xref: /xnu-11417.101.15/osfmk/vm/vm_pageout.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69 
70 #include <debug.h>
71 
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80 
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92 
93 #include <os/log.h>
94 
95 #include <sys/kdebug_triage.h>
96 
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99 
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116 
117 #include <san/kasan.h>
118 #include <sys/kern_memorystatus_xnu.h>
119 
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123 
124 
125 #if UPL_DEBUG
126 #include <libkern/OSDebug.h>
127 #endif
128 
129 extern int cs_debug;
130 
131 #if CONFIG_MBUF_MCACHE
132 extern void mbuf_drain(boolean_t);
133 #endif /* CONFIG_MBUF_MCACHE */
134 
135 #if CONFIG_FREEZE
136 extern unsigned int memorystatus_frozen_count;
137 extern unsigned int memorystatus_suspended_count;
138 #endif /* CONFIG_FREEZE */
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140 
141 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 extern uint32_t memorystatus_jetsam_bg_band_waiters;
144 
145 void vm_pressure_response(void);
146 extern void consider_vm_pressure_events(void);
147 
148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
149 
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 sched_cond_atomic_t vm_pageout_gc_cond;
153 #if CONFIG_VPS_DYNAMIC_PRIO
154 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
155 #else
156 const bool vps_dynamic_priority_enabled = false;
157 #endif
158 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
159 
160 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
161 #if !XNU_TARGET_OS_OSX
162 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
163 #else /* !XNU_TARGET_OS_OSX */
164 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
165 #endif /* !XNU_TARGET_OS_OSX */
166 #endif
167 
168 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
169 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
170 #endif
171 
172 #ifndef VM_PAGE_LAUNDRY_MAX
173 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
174 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
175 
176 #ifndef VM_PAGEOUT_BURST_WAIT
177 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
178 #endif  /* VM_PAGEOUT_BURST_WAIT */
179 
180 #ifndef VM_PAGEOUT_EMPTY_WAIT
181 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
182 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
183 
184 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
185 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
186 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
187 
188 #ifndef VM_PAGEOUT_IDLE_WAIT
189 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
190 #endif  /* VM_PAGEOUT_IDLE_WAIT */
191 
192 #ifndef VM_PAGEOUT_SWAP_WAIT
193 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
194 #endif  /* VM_PAGEOUT_SWAP_WAIT */
195 
196 /*
197  * vm_page_max_speculative_age_q should be less than or equal to
198  * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
199  * vm_page_queue_speculative entries.
200  */
201 
202 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
203 #ifndef VM_PAGE_SPECULATIVE_TARGET
204 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
205 #endif /* VM_PAGE_SPECULATIVE_TARGET */
206 
207 
208 /*
209  *	To obtain a reasonable LRU approximation, the inactive queue
210  *	needs to be large enough to give pages on it a chance to be
211  *	referenced a second time.  This macro defines the fraction
212  *	of active+inactive pages that should be inactive.
213  *	The pageout daemon uses it to update vm_page_inactive_target.
214  *
215  *	If vm_page_free_count falls below vm_page_free_target and
216  *	vm_page_inactive_count is below vm_page_inactive_target,
217  *	then the pageout daemon starts running.
218  */
219 
220 #ifndef VM_PAGE_INACTIVE_TARGET
221 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
222 #endif  /* VM_PAGE_INACTIVE_TARGET */
223 
224 /*
225  *	Once the pageout daemon starts running, it keeps going
226  *	until vm_page_free_count meets or exceeds vm_page_free_target.
227  */
228 
229 #ifndef VM_PAGE_FREE_TARGET
230 #if !XNU_TARGET_OS_OSX
231 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
232 #else /* !XNU_TARGET_OS_OSX */
233 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
234 #endif /* !XNU_TARGET_OS_OSX */
235 #endif  /* VM_PAGE_FREE_TARGET */
236 
237 
238 /*
239  *	The pageout daemon always starts running once vm_page_free_count
240  *	falls below vm_page_free_min.
241  */
242 
243 #ifndef VM_PAGE_FREE_MIN
244 #if !XNU_TARGET_OS_OSX
245 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
246 #else /* !XNU_TARGET_OS_OSX */
247 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
248 #endif /* !XNU_TARGET_OS_OSX */
249 #endif  /* VM_PAGE_FREE_MIN */
250 
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_RESERVED_LIMIT     100
253 #define VM_PAGE_FREE_MIN_LIMIT          1500
254 #define VM_PAGE_FREE_TARGET_LIMIT       2000
255 #else /* !XNU_TARGET_OS_OSX */
256 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
257 #define VM_PAGE_FREE_MIN_LIMIT          3500
258 #define VM_PAGE_FREE_TARGET_LIMIT       4000
259 #endif /* !XNU_TARGET_OS_OSX */
260 
261 /*
262  *	When vm_page_free_count falls below vm_page_free_reserved,
263  *	only vm-privileged threads can allocate pages.  vm-privilege
264  *	allows the pageout daemon and default pager (and any other
265  *	associated threads needed for default pageout) to continue
266  *	operation by dipping into the reserved pool of pages.
267  */
268 
269 #ifndef VM_PAGE_FREE_RESERVED
270 #define VM_PAGE_FREE_RESERVED(n)        \
271 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
272 #endif  /* VM_PAGE_FREE_RESERVED */
273 
274 /*
275  *	When we dequeue pages from the inactive list, they are
276  *	reactivated (ie, put back on the active queue) if referenced.
277  *	However, it is possible to starve the free list if other
278  *	processors are referencing pages faster than we can turn off
279  *	the referenced bit.  So we limit the number of reactivations
280  *	we will make per call of vm_pageout_scan().
281  */
282 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
283 
284 #ifndef VM_PAGE_REACTIVATE_LIMIT
285 #if !XNU_TARGET_OS_OSX
286 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
287 #else /* !XNU_TARGET_OS_OSX */
288 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
289 #endif /* !XNU_TARGET_OS_OSX */
290 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
291 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
292 
293 int vm_pageout_protect_realtime = true;
294 
295 extern boolean_t hibernate_cleaning_in_progress;
296 
297 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
298 struct pgo_iothread_state pgo_iothread_external_state;
299 
300 #if VM_PRESSURE_EVENTS
301 void vm_pressure_thread(void);
302 
303 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
304 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
305 
306 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
307 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
308 #endif
309 
310 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
311 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
312 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
313 
314 extern void vm_pageout_continue(void);
315 extern void vm_pageout_scan(void);
316 
317 boolean_t vm_pageout_running = FALSE;
318 
319 uint32_t vm_page_upl_tainted = 0;
320 uint32_t vm_page_iopl_tainted = 0;
321 
322 #if XNU_TARGET_OS_OSX
323 static boolean_t vm_pageout_waiter  = FALSE;
324 #endif /* XNU_TARGET_OS_OSX */
325 
326 
327 #if DEVELOPMENT || DEBUG
328 struct vm_pageout_debug vm_pageout_debug;
329 #endif
330 struct vm_pageout_vminfo vm_pageout_vminfo;
331 struct vm_pageout_state  vm_pageout_state;
332 struct vm_config         vm_config;
333 
334 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
335 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
336 #if DEVELOPMENT || DEBUG
337 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
338 #endif /* DEVELOPMENT || DEBUG */
339 
340 int         vm_upl_wait_for_pages = 0;
341 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
342 
343 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
344 
345 int     vm_debug_events = 0;
346 
347 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
348 
349 #if CONFIG_MEMORYSTATUS
350 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
351 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
352 #endif
353 
354 #if __AMP__
355 
356 
357 /*
358  * Bind compressor threads to e-cores unless there are multiple non-e clusters
359  */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367 
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_soft_bind_cluster_type(thread_t, char);
371 
372 #endif /* __AMP__ */
373 
374 
375 /*
376  *	Routine:	vm_pageout_object_terminate
377  *	Purpose:
378  *		Destroy the pageout_object, and perform all of the
379  *		required cleanup actions.
380  *
381  *	In/Out conditions:
382  *		The object must be locked, and will be returned locked.
383  */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 	vm_object_t     object)
387 {
388 	vm_object_t     shadow_object;
389 
390 	/*
391 	 * Deal with the deallocation (last reference) of a pageout object
392 	 * (used for cleaning-in-place) by dropping the paging references/
393 	 * freeing pages in the original object.
394 	 */
395 
396 	assert(object->pageout);
397 	shadow_object = object->shadow;
398 	vm_object_lock(shadow_object);
399 
400 	while (!vm_page_queue_empty(&object->memq)) {
401 		vm_page_t               p, m;
402 		vm_object_offset_t      offset;
403 
404 		p = (vm_page_t) vm_page_queue_first(&object->memq);
405 
406 		assert(vm_page_is_private(p));
407 		assert(p->vmp_free_when_done);
408 		p->vmp_free_when_done = FALSE;
409 		assert(!p->vmp_cleaning);
410 		assert(!p->vmp_laundry);
411 
412 		offset = p->vmp_offset;
413 		VM_PAGE_FREE(p);
414 		p = VM_PAGE_NULL;
415 
416 		m = vm_page_lookup(shadow_object,
417 		    offset + object->vo_shadow_offset);
418 
419 		if (m == VM_PAGE_NULL) {
420 			continue;
421 		}
422 
423 		assert((m->vmp_dirty) || (m->vmp_precious) ||
424 		    (m->vmp_busy && m->vmp_cleaning));
425 
426 		/*
427 		 * Handle the trusted pager throttle.
428 		 * Also decrement the burst throttle (if external).
429 		 */
430 		vm_page_lock_queues();
431 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 			vm_pageout_throttle_up(m);
433 		}
434 
435 		/*
436 		 * Handle the "target" page(s). These pages are to be freed if
437 		 * successfully cleaned. Target pages are always busy, and are
438 		 * wired exactly once. The initial target pages are not mapped,
439 		 * (so cannot be referenced or modified) but converted target
440 		 * pages may have been modified between the selection as an
441 		 * adjacent page and conversion to a target.
442 		 */
443 		if (m->vmp_free_when_done) {
444 			assert(m->vmp_busy);
445 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 			assert(m->vmp_wire_count == 1);
447 			m->vmp_cleaning = FALSE;
448 			m->vmp_free_when_done = FALSE;
449 			/*
450 			 * Revoke all access to the page. Since the object is
451 			 * locked, and the page is busy, this prevents the page
452 			 * from being dirtied after the pmap_disconnect() call
453 			 * returns.
454 			 *
455 			 * Since the page is left "dirty" but "not modifed", we
456 			 * can detect whether the page was redirtied during
457 			 * pageout by checking the modify state.
458 			 */
459 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 				SET_PAGE_DIRTY(m, FALSE);
461 			} else {
462 				m->vmp_dirty = FALSE;
463 			}
464 
465 			if (m->vmp_dirty) {
466 				vm_page_unwire(m, TRUE);        /* reactivates */
467 				counter_inc(&vm_statistics_reactivations);
468 				vm_page_wakeup_done(object, m);
469 			} else {
470 				vm_page_free(m);  /* clears busy, etc. */
471 			}
472 			vm_page_unlock_queues();
473 			continue;
474 		}
475 		/*
476 		 * Handle the "adjacent" pages. These pages were cleaned in
477 		 * place, and should be left alone.
478 		 * If prep_pin_count is nonzero, then someone is using the
479 		 * page, so make it active.
480 		 */
481 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
482 			if (m->vmp_reference) {
483 				vm_page_activate(m);
484 			} else {
485 				vm_page_deactivate(m);
486 			}
487 		}
488 		if (m->vmp_overwriting) {
489 			/*
490 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 			 */
492 			if (m->vmp_busy) {
493 				/*
494 				 * We do not re-set m->vmp_dirty !
495 				 * The page was busy so no extraneous activity
496 				 * could have occurred. COPY_INTO is a read into the
497 				 * new pages. CLEAN_IN_PLACE does actually write
498 				 * out the pages but handling outside of this code
499 				 * will take care of resetting dirty. We clear the
500 				 * modify however for the Programmed I/O case.
501 				 */
502 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503 
504 				m->vmp_busy = FALSE;
505 				m->vmp_absent = FALSE;
506 			} else {
507 				/*
508 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 				 * Occurs when the original page was wired
510 				 * at the time of the list request
511 				 */
512 				assert(VM_PAGE_WIRED(m));
513 				vm_page_unwire(m, TRUE);        /* reactivates */
514 			}
515 			m->vmp_overwriting = FALSE;
516 		} else {
517 			m->vmp_dirty = FALSE;
518 		}
519 		m->vmp_cleaning = FALSE;
520 
521 		/*
522 		 * Wakeup any thread waiting for the page to be un-cleaning.
523 		 */
524 		vm_page_wakeup(object, m);
525 		vm_page_unlock_queues();
526 	}
527 	/*
528 	 * Account for the paging reference taken in vm_paging_object_allocate.
529 	 */
530 	vm_object_activity_end(shadow_object);
531 	vm_object_unlock(shadow_object);
532 
533 	assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 	assert(object->paging_in_progress == 0);
535 	assert(object->activity_in_progress == 0);
536 	assert(object->resident_page_count == 0);
537 	return;
538 }
539 
540 /*
541  * Routine:	vm_pageclean_setup
542  *
543  * Purpose:	setup a page to be cleaned (made non-dirty), but not
544  *		necessarily flushed from the VM page cache.
545  *		This is accomplished by cleaning in place.
546  *
547  *		The page must not be busy, and new_object
548  *		must be locked.
549  *
550  */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 	vm_page_t               m,
554 	vm_page_t               new_m,
555 	vm_object_t             new_object,
556 	vm_object_offset_t      new_offset)
557 {
558 	assert(!m->vmp_busy);
559 #if 0
560 	assert(!m->vmp_cleaning);
561 #endif
562 
563 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564 
565 	/*
566 	 * Mark original page as cleaning in place.
567 	 */
568 	m->vmp_cleaning = TRUE;
569 	SET_PAGE_DIRTY(m, FALSE);
570 	m->vmp_precious = FALSE;
571 
572 	/*
573 	 * Convert the fictitious page to a private shadow of
574 	 * the real page.
575 	 */
576 	new_m->vmp_free_when_done = TRUE;
577 
578 	vm_page_lockspin_queues();
579 	vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
580 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 	vm_page_unlock_queues();
582 
583 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 	assert(!new_m->vmp_wanted);
585 	new_m->vmp_busy = FALSE;
586 }
587 
588 /*
589  *	Routine:	vm_pageout_initialize_page
590  *	Purpose:
591  *		Causes the specified page to be initialized in
592  *		the appropriate memory object. This routine is used to push
593  *		pages into a copy-object when they are modified in the
594  *		permanent object.
595  *
596  *		The page is moved to a temporary object and paged out.
597  *
598  *	In/out conditions:
599  *		The page in question must not be on any pageout queues.
600  *		The object to which it belongs must be locked.
601  *		The page must be busy, but not hold a paging reference.
602  *
603  *	Implementation:
604  *		Move this page to a completely new object.
605  */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 	vm_page_t       m)
609 {
610 	vm_object_t             object;
611 	vm_object_offset_t      paging_offset;
612 	memory_object_t         pager;
613 
614 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615 
616 	object = VM_PAGE_OBJECT(m);
617 
618 	assert(m->vmp_busy);
619 	assert(object->internal);
620 
621 	/*
622 	 *	Verify that we really want to clean this page
623 	 */
624 	assert(!m->vmp_absent);
625 	assert(m->vmp_dirty);
626 
627 	/*
628 	 *	Create a paging reference to let us play with the object.
629 	 */
630 	paging_offset = m->vmp_offset + object->paging_offset;
631 
632 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 		panic("reservation without pageout?"); /* alan */
634 
635 		VM_PAGE_FREE(m);
636 		vm_object_unlock(object);
637 
638 		return;
639 	}
640 
641 	/*
642 	 * If there's no pager, then we can't clean the page.  This should
643 	 * never happen since this should be a copy object and therefore not
644 	 * an external object, so the pager should always be there.
645 	 */
646 
647 	pager = object->pager;
648 
649 	if (pager == MEMORY_OBJECT_NULL) {
650 		panic("missing pager for copy object");
651 
652 		VM_PAGE_FREE(m);
653 		return;
654 	}
655 
656 	/*
657 	 * set the page for future call to vm_fault_list_request
658 	 */
659 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 	SET_PAGE_DIRTY(m, FALSE);
661 
662 	/*
663 	 * keep the object from collapsing or terminating
664 	 */
665 	vm_object_paging_begin(object);
666 	vm_object_unlock(object);
667 
668 	/*
669 	 *	Write the data to its pager.
670 	 *	Note that the data is passed by naming the new object,
671 	 *	not a virtual address; the pager interface has been
672 	 *	manipulated to use the "internal memory" data type.
673 	 *	[The object reference from its allocation is donated
674 	 *	to the eventual recipient.]
675 	 */
676 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677 
678 	vm_object_lock(object);
679 	vm_object_paging_end(object);
680 }
681 
682 
683 /*
684  * vm_pageout_cluster:
685  *
686  * Given a page, queue it to the appropriate I/O thread,
687  * which will page it out and attempt to clean adjacent pages
688  * in the same operation.
689  *
690  * The object and queues must be locked. We will take a
691  * paging reference to prevent deallocation or collapse when we
692  * release the object lock back at the call site.  The I/O thread
693  * is responsible for consuming this reference
694  *
695  * The page must not be on any pageout queue.
696  */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699 
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703 
704 typedef enum vmct_state_t {
705 	VMCT_IDLE,
706 	VMCT_AWAKENED,
707 	VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711 
712 
713 
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 	vm_object_t object = VM_PAGE_OBJECT(m);
718 
719 	VM_PAGE_CHECK(m);
720 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 	vm_object_lock_assert_exclusive(object);
722 
723 	/*
724 	 * Make sure it's OK to page this out.
725 	 */
726 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 	assert(!m->vmp_cleaning && !m->vmp_laundry);
728 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729 
730 	/*
731 	 * protect the object from collapse or termination
732 	 */
733 	vm_object_activity_begin(object);
734 
735 
736 	/*
737 	 * pgo_laundry count is tied to the laundry bit
738 	 */
739 	m->vmp_laundry = TRUE;
740 	q->pgo_laundry++;
741 
742 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744 
745 	if (object->internal == TRUE) {
746 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
747 		m->vmp_busy = TRUE;
748 #if DEVELOPMENT || DEBUG
749 		/*
750 		 * The benchmark queue will be woken up independently by the benchmark
751 		 * itself.
752 		 */
753 		if (q != &vm_pageout_queue_benchmark) {
754 #else /* DEVELOPMENT || DEBUG */
755 		if (true) {
756 #endif /* DEVELOPMENT || DEBUG */
757 			/*
758 			 * Wake up the first compressor thread. It will wake subsequent
759 			 * threads if necessary.
760 			 */
761 			sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
762 			    pgo_iothread_internal_state[0].pgo_iothread);
763 		}
764 	} else {
765 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
766 	}
767 	VM_PAGE_CHECK(m);
768 }
769 
770 void
771 vm_pageout_cluster(vm_page_t m)
772 {
773 	struct          vm_pageout_queue *q;
774 	vm_object_t     object = VM_PAGE_OBJECT(m);
775 	if (object->internal) {
776 		q = &vm_pageout_queue_internal;
777 	} else {
778 		q = &vm_pageout_queue_external;
779 	}
780 	vm_pageout_cluster_to_queue(m, q);
781 }
782 
783 
784 /*
785  * A page is back from laundry or we are stealing it back from
786  * the laundering state.  See if there are some pages waiting to
787  * go to laundry and if we can let some of them go now.
788  *
789  * Object and page queues must be locked.
790  */
791 void
792 vm_pageout_throttle_up(
793 	vm_page_t       m)
794 {
795 	struct vm_pageout_queue *q;
796 	vm_object_t      m_object;
797 
798 	m_object = VM_PAGE_OBJECT(m);
799 
800 	assert(m_object != VM_OBJECT_NULL);
801 	assert(!is_kernel_object(m_object));
802 
803 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
804 	vm_object_lock_assert_exclusive(m_object);
805 
806 	if (m_object->internal == TRUE) {
807 		q = &vm_pageout_queue_internal;
808 	} else {
809 		q = &vm_pageout_queue_external;
810 	}
811 
812 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
813 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
814 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
815 
816 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
817 
818 		vm_object_activity_end(m_object);
819 
820 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
821 	}
822 	if (m->vmp_laundry == TRUE) {
823 		m->vmp_laundry = FALSE;
824 		q->pgo_laundry--;
825 
826 		if (q->pgo_throttled == TRUE) {
827 			q->pgo_throttled = FALSE;
828 			thread_wakeup((event_t) &q->pgo_laundry);
829 		}
830 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
831 			q->pgo_draining = FALSE;
832 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
833 		}
834 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
835 	}
836 }
837 
838 
839 static void
840 vm_pageout_throttle_up_batch(
841 	struct vm_pageout_queue *q,
842 	int             batch_cnt)
843 {
844 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
845 
846 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
847 
848 	q->pgo_laundry -= batch_cnt;
849 
850 	if (q->pgo_throttled == TRUE) {
851 		q->pgo_throttled = FALSE;
852 		thread_wakeup((event_t) &q->pgo_laundry);
853 	}
854 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
855 		q->pgo_draining = FALSE;
856 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
857 	}
858 }
859 
860 
861 
862 /*
863  * VM memory pressure monitoring.
864  *
865  * vm_pageout_scan() keeps track of the number of pages it considers and
866  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
867  *
868  * compute_memory_pressure() is called every second from compute_averages()
869  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
870  * of recalimed pages in a new vm_pageout_stat[] bucket.
871  *
872  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
873  * The caller provides the number of seconds ("nsecs") worth of statistics
874  * it wants, up to 30 seconds.
875  * It computes the number of pages reclaimed in the past "nsecs" seconds and
876  * also returns the number of pages the system still needs to reclaim at this
877  * moment in time.
878  */
879 #if DEVELOPMENT || DEBUG
880 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
881 #else
882 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
883 #endif
884 struct vm_pageout_stat {
885 	unsigned long vm_page_active_count;
886 	unsigned long vm_page_speculative_count;
887 	unsigned long vm_page_inactive_count;
888 	unsigned long vm_page_anonymous_count;
889 
890 	unsigned long vm_page_free_count;
891 	unsigned long vm_page_wire_count;
892 	unsigned long vm_page_compressor_count;
893 
894 	unsigned long vm_page_pages_compressed;
895 	unsigned long vm_page_pageable_internal_count;
896 	unsigned long vm_page_pageable_external_count;
897 	unsigned long vm_page_xpmapped_external_count;
898 
899 	unsigned int pages_grabbed;
900 	unsigned int pages_freed;
901 
902 	unsigned int pages_compressed;
903 	unsigned int pages_grabbed_by_compressor;
904 	unsigned int failed_compressions;
905 
906 	unsigned int pages_evicted;
907 	unsigned int pages_purged;
908 
909 	unsigned int considered;
910 	unsigned int considered_bq_internal;
911 	unsigned int considered_bq_external;
912 
913 	unsigned int skipped_external;
914 	unsigned int skipped_internal;
915 	unsigned int filecache_min_reactivations;
916 
917 	unsigned int freed_speculative;
918 	unsigned int freed_cleaned;
919 	unsigned int freed_internal;
920 	unsigned int freed_external;
921 
922 	unsigned int cleaned_dirty_external;
923 	unsigned int cleaned_dirty_internal;
924 
925 	unsigned int inactive_referenced;
926 	unsigned int inactive_nolock;
927 	unsigned int reactivation_limit_exceeded;
928 	unsigned int forced_inactive_reclaim;
929 
930 	unsigned int throttled_internal_q;
931 	unsigned int throttled_external_q;
932 
933 	unsigned int phantom_ghosts_found;
934 	unsigned int phantom_ghosts_added;
935 
936 	unsigned int vm_page_realtime_count;
937 	unsigned int forcereclaimed_sharedcache;
938 	unsigned int forcereclaimed_realtime;
939 	unsigned int protected_sharedcache;
940 	unsigned int protected_realtime;
941 
942 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
943 
944 unsigned int vm_pageout_stat_now = 0;
945 
946 #define VM_PAGEOUT_STAT_BEFORE(i) \
947 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
948 #define VM_PAGEOUT_STAT_AFTER(i) \
949 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
950 
951 #if VM_PAGE_BUCKETS_CHECK
952 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
953 #endif /* VM_PAGE_BUCKETS_CHECK */
954 
955 
956 void
957 record_memory_pressure(void);
958 void
959 record_memory_pressure(void)
960 {
961 	unsigned int vm_pageout_next;
962 
963 #if VM_PAGE_BUCKETS_CHECK
964 	/* check the consistency of VM page buckets at regular interval */
965 	static int counter = 0;
966 	if ((++counter % vm_page_buckets_check_interval) == 0) {
967 		vm_page_buckets_check();
968 	}
969 #endif /* VM_PAGE_BUCKETS_CHECK */
970 
971 	vm_pageout_state.vm_memory_pressure =
972 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
973 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
974 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
975 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
976 
977 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
978 
979 	/* move "now" forward */
980 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
981 
982 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
983 
984 	vm_pageout_stat_now = vm_pageout_next;
985 }
986 
987 
988 /*
989  * IMPORTANT
990  * mach_vm_ctl_page_free_wanted() is called indirectly, via
991  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
992  * it must be safe in the restricted stackshot context. Locks and/or
993  * blocking are not allowable.
994  */
995 unsigned int
996 mach_vm_ctl_page_free_wanted(void)
997 {
998 	unsigned int page_free_target, page_free_count, page_free_wanted;
999 
1000 	page_free_target = vm_page_free_target;
1001 	page_free_count = vm_page_free_count;
1002 	if (page_free_target > page_free_count) {
1003 		page_free_wanted = page_free_target - page_free_count;
1004 	} else {
1005 		page_free_wanted = 0;
1006 	}
1007 
1008 	return page_free_wanted;
1009 }
1010 
1011 
1012 /*
1013  * IMPORTANT:
1014  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1015  * wait_for_pressure FALSE, so that code path must remain safe in the
1016  * restricted stackshot context. No blocking or locks are allowable.
1017  * on that code path.
1018  */
1019 
1020 kern_return_t
1021 mach_vm_pressure_monitor(
1022 	boolean_t       wait_for_pressure,
1023 	unsigned int    nsecs_monitored,
1024 	unsigned int    *pages_reclaimed_p,
1025 	unsigned int    *pages_wanted_p)
1026 {
1027 	wait_result_t   wr;
1028 	unsigned int    vm_pageout_then, vm_pageout_now;
1029 	unsigned int    pages_reclaimed;
1030 	unsigned int    units_of_monitor;
1031 
1032 	units_of_monitor = 8 * nsecs_monitored;
1033 	/*
1034 	 * We don't take the vm_page_queue_lock here because we don't want
1035 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1036 	 * thread when it's trying to reclaim memory.  We don't need fully
1037 	 * accurate monitoring anyway...
1038 	 */
1039 
1040 	if (wait_for_pressure) {
1041 		/* wait until there's memory pressure */
1042 		while (vm_page_free_count >= vm_page_free_target) {
1043 			wr = assert_wait((event_t) &vm_page_free_wanted,
1044 			    THREAD_INTERRUPTIBLE);
1045 			if (wr == THREAD_WAITING) {
1046 				wr = thread_block(THREAD_CONTINUE_NULL);
1047 			}
1048 			if (wr == THREAD_INTERRUPTED) {
1049 				return KERN_ABORTED;
1050 			}
1051 			if (wr == THREAD_AWAKENED) {
1052 				/*
1053 				 * The memory pressure might have already
1054 				 * been relieved but let's not block again
1055 				 * and let's report that there was memory
1056 				 * pressure at some point.
1057 				 */
1058 				break;
1059 			}
1060 		}
1061 	}
1062 
1063 	/* provide the number of pages the system wants to reclaim */
1064 	if (pages_wanted_p != NULL) {
1065 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1066 	}
1067 
1068 	if (pages_reclaimed_p == NULL) {
1069 		return KERN_SUCCESS;
1070 	}
1071 
1072 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1073 	vm_pageout_now = vm_pageout_stat_now;
1074 	pages_reclaimed = 0;
1075 	for (vm_pageout_then =
1076 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1077 	    vm_pageout_then != vm_pageout_now &&
1078 	    units_of_monitor-- != 0;
1079 	    vm_pageout_then =
1080 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1081 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1082 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1083 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1084 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1085 	}
1086 	*pages_reclaimed_p = pages_reclaimed;
1087 
1088 	return KERN_SUCCESS;
1089 }
1090 
1091 
1092 
1093 #if DEVELOPMENT || DEBUG
1094 
1095 static void
1096 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1097 
1098 /*
1099  * condition variable used to make sure there is
1100  * only a single sweep going on at a time
1101  */
1102 bool vm_pageout_disconnect_all_pages_active = false;
1103 
1104 void
1105 vm_pageout_disconnect_all_pages()
1106 {
1107 	vm_page_lock_queues();
1108 
1109 	if (vm_pageout_disconnect_all_pages_active) {
1110 		vm_page_unlock_queues();
1111 		return;
1112 	}
1113 	vm_pageout_disconnect_all_pages_active = true;
1114 
1115 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1116 	    vm_page_throttled_count);
1117 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1118 	    vm_page_anonymous_count);
1119 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1120 	    (vm_page_inactive_count - vm_page_anonymous_count));
1121 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1122 	    vm_page_active_count);
1123 #ifdef CONFIG_SECLUDED_MEMORY
1124 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1125 	    vm_page_secluded_count);
1126 #endif /* CONFIG_SECLUDED_MEMORY */
1127 	vm_page_unlock_queues();
1128 
1129 	vm_pageout_disconnect_all_pages_active = false;
1130 }
1131 
1132 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1133 void
1134 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1135 {
1136 	vm_page_t       m;
1137 	vm_object_t     t_object = NULL;
1138 	vm_object_t     l_object = NULL;
1139 	vm_object_t     m_object = NULL;
1140 	int             delayed_unlock = 0;
1141 	int             try_failed_count = 0;
1142 	int             disconnected_count = 0;
1143 	int             paused_count = 0;
1144 	int             object_locked_count = 0;
1145 
1146 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1147 	    DBG_FUNC_START),
1148 	    q, qcount);
1149 
1150 	while (qcount && !vm_page_queue_empty(q)) {
1151 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1152 
1153 		m = (vm_page_t) vm_page_queue_first(q);
1154 		m_object = VM_PAGE_OBJECT(m);
1155 
1156 		if (m_object == VM_OBJECT_NULL) {
1157 			/*
1158 			 * Bumped into a free page. This should only happen on the
1159 			 * secluded queue
1160 			 */
1161 #if CONFIG_SECLUDED_MEMORY
1162 			assert(q == &vm_page_queue_secluded);
1163 #endif /* CONFIG_SECLUDED_MEMORY */
1164 			goto reenter_pg_on_q;
1165 		}
1166 
1167 		/*
1168 		 * check to see if we currently are working
1169 		 * with the same object... if so, we've
1170 		 * already got the lock
1171 		 */
1172 		if (m_object != l_object) {
1173 			/*
1174 			 * the object associated with candidate page is
1175 			 * different from the one we were just working
1176 			 * with... dump the lock if we still own it
1177 			 */
1178 			if (l_object != NULL) {
1179 				vm_object_unlock(l_object);
1180 				l_object = NULL;
1181 			}
1182 			if (m_object != t_object) {
1183 				try_failed_count = 0;
1184 			}
1185 
1186 			/*
1187 			 * Try to lock object; since we've alread got the
1188 			 * page queues lock, we can only 'try' for this one.
1189 			 * if the 'try' fails, we need to do a mutex_pause
1190 			 * to allow the owner of the object lock a chance to
1191 			 * run...
1192 			 */
1193 			if (!vm_object_lock_try_scan(m_object)) {
1194 				if (try_failed_count > 20) {
1195 					goto reenter_pg_on_q;
1196 				}
1197 				vm_page_unlock_queues();
1198 				mutex_pause(try_failed_count++);
1199 				vm_page_lock_queues();
1200 				delayed_unlock = 0;
1201 
1202 				paused_count++;
1203 
1204 				t_object = m_object;
1205 				continue;
1206 			}
1207 			object_locked_count++;
1208 
1209 			l_object = m_object;
1210 		}
1211 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1212 		    m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1213 		    m->vmp_free_when_done) {
1214 			/*
1215 			 * put it back on the head of its queue
1216 			 */
1217 			goto reenter_pg_on_q;
1218 		}
1219 		if (m->vmp_pmapped == TRUE) {
1220 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1221 
1222 			disconnected_count++;
1223 		}
1224 reenter_pg_on_q:
1225 		vm_page_queue_remove(q, m, vmp_pageq);
1226 		vm_page_queue_enter(q, m, vmp_pageq);
1227 
1228 		qcount--;
1229 		try_failed_count = 0;
1230 
1231 		if (delayed_unlock++ > 128) {
1232 			if (l_object != NULL) {
1233 				vm_object_unlock(l_object);
1234 				l_object = NULL;
1235 			}
1236 			lck_mtx_yield(&vm_page_queue_lock);
1237 			delayed_unlock = 0;
1238 		}
1239 	}
1240 	if (l_object != NULL) {
1241 		vm_object_unlock(l_object);
1242 		l_object = NULL;
1243 	}
1244 
1245 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1246 	    DBG_FUNC_END),
1247 	    q, disconnected_count, object_locked_count, paused_count);
1248 }
1249 
1250 extern const char *proc_best_name(struct proc* proc);
1251 
1252 int
1253 vm_toggle_task_selfdonate_pages(task_t task)
1254 {
1255 	int state = 0;
1256 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1257 		printf("VM Donation mode is OFF on the system\n");
1258 		return state;
1259 	}
1260 	if (task != kernel_task) {
1261 		task_lock(task);
1262 		if (!task->donates_own_pages) {
1263 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1264 			task->donates_own_pages = true;
1265 			state = 1;
1266 		} else if (task->donates_own_pages) {
1267 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1268 			task->donates_own_pages = false;
1269 			state = 0;
1270 		}
1271 		task_unlock(task);
1272 	}
1273 	return state;
1274 }
1275 #endif /* DEVELOPMENT || DEBUG */
1276 
1277 void
1278 vm_task_set_selfdonate_pages(task_t task, bool donate)
1279 {
1280 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1281 	assert(task != kernel_task);
1282 
1283 	task_lock(task);
1284 	task->donates_own_pages = donate;
1285 	task_unlock(task);
1286 }
1287 
1288 
1289 
1290 static size_t
1291 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1292 
1293 /*
1294  * condition variable used to make sure there is
1295  * only a single sweep going on at a time
1296  */
1297 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1298 
1299 
1300 kern_return_t
1301 vm_pageout_anonymous_pages()
1302 {
1303 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1304 		size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1305 		vm_page_lock_queues();
1306 
1307 		if (vm_pageout_anonymous_pages_active == TRUE) {
1308 			vm_page_unlock_queues();
1309 			return KERN_RESOURCE_SHORTAGE;
1310 		}
1311 		vm_pageout_anonymous_pages_active = TRUE;
1312 		vm_page_unlock_queues();
1313 
1314 		throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1315 		anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1316 		active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1317 
1318 		os_log(OS_LOG_DEFAULT,
1319 		    "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1320 		    __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1321 
1322 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1323 			vm_consider_swapping();
1324 		}
1325 
1326 		vm_page_lock_queues();
1327 		vm_pageout_anonymous_pages_active = FALSE;
1328 		vm_page_unlock_queues();
1329 		return KERN_SUCCESS;
1330 	} else {
1331 		return KERN_NOT_SUPPORTED;
1332 	}
1333 }
1334 
1335 
1336 size_t
1337 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1338 {
1339 	vm_page_t       m;
1340 	vm_object_t     t_object = NULL;
1341 	vm_object_t     l_object = NULL;
1342 	vm_object_t     m_object = NULL;
1343 	int             delayed_unlock = 0;
1344 	int             try_failed_count = 0;
1345 	int             refmod_state;
1346 	int             pmap_options;
1347 	struct          vm_pageout_queue *iq;
1348 	ppnum_t         phys_page;
1349 	size_t          pages_moved = 0;
1350 
1351 
1352 	iq = &vm_pageout_queue_internal;
1353 
1354 	vm_page_lock_queues();
1355 
1356 #if DEVELOPMENT || DEBUG
1357 	if (perf_test) {
1358 		iq = &vm_pageout_queue_benchmark;
1359 		// ensure the benchmark queue isn't throttled
1360 		iq->pgo_maxlaundry = (unsigned int) qcount;
1361 	}
1362 #endif /* DEVELOPMENT ||DEBUG */
1363 
1364 	while (qcount && !vm_page_queue_empty(q)) {
1365 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1366 
1367 		if (VM_PAGE_Q_THROTTLED(iq)) {
1368 			if (l_object != NULL) {
1369 				vm_object_unlock(l_object);
1370 				l_object = NULL;
1371 			}
1372 			iq->pgo_draining = TRUE;
1373 
1374 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1375 			vm_page_unlock_queues();
1376 
1377 			thread_block(THREAD_CONTINUE_NULL);
1378 
1379 			vm_page_lock_queues();
1380 			delayed_unlock = 0;
1381 			continue;
1382 		}
1383 		m = (vm_page_t) vm_page_queue_first(q);
1384 		m_object = VM_PAGE_OBJECT(m);
1385 
1386 		/*
1387 		 * check to see if we currently are working
1388 		 * with the same object... if so, we've
1389 		 * already got the lock
1390 		 */
1391 		if (m_object != l_object) {
1392 			if (!m_object->internal) {
1393 				goto reenter_pg_on_q;
1394 			}
1395 
1396 			/*
1397 			 * the object associated with candidate page is
1398 			 * different from the one we were just working
1399 			 * with... dump the lock if we still own it
1400 			 */
1401 			if (l_object != NULL) {
1402 				vm_object_unlock(l_object);
1403 				l_object = NULL;
1404 			}
1405 			if (m_object != t_object) {
1406 				try_failed_count = 0;
1407 			}
1408 
1409 			/*
1410 			 * Try to lock object; since we've alread got the
1411 			 * page queues lock, we can only 'try' for this one.
1412 			 * if the 'try' fails, we need to do a mutex_pause
1413 			 * to allow the owner of the object lock a chance to
1414 			 * run...
1415 			 */
1416 			if (!vm_object_lock_try_scan(m_object)) {
1417 				if (try_failed_count > 20) {
1418 					goto reenter_pg_on_q;
1419 				}
1420 				vm_page_unlock_queues();
1421 				mutex_pause(try_failed_count++);
1422 				vm_page_lock_queues();
1423 				delayed_unlock = 0;
1424 
1425 				t_object = m_object;
1426 				continue;
1427 			}
1428 			l_object = m_object;
1429 		}
1430 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1431 			/*
1432 			 * page is not to be cleaned
1433 			 * put it back on the head of its queue
1434 			 */
1435 			goto reenter_pg_on_q;
1436 		}
1437 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1438 
1439 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1440 			refmod_state = pmap_get_refmod(phys_page);
1441 
1442 			if (refmod_state & VM_MEM_REFERENCED) {
1443 				m->vmp_reference = TRUE;
1444 			}
1445 			if (refmod_state & VM_MEM_MODIFIED) {
1446 				SET_PAGE_DIRTY(m, FALSE);
1447 			}
1448 		}
1449 		if (m->vmp_reference == TRUE) {
1450 			m->vmp_reference = FALSE;
1451 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1452 			goto reenter_pg_on_q;
1453 		}
1454 		if (m->vmp_pmapped == TRUE) {
1455 			if (m->vmp_dirty || m->vmp_precious) {
1456 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1457 			} else {
1458 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1459 			}
1460 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1461 			if (refmod_state & VM_MEM_MODIFIED) {
1462 				SET_PAGE_DIRTY(m, FALSE);
1463 			}
1464 		}
1465 
1466 		if (!m->vmp_dirty && !m->vmp_precious) {
1467 			vm_page_unlock_queues();
1468 			VM_PAGE_FREE(m);
1469 			vm_page_lock_queues();
1470 			delayed_unlock = 0;
1471 
1472 			goto next_pg;
1473 		}
1474 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1475 			if (!m_object->pager_initialized) {
1476 				vm_page_unlock_queues();
1477 
1478 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1479 
1480 				if (!m_object->pager_initialized) {
1481 					vm_object_compressor_pager_create(m_object);
1482 				}
1483 
1484 				vm_page_lock_queues();
1485 				delayed_unlock = 0;
1486 			}
1487 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1488 				/*
1489 				 * We dropped the page queues lock above, so
1490 				 * "m" might no longer be on this queue...
1491 				 */
1492 				if (m != (vm_page_t) vm_page_queue_first(q)) {
1493 					continue;
1494 				}
1495 				goto reenter_pg_on_q;
1496 			}
1497 			/*
1498 			 * vm_object_compressor_pager_create will drop the object lock
1499 			 * which means 'm' may no longer be valid to use
1500 			 */
1501 			continue;
1502 		}
1503 
1504 		if (!perf_test) {
1505 			/*
1506 			 * we've already factored out pages in the laundry which
1507 			 * means this page can't be on the pageout queue so it's
1508 			 * safe to do the vm_page_queues_remove
1509 			 */
1510 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1511 			vm_page_queues_remove(m, TRUE);
1512 			if (donate) {
1513 				/*
1514 				 * The compressor needs to see this bit to know
1515 				 * where this page needs to land. Also if stolen,
1516 				 * this bit helps put the page back in the right
1517 				 * special queue where it belongs.
1518 				 */
1519 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1520 			}
1521 		} else {
1522 			vm_page_queue_remove(q, m, vmp_pageq);
1523 		}
1524 
1525 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1526 
1527 		vm_pageout_cluster_to_queue(m, iq);
1528 
1529 		pages_moved++;
1530 		goto next_pg;
1531 
1532 reenter_pg_on_q:
1533 		vm_page_queue_remove(q, m, vmp_pageq);
1534 		vm_page_queue_enter(q, m, vmp_pageq);
1535 next_pg:
1536 		qcount--;
1537 		try_failed_count = 0;
1538 
1539 		if (delayed_unlock++ > 128) {
1540 			if (l_object != NULL) {
1541 				vm_object_unlock(l_object);
1542 				l_object = NULL;
1543 			}
1544 			lck_mtx_yield(&vm_page_queue_lock);
1545 			delayed_unlock = 0;
1546 		}
1547 	}
1548 	if (l_object != NULL) {
1549 		vm_object_unlock(l_object);
1550 		l_object = NULL;
1551 	}
1552 	vm_page_unlock_queues();
1553 	return pages_moved;
1554 }
1555 
1556 
1557 
1558 /*
1559  * function in BSD to apply I/O throttle to the pageout thread
1560  */
1561 extern void vm_pageout_io_throttle(void);
1562 
1563 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1564 	MACRO_BEGIN                                                     \
1565 	/* \
1566 	 * If a "reusable" page somehow made it back into \
1567 	 * the active queue, it's been re-used and is not \
1568 	 * quite re-usable. \
1569 	 * If the VM object was "all_reusable", consider it \
1570 	 * as "all re-used" instead of converting it to \
1571 	 * "partially re-used", which could be expensive. \
1572 	 */                                                             \
1573 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1574 	if ((m)->vmp_reusable ||                                        \
1575 	    (obj)->all_reusable) {                                      \
1576 	        vm_object_reuse_pages((obj),                            \
1577 	                              (m)->vmp_offset,                  \
1578 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1579 	                              FALSE);                           \
1580 	}                                                               \
1581 	MACRO_END
1582 
1583 
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1585 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1586 
1587 #define FCS_IDLE                0
1588 #define FCS_DELAYED             1
1589 #define FCS_DEADLOCK_DETECTED   2
1590 
1591 struct flow_control {
1592 	int             state;
1593 	mach_timespec_t ts;
1594 };
1595 
1596 
1597 uint64_t vm_pageout_rejected_bq_internal = 0;
1598 uint64_t vm_pageout_rejected_bq_external = 0;
1599 uint64_t vm_pageout_skipped_bq_internal = 0;
1600 uint64_t vm_pageout_skipped_bq_external = 0;
1601 
1602 #define ANONS_GRABBED_LIMIT     2
1603 
1604 
1605 #if 0
1606 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1607 #endif
1608 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1609 
1610 #define VM_PAGEOUT_PB_NO_ACTION                         0
1611 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1612 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1613 
1614 
1615 #if 0
1616 static void
1617 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1618 {
1619 	if (*local_freeq) {
1620 		vm_page_unlock_queues();
1621 
1622 		VM_DEBUG_CONSTANT_EVENT(
1623 			vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1624 			vm_page_free_count, 0, 0, 1);
1625 
1626 		vm_page_free_list(*local_freeq, TRUE);
1627 
1628 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1629 		    vm_page_free_count, *local_freed, 0, 1);
1630 
1631 		*local_freeq = NULL;
1632 		*local_freed = 0;
1633 
1634 		vm_page_lock_queues();
1635 	} else {
1636 		lck_mtx_yield(&vm_page_queue_lock);
1637 	}
1638 	*delayed_unlock = 1;
1639 }
1640 #endif
1641 
1642 
1643 static void
1644 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1645     vm_page_t *local_freeq, int *local_freed, int action)
1646 {
1647 	vm_page_unlock_queues();
1648 
1649 	if (*object != NULL) {
1650 		vm_object_unlock(*object);
1651 		*object = NULL;
1652 	}
1653 	if (*local_freeq) {
1654 		vm_page_free_list(*local_freeq, TRUE);
1655 
1656 		*local_freeq = NULL;
1657 		*local_freed = 0;
1658 	}
1659 	*delayed_unlock = 1;
1660 
1661 	switch (action) {
1662 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1663 		vm_consider_waking_compactor_swapper();
1664 		break;
1665 	case VM_PAGEOUT_PB_THREAD_YIELD:
1666 		thread_yield_internal(1);
1667 		break;
1668 	case VM_PAGEOUT_PB_NO_ACTION:
1669 	default:
1670 		break;
1671 	}
1672 	vm_page_lock_queues();
1673 }
1674 
1675 
1676 static struct vm_pageout_vminfo last;
1677 
1678 uint64_t last_vm_page_pages_grabbed = 0;
1679 
1680 extern  uint32_t c_segment_pages_compressed;
1681 
1682 extern uint64_t shared_region_pager_reclaimed;
1683 extern struct memory_object_pager_ops shared_region_pager_ops;
1684 
1685 void
1686 update_vm_info(void)
1687 {
1688 	unsigned long tmp;
1689 	uint64_t tmp64;
1690 
1691 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1692 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1693 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1694 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1695 
1696 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1697 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1698 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1699 
1700 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1701 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1702 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1703 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1704 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1705 
1706 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1707 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1708 	last.vm_pageout_considered_page = tmp;
1709 
1710 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1711 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1712 	last.vm_pageout_compressions = tmp64;
1713 
1714 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1715 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1716 	last.vm_compressor_failed = tmp;
1717 
1718 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1719 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1720 	last.vm_compressor_pages_grabbed = tmp64;
1721 
1722 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1723 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1724 	last.vm_phantom_cache_found_ghost = tmp;
1725 
1726 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1727 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1728 	last.vm_phantom_cache_added_ghost = tmp;
1729 
1730 	tmp64 = counter_load(&vm_page_grab_count);
1731 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1732 	last_vm_page_pages_grabbed = tmp64;
1733 
1734 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1735 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1736 	last.vm_page_pages_freed = tmp;
1737 
1738 
1739 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1740 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1741 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1742 		last.vm_pageout_pages_evicted = tmp;
1743 
1744 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1745 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1746 		last.vm_pageout_pages_purged = tmp;
1747 
1748 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1749 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1750 		last.vm_pageout_freed_speculative = tmp;
1751 
1752 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1753 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1754 		last.vm_pageout_freed_external = tmp;
1755 
1756 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1757 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1758 		last.vm_pageout_inactive_referenced = tmp;
1759 
1760 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1761 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1762 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1763 
1764 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1765 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1766 		last.vm_pageout_inactive_dirty_external = tmp;
1767 
1768 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1769 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1770 		last.vm_pageout_freed_cleaned = tmp;
1771 
1772 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1773 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1774 		last.vm_pageout_inactive_nolock = tmp;
1775 
1776 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1777 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1778 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1779 
1780 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1781 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1782 		last.vm_pageout_skipped_external = tmp;
1783 
1784 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1785 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1786 		last.vm_pageout_skipped_internal = tmp;
1787 
1788 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1789 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1790 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1791 
1792 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1793 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1794 		last.vm_pageout_inactive_force_reclaim = tmp;
1795 
1796 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1797 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1798 		last.vm_pageout_freed_internal = tmp;
1799 
1800 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1801 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1802 		last.vm_pageout_considered_bq_internal = tmp;
1803 
1804 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1805 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1806 		last.vm_pageout_considered_bq_external = tmp;
1807 
1808 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1809 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1810 		last.vm_pageout_filecache_min_reactivated = tmp;
1811 
1812 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1813 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1814 		last.vm_pageout_inactive_dirty_internal = tmp;
1815 
1816 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1817 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1818 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1819 
1820 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1821 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1822 		last.vm_pageout_forcereclaimed_realtime = tmp;
1823 
1824 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1825 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1826 		last.vm_pageout_protected_sharedcache = tmp;
1827 
1828 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1829 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1830 		last.vm_pageout_protected_realtime = tmp;
1831 	}
1832 
1833 	KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1834 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1835 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1836 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1837 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1838 
1839 	KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1840 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1841 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1842 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1843 
1844 	KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1845 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1846 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1847 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1848 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1849 
1850 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1851 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1852 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1853 		KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1854 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1855 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1856 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1857 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1858 
1859 		KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1860 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1861 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1862 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1863 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1864 
1865 		KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1866 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1867 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1868 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1869 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1870 
1871 		KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1872 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1873 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1874 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1875 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1876 
1877 		KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1878 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1879 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1880 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1881 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1882 
1883 		KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1884 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1885 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1886 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1887 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1888 	}
1889 	KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1890 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1891 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1892 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1893 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1894 
1895 	record_memory_pressure();
1896 }
1897 
1898 extern boolean_t hibernation_vmqueues_inspection;
1899 
1900 /*
1901  * Return values for functions called by vm_pageout_scan
1902  * that control its flow.
1903  *
1904  * PROCEED -- vm_pageout_scan will keep making forward progress.
1905  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1906  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1907  */
1908 
1909 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1910 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1911 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1912 
1913 /*
1914  * This function is called only from vm_pageout_scan and
1915  * it moves overflow secluded pages (one-at-a-time) to the
1916  * batched 'local' free Q or active Q.
1917  */
1918 static void
1919 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1920 {
1921 #if CONFIG_SECLUDED_MEMORY
1922 	/*
1923 	 * Deal with secluded_q overflow.
1924 	 */
1925 	if (vm_page_secluded_count > vm_page_secluded_target) {
1926 		vm_page_t secluded_page;
1927 
1928 		/*
1929 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1930 		 * Excess secluded pages go to the active queue and
1931 		 * will later go to the inactive queue.
1932 		 */
1933 		assert((vm_page_secluded_count_free +
1934 		    vm_page_secluded_count_inuse) ==
1935 		    vm_page_secluded_count);
1936 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1937 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1938 
1939 		vm_page_queues_remove(secluded_page, FALSE);
1940 		assert(!vm_page_is_fictitious(secluded_page));
1941 		assert(!VM_PAGE_WIRED(secluded_page));
1942 
1943 		if (secluded_page->vmp_object == 0) {
1944 			/* transfer to free queue */
1945 			assert(secluded_page->vmp_busy);
1946 			secluded_page->vmp_snext = *local_freeq;
1947 			*local_freeq = secluded_page;
1948 			*local_freed += 1;
1949 		} else {
1950 			/* transfer to head of active queue */
1951 			vm_page_enqueue_active(secluded_page, FALSE);
1952 			secluded_page = VM_PAGE_NULL;
1953 		}
1954 	}
1955 #else /* CONFIG_SECLUDED_MEMORY */
1956 
1957 #pragma unused(local_freeq)
1958 #pragma unused(local_freed)
1959 
1960 	return;
1961 
1962 #endif /* CONFIG_SECLUDED_MEMORY */
1963 }
1964 
1965 
1966 /*
1967  * This function is called only from vm_pageout_scan and
1968  * it initializes the loop targets for vm_pageout_scan().
1969  */
1970 static void
1971 vps_init_page_targets(void)
1972 {
1973 	/*
1974 	 * LD TODO: Other page targets should be calculated here too.
1975 	 */
1976 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1977 
1978 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1979 		vm_pageout_state.vm_page_speculative_percentage = 50;
1980 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1981 		vm_pageout_state.vm_page_speculative_percentage = 1;
1982 	}
1983 
1984 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1985 	    vm_page_inactive_count);
1986 }
1987 
1988 /*
1989  * This function is called only from vm_pageout_scan and
1990  * it purges a single VM object at-a-time and will either
1991  * make vm_pageout_scan() restart the loop or keeping moving forward.
1992  */
1993 static int
1994 vps_purge_object()
1995 {
1996 	int             force_purge;
1997 
1998 	assert(available_for_purge >= 0);
1999 	force_purge = 0; /* no force-purging */
2000 
2001 #if VM_PRESSURE_EVENTS
2002 	vm_pressure_level_t pressure_level;
2003 
2004 	pressure_level = memorystatus_vm_pressure_level;
2005 
2006 	if (pressure_level > kVMPressureNormal) {
2007 		if (pressure_level >= kVMPressureCritical) {
2008 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2009 		} else if (pressure_level >= kVMPressureUrgent) {
2010 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2011 		} else if (pressure_level >= kVMPressureWarning) {
2012 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2013 		}
2014 	}
2015 #endif /* VM_PRESSURE_EVENTS */
2016 
2017 	if (available_for_purge || force_purge) {
2018 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2019 
2020 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2021 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2022 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2023 			VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2024 			memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2025 
2026 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2027 		}
2028 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2029 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2030 	}
2031 
2032 	return VM_PAGEOUT_SCAN_PROCEED;
2033 }
2034 
2035 /*
2036  * This function is called only from vm_pageout_scan and
2037  * it will try to age the next speculative Q if the oldest
2038  * one is empty.
2039  */
2040 static int
2041 vps_age_speculative_queue(boolean_t force_speculative_aging)
2042 {
2043 #define DELAY_SPECULATIVE_AGE   1000
2044 
2045 	/*
2046 	 * try to pull pages from the aging bins...
2047 	 * see vm_page_internal.h for an explanation of how
2048 	 * this mechanism works
2049 	 */
2050 	boolean_t                       can_steal = FALSE;
2051 	int                             num_scanned_queues;
2052 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2053 	mach_timespec_t                 ts;
2054 	struct vm_speculative_age_q     *aq;
2055 	struct vm_speculative_age_q     *sq;
2056 
2057 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2058 
2059 	aq = &vm_page_queue_speculative[speculative_steal_index];
2060 
2061 	num_scanned_queues = 0;
2062 	while (vm_page_queue_empty(&aq->age_q) &&
2063 	    num_scanned_queues++ != vm_page_max_speculative_age_q) {
2064 		speculative_steal_index++;
2065 
2066 		if (speculative_steal_index > vm_page_max_speculative_age_q) {
2067 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2068 		}
2069 
2070 		aq = &vm_page_queue_speculative[speculative_steal_index];
2071 	}
2072 
2073 	if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2074 		/*
2075 		 * XXX We've scanned all the speculative
2076 		 * queues but still haven't found one
2077 		 * that is not empty, even though
2078 		 * vm_page_speculative_count is not 0.
2079 		 */
2080 		if (!vm_page_queue_empty(&sq->age_q)) {
2081 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2082 		}
2083 #if DEVELOPMENT || DEBUG
2084 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2085 #endif
2086 		/* readjust... */
2087 		vm_page_speculative_count = 0;
2088 		/* ... and continue */
2089 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2090 	}
2091 
2092 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2093 		can_steal = TRUE;
2094 	} else {
2095 		if (!delay_speculative_age) {
2096 			mach_timespec_t ts_fully_aged;
2097 
2098 			ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2099 			ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2100 			    * 1000 * NSEC_PER_USEC;
2101 
2102 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2103 
2104 			clock_sec_t sec;
2105 			clock_nsec_t nsec;
2106 			clock_get_system_nanotime(&sec, &nsec);
2107 			ts.tv_sec = (unsigned int) sec;
2108 			ts.tv_nsec = nsec;
2109 
2110 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2111 				can_steal = TRUE;
2112 			} else {
2113 				delay_speculative_age++;
2114 			}
2115 		} else {
2116 			delay_speculative_age++;
2117 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2118 				delay_speculative_age = 0;
2119 			}
2120 		}
2121 	}
2122 	if (can_steal == TRUE) {
2123 		vm_page_speculate_ageit(aq);
2124 	}
2125 
2126 	return VM_PAGEOUT_SCAN_PROCEED;
2127 }
2128 
2129 /*
2130  * This function is called only from vm_pageout_scan and
2131  * it evicts a single VM object from the cache.
2132  */
2133 static int inline
2134 vps_object_cache_evict(vm_object_t *object_to_unlock)
2135 {
2136 	static int                      cache_evict_throttle = 0;
2137 	struct vm_speculative_age_q     *sq;
2138 
2139 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2140 
2141 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2142 		int     pages_evicted;
2143 
2144 		if (*object_to_unlock != NULL) {
2145 			vm_object_unlock(*object_to_unlock);
2146 			*object_to_unlock = NULL;
2147 		}
2148 		KDBG(0x13001ec | DBG_FUNC_START);
2149 
2150 		pages_evicted = vm_object_cache_evict(100, 10);
2151 
2152 		KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2153 
2154 		if (pages_evicted) {
2155 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2156 
2157 			VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2158 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2159 			memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2160 
2161 			/*
2162 			 * we just freed up to 100 pages,
2163 			 * so go back to the top of the main loop
2164 			 * and re-evaulate the memory situation
2165 			 */
2166 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2167 		} else {
2168 			cache_evict_throttle = 1000;
2169 		}
2170 	}
2171 	if (cache_evict_throttle) {
2172 		cache_evict_throttle--;
2173 	}
2174 
2175 	return VM_PAGEOUT_SCAN_PROCEED;
2176 }
2177 
2178 
2179 /*
2180  * This function is called only from vm_pageout_scan and
2181  * it calculates the filecache min. that needs to be maintained
2182  * as we start to steal pages.
2183  */
2184 static void
2185 vps_calculate_filecache_min(void)
2186 {
2187 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2188 
2189 #if CONFIG_JETSAM
2190 	/*
2191 	 * don't let the filecache_min fall below 15% of available memory
2192 	 * on systems with an active compressor that isn't nearing its
2193 	 * limits w/r to accepting new data
2194 	 *
2195 	 * on systems w/o the compressor/swapper, the filecache is always
2196 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2197 	 * since most (if not all) of the anonymous pages are in the
2198 	 * throttled queue (which isn't counted as available) which
2199 	 * effectively disables this filter
2200 	 */
2201 	if (vm_compressor_low_on_space() || divisor == 0) {
2202 		vm_pageout_state.vm_page_filecache_min = 0;
2203 	} else {
2204 		vm_pageout_state.vm_page_filecache_min =
2205 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2206 	}
2207 #else
2208 	if (vm_compressor_out_of_space() || divisor == 0) {
2209 		vm_pageout_state.vm_page_filecache_min = 0;
2210 	} else {
2211 		/*
2212 		 * don't let the filecache_min fall below the specified critical level
2213 		 */
2214 		vm_pageout_state.vm_page_filecache_min =
2215 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2216 	}
2217 #endif
2218 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2219 		vm_pageout_state.vm_page_filecache_min = 0;
2220 	}
2221 }
2222 
2223 /*
2224  * This function is called only from vm_pageout_scan and
2225  * it updates the flow control time to detect if VM pageoutscan
2226  * isn't making progress.
2227  */
2228 static void
2229 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2230 {
2231 	mach_timespec_t ts;
2232 	clock_sec_t sec;
2233 	clock_nsec_t nsec;
2234 
2235 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2236 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2237 	clock_get_system_nanotime(&sec, &nsec);
2238 	flow_control->ts.tv_sec = (unsigned int) sec;
2239 	flow_control->ts.tv_nsec = nsec;
2240 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2241 
2242 	flow_control->state = FCS_DELAYED;
2243 
2244 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2245 }
2246 
2247 /*
2248  * This function is called only from vm_pageout_scan and
2249  * it is the flow control logic of VM pageout scan which
2250  * controls if it should block and for how long.
2251  * Any blocking of vm_pageout_scan happens ONLY in this function.
2252  */
2253 static int
2254 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2255     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2256 {
2257 	boolean_t       exceeded_burst_throttle = FALSE;
2258 	unsigned int    msecs = 0;
2259 	uint32_t        inactive_external_count;
2260 	mach_timespec_t ts;
2261 	struct  vm_pageout_queue *iq;
2262 	struct  vm_pageout_queue *eq;
2263 	struct  vm_speculative_age_q *sq;
2264 
2265 	iq = &vm_pageout_queue_internal;
2266 	eq = &vm_pageout_queue_external;
2267 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2268 
2269 	/*
2270 	 * Sometimes we have to pause:
2271 	 *	1) No inactive pages - nothing to do.
2272 	 *	2) Loop control - no acceptable pages found on the inactive queue
2273 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2274 	 *	3) Flow control - default pageout queue is full
2275 	 */
2276 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2277 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2278 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2279 	    vm_page_queue_empty(&sq->age_q)) {
2280 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2281 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2282 	} else if (inactive_burst_count >=
2283 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2284 	    (vm_page_inactive_count +
2285 	    vm_page_speculative_count))) {
2286 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2287 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2288 
2289 		exceeded_burst_throttle = TRUE;
2290 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2291 	    VM_DYNAMIC_PAGING_ENABLED()) {
2292 		clock_sec_t sec;
2293 		clock_nsec_t nsec;
2294 
2295 		switch (flow_control->state) {
2296 		case FCS_IDLE:
2297 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2298 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2299 				/*
2300 				 * since the compressor is running independently of vm_pageout_scan
2301 				 * let's not wait for it just yet... as long as we have a healthy supply
2302 				 * of filecache pages to work with, let's keep stealing those.
2303 				 */
2304 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2305 
2306 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2307 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2308 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2309 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2310 					return VM_PAGEOUT_SCAN_PROCEED;
2311 				}
2312 			}
2313 
2314 			vps_flow_control_reset_deadlock_timer(flow_control);
2315 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2316 
2317 			break;
2318 
2319 		case FCS_DELAYED:
2320 			clock_get_system_nanotime(&sec, &nsec);
2321 			ts.tv_sec = (unsigned int) sec;
2322 			ts.tv_nsec = nsec;
2323 
2324 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2325 				/*
2326 				 * the pageout thread for the default pager is potentially
2327 				 * deadlocked since the
2328 				 * default pager queue has been throttled for more than the
2329 				 * allowable time... we need to move some clean pages or dirty
2330 				 * pages belonging to the external pagers if they aren't throttled
2331 				 * vm_page_free_wanted represents the number of threads currently
2332 				 * blocked waiting for pages... we'll move one page for each of
2333 				 * these plus a fixed amount to break the logjam... once we're done
2334 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2335 				 * with a new timeout target since we have no way of knowing
2336 				 * whether we've broken the deadlock except through observation
2337 				 * of the queue associated with the default pager... we need to
2338 				 * stop moving pages and allow the system to run to see what
2339 				 * state it settles into.
2340 				 */
2341 
2342 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2343 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2344 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2345 				flow_control->state = FCS_DEADLOCK_DETECTED;
2346 				sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2347 				return VM_PAGEOUT_SCAN_PROCEED;
2348 			}
2349 			/*
2350 			 * just resniff instead of trying
2351 			 * to compute a new delay time... we're going to be
2352 			 * awakened immediately upon a laundry completion,
2353 			 * so we won't wait any longer than necessary
2354 			 */
2355 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2356 			break;
2357 
2358 		case FCS_DEADLOCK_DETECTED:
2359 			if (*vm_pageout_deadlock_target) {
2360 				return VM_PAGEOUT_SCAN_PROCEED;
2361 			}
2362 
2363 			vps_flow_control_reset_deadlock_timer(flow_control);
2364 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2365 
2366 			break;
2367 		}
2368 	} else {
2369 		/*
2370 		 * No need to pause...
2371 		 */
2372 		return VM_PAGEOUT_SCAN_PROCEED;
2373 	}
2374 
2375 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2376 
2377 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2378 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2379 
2380 	if (vm_page_free_count >= vm_page_free_target) {
2381 		/*
2382 		 * we're here because
2383 		 *  1) someone else freed up some pages while we had
2384 		 *     the queues unlocked above
2385 		 * and we've hit one of the 3 conditions that
2386 		 * cause us to pause the pageout scan thread
2387 		 *
2388 		 * since we already have enough free pages,
2389 		 * let's avoid stalling and return normally
2390 		 *
2391 		 * before we return, make sure the pageout I/O threads
2392 		 * are running throttled in case there are still requests
2393 		 * in the laundry... since we have enough free pages
2394 		 * we don't need the laundry to be cleaned in a timely
2395 		 * fashion... so let's avoid interfering with foreground
2396 		 * activity
2397 		 *
2398 		 * we don't want to hold vm_page_queue_free_lock when
2399 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2400 		 * may cause other locks to be taken), we do the intitial
2401 		 * check outside of the lock.  Once we take the lock,
2402 		 * we recheck the condition since it may have changed.
2403 		 * if it has, no problem, we will make the threads
2404 		 * non-throttled before actually blocking
2405 		 */
2406 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2407 	}
2408 	vm_free_page_lock();
2409 
2410 	if (vm_page_free_count >= vm_page_free_target &&
2411 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2412 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2413 	}
2414 	vm_free_page_unlock();
2415 
2416 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2417 		/*
2418 		 * we're most likely about to block due to one of
2419 		 * the 3 conditions that cause vm_pageout_scan to
2420 		 * not be able to make forward progress w/r
2421 		 * to providing new pages to the free queue,
2422 		 * so unthrottle the I/O threads in case we
2423 		 * have laundry to be cleaned... it needs
2424 		 * to be completed ASAP.
2425 		 *
2426 		 * even if we don't block, we want the io threads
2427 		 * running unthrottled since the sum of free +
2428 		 * clean pages is still under our free target
2429 		 */
2430 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2431 	}
2432 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2433 		/*
2434 		 * if we get here we're below our free target and
2435 		 * we're stalling due to a full laundry queue or
2436 		 * we don't have any inactive pages other then
2437 		 * those in the clean queue...
2438 		 * however, we have pages on the clean queue that
2439 		 * can be moved to the free queue, so let's not
2440 		 * stall the pageout scan
2441 		 */
2442 		flow_control->state = FCS_IDLE;
2443 		return VM_PAGEOUT_SCAN_PROCEED;
2444 	}
2445 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2446 		flow_control->state = FCS_IDLE;
2447 		return VM_PAGEOUT_SCAN_PROCEED;
2448 	}
2449 
2450 	VM_CHECK_MEMORYSTATUS;
2451 
2452 	if (flow_control->state != FCS_IDLE) {
2453 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2454 	}
2455 
2456 	iq->pgo_throttled = TRUE;
2457 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2458 
2459 	vm_page_unlock_queues();
2460 
2461 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2462 
2463 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2464 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2465 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2466 
2467 	thread_block(THREAD_CONTINUE_NULL);
2468 
2469 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2470 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2471 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2472 
2473 	vm_page_lock_queues();
2474 
2475 	iq->pgo_throttled = FALSE;
2476 
2477 	vps_init_page_targets();
2478 
2479 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2480 }
2481 
2482 extern boolean_t vm_darkwake_mode;
2483 /*
2484  * This function is called only from vm_pageout_scan and
2485  * it will find and return the most appropriate page to be
2486  * reclaimed.
2487  */
2488 static int
2489 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2490     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2491 {
2492 	vm_page_t                       m = NULL;
2493 	vm_object_t                     m_object = VM_OBJECT_NULL;
2494 	uint32_t                        inactive_external_count;
2495 	struct vm_speculative_age_q     *sq;
2496 	struct vm_pageout_queue         *iq;
2497 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2498 
2499 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2500 	iq = &vm_pageout_queue_internal;
2501 
2502 	*is_page_from_bg_q = FALSE;
2503 
2504 	m = NULL;
2505 	m_object = VM_OBJECT_NULL;
2506 
2507 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2508 		assert(vm_page_throttled_count == 0);
2509 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2510 	}
2511 
2512 	/*
2513 	 * Try for a clean-queue inactive page.
2514 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2515 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2516 	 */
2517 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2518 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2519 
2520 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2521 
2522 		goto found_page;
2523 	}
2524 
2525 	/*
2526 	 * The next most eligible pages are ones we paged in speculatively,
2527 	 * but which have not yet been touched and have been aged out.
2528 	 */
2529 	if (!vm_page_queue_empty(&sq->age_q)) {
2530 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2531 
2532 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2533 
2534 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2535 			goto found_page;
2536 		} else {
2537 			m = NULL;
2538 		}
2539 	}
2540 
2541 #if !CONFIG_JETSAM
2542 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2543 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2544 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2545 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2546 			goto found_page;
2547 		}
2548 	}
2549 #endif /* !CONFIG_JETSAM */
2550 
2551 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2552 		vm_object_t     bg_m_object = NULL;
2553 
2554 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2555 
2556 		bg_m_object = VM_PAGE_OBJECT(m);
2557 
2558 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2559 			/*
2560 			 * This page is on the background queue
2561 			 * but not on a pageable queue OR is busy during
2562 			 * darkwake mode when the target is artificially lowered.
2563 			 * If it is busy during darkwake mode, and we don't skip it,
2564 			 * we will just swing back around and try again with the same
2565 			 * queue and might hit the same page or its neighbor in a
2566 			 * similar state. Both of these are transient states and will
2567 			 * get resolved, but, at this point let's ignore this page.
2568 			 */
2569 			if (vm_darkwake_mode && m->vmp_busy) {
2570 				if (bg_m_object->internal) {
2571 					vm_pageout_skipped_bq_internal++;
2572 				} else {
2573 					vm_pageout_skipped_bq_external++;
2574 				}
2575 			}
2576 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2577 			if (bg_m_object->internal &&
2578 			    (VM_PAGE_Q_THROTTLED(iq) ||
2579 			    vm_compressor_out_of_space() == TRUE ||
2580 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2581 				vm_pageout_skipped_bq_internal++;
2582 			} else {
2583 				*is_page_from_bg_q = TRUE;
2584 
2585 				if (bg_m_object->internal) {
2586 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2587 				} else {
2588 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2589 				}
2590 				goto found_page;
2591 			}
2592 		}
2593 	}
2594 
2595 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2596 
2597 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2598 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2599 		*grab_anonymous = TRUE;
2600 		*anons_grabbed = 0;
2601 
2602 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2603 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2604 		} else {
2605 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2606 				/*
2607 				 * No swap and we are in dangerously low levels of free memory.
2608 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2609 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2610 				 *
2611 				 * So, pick a file backed page...
2612 				 */
2613 				*grab_anonymous = FALSE;
2614 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2615 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2616 			}
2617 		}
2618 		goto want_anonymous;
2619 	}
2620 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2621 
2622 #if CONFIG_JETSAM
2623 	/* If the file-backed pool has accumulated
2624 	 * significantly more pages than the jetsam
2625 	 * threshold, prefer to reclaim those
2626 	 * inline to minimise compute overhead of reclaiming
2627 	 * anonymous pages.
2628 	 * This calculation does not account for the CPU local
2629 	 * external page queues, as those are expected to be
2630 	 * much smaller relative to the global pools.
2631 	 */
2632 
2633 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2634 
2635 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2636 		if (vm_page_pageable_external_count >
2637 		    vm_pageout_state.vm_page_filecache_min) {
2638 			if ((vm_page_pageable_external_count *
2639 			    vm_pageout_memorystatus_fb_factor_dr) >
2640 			    (memorystatus_get_critical_page_shortage_threshold() *
2641 			    vm_pageout_memorystatus_fb_factor_nr)) {
2642 				*grab_anonymous = FALSE;
2643 
2644 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2645 			}
2646 		}
2647 		if (*grab_anonymous) {
2648 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2649 		}
2650 	}
2651 #endif /* CONFIG_JETSAM */
2652 
2653 want_anonymous:
2654 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2655 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2656 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2657 
2658 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2659 			*anons_grabbed = 0;
2660 
2661 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2662 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2663 					if ((++(*reactivated_this_call) % 100)) {
2664 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2665 
2666 						vm_page_activate(m);
2667 						counter_inc(&vm_statistics_reactivations);
2668 #if DEVELOPMENT || DEBUG
2669 						if (*is_page_from_bg_q == TRUE) {
2670 							if (m_object->internal) {
2671 								vm_pageout_rejected_bq_internal++;
2672 							} else {
2673 								vm_pageout_rejected_bq_external++;
2674 							}
2675 						}
2676 #endif /* DEVELOPMENT || DEBUG */
2677 						vm_pageout_state.vm_pageout_inactive_used++;
2678 
2679 						m = NULL;
2680 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2681 
2682 						goto found_page;
2683 					}
2684 
2685 					/*
2686 					 * steal 1 of the file backed pages even if
2687 					 * we are under the limit that has been set
2688 					 * for a healthy filecache
2689 					 */
2690 				}
2691 			}
2692 			goto found_page;
2693 		}
2694 	}
2695 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2696 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2697 
2698 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2699 		*anons_grabbed += 1;
2700 
2701 		goto found_page;
2702 	}
2703 
2704 	m = NULL;
2705 
2706 found_page:
2707 	*victim_page = m;
2708 
2709 	return retval;
2710 }
2711 
2712 /*
2713  * This function is called only from vm_pageout_scan and
2714  * it will put a page back on the active/inactive queue
2715  * if we can't reclaim it for some reason.
2716  */
2717 static void
2718 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2719 {
2720 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2721 		vm_page_enqueue_inactive(m, FALSE);
2722 	} else {
2723 		vm_page_activate(m);
2724 	}
2725 
2726 #if DEVELOPMENT || DEBUG
2727 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2728 
2729 	if (page_from_bg_q == TRUE) {
2730 		if (m_object->internal) {
2731 			vm_pageout_rejected_bq_internal++;
2732 		} else {
2733 			vm_pageout_rejected_bq_external++;
2734 		}
2735 	}
2736 #endif /* DEVELOPMENT || DEBUG */
2737 }
2738 
2739 /*
2740  * This function is called only from vm_pageout_scan and
2741  * it will try to grab the victim page's VM object (m_object)
2742  * which differs from the previous victim page's object (object).
2743  */
2744 static int
2745 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2746 {
2747 	struct vm_speculative_age_q *sq;
2748 
2749 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2750 
2751 	/*
2752 	 * the object associated with candidate page is
2753 	 * different from the one we were just working
2754 	 * with... dump the lock if we still own it
2755 	 */
2756 	if (*object != NULL) {
2757 		vm_object_unlock(*object);
2758 		*object = NULL;
2759 	}
2760 	/*
2761 	 * Try to lock object; since we've alread got the
2762 	 * page queues lock, we can only 'try' for this one.
2763 	 * if the 'try' fails, we need to do a mutex_pause
2764 	 * to allow the owner of the object lock a chance to
2765 	 * run... otherwise, we're likely to trip over this
2766 	 * object in the same state as we work our way through
2767 	 * the queue... clumps of pages associated with the same
2768 	 * object are fairly typical on the inactive and active queues
2769 	 */
2770 	if (!vm_object_lock_try_scan(m_object)) {
2771 		vm_page_t m_want = NULL;
2772 
2773 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2774 
2775 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2776 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2777 		}
2778 
2779 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2780 
2781 		m->vmp_reference = FALSE;
2782 
2783 		if (!m_object->object_is_shared_cache) {
2784 			/*
2785 			 * don't apply this optimization if this is the shared cache
2786 			 * object, it's too easy to get rid of very hot and important
2787 			 * pages...
2788 			 * m->vmp_object must be stable since we hold the page queues lock...
2789 			 * we can update the scan_collisions field sans the object lock
2790 			 * since it is a separate field and this is the only spot that does
2791 			 * a read-modify-write operation and it is never executed concurrently...
2792 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2793 			 * is possible for the value to be a bit non-determistic, but that's ok
2794 			 * since it's only used as a hint
2795 			 */
2796 			m_object->scan_collisions = 1;
2797 		}
2798 		if (page_from_bg_q) {
2799 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2800 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2801 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2802 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2803 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2804 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2805 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2806 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2807 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2808 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2809 		}
2810 
2811 		/*
2812 		 * this is the next object we're going to be interested in
2813 		 * try to make sure its available after the mutex_pause
2814 		 * returns control
2815 		 */
2816 		if (m_want) {
2817 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2818 		}
2819 
2820 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2821 
2822 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2823 	} else {
2824 		*object = m_object;
2825 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2826 	}
2827 
2828 	return VM_PAGEOUT_SCAN_PROCEED;
2829 }
2830 
2831 /*
2832  * This function is called only from vm_pageout_scan and
2833  * it notices that pageout scan may be rendered ineffective
2834  * due to a FS deadlock and will jetsam a process if possible.
2835  * If jetsam isn't supported, it'll move the page to the active
2836  * queue to try and get some different pages pushed onwards so
2837  * we can try to get out of this scenario.
2838  */
2839 static void
2840 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2841     boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2842 {
2843 	struct  vm_pageout_queue *eq;
2844 	vm_object_t cur_object = VM_OBJECT_NULL;
2845 
2846 	cur_object = *object;
2847 
2848 	eq = &vm_pageout_queue_external;
2849 
2850 	if (cur_object->internal == FALSE) {
2851 		/*
2852 		 * we need to break up the following potential deadlock case...
2853 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2854 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2855 		 *  c) Most of the pages in the inactive queue belong to this file.
2856 		 *
2857 		 * we are potentially in this deadlock because...
2858 		 *  a) the external pageout queue is throttled
2859 		 *  b) we're done with the active queue and moved on to the inactive queue
2860 		 *  c) we've got a dirty external page
2861 		 *
2862 		 * since we don't know the reason for the external pageout queue being throttled we
2863 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2864 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2865 		 *
2866 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2867 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2868 		 * pool the next time we select a victim page... if we can make enough new free pages,
2869 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2870 		 * be throttled
2871 		 *
2872 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2873 		 * that we can try to find clean pages in the active/inactive queues before
2874 		 * deciding to jetsam a process
2875 		 */
2876 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2877 
2878 		vm_page_check_pageable_safe(m);
2879 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2880 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2881 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2882 		vm_page_active_count++;
2883 		vm_page_pageable_external_count++;
2884 
2885 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2886 
2887 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2888 
2889 #pragma unused(force_anonymous)
2890 
2891 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2892 
2893 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2894 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2895 			/*
2896 			 * Possible deadlock scenario so request jetsam action
2897 			 */
2898 			memorystatus_kill_on_vps_starvation();
2899 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2900 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2901 		}
2902 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2903 
2904 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2905 
2906 		*force_anonymous = TRUE;
2907 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2908 	} else {
2909 		vm_page_activate(m);
2910 		counter_inc(&vm_statistics_reactivations);
2911 
2912 #if DEVELOPMENT || DEBUG
2913 		if (is_page_from_bg_q == TRUE) {
2914 			if (cur_object->internal) {
2915 				vm_pageout_rejected_bq_internal++;
2916 			} else {
2917 				vm_pageout_rejected_bq_external++;
2918 			}
2919 		}
2920 #endif /* DEVELOPMENT || DEBUG */
2921 
2922 		vm_pageout_state.vm_pageout_inactive_used++;
2923 	}
2924 }
2925 
2926 
2927 void
2928 vm_page_balance_inactive(int max_to_move)
2929 {
2930 	vm_page_t m;
2931 
2932 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2933 
2934 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2935 		/*
2936 		 * It is likely that the hibernation code path is
2937 		 * dealing with these very queues as we are about
2938 		 * to move pages around in/from them and completely
2939 		 * change the linkage of the pages.
2940 		 *
2941 		 * And so we skip the rebalancing of these queues.
2942 		 */
2943 		return;
2944 	}
2945 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2946 	    vm_page_inactive_count +
2947 	    vm_page_speculative_count);
2948 
2949 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2950 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2951 
2952 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2953 
2954 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2955 		assert(!m->vmp_laundry);
2956 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2957 		assert(!vm_page_is_guard(m));
2958 
2959 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2960 
2961 		/*
2962 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2963 		 *
2964 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2965 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2966 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2967 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2968 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2969 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2970 		 * have happened before we moved the page
2971 		 */
2972 		if (m->vmp_pmapped == TRUE) {
2973 			/*
2974 			 * We might be holding the page queue lock as a
2975 			 * spin lock and clearing the "referenced" bit could
2976 			 * take a while if there are lots of mappings of
2977 			 * that page, so make sure we acquire the lock as
2978 			 * as mutex to avoid a spinlock timeout.
2979 			 */
2980 			vm_page_lockconvert_queues();
2981 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2982 		}
2983 
2984 		/*
2985 		 * The page might be absent or busy,
2986 		 * but vm_page_deactivate can handle that.
2987 		 * FALSE indicates that we don't want a H/W clear reference
2988 		 */
2989 		vm_page_deactivate_internal(m, FALSE);
2990 	}
2991 }
2992 
2993 /*
2994  *	vm_pageout_scan does the dirty work for the pageout daemon.
2995  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2996  *	held and vm_page_free_wanted == 0.
2997  */
2998 void
2999 vm_pageout_scan(void)
3000 {
3001 	unsigned int loop_count = 0;
3002 	unsigned int inactive_burst_count = 0;
3003 	unsigned int reactivated_this_call;
3004 	unsigned int reactivate_limit;
3005 	vm_page_t   local_freeq = NULL;
3006 	int         local_freed = 0;
3007 	int         delayed_unlock;
3008 	int         delayed_unlock_limit = 0;
3009 	int         refmod_state = 0;
3010 	int     vm_pageout_deadlock_target = 0;
3011 	struct  vm_pageout_queue *iq;
3012 	struct  vm_pageout_queue *eq;
3013 	struct  vm_speculative_age_q *sq;
3014 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3015 	boolean_t inactive_throttled = FALSE;
3016 	vm_object_t     object = NULL;
3017 	uint32_t        inactive_reclaim_run;
3018 	boolean_t       grab_anonymous = FALSE;
3019 	boolean_t       force_anonymous = FALSE;
3020 	boolean_t       force_speculative_aging = FALSE;
3021 	int             anons_grabbed = 0;
3022 	int             page_prev_q_state = 0;
3023 	boolean_t       page_from_bg_q = FALSE;
3024 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3025 	vm_object_t     m_object = VM_OBJECT_NULL;
3026 	int             retval = 0;
3027 	boolean_t       lock_yield_check = FALSE;
3028 
3029 
3030 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3031 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3032 	    vm_pageout_state.vm_pageout_inactive_clean,
3033 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3034 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3035 
3036 	flow_control.state = FCS_IDLE;
3037 	iq = &vm_pageout_queue_internal;
3038 	eq = &vm_pageout_queue_external;
3039 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3040 
3041 	/* Ask the pmap layer to return any pages it no longer needs. */
3042 	pmap_release_pages_fast();
3043 
3044 	vm_page_lock_queues();
3045 
3046 	delayed_unlock = 1;
3047 
3048 	/*
3049 	 *	Calculate the max number of referenced pages on the inactive
3050 	 *	queue that we will reactivate.
3051 	 */
3052 	reactivated_this_call = 0;
3053 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3054 	    vm_page_inactive_count);
3055 	inactive_reclaim_run = 0;
3056 
3057 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3058 
3059 	/*
3060 	 *	We must limit the rate at which we send pages to the pagers
3061 	 *	so that we don't tie up too many pages in the I/O queues.
3062 	 *	We implement a throttling mechanism using the laundry count
3063 	 *      to limit the number of pages outstanding to the default
3064 	 *	and external pagers.  We can bypass the throttles and look
3065 	 *	for clean pages if the pageout queues don't drain in a timely
3066 	 *	fashion since this may indicate that the pageout paths are
3067 	 *	stalled waiting for memory, which only we can provide.
3068 	 */
3069 
3070 	vps_init_page_targets();
3071 	assert(object == NULL);
3072 	assert(delayed_unlock != 0);
3073 
3074 	for (;;) {
3075 		vm_page_t m;
3076 
3077 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3078 
3079 		if (lock_yield_check) {
3080 			lock_yield_check = FALSE;
3081 
3082 			if (delayed_unlock++ > delayed_unlock_limit) {
3083 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3084 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3085 			} else if (vm_pageout_scan_wants_object) {
3086 				vm_page_unlock_queues();
3087 				mutex_pause(0);
3088 				vm_page_lock_queues();
3089 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3090 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3091 			}
3092 		}
3093 
3094 		if (vm_upl_wait_for_pages < 0) {
3095 			vm_upl_wait_for_pages = 0;
3096 		}
3097 
3098 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3099 
3100 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3101 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3102 		}
3103 
3104 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3105 
3106 		assert(delayed_unlock);
3107 
3108 		/*
3109 		 * maintain our balance
3110 		 */
3111 		vm_page_balance_inactive(1);
3112 
3113 
3114 		/**********************************************************************
3115 		* above this point we're playing with the active and secluded queues
3116 		* below this point we're playing with the throttling mechanisms
3117 		* and the inactive queue
3118 		**********************************************************************/
3119 
3120 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3121 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3122 
3123 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3124 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3125 			/*
3126 			 * make sure the pageout I/O threads are running
3127 			 * throttled in case there are still requests
3128 			 * in the laundry... since we have met our targets
3129 			 * we don't need the laundry to be cleaned in a timely
3130 			 * fashion... so let's avoid interfering with foreground
3131 			 * activity
3132 			 */
3133 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3134 
3135 			vm_free_page_lock();
3136 
3137 			if ((vm_page_free_count >= vm_page_free_target) &&
3138 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3139 				/*
3140 				 * done - we have met our target *and*
3141 				 * there is no one waiting for a page.
3142 				 */
3143 return_from_scan:
3144 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3145 
3146 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3147 				    vm_pageout_state.vm_pageout_inactive,
3148 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3149 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3150 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3151 				    vm_pageout_state.vm_pageout_inactive_clean,
3152 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3153 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3154 
3155 				return;
3156 			}
3157 			vm_free_page_unlock();
3158 		}
3159 
3160 		/*
3161 		 * Before anything, we check if we have any ripe volatile
3162 		 * objects around. If so, try to purge the first object.
3163 		 * If the purge fails, fall through to reclaim a page instead.
3164 		 * If the purge succeeds, go back to the top and reevalute
3165 		 * the new memory situation.
3166 		 */
3167 		retval = vps_purge_object();
3168 
3169 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3170 			/*
3171 			 * Success
3172 			 */
3173 			if (object != NULL) {
3174 				vm_object_unlock(object);
3175 				object = NULL;
3176 			}
3177 
3178 			lock_yield_check = FALSE;
3179 			continue;
3180 		}
3181 
3182 
3183 		/*
3184 		 * If our 'aged' queue is empty and we have some speculative pages
3185 		 * in the other queues, let's go through and see if we need to age
3186 		 * them.
3187 		 *
3188 		 * If we succeeded in aging a speculative Q or just that everything
3189 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3190 		 *
3191 		 * If, for some reason, we seem to have a mismatch between the spec.
3192 		 * page count and the page queues, we reset those variables and
3193 		 * restart the loop (LD TODO: Track this better?).
3194 		 */
3195 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3196 			retval = vps_age_speculative_queue(force_speculative_aging);
3197 
3198 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3199 				lock_yield_check = FALSE;
3200 				continue;
3201 			}
3202 		}
3203 		force_speculative_aging = FALSE;
3204 
3205 		/*
3206 		 * Check to see if we need to evict objects from the cache.
3207 		 *
3208 		 * Note: 'object' here doesn't have anything to do with
3209 		 * the eviction part. We just need to make sure we have dropped
3210 		 * any object lock we might be holding if we need to go down
3211 		 * into the eviction logic.
3212 		 */
3213 		retval = vps_object_cache_evict(&object);
3214 
3215 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3216 			lock_yield_check = FALSE;
3217 			continue;
3218 		}
3219 
3220 
3221 		/*
3222 		 * Calculate our filecache_min that will affect the loop
3223 		 * going forward.
3224 		 */
3225 		vps_calculate_filecache_min();
3226 
3227 		/*
3228 		 * LD TODO: Use a structure to hold all state variables for a single
3229 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3230 		 */
3231 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3232 		    &delayed_unlock, &local_freeq, &local_freed,
3233 		    &vm_pageout_deadlock_target, inactive_burst_count);
3234 
3235 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3236 			if (loop_count >= vm_page_inactive_count) {
3237 				loop_count = 0;
3238 			}
3239 
3240 			inactive_burst_count = 0;
3241 
3242 			assert(object == NULL);
3243 			assert(delayed_unlock != 0);
3244 
3245 			lock_yield_check = FALSE;
3246 			continue;
3247 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3248 			goto return_from_scan;
3249 		}
3250 
3251 		flow_control.state = FCS_IDLE;
3252 
3253 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3254 		    vm_pageout_inactive_external_forced_reactivate_limit);
3255 		loop_count++;
3256 		inactive_burst_count++;
3257 		vm_pageout_state.vm_pageout_inactive++;
3258 
3259 		/*
3260 		 * Choose a victim.
3261 		 */
3262 
3263 		m = NULL;
3264 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3265 
3266 		if (m == NULL) {
3267 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3268 				inactive_burst_count = 0;
3269 
3270 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3271 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3272 				}
3273 
3274 				lock_yield_check = TRUE;
3275 				continue;
3276 			}
3277 
3278 			/*
3279 			 * if we've gotten here, we have no victim page.
3280 			 * check to see if we've not finished balancing the queues
3281 			 * or we have a page on the aged speculative queue that we
3282 			 * skipped due to force_anonymous == TRUE.. or we have
3283 			 * speculative  pages that we can prematurely age... if
3284 			 * one of these cases we'll keep going, else panic
3285 			 */
3286 			force_anonymous = FALSE;
3287 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3288 
3289 			if (!vm_page_queue_empty(&sq->age_q)) {
3290 				lock_yield_check = TRUE;
3291 				continue;
3292 			}
3293 
3294 			if (vm_page_speculative_count) {
3295 				force_speculative_aging = TRUE;
3296 				lock_yield_check = TRUE;
3297 				continue;
3298 			}
3299 			panic("vm_pageout: no victim");
3300 
3301 			/* NOTREACHED */
3302 		}
3303 
3304 		assert(VM_PAGE_PAGEABLE(m));
3305 		m_object = VM_PAGE_OBJECT(m);
3306 		force_anonymous = FALSE;
3307 
3308 		page_prev_q_state = m->vmp_q_state;
3309 		/*
3310 		 * we just found this page on one of our queues...
3311 		 * it can't also be on the pageout queue, so safe
3312 		 * to call vm_page_queues_remove
3313 		 */
3314 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3315 		vm_page_queues_remove(m, TRUE);
3316 		if (donate) {
3317 			/*
3318 			 * The compressor needs to see this bit to know
3319 			 * where this page needs to land. Also if stolen,
3320 			 * this bit helps put the page back in the right
3321 			 * special queue where it belongs.
3322 			 */
3323 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3324 		}
3325 
3326 		assert(!m->vmp_laundry);
3327 		assert(vm_page_is_canonical(m));
3328 		assert(!is_kernel_object(m_object));
3329 
3330 		vm_pageout_vminfo.vm_pageout_considered_page++;
3331 
3332 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3333 
3334 		/*
3335 		 * check to see if we currently are working
3336 		 * with the same object... if so, we've
3337 		 * already got the lock
3338 		 */
3339 		if (m_object != object) {
3340 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3341 
3342 			/*
3343 			 * vps_switch_object() will always drop the 'object' lock first
3344 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3345 			 * either 'm_object' or NULL.
3346 			 */
3347 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3348 
3349 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3350 				lock_yield_check = TRUE;
3351 				continue;
3352 			}
3353 		}
3354 		assert(m_object == object);
3355 		assert(VM_PAGE_OBJECT(m) == m_object);
3356 
3357 		if (m->vmp_busy) {
3358 			/*
3359 			 *	Somebody is already playing with this page.
3360 			 *	Put it back on the appropriate queue
3361 			 *
3362 			 */
3363 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3364 
3365 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3366 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3367 			}
3368 
3369 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3370 
3371 			lock_yield_check = TRUE;
3372 			continue;
3373 		}
3374 
3375 		/*
3376 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3377 		 *	If already cleaning this page in place
3378 		 *	just leave if off the paging queues.
3379 		 *	We can leave the page mapped, and upl_commit_range
3380 		 *	will put it on the clean queue.
3381 		 *
3382 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3383 		 *	an msync INVALIDATE is in progress...
3384 		 *	this page has been marked for destruction
3385 		 *      after it has been cleaned,
3386 		 *      but not yet gathered into a UPL
3387 		 *	where 'cleaning' will be set...
3388 		 *	just leave it off the paging queues
3389 		 *
3390 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3391 		 *	an msync INVALIDATE is in progress
3392 		 *	and the UPL has already gathered this page...
3393 		 *	just leave it off the paging queues
3394 		 */
3395 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3396 			lock_yield_check = TRUE;
3397 			continue;
3398 		}
3399 
3400 
3401 		/*
3402 		 *	If it's absent, in error or the object is no longer alive,
3403 		 *	we can reclaim the page... in the no longer alive case,
3404 		 *	there are 2 states the page can be in that preclude us
3405 		 *	from reclaiming it - busy or cleaning - that we've already
3406 		 *	dealt with
3407 		 */
3408 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3409 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3410 			if (m->vmp_absent) {
3411 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3412 			} else if (!object->alive ||
3413 			    (!object->internal &&
3414 			    object->pager == MEMORY_OBJECT_NULL)) {
3415 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3416 			} else {
3417 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3418 			}
3419 reclaim_page:
3420 			if (vm_pageout_deadlock_target) {
3421 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3422 				vm_pageout_deadlock_target--;
3423 			}
3424 
3425 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3426 
3427 			if (object->internal) {
3428 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3429 			} else {
3430 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3431 			}
3432 			assert(!m->vmp_cleaning);
3433 			assert(!m->vmp_laundry);
3434 
3435 			if (!object->internal &&
3436 			    object->pager != NULL &&
3437 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3438 				shared_region_pager_reclaimed++;
3439 			}
3440 
3441 			m->vmp_busy = TRUE;
3442 
3443 			/*
3444 			 * remove page from object here since we're already
3445 			 * behind the object lock... defer the rest of the work
3446 			 * we'd normally do in vm_page_free_prepare_object
3447 			 * until 'vm_page_free_list' is called
3448 			 */
3449 			if (m->vmp_tabled) {
3450 				vm_page_remove(m, TRUE);
3451 			}
3452 
3453 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3454 			m->vmp_snext = local_freeq;
3455 			local_freeq = m;
3456 			local_freed++;
3457 
3458 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3459 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3460 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3461 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3462 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3463 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3464 			} else {
3465 				vm_pageout_vminfo.vm_pageout_freed_external++;
3466 			}
3467 
3468 			inactive_burst_count = 0;
3469 
3470 			lock_yield_check = TRUE;
3471 			continue;
3472 		}
3473 		if (object->vo_copy == VM_OBJECT_NULL) {
3474 			/*
3475 			 * No one else can have any interest in this page.
3476 			 * If this is an empty purgable object, the page can be
3477 			 * reclaimed even if dirty.
3478 			 * If the page belongs to a volatile purgable object, we
3479 			 * reactivate it if the compressor isn't active.
3480 			 */
3481 			if (object->purgable == VM_PURGABLE_EMPTY) {
3482 				if (m->vmp_pmapped == TRUE) {
3483 					/* unmap the page */
3484 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3485 					if (refmod_state & VM_MEM_MODIFIED) {
3486 						SET_PAGE_DIRTY(m, FALSE);
3487 					}
3488 				}
3489 				if (m->vmp_dirty || m->vmp_precious) {
3490 					/* we saved the cost of cleaning this page ! */
3491 					vm_page_purged_count++;
3492 				}
3493 				goto reclaim_page;
3494 			}
3495 
3496 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3497 				/*
3498 				 * With the VM compressor, the cost of
3499 				 * reclaiming a page is much lower (no I/O),
3500 				 * so if we find a "volatile" page, it's better
3501 				 * to let it get compressed rather than letting
3502 				 * it occupy a full page until it gets purged.
3503 				 * So no need to check for "volatile" here.
3504 				 */
3505 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3506 				/*
3507 				 * Avoid cleaning a "volatile" page which might
3508 				 * be purged soon.
3509 				 */
3510 
3511 				/* if it's wired, we can't put it on our queue */
3512 				assert(!VM_PAGE_WIRED(m));
3513 
3514 				/* just stick it back on! */
3515 				reactivated_this_call++;
3516 
3517 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3518 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3519 				}
3520 
3521 				goto reactivate_page;
3522 			}
3523 		} /* vo_copy NULL */
3524 		/*
3525 		 *	If it's being used, reactivate.
3526 		 *	(Fictitious pages are either busy or absent.)
3527 		 *	First, update the reference and dirty bits
3528 		 *	to make sure the page is unreferenced.
3529 		 */
3530 		refmod_state = -1;
3531 
3532 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3533 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3534 
3535 			if (refmod_state & VM_MEM_REFERENCED) {
3536 				m->vmp_reference = TRUE;
3537 			}
3538 			if (refmod_state & VM_MEM_MODIFIED) {
3539 				SET_PAGE_DIRTY(m, FALSE);
3540 			}
3541 		}
3542 
3543 		if (m->vmp_reference || m->vmp_dirty) {
3544 			/* deal with a rogue "reusable" page */
3545 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3546 		}
3547 
3548 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3549 			vm_pageout_state.vm_page_xpmapped_min = 0;
3550 		} else {
3551 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3552 			    vm_pageout_state.vm_page_xpmapped_min_divisor;
3553 		}
3554 
3555 		if (!m->vmp_no_cache &&
3556 		    page_from_bg_q == FALSE &&
3557 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3558 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3559 			/*
3560 			 * The page we pulled off the inactive list has
3561 			 * been referenced.  It is possible for other
3562 			 * processors to be touching pages faster than we
3563 			 * can clear the referenced bit and traverse the
3564 			 * inactive queue, so we limit the number of
3565 			 * reactivations.
3566 			 */
3567 			if (++reactivated_this_call >= reactivate_limit &&
3568 			    !object->object_is_shared_cache &&
3569 			    !((m->vmp_realtime ||
3570 			    object->for_realtime) &&
3571 			    vm_pageout_protect_realtime)) {
3572 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3573 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3574 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3575 				if (object->object_is_shared_cache) {
3576 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3577 				} else if (m->vmp_realtime ||
3578 				    object->for_realtime) {
3579 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3580 				}
3581 			} else {
3582 				uint32_t isinuse;
3583 
3584 				if (reactivated_this_call >= reactivate_limit) {
3585 					if (object->object_is_shared_cache) {
3586 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3587 					} else if ((m->vmp_realtime ||
3588 					    object->for_realtime) &&
3589 					    vm_pageout_protect_realtime) {
3590 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3591 					}
3592 				}
3593 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3594 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3595 				}
3596 
3597 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3598 reactivate_page:
3599 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3600 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3601 					/*
3602 					 * no explict mappings of this object exist
3603 					 * and it's not open via the filesystem
3604 					 */
3605 					vm_page_deactivate(m);
3606 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3607 				} else {
3608 					/*
3609 					 * The page was/is being used, so put back on active list.
3610 					 */
3611 					vm_page_activate(m);
3612 					counter_inc(&vm_statistics_reactivations);
3613 					inactive_burst_count = 0;
3614 				}
3615 #if DEVELOPMENT || DEBUG
3616 				if (page_from_bg_q == TRUE) {
3617 					if (m_object->internal) {
3618 						vm_pageout_rejected_bq_internal++;
3619 					} else {
3620 						vm_pageout_rejected_bq_external++;
3621 					}
3622 				}
3623 #endif /* DEVELOPMENT || DEBUG */
3624 
3625 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3626 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3627 				}
3628 				vm_pageout_state.vm_pageout_inactive_used++;
3629 
3630 				lock_yield_check = TRUE;
3631 				continue;
3632 			}
3633 			/*
3634 			 * Make sure we call pmap_get_refmod() if it
3635 			 * wasn't already called just above, to update
3636 			 * the dirty bit.
3637 			 */
3638 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3639 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3640 				if (refmod_state & VM_MEM_MODIFIED) {
3641 					SET_PAGE_DIRTY(m, FALSE);
3642 				}
3643 			}
3644 		}
3645 
3646 		/*
3647 		 * we've got a candidate page to steal...
3648 		 *
3649 		 * m->vmp_dirty is up to date courtesy of the
3650 		 * preceding check for m->vmp_reference... if
3651 		 * we get here, then m->vmp_reference had to be
3652 		 * FALSE (or possibly "reactivate_limit" was
3653 		 * exceeded), but in either case we called
3654 		 * pmap_get_refmod() and updated both
3655 		 * m->vmp_reference and m->vmp_dirty
3656 		 *
3657 		 * if it's dirty or precious we need to
3658 		 * see if the target queue is throtttled
3659 		 * it if is, we need to skip over it by moving it back
3660 		 * to the end of the inactive queue
3661 		 */
3662 
3663 		inactive_throttled = FALSE;
3664 
3665 		if (m->vmp_dirty || m->vmp_precious) {
3666 			if (object->internal) {
3667 				if (VM_PAGE_Q_THROTTLED(iq)) {
3668 					inactive_throttled = TRUE;
3669 				}
3670 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3671 				inactive_throttled = TRUE;
3672 			}
3673 		}
3674 throttle_inactive:
3675 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3676 		    object->internal && m->vmp_dirty &&
3677 		    (object->purgable == VM_PURGABLE_DENY ||
3678 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3679 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3680 			vm_page_check_pageable_safe(m);
3681 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3682 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3683 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3684 			vm_page_throttled_count++;
3685 
3686 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3687 
3688 			inactive_burst_count = 0;
3689 
3690 			lock_yield_check = TRUE;
3691 			continue;
3692 		}
3693 		if (inactive_throttled == TRUE) {
3694 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3695 			    &force_anonymous, page_from_bg_q);
3696 
3697 			inactive_burst_count = 0;
3698 
3699 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3700 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3701 			}
3702 
3703 			lock_yield_check = TRUE;
3704 			continue;
3705 		}
3706 
3707 		/*
3708 		 * we've got a page that we can steal...
3709 		 * eliminate all mappings and make sure
3710 		 * we have the up-to-date modified state
3711 		 *
3712 		 * if we need to do a pmap_disconnect then we
3713 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3714 		 * provides the true state atomically... the
3715 		 * page was still mapped up to the pmap_disconnect
3716 		 * and may have been dirtied at the last microsecond
3717 		 *
3718 		 * Note that if 'pmapped' is FALSE then the page is not
3719 		 * and has not been in any map, so there is no point calling
3720 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3721 		 * of likely usage of the page.
3722 		 */
3723 		if (m->vmp_pmapped == TRUE) {
3724 			int pmap_options;
3725 
3726 			/*
3727 			 * Don't count this page as going into the compressor
3728 			 * if any of these are true:
3729 			 * 1) compressed pager isn't enabled
3730 			 * 2) Freezer enabled device with compressed pager
3731 			 *    backend (exclusive use) i.e. most of the VM system
3732 			 *    (including vm_pageout_scan) has no knowledge of
3733 			 *    the compressor
3734 			 * 3) This page belongs to a file and hence will not be
3735 			 *    sent into the compressor
3736 			 */
3737 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3738 			    object->internal == FALSE) {
3739 				pmap_options = 0;
3740 			} else if (m->vmp_dirty || m->vmp_precious) {
3741 				/*
3742 				 * VM knows that this page is dirty (or
3743 				 * precious) and needs to be compressed
3744 				 * rather than freed.
3745 				 * Tell the pmap layer to count this page
3746 				 * as "compressed".
3747 				 */
3748 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3749 			} else {
3750 				/*
3751 				 * VM does not know if the page needs to
3752 				 * be preserved but the pmap layer might tell
3753 				 * us if any mapping has "modified" it.
3754 				 * Let's the pmap layer to count this page
3755 				 * as compressed if and only if it has been
3756 				 * modified.
3757 				 */
3758 				pmap_options =
3759 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3760 			}
3761 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3762 			    pmap_options,
3763 			    NULL);
3764 			if (refmod_state & VM_MEM_MODIFIED) {
3765 				SET_PAGE_DIRTY(m, FALSE);
3766 			}
3767 		}
3768 
3769 		/*
3770 		 * reset our count of pages that have been reclaimed
3771 		 * since the last page was 'stolen'
3772 		 */
3773 		inactive_reclaim_run = 0;
3774 
3775 		/*
3776 		 *	If it's clean and not precious, we can free the page.
3777 		 */
3778 		if (!m->vmp_dirty && !m->vmp_precious) {
3779 			vm_pageout_state.vm_pageout_inactive_clean++;
3780 
3781 			/*
3782 			 * OK, at this point we have found a page we are going to free.
3783 			 */
3784 #if CONFIG_PHANTOM_CACHE
3785 			if (!object->internal) {
3786 				vm_phantom_cache_add_ghost(m);
3787 			}
3788 #endif
3789 			goto reclaim_page;
3790 		}
3791 
3792 		/*
3793 		 * The page may have been dirtied since the last check
3794 		 * for a throttled target queue (which may have been skipped
3795 		 * if the page was clean then).  With the dirty page
3796 		 * disconnected here, we can make one final check.
3797 		 */
3798 		if (object->internal) {
3799 			if (VM_PAGE_Q_THROTTLED(iq)) {
3800 				inactive_throttled = TRUE;
3801 			}
3802 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3803 			inactive_throttled = TRUE;
3804 		}
3805 
3806 		if (inactive_throttled == TRUE) {
3807 			goto throttle_inactive;
3808 		}
3809 #if !CONFIG_JETSAM
3810 		memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3811 #endif /* !CONFIG_JETSAM */
3812 
3813 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3814 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3815 		}
3816 
3817 		if (object->internal) {
3818 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3819 		} else {
3820 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3821 		}
3822 
3823 		/*
3824 		 * internal pages will go to the compressor...
3825 		 * external pages will go to the appropriate pager to be cleaned
3826 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3827 		 * is a preferred queue to steal from
3828 		 */
3829 		vm_pageout_cluster(m);
3830 		inactive_burst_count = 0;
3831 
3832 		/*
3833 		 * back to top of pageout scan loop
3834 		 */
3835 	}
3836 }
3837 
3838 
3839 void
3840 vm_page_free_reserve(
3841 	int pages)
3842 {
3843 	int             free_after_reserve;
3844 
3845 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3846 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3847 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3848 		} else {
3849 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3850 		}
3851 	} else {
3852 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3853 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3854 		} else {
3855 			vm_page_free_reserved += pages;
3856 		}
3857 	}
3858 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3859 
3860 	vm_page_free_min = vm_page_free_reserved +
3861 	    VM_PAGE_FREE_MIN(free_after_reserve);
3862 
3863 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3864 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3865 	}
3866 
3867 	vm_page_free_target = vm_page_free_reserved +
3868 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3869 
3870 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3871 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3872 	}
3873 
3874 	if (vm_page_free_target < vm_page_free_min + 5) {
3875 		vm_page_free_target = vm_page_free_min + 5;
3876 	}
3877 
3878 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3879 }
3880 
3881 /*
3882  *	vm_pageout is the high level pageout daemon.
3883  */
3884 
3885 void
3886 vm_pageout_continue(void)
3887 {
3888 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3889 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3890 
3891 	vm_free_page_lock();
3892 	vm_pageout_running = TRUE;
3893 	vm_free_page_unlock();
3894 
3895 	vm_pageout_scan();
3896 	/*
3897 	 * we hold both the vm_page_queue_free_lock
3898 	 * and the vm_page_queues_lock at this point
3899 	 */
3900 	assert(vm_page_free_wanted == 0);
3901 	assert(vm_page_free_wanted_privileged == 0);
3902 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3903 
3904 	vm_pageout_running = FALSE;
3905 #if XNU_TARGET_OS_OSX
3906 	if (vm_pageout_waiter) {
3907 		vm_pageout_waiter = FALSE;
3908 		thread_wakeup((event_t)&vm_pageout_waiter);
3909 	}
3910 #endif /* XNU_TARGET_OS_OSX */
3911 
3912 	vm_free_page_unlock();
3913 	vm_page_unlock_queues();
3914 
3915 	thread_block((thread_continue_t)vm_pageout_continue);
3916 	/*NOTREACHED*/
3917 }
3918 
3919 #if XNU_TARGET_OS_OSX
3920 kern_return_t
3921 vm_pageout_wait(uint64_t deadline)
3922 {
3923 	kern_return_t kr;
3924 
3925 	vm_free_page_lock();
3926 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3927 		vm_pageout_waiter = TRUE;
3928 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3929 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3930 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3931 			kr = KERN_OPERATION_TIMED_OUT;
3932 		}
3933 	}
3934 	vm_free_page_unlock();
3935 
3936 	return kr;
3937 }
3938 #endif /* XNU_TARGET_OS_OSX */
3939 
3940 OS_NORETURN
3941 static void
3942 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3943 {
3944 	vm_page_t       m = NULL;
3945 	vm_object_t     object;
3946 	vm_object_offset_t offset;
3947 	memory_object_t pager;
3948 	struct vm_pageout_queue *q = ethr->q;
3949 
3950 	/* On systems with a compressor, the external IO thread clears its
3951 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3952 	 * creation)
3953 	 */
3954 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3955 		current_thread()->options &= ~TH_OPT_VMPRIV;
3956 	}
3957 
3958 	sched_cond_ack(&(ethr->pgo_wakeup));
3959 
3960 	while (true) {
3961 		vm_page_lockspin_queues();
3962 
3963 		while (!vm_page_queue_empty(&q->pgo_pending)) {
3964 			q->pgo_busy = TRUE;
3965 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3966 
3967 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3968 			VM_PAGE_CHECK(m);
3969 			/*
3970 			 * grab a snapshot of the object and offset this
3971 			 * page is tabled in so that we can relookup this
3972 			 * page after we've taken the object lock - these
3973 			 * fields are stable while we hold the page queues lock
3974 			 * but as soon as we drop it, there is nothing to keep
3975 			 * this page in this object... we hold an activity_in_progress
3976 			 * on this object which will keep it from terminating
3977 			 */
3978 			object = VM_PAGE_OBJECT(m);
3979 			offset = m->vmp_offset;
3980 
3981 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3982 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3983 
3984 			vm_page_unlock_queues();
3985 
3986 			vm_object_lock(object);
3987 
3988 			m = vm_page_lookup(object, offset);
3989 
3990 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3991 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3992 				/*
3993 				 * it's either the same page that someone else has
3994 				 * started cleaning (or it's finished cleaning or
3995 				 * been put back on the pageout queue), or
3996 				 * the page has been freed or we have found a
3997 				 * new page at this offset... in all of these cases
3998 				 * we merely need to release the activity_in_progress
3999 				 * we took when we put the page on the pageout queue
4000 				 */
4001 				vm_object_activity_end(object);
4002 				vm_object_unlock(object);
4003 
4004 				vm_page_lockspin_queues();
4005 				continue;
4006 			}
4007 			pager = object->pager;
4008 
4009 			if (pager == MEMORY_OBJECT_NULL) {
4010 				/*
4011 				 * This pager has been destroyed by either
4012 				 * memory_object_destroy or vm_object_destroy, and
4013 				 * so there is nowhere for the page to go.
4014 				 */
4015 				if (m->vmp_free_when_done) {
4016 					/*
4017 					 * Just free the page... VM_PAGE_FREE takes
4018 					 * care of cleaning up all the state...
4019 					 * including doing the vm_pageout_throttle_up
4020 					 */
4021 					VM_PAGE_FREE(m);
4022 				} else {
4023 					vm_page_lockspin_queues();
4024 
4025 					vm_pageout_throttle_up(m);
4026 					vm_page_activate(m);
4027 
4028 					vm_page_unlock_queues();
4029 
4030 					/*
4031 					 *	And we are done with it.
4032 					 */
4033 				}
4034 				vm_object_activity_end(object);
4035 				vm_object_unlock(object);
4036 
4037 				vm_page_lockspin_queues();
4038 				continue;
4039 			}
4040 	#if 0
4041 			/*
4042 			 * we don't hold the page queue lock
4043 			 * so this check isn't safe to make
4044 			 */
4045 			VM_PAGE_CHECK(m);
4046 	#endif
4047 			/*
4048 			 * give back the activity_in_progress reference we
4049 			 * took when we queued up this page and replace it
4050 			 * it with a paging_in_progress reference that will
4051 			 * also hold the paging offset from changing and
4052 			 * prevent the object from terminating
4053 			 */
4054 			vm_object_activity_end(object);
4055 			vm_object_paging_begin(object);
4056 			vm_object_unlock(object);
4057 
4058 			/*
4059 			 * Send the data to the pager.
4060 			 * any pageout clustering happens there
4061 			 */
4062 			memory_object_data_return(pager,
4063 			    m->vmp_offset + object->paging_offset,
4064 			    PAGE_SIZE,
4065 			    NULL,
4066 			    NULL,
4067 			    FALSE,
4068 			    FALSE,
4069 			    0);
4070 
4071 			vm_object_lock(object);
4072 			vm_object_paging_end(object);
4073 			vm_object_unlock(object);
4074 
4075 			vm_pageout_io_throttle();
4076 
4077 			vm_page_lockspin_queues();
4078 		}
4079 		q->pgo_busy = FALSE;
4080 
4081 		vm_page_unlock_queues();
4082 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4083 	}
4084 	/*NOTREACHED*/
4085 }
4086 
4087 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4088 
4089 #if DEVELOPMENT || DEBUG
4090 static void
4091 vm_pageout_record_thread_time(int cqid, int ncomps)
4092 {
4093 	if (__improbable(vm_compressor_time_thread)) {
4094 		vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4095 		vmct_stats.vmct_pages[cqid] += ncomps;
4096 		vmct_stats.vmct_iterations[cqid]++;
4097 		if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4098 			vmct_stats.vmct_maxpages[cqid] = ncomps;
4099 		}
4100 		if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4101 			vmct_stats.vmct_minpages[cqid] = ncomps;
4102 		}
4103 	}
4104 }
4105 #endif
4106 
4107 static void *
4108 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4109 {
4110 	/*
4111 	 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4112 	 * However, this page has been removed from all queues and is only
4113 	 * known to this compressor thread dealing with this local queue.
4114 	 *
4115 	 * TODO: Add a second localq that is the early localq and
4116 	 * put special pages like this one on that queue in the block above
4117 	 * under the pageq lock to avoid this 'works but not clean' logic.
4118 	 */
4119 	void *donate_queue_head;
4120 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4121 	donate_queue_head = &cq->current_early_swapout_chead;
4122 #else /* XNU_TARGET_OS_OSX */
4123 	donate_queue_head = &cq->current_late_swapout_chead;
4124 #endif /* XNU_TARGET_OS_OSX */
4125 	if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4126 		m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4127 		return donate_queue_head;
4128 	} else {
4129 		return &cq->current_regular_swapout_chead;
4130 	}
4131 }
4132 
4133 #define         MAX_FREE_BATCH          32
4134 
4135 OS_NORETURN
4136 static void
4137 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4138 {
4139 	struct vm_pageout_queue *q;
4140 	vm_page_t       m = NULL;
4141 	boolean_t       pgo_draining;
4142 	vm_page_t   local_q;
4143 	int         local_cnt;
4144 	vm_page_t   local_freeq = NULL;
4145 	int         local_freed = 0;
4146 	int         local_batch_size;
4147 #if DEVELOPMENT || DEBUG
4148 	int       ncomps = 0;
4149 	boolean_t marked_active = FALSE;
4150 	int       num_pages_processed = 0;
4151 #endif
4152 	void *chead = NULL;
4153 
4154 	KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4155 
4156 	sched_cond_ack(&(cq->pgo_wakeup));
4157 
4158 	q = cq->q;
4159 
4160 	while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4161 #if DEVELOPMENT || DEBUG
4162 		bool benchmark_accounting = false;
4163 		/* If we're running the compressor perf test, only process the benchmark pages.
4164 		 * We'll get back to our regular queue once the benchmark is done */
4165 		if (compressor_running_perf_test) {
4166 			q = cq->benchmark_q;
4167 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4168 				benchmark_accounting = true;
4169 			} else {
4170 				q = cq->q;
4171 				benchmark_accounting = false;
4172 			}
4173 		}
4174 #endif /* DEVELOPMENT || DEBUG */
4175 
4176 #if __AMP__
4177 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4178 			local_batch_size = (q->pgo_maxlaundry >> 3);
4179 			local_batch_size = MAX(local_batch_size, 16);
4180 		} else {
4181 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4182 		}
4183 #else
4184 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4185 #endif
4186 
4187 #if RECORD_THE_COMPRESSED_DATA
4188 		if (q->pgo_laundry) {
4189 			c_compressed_record_init();
4190 		}
4191 #endif
4192 		while (true) { /* this loop is for working though all the pages in the pending queue */
4193 			int     pages_left_on_q = 0;
4194 
4195 			local_cnt = 0;
4196 			local_q = NULL;
4197 
4198 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4199 
4200 			vm_page_lock_queues();
4201 #if DEVELOPMENT || DEBUG
4202 			if (marked_active == FALSE) {
4203 				vmct_active++;
4204 				vmct_state[cq->id] = VMCT_ACTIVE;
4205 				marked_active = TRUE;
4206 				if (vmct_active == 1) {
4207 					vm_compressor_epoch_start = mach_absolute_time();
4208 				}
4209 			}
4210 #endif
4211 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4212 
4213 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4214 
4215 			/* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4216 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4217 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4218 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4219 				VM_PAGE_CHECK(m);
4220 
4221 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4222 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4223 				m->vmp_laundry = FALSE;
4224 
4225 				m->vmp_snext = local_q;
4226 				local_q = m;
4227 				local_cnt++;
4228 			}
4229 			if (local_q == NULL) {
4230 				break;
4231 			}
4232 
4233 			q->pgo_busy = TRUE;
4234 
4235 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4236 				vm_pageout_throttle_up_batch(q, local_cnt);
4237 				pages_left_on_q = q->pgo_laundry;
4238 			} else {
4239 				pages_left_on_q = q->pgo_laundry - local_cnt;
4240 			}
4241 
4242 			vm_page_unlock_queues();
4243 
4244 #if !RECORD_THE_COMPRESSED_DATA
4245 			/* if we have lots to compress, wake up the other thread to help.
4246 			 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4247 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4248 				// wake up the next compressor thread
4249 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4250 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4251 			}
4252 #endif
4253 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4254 
4255 			while (local_q) {
4256 				KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4257 
4258 				m = local_q;
4259 				local_q = m->vmp_snext;
4260 				m->vmp_snext = NULL;
4261 
4262 
4263 				chead = vm_pageout_select_filling_chead(cq, m);
4264 
4265 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4266 #if DEVELOPMENT || DEBUG
4267 					ncomps++;
4268 #endif
4269 					KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4270 
4271 					m->vmp_snext = local_freeq;
4272 					local_freeq = m;
4273 					local_freed++;
4274 
4275 					/* if we gathered enough free pages, free them now */
4276 					if (local_freed >= MAX_FREE_BATCH) {
4277 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4278 
4279 						vm_page_free_list(local_freeq, TRUE);
4280 
4281 						local_freeq = NULL;
4282 						local_freed = 0;
4283 					}
4284 				}
4285 #if DEVELOPMENT || DEBUG
4286 				num_pages_processed++;
4287 #endif /* DEVELOPMENT || DEBUG */
4288 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4289 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4290 					kern_return_t   wait_result;
4291 					int             need_wakeup = 0;
4292 
4293 					if (local_freeq) {
4294 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4295 
4296 						vm_page_free_list(local_freeq, TRUE);
4297 						local_freeq = NULL;
4298 						local_freed = 0;
4299 
4300 						continue;
4301 					}
4302 					vm_free_page_lock_spin();
4303 
4304 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4305 						if (vm_page_free_wanted_privileged++ == 0) {
4306 							need_wakeup = 1;
4307 						}
4308 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4309 
4310 						vm_free_page_unlock();
4311 
4312 						if (need_wakeup) {
4313 							thread_wakeup((event_t)&vm_page_free_wanted);
4314 						}
4315 
4316 						if (wait_result == THREAD_WAITING) {
4317 							thread_block(THREAD_CONTINUE_NULL);
4318 						}
4319 					} else {
4320 						vm_free_page_unlock();
4321 					}
4322 				}
4323 #endif
4324 			}  /* while (local_q) */
4325 			/* free any leftovers in the freeq */
4326 			if (local_freeq) {
4327 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4328 
4329 				vm_page_free_list(local_freeq, TRUE);
4330 				local_freeq = NULL;
4331 				local_freed = 0;
4332 			}
4333 			if (pgo_draining == TRUE) {
4334 				vm_page_lockspin_queues();
4335 				vm_pageout_throttle_up_batch(q, local_cnt);
4336 				vm_page_unlock_queues();
4337 			}
4338 		}
4339 		KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4340 
4341 		/*
4342 		 * queue lock is held and our q is empty
4343 		 */
4344 		q->pgo_busy = FALSE;
4345 #if DEVELOPMENT || DEBUG
4346 		if (marked_active == TRUE) {
4347 			vmct_active--;
4348 			vmct_state[cq->id] = VMCT_IDLE;
4349 
4350 			if (vmct_active == 0) {
4351 				vm_compressor_epoch_stop = mach_absolute_time();
4352 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4353 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4354 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4355 				/* This interval includes intervals where one or more
4356 				 * compressor threads were pre-empted
4357 				 */
4358 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4359 			}
4360 		}
4361 		if (compressor_running_perf_test && benchmark_accounting) {
4362 			/*
4363 			 * We could turn ON compressor_running_perf_test while still processing
4364 			 * regular non-benchmark pages. We shouldn't count them here else we
4365 			 * could overshoot. We might also still be populating that benchmark Q
4366 			 * and be under pressure. So we will go back to the regular queues. And
4367 			 * benchmark accounting will be off for that case too.
4368 			 */
4369 			compressor_perf_test_pages_processed += num_pages_processed;
4370 			thread_wakeup(&compressor_perf_test_pages_processed);
4371 		}
4372 #endif
4373 		vm_page_unlock_queues();
4374 #if DEVELOPMENT || DEBUG
4375 		vm_pageout_record_thread_time(cq->id, ncomps);
4376 #endif
4377 
4378 		KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4379 #if DEVELOPMENT || DEBUG
4380 		if (compressor_running_perf_test && benchmark_accounting) {
4381 			/*
4382 			 * We've been exclusively compressing pages from the benchmark queue,
4383 			 * do 1 pass over the internal queue before blocking.
4384 			 */
4385 			continue;
4386 		}
4387 #endif
4388 
4389 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4390 	}
4391 	/*NOTREACHED*/
4392 }
4393 
4394 /* resolves the pager and maintain stats in the pager and in the vm_object */
4395 kern_return_t
4396 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4397 {
4398 	vm_object_t     object;
4399 	memory_object_t pager;
4400 	int             compressed_count_delta;
4401 	kern_return_t   retval;
4402 
4403 	object = VM_PAGE_OBJECT(m);
4404 
4405 	assert(!m->vmp_free_when_done);
4406 	assert(!m->vmp_laundry);
4407 
4408 	pager = object->pager;
4409 
4410 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4411 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4412 
4413 		vm_object_lock(object);
4414 
4415 		/*
4416 		 * If there is no memory object for the page, create
4417 		 * one and hand it to the compression pager.
4418 		 */
4419 
4420 		if (!object->pager_initialized) {
4421 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4422 		}
4423 		if (!object->pager_initialized) {
4424 			vm_object_compressor_pager_create(object);
4425 		}
4426 
4427 		pager = object->pager;
4428 
4429 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4430 			/*
4431 			 * Still no pager for the object,
4432 			 * or the pager has been destroyed.
4433 			 * Reactivate the page.
4434 			 *
4435 			 * Should only happen if there is no
4436 			 * compression pager
4437 			 */
4438 			vm_page_wakeup_done(object, m);
4439 
4440 			vm_page_lockspin_queues();
4441 			vm_page_activate(m);
4442 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4443 			vm_page_unlock_queues();
4444 
4445 			/*
4446 			 *	And we are done with it.
4447 			 */
4448 			vm_object_activity_end(object);
4449 			vm_object_unlock(object);
4450 
4451 			return KERN_FAILURE;
4452 		}
4453 		vm_object_unlock(object);
4454 
4455 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4456 	}
4457 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4458 	assert(object->activity_in_progress > 0);
4459 
4460 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4461 	if (m->vmp_unmodified_ro == true) {
4462 		os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4463 	}
4464 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4465 
4466 	vm_compressor_options_t flags = 0;
4467 
4468 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4469 	if (m->vmp_unmodified_ro) {
4470 		flags |= C_PAGE_UNMODIFIED;
4471 	}
4472 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4473 
4474 
4475 	retval = vm_compressor_pager_put(
4476 		pager,
4477 		m->vmp_offset + object->paging_offset,
4478 		VM_PAGE_GET_PHYS_PAGE(m),
4479 		current_chead,
4480 		scratch_buf,
4481 		&compressed_count_delta,
4482 		flags);
4483 
4484 	vm_object_lock(object);
4485 
4486 	assert(object->activity_in_progress > 0);
4487 	assert(VM_PAGE_OBJECT(m) == object);
4488 	assert( !VM_PAGE_WIRED(m));
4489 
4490 	vm_compressor_pager_count(pager,
4491 	    compressed_count_delta,
4492 	    FALSE,                       /* shared_lock */
4493 	    object);
4494 
4495 	if (retval == KERN_SUCCESS) {
4496 		/*
4497 		 * If the object is purgeable, its owner's
4498 		 * purgeable ledgers will be updated in
4499 		 * vm_page_remove() but the page still
4500 		 * contributes to the owner's memory footprint,
4501 		 * so account for it as such.
4502 		 */
4503 		if (m->vmp_tabled) {
4504 			vm_page_remove(m, TRUE);
4505 		}
4506 		if ((object->purgable != VM_PURGABLE_DENY ||
4507 		    object->vo_ledger_tag) &&
4508 		    object->vo_owner != NULL) {
4509 			/* one more compressed purgeable/tagged page */
4510 			vm_object_owner_compressed_update(object,
4511 			    compressed_count_delta);
4512 		}
4513 		counter_inc(&vm_statistics_compressions);
4514 	} else {
4515 		vm_page_wakeup_done(object, m);
4516 
4517 		vm_page_lockspin_queues();
4518 
4519 		vm_page_activate(m);
4520 		vm_pageout_vminfo.vm_compressor_failed++;
4521 
4522 		vm_page_unlock_queues();
4523 	}
4524 	vm_object_activity_end(object);
4525 	vm_object_unlock(object);
4526 
4527 	return retval;
4528 }
4529 
4530 
4531 static void
4532 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4533 {
4534 	uint32_t        policy;
4535 
4536 	if (hibernate_cleaning_in_progress == TRUE) {
4537 		req_lowpriority = FALSE;
4538 	}
4539 
4540 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4541 		vm_page_unlock_queues();
4542 
4543 		if (req_lowpriority == TRUE) {
4544 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4545 			DTRACE_VM(laundrythrottle);
4546 		} else {
4547 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4548 			DTRACE_VM(laundryunthrottle);
4549 		}
4550 		proc_set_thread_policy(ethr->pgo_iothread,
4551 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4552 
4553 		vm_page_lock_queues();
4554 		ethr->q->pgo_lowpriority = req_lowpriority;
4555 	}
4556 }
4557 
4558 OS_NORETURN
4559 static void
4560 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4561 {
4562 	thread_t        self = current_thread();
4563 
4564 	self->options |= TH_OPT_VMPRIV;
4565 
4566 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4567 
4568 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4569 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4570 
4571 	vm_page_lock_queues();
4572 
4573 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4574 	vm_pageout_queue_external.pgo_inited = TRUE;
4575 
4576 	vm_page_unlock_queues();
4577 
4578 #if CONFIG_THREAD_GROUPS
4579 	thread_group_vm_add();
4580 #endif /* CONFIG_THREAD_GROUPS */
4581 
4582 	vm_pageout_iothread_external_continue(ethr, 0);
4583 	/*NOTREACHED*/
4584 }
4585 
4586 
4587 OS_NORETURN
4588 static void
4589 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4590 {
4591 	thread_t        self = current_thread();
4592 
4593 	self->options |= TH_OPT_VMPRIV;
4594 
4595 	vm_page_lock_queues();
4596 
4597 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4598 	vm_pageout_queue_internal.pgo_inited = TRUE;
4599 
4600 #if DEVELOPMENT || DEBUG
4601 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4602 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4603 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4604 #endif /* DEVELOPMENT || DEBUG */
4605 
4606 	vm_page_unlock_queues();
4607 
4608 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4609 		thread_vm_bind_group_add();
4610 	}
4611 
4612 #if CONFIG_THREAD_GROUPS
4613 	thread_group_vm_add();
4614 #endif /* CONFIG_THREAD_GROUPS */
4615 
4616 #if __AMP__
4617 	if (vm_compressor_ebound) {
4618 		/*
4619 		 * Use the soft bound option for vm_compressor to allow it to run on
4620 		 * P-cores if E-cluster is unavailable.
4621 		 */
4622 		thread_soft_bind_cluster_type(self, 'E');
4623 	}
4624 #endif /* __AMP__ */
4625 
4626 	thread_set_thread_name(current_thread(), "VM_compressor");
4627 #if DEVELOPMENT || DEBUG
4628 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4629 #endif
4630 	vm_pageout_iothread_internal_continue(cthr, 0);
4631 
4632 	/*NOTREACHED*/
4633 }
4634 
4635 kern_return_t
4636 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4637 {
4638 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4639 		return KERN_SUCCESS;
4640 	} else {
4641 		return KERN_FAILURE; /* Already set */
4642 	}
4643 }
4644 
4645 extern boolean_t        memorystatus_manual_testing_on;
4646 extern unsigned int     memorystatus_level;
4647 
4648 
4649 #if VM_PRESSURE_EVENTS
4650 
4651 boolean_t vm_pressure_events_enabled = FALSE;
4652 
4653 extern uint64_t next_warning_notification_sent_at_ts;
4654 extern uint64_t next_critical_notification_sent_at_ts;
4655 
4656 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4657 
4658 /*
4659  * The last time there was change in pressure level OR we forced a check
4660  * because the system is stuck in a non-normal pressure level.
4661  */
4662 uint64_t  vm_pressure_last_level_transition_abs = 0;
4663 
4664 /*
4665  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4666  * level before resending out notifications for that level again.
4667  */
4668 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4669 
4670 void
4671 vm_pressure_response(void)
4672 {
4673 	vm_pressure_level_t     old_level = kVMPressureNormal;
4674 	int                     new_level = -1;
4675 	unsigned int            total_pages;
4676 	uint64_t                available_memory = 0;
4677 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4678 	bool                    force_check = false;
4679 	int                     time_in_mins;
4680 
4681 
4682 	if (vm_pressure_events_enabled == FALSE) {
4683 		return;
4684 	}
4685 
4686 	available_memory = (uint64_t) memorystatus_get_available_page_count();
4687 
4688 	total_pages = (unsigned int) atop_64(max_mem);
4689 #if CONFIG_SECLUDED_MEMORY
4690 	total_pages -= vm_page_secluded_count;
4691 #endif /* CONFIG_SECLUDED_MEMORY */
4692 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4693 
4694 	if (memorystatus_manual_testing_on) {
4695 		return;
4696 	}
4697 
4698 	curr_ts = mach_absolute_time();
4699 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4700 
4701 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4702 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4703 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4704 
4705 	old_level = memorystatus_vm_pressure_level;
4706 
4707 	switch (memorystatus_vm_pressure_level) {
4708 	case kVMPressureNormal:
4709 	{
4710 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4711 			new_level = kVMPressureCritical;
4712 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4713 			new_level = kVMPressureWarning;
4714 		}
4715 		break;
4716 	}
4717 
4718 	case kVMPressureWarning:
4719 	case kVMPressureUrgent:
4720 	{
4721 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4722 			new_level = kVMPressureNormal;
4723 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4724 			new_level = kVMPressureCritical;
4725 		} else if (force_check) {
4726 			new_level = kVMPressureWarning;
4727 			next_warning_notification_sent_at_ts = curr_ts;
4728 		}
4729 		break;
4730 	}
4731 
4732 	case kVMPressureCritical:
4733 	{
4734 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4735 			new_level = kVMPressureNormal;
4736 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4737 			new_level = kVMPressureWarning;
4738 		} else if (force_check) {
4739 			new_level = kVMPressureCritical;
4740 			next_critical_notification_sent_at_ts = curr_ts;
4741 		}
4742 		break;
4743 	}
4744 
4745 	default:
4746 		return;
4747 	}
4748 
4749 	if (new_level != -1 || force_check) {
4750 		if (new_level != -1) {
4751 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4752 
4753 			if (new_level != (int) old_level) {
4754 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4755 				    new_level, old_level, 0, 0);
4756 			}
4757 		} else {
4758 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4759 			    new_level, old_level, force_check, 0);
4760 		}
4761 
4762 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4763 			/*
4764 			 * We don't want to schedule a wakeup while hibernation is in progress
4765 			 * because that could collide with checks for non-monotonicity in the scheduler.
4766 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4767 			 * we _might_ want to use that for decisions regarding which pages or how
4768 			 * many pages we want to dump in hibernation.
4769 			 */
4770 			return;
4771 		}
4772 
4773 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4774 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4775 				thread_wakeup(&vm_pressure_thread);
4776 			}
4777 
4778 			if (old_level != memorystatus_vm_pressure_level) {
4779 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4780 			}
4781 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4782 		}
4783 	}
4784 }
4785 #endif /* VM_PRESSURE_EVENTS */
4786 
4787 
4788 /**
4789  * Called by a kernel thread to ask if a number of pages may be wired.
4790  */
4791 kern_return_t
4792 mach_vm_wire_level_monitor(int64_t requested_pages)
4793 {
4794 	if (requested_pages <= 0) {
4795 		return KERN_INVALID_ARGUMENT;
4796 	}
4797 
4798 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4799 	/**
4800 	 * Available pages can be negative in the case where more system memory is
4801 	 * wired than the threshold, so we must use a signed integer.
4802 	 */
4803 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4804 
4805 	if (requested_pages > available_pages) {
4806 		return KERN_RESOURCE_SHORTAGE;
4807 	}
4808 	return KERN_SUCCESS;
4809 }
4810 
4811 /*
4812  * Function called by a kernel thread to either get the current pressure level or
4813  * wait until memory pressure changes from a given level.
4814  */
4815 kern_return_t
4816 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4817 {
4818 #if !VM_PRESSURE_EVENTS
4819 	(void)wait_for_pressure;
4820 	(void)pressure_level;
4821 	return KERN_NOT_SUPPORTED;
4822 #else /* VM_PRESSURE_EVENTS */
4823 
4824 	uint32_t *waiters = NULL;
4825 	wait_result_t wr = 0;
4826 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4827 
4828 	if (pressure_level == NULL) {
4829 		return KERN_INVALID_ARGUMENT;
4830 	}
4831 	if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4832 	    *pressure_level == kVMPressureForegroundJetsam)) {
4833 		return KERN_INVALID_ARGUMENT;
4834 	}
4835 
4836 	if (wait_for_pressure) {
4837 		switch (*pressure_level) {
4838 		case kVMPressureForegroundJetsam:
4839 		case kVMPressureBackgroundJetsam:
4840 
4841 			if (*pressure_level == kVMPressureForegroundJetsam) {
4842 				waiters = &memorystatus_jetsam_fg_band_waiters;
4843 			} else {
4844 				/* kVMPressureBackgroundJetsam */
4845 				waiters = &memorystatus_jetsam_bg_band_waiters;
4846 			}
4847 
4848 			lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4849 			wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4850 			if (wr == THREAD_WAITING) {
4851 				*waiters += 1;
4852 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4853 				wr = thread_block(THREAD_CONTINUE_NULL);
4854 			} else {
4855 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4856 			}
4857 
4858 			if (wr != THREAD_AWAKENED) {
4859 				return KERN_ABORTED;
4860 			}
4861 
4862 			return KERN_SUCCESS;
4863 		case kVMPressureNormal:
4864 		case kVMPressureWarning:
4865 		case kVMPressureUrgent:
4866 		case kVMPressureCritical:
4867 			while (old_level == *pressure_level) {
4868 				wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4869 				    THREAD_INTERRUPTIBLE);
4870 				if (wr == THREAD_WAITING) {
4871 					wr = thread_block(THREAD_CONTINUE_NULL);
4872 				}
4873 				if (wr == THREAD_INTERRUPTED) {
4874 					return KERN_ABORTED;
4875 				}
4876 
4877 				if (wr == THREAD_AWAKENED) {
4878 					old_level = memorystatus_vm_pressure_level;
4879 				}
4880 			}
4881 			break;
4882 		default:
4883 			return KERN_INVALID_ARGUMENT;
4884 		}
4885 	}
4886 
4887 	*pressure_level = old_level;
4888 	return KERN_SUCCESS;
4889 #endif /* VM_PRESSURE_EVENTS */
4890 }
4891 
4892 #if VM_PRESSURE_EVENTS
4893 void
4894 vm_pressure_thread(void)
4895 {
4896 	static boolean_t thread_initialized = FALSE;
4897 
4898 	if (thread_initialized == TRUE) {
4899 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4900 		consider_vm_pressure_events();
4901 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4902 	}
4903 
4904 #if CONFIG_THREAD_GROUPS
4905 	thread_group_vm_add();
4906 #endif /* CONFIG_THREAD_GROUPS */
4907 
4908 	thread_set_thread_name(current_thread(), "VM_pressure");
4909 	thread_initialized = TRUE;
4910 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4911 	thread_block((thread_continue_t)vm_pressure_thread);
4912 }
4913 #endif /* VM_PRESSURE_EVENTS */
4914 
4915 
4916 /*
4917  * called once per-second via "compute_averages"
4918  */
4919 void
4920 compute_pageout_gc_throttle(__unused void *arg)
4921 {
4922 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4923 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4924 		sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4925 	}
4926 }
4927 
4928 /*
4929  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4930  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4931  * jetsams. We need to check if the zone map size is above its jetsam limit to
4932  * decide if this was indeed the case.
4933  *
4934  * We need to do this on a different thread because of the following reasons:
4935  *
4936  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4937  * itself causing the system to hang. We perform synchronous jetsams if we're
4938  * leaking in the VM map entries zone, so the leaking process could be doing a
4939  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4940  * jetsam itself. We also need the vm_map lock on the process termination path,
4941  * which would now lead the dying process to deadlock against itself.
4942  *
4943  * 2. The jetsam path might need to allocate zone memory itself. We could try
4944  * using the non-blocking variant of zalloc for this path, but we can still
4945  * end up trying to do a kmem_alloc when the zone maps are almost full.
4946  */
4947 __dead2
4948 void
4949 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4950 {
4951 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4952 
4953 	if (step != VM_PAGEOUT_GC_INIT) {
4954 		sched_cond_ack(&vm_pageout_gc_cond);
4955 	}
4956 
4957 	while (true) {
4958 		if (step == VM_PAGEOUT_GC_INIT) {
4959 			/* first time being called is not about GC */
4960 #if CONFIG_THREAD_GROUPS
4961 			thread_group_vm_add();
4962 #endif /* CONFIG_THREAD_GROUPS */
4963 			step = VM_PAGEOUT_GC_COLLECT;
4964 		} else if (zone_map_nearing_exhaustion()) {
4965 			/*
4966 			 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4967 			 *
4968 			 * Bail out after calling zone_gc (which triggers the
4969 			 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4970 			 * operations that clear out a bunch of caches might allocate zone
4971 			 * memory themselves (for eg. vm_map operations would need VM map
4972 			 * entries). Since the zone map is almost full at this point, we
4973 			 * could end up with a panic. We just need to quickly jetsam a
4974 			 * process and exit here.
4975 			 *
4976 			 * It could so happen that we were woken up to relieve memory
4977 			 * pressure and the zone map also happened to be near its limit at
4978 			 * the time, in which case we'll skip out early. But that should be
4979 			 * ok; if memory pressure persists, the thread will simply be woken
4980 			 * up again.
4981 			 */
4982 
4983 			zone_gc(ZONE_GC_JETSAM);
4984 		} else {
4985 			/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4986 			boolean_t buf_large_zfree = FALSE;
4987 			boolean_t first_try = TRUE;
4988 
4989 			stack_collect();
4990 
4991 			consider_machine_collect();
4992 #if CONFIG_DEFERRED_RECLAIM
4993 			vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE);
4994 #endif /* CONFIG_DEFERRED_RECLAIM */
4995 #if CONFIG_MBUF_MCACHE
4996 			mbuf_drain(FALSE);
4997 #endif /* CONFIG_MBUF_MCACHE */
4998 
4999 			do {
5000 				if (consider_buffer_cache_collect != NULL) {
5001 					buf_large_zfree = (*consider_buffer_cache_collect)(0);
5002 				}
5003 				if (first_try == TRUE || buf_large_zfree == TRUE) {
5004 					/*
5005 					 * zone_gc should be last, because the other operations
5006 					 * might return memory to zones.
5007 					 */
5008 					zone_gc(ZONE_GC_TRIM);
5009 				}
5010 				first_try = FALSE;
5011 			} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5012 
5013 			consider_machine_adjust();
5014 		}
5015 
5016 		sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5017 	}
5018 	__builtin_unreachable();
5019 }
5020 
5021 
5022 #if VM_PAGE_BUCKETS_CHECK
5023 #if VM_PAGE_FAKE_BUCKETS
5024 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5025 #endif /* VM_PAGE_FAKE_BUCKETS */
5026 #endif /* VM_PAGE_BUCKETS_CHECK */
5027 
5028 
5029 
5030 void
5031 vm_set_restrictions(unsigned int num_cpus)
5032 {
5033 	int vm_restricted_to_single_processor = 0;
5034 
5035 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5036 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5037 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5038 	} else {
5039 		assert(num_cpus > 0);
5040 
5041 		if (num_cpus <= 3) {
5042 			/*
5043 			 * on systems with a limited number of CPUS, bind the
5044 			 * 4 major threads that can free memory and that tend to use
5045 			 * a fair bit of CPU under pressured conditions to a single processor.
5046 			 * This insures that these threads don't hog all of the available CPUs
5047 			 * (important for camera launch), while allowing them to run independently
5048 			 * w/r to locks... the 4 threads are
5049 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
5050 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
5051 			 * memorystatus_thread (jetsams).
5052 			 *
5053 			 * the first time the thread is run, it is responsible for checking the
5054 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
5055 			 * thread_bind_master...  someday this should be replaced with a group
5056 			 * scheduling mechanism and KPI.
5057 			 */
5058 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5059 		} else {
5060 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5061 		}
5062 	}
5063 }
5064 
5065 /*
5066  * Set up vm_config based on the vm_compressor_mode.
5067  * Must run BEFORE the pageout thread starts up.
5068  */
5069 __startup_func
5070 void
5071 vm_config_init(void)
5072 {
5073 	bzero(&vm_config, sizeof(vm_config));
5074 
5075 	switch (vm_compressor_mode) {
5076 	case VM_PAGER_DEFAULT:
5077 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5078 		OS_FALLTHROUGH;
5079 
5080 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5081 		vm_config.compressor_is_present = TRUE;
5082 		vm_config.swap_is_present = TRUE;
5083 		vm_config.compressor_is_active = TRUE;
5084 		vm_config.swap_is_active = TRUE;
5085 		break;
5086 
5087 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5088 		vm_config.compressor_is_present = TRUE;
5089 		vm_config.swap_is_present = TRUE;
5090 		vm_config.compressor_is_active = TRUE;
5091 		break;
5092 
5093 	case VM_PAGER_FREEZER_DEFAULT:
5094 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5095 		OS_FALLTHROUGH;
5096 
5097 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5098 		vm_config.compressor_is_present = TRUE;
5099 		vm_config.swap_is_present = TRUE;
5100 		break;
5101 
5102 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5103 		vm_config.compressor_is_present = TRUE;
5104 		vm_config.swap_is_present = TRUE;
5105 		vm_config.compressor_is_active = TRUE;
5106 		vm_config.freezer_swap_is_active = TRUE;
5107 		break;
5108 
5109 	case VM_PAGER_NOT_CONFIGURED:
5110 		break;
5111 
5112 	default:
5113 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5114 		break;
5115 	}
5116 }
5117 
5118 __startup_func
5119 static void
5120 vm_pageout_create_gc_thread(void)
5121 {
5122 	thread_t thread;
5123 
5124 	sched_cond_init(&vm_pageout_gc_cond);
5125 	if (kernel_thread_create(vm_pageout_garbage_collect,
5126 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5127 		panic("vm_pageout_garbage_collect: create failed");
5128 	}
5129 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5130 	if (thread->reserved_stack == 0) {
5131 		assert(thread->kernel_stack);
5132 		thread->reserved_stack = thread->kernel_stack;
5133 	}
5134 
5135 	/* thread is started in vm_pageout() */
5136 	vm_pageout_gc_thread = thread;
5137 }
5138 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5139 
5140 void
5141 vm_pageout(void)
5142 {
5143 	thread_t        self = current_thread();
5144 	thread_t        thread;
5145 	kern_return_t   result;
5146 	spl_t           s;
5147 
5148 	/*
5149 	 * Set thread privileges.
5150 	 */
5151 	s = splsched();
5152 
5153 #if CONFIG_VPS_DYNAMIC_PRIO
5154 	if (vps_dynamic_priority_enabled) {
5155 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5156 		thread_set_eager_preempt(self);
5157 	} else {
5158 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5159 	}
5160 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5161 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5162 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5163 
5164 	thread_lock(self);
5165 	self->options |= TH_OPT_VMPRIV;
5166 	thread_unlock(self);
5167 
5168 	if (!self->reserved_stack) {
5169 		self->reserved_stack = self->kernel_stack;
5170 	}
5171 
5172 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5173 	    !vps_dynamic_priority_enabled) {
5174 		thread_vm_bind_group_add();
5175 	}
5176 
5177 
5178 #if CONFIG_THREAD_GROUPS
5179 	thread_group_vm_add();
5180 #endif /* CONFIG_THREAD_GROUPS */
5181 
5182 #if __AMP__
5183 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5184 	if (vm_pgo_pbound) {
5185 		/*
5186 		 * Use the soft bound option for vm pageout to allow it to run on
5187 		 * E-cores if P-cluster is unavailable.
5188 		 */
5189 		thread_soft_bind_cluster_type(self, 'P');
5190 	}
5191 #endif /* __AMP__ */
5192 
5193 	PE_parse_boot_argn("vmpgo_protect_realtime",
5194 	    &vm_pageout_protect_realtime,
5195 	    sizeof(vm_pageout_protect_realtime));
5196 	splx(s);
5197 
5198 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5199 
5200 	/*
5201 	 *	Initialize some paging parameters.
5202 	 */
5203 
5204 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5205 	vm_pageout_state.vm_pressure_changed = FALSE;
5206 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5207 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5208 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5209 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5210 	vm_pageout_state.vm_page_speculative_percentage = 5;
5211 	vm_pageout_state.vm_page_speculative_target = 0;
5212 
5213 	vm_pageout_state.vm_pageout_swap_wait = 0;
5214 	vm_pageout_state.vm_pageout_idle_wait = 0;
5215 	vm_pageout_state.vm_pageout_empty_wait = 0;
5216 	vm_pageout_state.vm_pageout_burst_wait = 0;
5217 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5218 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5219 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5220 
5221 	vm_pageout_state.vm_pageout_inactive = 0;
5222 	vm_pageout_state.vm_pageout_inactive_used = 0;
5223 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5224 
5225 	vm_pageout_state.vm_memory_pressure = 0;
5226 	vm_pageout_state.vm_page_filecache_min = 0;
5227 #if CONFIG_JETSAM
5228 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5229 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5230 #else
5231 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5232 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5233 #endif
5234 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5235 
5236 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5237 
5238 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5239 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5240 	}
5241 
5242 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5243 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5244 	}
5245 
5246 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5247 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5248 	}
5249 
5250 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5251 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5252 	}
5253 
5254 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5255 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5256 	}
5257 
5258 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5259 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5260 	}
5261 
5262 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5263 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5264 	}
5265 	/*
5266 	 * even if we've already called vm_page_free_reserve
5267 	 * call it again here to insure that the targets are
5268 	 * accurately calculated (it uses vm_page_free_count_init)
5269 	 * calling it with an arg of 0 will not change the reserve
5270 	 * but will re-calculate free_min and free_target
5271 	 */
5272 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5273 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5274 	} else {
5275 		vm_page_free_reserve(0);
5276 	}
5277 
5278 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5279 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5280 
5281 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5282 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5283 
5284 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5285 
5286 #if DEVELOPMENT || DEBUG
5287 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5288 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5289 #endif /* DEVELOPMENT || DEBUG */
5290 
5291 
5292 	/* internal pageout thread started when default pager registered first time */
5293 	/* external pageout and garbage collection threads started here */
5294 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5295 	ethr->id = 0;
5296 	ethr->q = &vm_pageout_queue_external;
5297 	/* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5298 	ethr->current_early_swapout_chead = NULL;
5299 	ethr->current_regular_swapout_chead = NULL;
5300 	ethr->current_late_swapout_chead = NULL;
5301 	ethr->scratch_buf = NULL;
5302 #if DEVELOPMENT || DEBUG
5303 	ethr->benchmark_q = NULL;
5304 #endif /* DEVELOPMENT || DEBUG */
5305 	sched_cond_init(&(ethr->pgo_wakeup));
5306 
5307 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5308 	    (void *)ethr, BASEPRI_VM,
5309 	    &(ethr->pgo_iothread));
5310 	if (result != KERN_SUCCESS) {
5311 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5312 	}
5313 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5314 
5315 	thread_mtx_lock(vm_pageout_gc_thread );
5316 	thread_start(vm_pageout_gc_thread );
5317 	thread_mtx_unlock(vm_pageout_gc_thread);
5318 
5319 #if VM_PRESSURE_EVENTS
5320 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5321 	    BASEPRI_DEFAULT,
5322 	    &thread);
5323 
5324 	if (result != KERN_SUCCESS) {
5325 		panic("vm_pressure_thread: create failed");
5326 	}
5327 
5328 	thread_deallocate(thread);
5329 #endif
5330 
5331 	vm_object_reaper_init();
5332 
5333 
5334 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5335 		vm_compressor_init();
5336 	}
5337 
5338 #if VM_PRESSURE_EVENTS
5339 	vm_pressure_events_enabled = TRUE;
5340 #endif /* VM_PRESSURE_EVENTS */
5341 
5342 #if CONFIG_PHANTOM_CACHE
5343 	vm_phantom_cache_init();
5344 #endif
5345 #if VM_PAGE_BUCKETS_CHECK
5346 #if VM_PAGE_FAKE_BUCKETS
5347 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5348 	    (uint64_t) vm_page_fake_buckets_start,
5349 	    (uint64_t) vm_page_fake_buckets_end);
5350 	pmap_protect(kernel_pmap,
5351 	    vm_page_fake_buckets_start,
5352 	    vm_page_fake_buckets_end,
5353 	    VM_PROT_READ);
5354 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5355 #endif /* VM_PAGE_FAKE_BUCKETS */
5356 #endif /* VM_PAGE_BUCKETS_CHECK */
5357 
5358 #if VM_OBJECT_TRACKING
5359 	vm_object_tracking_init();
5360 #endif /* VM_OBJECT_TRACKING */
5361 
5362 #if __arm64__
5363 //	vm_tests();
5364 #endif /* __arm64__ */
5365 
5366 	vm_pageout_continue();
5367 
5368 	/*
5369 	 * Unreached code!
5370 	 *
5371 	 * The vm_pageout_continue() call above never returns, so the code below is never
5372 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5373 	 * points that our kernel doesn't have an analog for.  These are probe points that
5374 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5375 	 * scripts that use them.  Declaring the probe points here means their scripts will
5376 	 * compile and execute which we want for portability of the scripts, but since this
5377 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5378 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5379 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5380 	 */
5381 
5382 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5383 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5384 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5385 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5386 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5387 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5388 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5389 	/*NOTREACHED*/
5390 }
5391 
5392 
5393 
5394 kern_return_t
5395 vm_pageout_internal_start(void)
5396 {
5397 	kern_return_t   result = KERN_SUCCESS;
5398 	host_basic_info_data_t hinfo;
5399 	vm_offset_t     buf, bufsize;
5400 
5401 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5402 
5403 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5404 #define BSD_HOST 1
5405 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5406 
5407 	assert(hinfo.max_cpus > 0);
5408 
5409 #if !XNU_TARGET_OS_OSX
5410 	vm_pageout_state.vm_compressor_thread_count = 1;
5411 #else /* !XNU_TARGET_OS_OSX */
5412 	if (hinfo.max_cpus > 4) {
5413 		vm_pageout_state.vm_compressor_thread_count = 2;
5414 	} else {
5415 		vm_pageout_state.vm_compressor_thread_count = 1;
5416 	}
5417 #endif /* !XNU_TARGET_OS_OSX */
5418 #if     __AMP__
5419 	if (vm_compressor_ebound) {
5420 		vm_pageout_state.vm_compressor_thread_count = 2;
5421 	}
5422 #endif
5423 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5424 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5425 
5426 	/* did we get from the bootargs an unreasonable number? */
5427 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5428 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5429 	}
5430 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5431 		vm_pageout_state.vm_compressor_thread_count = 1;
5432 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5433 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5434 	}
5435 
5436 	vm_pageout_queue_internal.pgo_maxlaundry =
5437 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5438 
5439 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5440 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5441 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5442 
5443 #if DEVELOPMENT || DEBUG
5444 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5445 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5446 #endif /* DEVELOPMENT || DEBUG */
5447 
5448 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5449 
5450 	kmem_alloc(kernel_map, &buf,
5451 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5452 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5453 	    VM_KERN_MEMORY_COMPRESSOR);
5454 
5455 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5456 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5457 		iq->id = i;
5458 		iq->q = &vm_pageout_queue_internal;
5459 		iq->current_early_swapout_chead = NULL;
5460 		iq->current_regular_swapout_chead = NULL;
5461 		iq->current_late_swapout_chead = NULL;
5462 		iq->scratch_buf = (char *)(buf + i * bufsize);
5463 #if DEVELOPMENT || DEBUG
5464 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5465 #endif /* DEVELOPMENT || DEBUG */
5466 		sched_cond_init(&(iq->pgo_wakeup));
5467 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5468 		    (void *)iq, BASEPRI_VM,
5469 		    &(iq->pgo_iothread));
5470 
5471 		if (result != KERN_SUCCESS) {
5472 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5473 		}
5474 	}
5475 	return result;
5476 }
5477 
5478 #if CONFIG_IOSCHED
5479 /*
5480  * To support I/O Expedite for compressed files we mark the upls with special flags.
5481  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5482  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5483  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5484  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5485  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5486  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5487  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5488  * unless the real I/O upl is being destroyed).
5489  */
5490 
5491 
5492 static void
5493 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5494 {
5495 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5496 
5497 	upl_lock(src_upl);
5498 	if (src_upl->decmp_io_upl) {
5499 		/*
5500 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5501 		 * This case should rarely happen and even if it does, it just means
5502 		 * that we might issue a spurious expedite which the driver is expected
5503 		 * to handle.
5504 		 */
5505 		upl_unlock(src_upl);
5506 		return;
5507 	}
5508 	src_upl->decmp_io_upl = (void *)upl;
5509 	src_upl->ref_count++;
5510 
5511 	upl->flags |= UPL_DECMP_REAL_IO;
5512 	upl->decmp_io_upl = (void *)src_upl;
5513 	upl_unlock(src_upl);
5514 }
5515 #endif /* CONFIG_IOSCHED */
5516 
5517 #if UPL_DEBUG
5518 int     upl_debug_enabled = 1;
5519 #else
5520 int     upl_debug_enabled = 0;
5521 #endif
5522 
5523 static upl_t
5524 upl_create(int type, int flags, upl_size_t size)
5525 {
5526 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5527 	upl_t    upl;
5528 
5529 	assert(page_aligned(size));
5530 
5531 	/*
5532 	 * FIXME: this code assumes the allocation always succeeds,
5533 	 *        however `pages` can be up to MAX_UPL_SIZE.
5534 	 *
5535 	 *        The allocation size is above 32k (resp. 128k)
5536 	 *        on 16k pages (resp. 4k), which kalloc might fail
5537 	 *        to allocate.
5538 	 */
5539 	upl = kalloc_type(struct upl, struct upl_page_info,
5540 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5541 	if (type & UPL_CREATE_INTERNAL) {
5542 		flags |= UPL_INTERNAL;
5543 	}
5544 
5545 	if (type & UPL_CREATE_LITE) {
5546 		flags |= UPL_LITE;
5547 		if (pages) {
5548 			upl->lite_list = bitmap_alloc(pages);
5549 		}
5550 	}
5551 
5552 	upl->flags = flags;
5553 	upl->ref_count = 1;
5554 	upl_lock_init(upl);
5555 #if CONFIG_IOSCHED
5556 	if (type & UPL_CREATE_IO_TRACKING) {
5557 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5558 	}
5559 
5560 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5561 		/* Only support expedite on internal UPLs */
5562 		thread_t        curthread = current_thread();
5563 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5564 		    Z_WAITOK | Z_ZERO);
5565 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5566 		if (curthread->decmp_upl != NULL) {
5567 			upl_set_decmp_info(upl, curthread->decmp_upl);
5568 		}
5569 	}
5570 #endif
5571 #if CONFIG_IOSCHED || UPL_DEBUG
5572 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5573 		upl->upl_creator = current_thread();
5574 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5575 	}
5576 #endif
5577 
5578 #if UPL_DEBUG
5579 	upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5580 #endif /* UPL_DEBUG */
5581 
5582 	return upl;
5583 }
5584 
5585 static void
5586 upl_destroy(upl_t upl)
5587 {
5588 	uint32_t pages;
5589 
5590 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5591 
5592 	if (upl->ext_ref_count) {
5593 		panic("upl(%p) ext_ref_count", upl);
5594 	}
5595 
5596 #if CONFIG_IOSCHED
5597 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5598 		upl_t src_upl;
5599 		src_upl = upl->decmp_io_upl;
5600 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5601 		upl_lock(src_upl);
5602 		src_upl->decmp_io_upl = NULL;
5603 		upl_unlock(src_upl);
5604 		upl_deallocate(src_upl);
5605 	}
5606 #endif /* CONFIG_IOSCHED */
5607 
5608 #if CONFIG_IOSCHED || UPL_DEBUG
5609 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5610 	    !(upl->flags & UPL_VECTOR)) {
5611 		vm_object_t     object;
5612 
5613 		if (upl->flags & UPL_SHADOWED) {
5614 			object = upl->map_object->shadow;
5615 		} else {
5616 			object = upl->map_object;
5617 		}
5618 
5619 		vm_object_lock(object);
5620 		queue_remove(&object->uplq, upl, upl_t, uplq);
5621 		vm_object_activity_end(object);
5622 		vm_object_collapse(object, 0, TRUE);
5623 		vm_object_unlock(object);
5624 	}
5625 #endif
5626 	/*
5627 	 * drop a reference on the map_object whether or
5628 	 * not a pageout object is inserted
5629 	 */
5630 	if (upl->flags & UPL_SHADOWED) {
5631 		vm_object_deallocate(upl->map_object);
5632 	}
5633 
5634 	if (upl->flags & UPL_DEVICE_MEMORY) {
5635 		pages = 1;
5636 	} else {
5637 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5638 	}
5639 
5640 	upl_lock_destroy(upl);
5641 
5642 #if CONFIG_IOSCHED
5643 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5644 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5645 	}
5646 #endif
5647 
5648 #if UPL_DEBUG
5649 	for (int i = 0; i < upl->upl_commit_index; i++) {
5650 		btref_put(upl->upl_commit_records[i].c_btref);
5651 	}
5652 	btref_put(upl->upl_create_btref);
5653 #endif /* UPL_DEBUG */
5654 
5655 	if ((upl->flags & UPL_LITE) && pages) {
5656 		bitmap_free(upl->lite_list, pages);
5657 	}
5658 	kfree_type(struct upl, struct upl_page_info,
5659 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5660 }
5661 
5662 void
5663 upl_deallocate(upl_t upl)
5664 {
5665 	upl_lock(upl);
5666 
5667 	if (--upl->ref_count == 0) {
5668 		if (vector_upl_is_valid(upl)) {
5669 			vector_upl_deallocate(upl);
5670 		}
5671 		upl_unlock(upl);
5672 
5673 		if (upl->upl_iodone) {
5674 			upl_callout_iodone(upl);
5675 		}
5676 
5677 		upl_destroy(upl);
5678 	} else {
5679 		upl_unlock(upl);
5680 	}
5681 }
5682 
5683 #if CONFIG_IOSCHED
5684 void
5685 upl_mark_decmp(upl_t upl)
5686 {
5687 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5688 		upl->flags |= UPL_DECMP_REQ;
5689 		upl->upl_creator->decmp_upl = (void *)upl;
5690 	}
5691 }
5692 
5693 void
5694 upl_unmark_decmp(upl_t upl)
5695 {
5696 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5697 		upl->upl_creator->decmp_upl = NULL;
5698 	}
5699 }
5700 
5701 #endif /* CONFIG_IOSCHED */
5702 
5703 #define VM_PAGE_Q_BACKING_UP(q)         \
5704 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5705 
5706 boolean_t must_throttle_writes(void);
5707 
5708 boolean_t
5709 must_throttle_writes()
5710 {
5711 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5712 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5713 		return TRUE;
5714 	}
5715 
5716 	return FALSE;
5717 }
5718 
5719 int vm_page_delayed_work_ctx_needed = 0;
5720 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5721 
5722 __startup_func
5723 static void
5724 vm_page_delayed_work_init_ctx(void)
5725 {
5726 	uint16_t min_delayed_work_ctx_allocated = 16;
5727 
5728 	/*
5729 	 * try really hard to always keep NCPU elements around in the zone
5730 	 * in order for the UPL code to almost always get an element.
5731 	 */
5732 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5733 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5734 	}
5735 
5736 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5737 }
5738 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5739 
5740 struct vm_page_delayed_work*
5741 vm_page_delayed_work_get_ctx(void)
5742 {
5743 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5744 
5745 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5746 
5747 	if (__probable(dw_ctx)) {
5748 		dw_ctx->delayed_owner = current_thread();
5749 	} else {
5750 		vm_page_delayed_work_ctx_needed++;
5751 	}
5752 	return dw_ctx ? dw_ctx->dwp : NULL;
5753 }
5754 
5755 void
5756 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5757 {
5758 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5759 
5760 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5761 	ldw_ctx->delayed_owner = NULL;
5762 
5763 	zfree(dw_ctx_zone, ldw_ctx);
5764 }
5765 
5766 /*
5767  *	Routine:	vm_object_upl_request
5768  *	Purpose:
5769  *		Cause the population of a portion of a vm_object.
5770  *		Depending on the nature of the request, the pages
5771  *		returned may be contain valid data or be uninitialized.
5772  *		A page list structure, listing the physical pages
5773  *		will be returned upon request.
5774  *		This function is called by the file system or any other
5775  *		supplier of backing store to a pager.
5776  *		IMPORTANT NOTE: The caller must still respect the relationship
5777  *		between the vm_object and its backing memory object.  The
5778  *		caller MUST NOT substitute changes in the backing file
5779  *		without first doing a memory_object_lock_request on the
5780  *		target range unless it is know that the pages are not
5781  *		shared with another entity at the pager level.
5782  *		Copy_in_to:
5783  *			if a page list structure is present
5784  *			return the mapped physical pages, where a
5785  *			page is not present, return a non-initialized
5786  *			one.  If the no_sync bit is turned on, don't
5787  *			call the pager unlock to synchronize with other
5788  *			possible copies of the page. Leave pages busy
5789  *			in the original object, if a page list structure
5790  *			was specified.  When a commit of the page list
5791  *			pages is done, the dirty bit will be set for each one.
5792  *		Copy_out_from:
5793  *			If a page list structure is present, return
5794  *			all mapped pages.  Where a page does not exist
5795  *			map a zero filled one. Leave pages busy in
5796  *			the original object.  If a page list structure
5797  *			is not specified, this call is a no-op.
5798  *
5799  *		Note:  access of default pager objects has a rather interesting
5800  *		twist.  The caller of this routine, presumably the file system
5801  *		page cache handling code, will never actually make a request
5802  *		against a default pager backed object.  Only the default
5803  *		pager will make requests on backing store related vm_objects
5804  *		In this way the default pager can maintain the relationship
5805  *		between backing store files (abstract memory objects) and
5806  *		the vm_objects (cache objects), they support.
5807  *
5808  */
5809 
5810 __private_extern__ kern_return_t
5811 vm_object_upl_request(
5812 	vm_object_t             object,
5813 	vm_object_offset_t      offset,
5814 	upl_size_t              size,
5815 	upl_t                   *upl_ptr,
5816 	upl_page_info_array_t   user_page_list,
5817 	unsigned int            *page_list_count,
5818 	upl_control_flags_t     cntrl_flags,
5819 	vm_tag_t                tag)
5820 {
5821 	vm_page_t               dst_page = VM_PAGE_NULL;
5822 	vm_object_offset_t      dst_offset;
5823 	upl_size_t              xfer_size;
5824 	unsigned int            size_in_pages;
5825 	boolean_t               dirty;
5826 	boolean_t               hw_dirty;
5827 	upl_t                   upl = NULL;
5828 	unsigned int            entry;
5829 	vm_page_t               alias_page = NULL;
5830 	int                     refmod_state = 0;
5831 	vm_object_t             last_copy_object;
5832 	uint32_t                last_copy_version;
5833 	struct  vm_page_delayed_work    dw_array;
5834 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5835 	bool                    dwp_finish_ctx = TRUE;
5836 	int                     dw_count;
5837 	int                     dw_limit;
5838 	int                     io_tracking_flag = 0;
5839 	int                     grab_options;
5840 	int                     page_grab_count = 0;
5841 	ppnum_t                 phys_page;
5842 	pmap_flush_context      pmap_flush_context_storage;
5843 	boolean_t               pmap_flushes_delayed = FALSE;
5844 	task_t                  task = current_task();
5845 
5846 	dwp_start = dwp = NULL;
5847 
5848 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5849 		/*
5850 		 * For forward compatibility's sake,
5851 		 * reject any unknown flag.
5852 		 */
5853 		return KERN_INVALID_VALUE;
5854 	}
5855 	if ((!object->internal) && (object->paging_offset != 0)) {
5856 		panic("vm_object_upl_request: external object with non-zero paging offset");
5857 	}
5858 	if (object->phys_contiguous) {
5859 		panic("vm_object_upl_request: contiguous object specified");
5860 	}
5861 
5862 	assertf(page_aligned(offset) && page_aligned(size),
5863 	    "offset 0x%llx size 0x%x",
5864 	    offset, size);
5865 
5866 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5867 
5868 	dw_count = 0;
5869 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5870 	dwp_start = vm_page_delayed_work_get_ctx();
5871 	if (dwp_start == NULL) {
5872 		dwp_start = &dw_array;
5873 		dw_limit = 1;
5874 		dwp_finish_ctx = FALSE;
5875 	}
5876 
5877 	dwp = dwp_start;
5878 
5879 	if (size > MAX_UPL_SIZE_BYTES) {
5880 		size = MAX_UPL_SIZE_BYTES;
5881 	}
5882 
5883 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5884 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5885 	}
5886 
5887 #if CONFIG_IOSCHED || UPL_DEBUG
5888 	if (object->io_tracking || upl_debug_enabled) {
5889 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5890 	}
5891 #endif
5892 #if CONFIG_IOSCHED
5893 	if (object->io_tracking) {
5894 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5895 	}
5896 #endif
5897 
5898 	if (cntrl_flags & UPL_SET_INTERNAL) {
5899 		if (cntrl_flags & UPL_SET_LITE) {
5900 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5901 		} else {
5902 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5903 		}
5904 		user_page_list = size ? upl->page_list : NULL;
5905 	} else {
5906 		if (cntrl_flags & UPL_SET_LITE) {
5907 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5908 		} else {
5909 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5910 		}
5911 	}
5912 	*upl_ptr = upl;
5913 
5914 	if (user_page_list) {
5915 		user_page_list[0].device = FALSE;
5916 	}
5917 
5918 	if (cntrl_flags & UPL_SET_LITE) {
5919 		upl->map_object = object;
5920 	} else {
5921 		upl->map_object = vm_object_allocate(size);
5922 		vm_object_lock(upl->map_object);
5923 		/*
5924 		 * No neeed to lock the new object: nobody else knows
5925 		 * about it yet, so it's all ours so far.
5926 		 */
5927 		upl->map_object->shadow = object;
5928 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5929 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5930 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5931 		upl->map_object->vo_shadow_offset = offset;
5932 		upl->map_object->wimg_bits = object->wimg_bits;
5933 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5934 		    "object %p shadow_offset 0x%llx",
5935 		    upl->map_object, upl->map_object->vo_shadow_offset);
5936 		vm_object_unlock(upl->map_object);
5937 
5938 		alias_page = vm_page_create_fictitious();
5939 
5940 		upl->flags |= UPL_SHADOWED;
5941 	}
5942 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5943 		upl->flags |= UPL_PAGEOUT;
5944 	}
5945 
5946 	vm_object_lock(object);
5947 	vm_object_activity_begin(object);
5948 
5949 	grab_options = 0;
5950 #if CONFIG_SECLUDED_MEMORY
5951 	if (object->can_grab_secluded) {
5952 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5953 	}
5954 #endif /* CONFIG_SECLUDED_MEMORY */
5955 
5956 	/*
5957 	 * we can lock in the paging_offset once paging_in_progress is set
5958 	 */
5959 	upl->u_size = size;
5960 	upl->u_offset = offset + object->paging_offset;
5961 
5962 #if CONFIG_IOSCHED || UPL_DEBUG
5963 	if (object->io_tracking || upl_debug_enabled) {
5964 		vm_object_activity_begin(object);
5965 		queue_enter(&object->uplq, upl, upl_t, uplq);
5966 	}
5967 #endif
5968 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5969 		/*
5970 		 * Honor copy-on-write obligations
5971 		 *
5972 		 * The caller is gathering these pages and
5973 		 * might modify their contents.  We need to
5974 		 * make sure that the copy object has its own
5975 		 * private copies of these pages before we let
5976 		 * the caller modify them.
5977 		 */
5978 		vm_object_update(object,
5979 		    offset,
5980 		    size,
5981 		    NULL,
5982 		    NULL,
5983 		    FALSE,              /* should_return */
5984 		    MEMORY_OBJECT_COPY_SYNC,
5985 		    VM_PROT_NO_CHANGE);
5986 
5987 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5988 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5989 	}
5990 	/*
5991 	 * remember which copy object we synchronized with
5992 	 */
5993 	last_copy_object = object->vo_copy;
5994 	last_copy_version = object->vo_copy_version;
5995 	entry = 0;
5996 
5997 	xfer_size = size;
5998 	dst_offset = offset;
5999 	size_in_pages = size / PAGE_SIZE;
6000 
6001 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6002 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6003 		object->scan_collisions = 0;
6004 	}
6005 
6006 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6007 		boolean_t       isSSD = FALSE;
6008 
6009 #if !XNU_TARGET_OS_OSX
6010 		isSSD = TRUE;
6011 #else /* !XNU_TARGET_OS_OSX */
6012 		vnode_pager_get_isSSD(object->pager, &isSSD);
6013 #endif /* !XNU_TARGET_OS_OSX */
6014 		vm_object_unlock(object);
6015 
6016 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6017 
6018 		if (isSSD == TRUE) {
6019 			delay(1000 * size_in_pages);
6020 		} else {
6021 			delay(5000 * size_in_pages);
6022 		}
6023 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6024 
6025 		vm_object_lock(object);
6026 	}
6027 
6028 	while (xfer_size) {
6029 		dwp->dw_mask = 0;
6030 
6031 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6032 			vm_object_unlock(object);
6033 			alias_page = vm_page_create_fictitious();
6034 			vm_object_lock(object);
6035 		}
6036 		if (cntrl_flags & UPL_COPYOUT_FROM) {
6037 			upl->flags |= UPL_PAGE_SYNC_DONE;
6038 
6039 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6040 			    vm_page_is_fictitious(dst_page) ||
6041 			    dst_page->vmp_absent ||
6042 			    VMP_ERROR_GET(dst_page) ||
6043 			    dst_page->vmp_cleaning ||
6044 			    (VM_PAGE_WIRED(dst_page))) {
6045 				if (user_page_list) {
6046 					user_page_list[entry].phys_addr = 0;
6047 				}
6048 
6049 				goto try_next_page;
6050 			}
6051 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6052 
6053 			/*
6054 			 * grab this up front...
6055 			 * a high percentange of the time we're going to
6056 			 * need the hardware modification state a bit later
6057 			 * anyway... so we can eliminate an extra call into
6058 			 * the pmap layer by grabbing it here and recording it
6059 			 */
6060 			if (dst_page->vmp_pmapped) {
6061 				refmod_state = pmap_get_refmod(phys_page);
6062 			} else {
6063 				refmod_state = 0;
6064 			}
6065 
6066 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6067 				/*
6068 				 * page is on inactive list and referenced...
6069 				 * reactivate it now... this gets it out of the
6070 				 * way of vm_pageout_scan which would have to
6071 				 * reactivate it upon tripping over it
6072 				 */
6073 				dwp->dw_mask |= DW_vm_page_activate;
6074 			}
6075 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6076 				/*
6077 				 * we're only asking for DIRTY pages to be returned
6078 				 */
6079 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6080 					/*
6081 					 * if we were the page stolen by vm_pageout_scan to be
6082 					 * cleaned (as opposed to a buddy being clustered in
6083 					 * or this request is not being driven by a PAGEOUT cluster
6084 					 * then we only need to check for the page being dirty or
6085 					 * precious to decide whether to return it
6086 					 */
6087 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6088 						goto check_busy;
6089 					}
6090 					goto dont_return;
6091 				}
6092 				/*
6093 				 * this is a request for a PAGEOUT cluster and this page
6094 				 * is merely along for the ride as a 'buddy'... not only
6095 				 * does it have to be dirty to be returned, but it also
6096 				 * can't have been referenced recently...
6097 				 */
6098 				if ((hibernate_cleaning_in_progress == TRUE ||
6099 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6100 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6101 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6102 					goto check_busy;
6103 				}
6104 dont_return:
6105 				/*
6106 				 * if we reach here, we're not to return
6107 				 * the page... go on to the next one
6108 				 */
6109 				if (dst_page->vmp_laundry == TRUE) {
6110 					/*
6111 					 * if we get here, the page is not 'cleaning' (filtered out above).
6112 					 * since it has been referenced, remove it from the laundry
6113 					 * so we don't pay the cost of an I/O to clean a page
6114 					 * we're just going to take back
6115 					 */
6116 					vm_page_lockspin_queues();
6117 
6118 					vm_pageout_steal_laundry(dst_page, TRUE);
6119 					vm_page_activate(dst_page);
6120 
6121 					vm_page_unlock_queues();
6122 				}
6123 				if (user_page_list) {
6124 					user_page_list[entry].phys_addr = 0;
6125 				}
6126 
6127 				goto try_next_page;
6128 			}
6129 check_busy:
6130 			if (dst_page->vmp_busy) {
6131 				if (cntrl_flags & UPL_NOBLOCK) {
6132 					if (user_page_list) {
6133 						user_page_list[entry].phys_addr = 0;
6134 					}
6135 					dwp->dw_mask = 0;
6136 
6137 					goto try_next_page;
6138 				}
6139 				/*
6140 				 * someone else is playing with the
6141 				 * page.  We will have to wait.
6142 				 */
6143 				vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6144 
6145 				continue;
6146 			}
6147 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6148 				vm_page_lockspin_queues();
6149 
6150 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6151 					/*
6152 					 * we've buddied up a page for a clustered pageout
6153 					 * that has already been moved to the pageout
6154 					 * queue by pageout_scan... we need to remove
6155 					 * it from the queue and drop the laundry count
6156 					 * on that queue
6157 					 */
6158 					vm_pageout_throttle_up(dst_page);
6159 				}
6160 				vm_page_unlock_queues();
6161 			}
6162 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6163 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6164 
6165 			if (phys_page > upl->highest_page) {
6166 				upl->highest_page = phys_page;
6167 			}
6168 
6169 			assert(!pmap_is_noencrypt(phys_page));
6170 
6171 			if (cntrl_flags & UPL_SET_LITE) {
6172 				unsigned int    pg_num;
6173 
6174 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6175 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6176 				bitmap_set(upl->lite_list, pg_num);
6177 
6178 				if (hw_dirty) {
6179 					if (pmap_flushes_delayed == FALSE) {
6180 						pmap_flush_context_init(&pmap_flush_context_storage);
6181 						pmap_flushes_delayed = TRUE;
6182 					}
6183 					pmap_clear_refmod_options(phys_page,
6184 					    VM_MEM_MODIFIED,
6185 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6186 					    &pmap_flush_context_storage);
6187 				}
6188 
6189 				/*
6190 				 * Mark original page as cleaning
6191 				 * in place.
6192 				 */
6193 				dst_page->vmp_cleaning = TRUE;
6194 				dst_page->vmp_precious = FALSE;
6195 			} else {
6196 				/*
6197 				 * use pageclean setup, it is more
6198 				 * convenient even for the pageout
6199 				 * cases here
6200 				 */
6201 				vm_object_lock(upl->map_object);
6202 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6203 				vm_object_unlock(upl->map_object);
6204 
6205 				alias_page->vmp_absent = FALSE;
6206 				alias_page = NULL;
6207 			}
6208 			if (dirty) {
6209 				SET_PAGE_DIRTY(dst_page, FALSE);
6210 			} else {
6211 				dst_page->vmp_dirty = FALSE;
6212 			}
6213 
6214 			if (!dirty) {
6215 				dst_page->vmp_precious = TRUE;
6216 			}
6217 
6218 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6219 				if (!VM_PAGE_WIRED(dst_page)) {
6220 					dst_page->vmp_free_when_done = TRUE;
6221 				}
6222 			}
6223 		} else {
6224 			if ((cntrl_flags & UPL_WILL_MODIFY) &&
6225 			    (object->vo_copy != last_copy_object ||
6226 			    object->vo_copy_version != last_copy_version)) {
6227 				/*
6228 				 * Honor copy-on-write obligations
6229 				 *
6230 				 * The copy object has changed since we
6231 				 * last synchronized for copy-on-write.
6232 				 * Another copy object might have been
6233 				 * inserted while we released the object's
6234 				 * lock.  Since someone could have seen the
6235 				 * original contents of the remaining pages
6236 				 * through that new object, we have to
6237 				 * synchronize with it again for the remaining
6238 				 * pages only.  The previous pages are "busy"
6239 				 * so they can not be seen through the new
6240 				 * mapping.  The new mapping will see our
6241 				 * upcoming changes for those previous pages,
6242 				 * but that's OK since they couldn't see what
6243 				 * was there before.  It's just a race anyway
6244 				 * and there's no guarantee of consistency or
6245 				 * atomicity.  We just don't want new mappings
6246 				 * to see both the *before* and *after* pages.
6247 				 */
6248 				if (object->vo_copy != VM_OBJECT_NULL) {
6249 					vm_object_update(
6250 						object,
6251 						dst_offset,/* current offset */
6252 						xfer_size, /* remaining size */
6253 						NULL,
6254 						NULL,
6255 						FALSE,     /* should_return */
6256 						MEMORY_OBJECT_COPY_SYNC,
6257 						VM_PROT_NO_CHANGE);
6258 
6259 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6260 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6261 				}
6262 				/*
6263 				 * remember the copy object we synced with
6264 				 */
6265 				last_copy_object = object->vo_copy;
6266 				last_copy_version = object->vo_copy_version;
6267 			}
6268 			dst_page = vm_page_lookup(object, dst_offset);
6269 
6270 			if (dst_page != VM_PAGE_NULL) {
6271 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6272 					/*
6273 					 * skip over pages already present in the cache
6274 					 */
6275 					if (user_page_list) {
6276 						user_page_list[entry].phys_addr = 0;
6277 					}
6278 
6279 					goto try_next_page;
6280 				}
6281 				if (vm_page_is_fictitious(dst_page)) {
6282 					panic("need corner case for fictitious page");
6283 				}
6284 
6285 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6286 					/*
6287 					 * someone else is playing with the
6288 					 * page.  We will have to wait.
6289 					 */
6290 					vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6291 
6292 					continue;
6293 				}
6294 				if (dst_page->vmp_laundry) {
6295 					vm_pageout_steal_laundry(dst_page, FALSE);
6296 				}
6297 			} else {
6298 				if (object->private) {
6299 					/*
6300 					 * This is a nasty wrinkle for users
6301 					 * of upl who encounter device or
6302 					 * private memory however, it is
6303 					 * unavoidable, only a fault can
6304 					 * resolve the actual backing
6305 					 * physical page by asking the
6306 					 * backing device.
6307 					 */
6308 					if (user_page_list) {
6309 						user_page_list[entry].phys_addr = 0;
6310 					}
6311 
6312 					goto try_next_page;
6313 				}
6314 				if (object->scan_collisions) {
6315 					/*
6316 					 * the pageout_scan thread is trying to steal
6317 					 * pages from this object, but has run into our
6318 					 * lock... grab 2 pages from the head of the object...
6319 					 * the first is freed on behalf of pageout_scan, the
6320 					 * 2nd is for our own use... we use vm_object_page_grab
6321 					 * in both cases to avoid taking pages from the free
6322 					 * list since we are under memory pressure and our
6323 					 * lock on this object is getting in the way of
6324 					 * relieving it
6325 					 */
6326 					dst_page = vm_object_page_grab(object);
6327 
6328 					if (dst_page != VM_PAGE_NULL) {
6329 						vm_page_release(dst_page,
6330 						    FALSE);
6331 					}
6332 
6333 					dst_page = vm_object_page_grab(object);
6334 				}
6335 				if (dst_page == VM_PAGE_NULL) {
6336 					/*
6337 					 * need to allocate a page
6338 					 */
6339 					dst_page = vm_page_grab_options(grab_options);
6340 					if (dst_page != VM_PAGE_NULL) {
6341 						page_grab_count++;
6342 					}
6343 				}
6344 				if (dst_page == VM_PAGE_NULL) {
6345 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6346 						/*
6347 						 * we don't want to stall waiting for pages to come onto the free list
6348 						 * while we're already holding absent pages in this UPL
6349 						 * the caller will deal with the empty slots
6350 						 */
6351 						if (user_page_list) {
6352 							user_page_list[entry].phys_addr = 0;
6353 						}
6354 
6355 						goto try_next_page;
6356 					}
6357 					/*
6358 					 * no pages available... wait
6359 					 * then try again for the same
6360 					 * offset...
6361 					 */
6362 					vm_object_unlock(object);
6363 
6364 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6365 
6366 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6367 
6368 					VM_PAGE_WAIT();
6369 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6370 
6371 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6372 
6373 					vm_object_lock(object);
6374 
6375 					continue;
6376 				}
6377 				vm_page_insert(dst_page, object, dst_offset);
6378 
6379 				dst_page->vmp_absent = TRUE;
6380 				dst_page->vmp_busy = FALSE;
6381 
6382 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6383 					/*
6384 					 * if UPL_RET_ONLY_ABSENT was specified,
6385 					 * than we're definitely setting up a
6386 					 * upl for a clustered read/pagein
6387 					 * operation... mark the pages as clustered
6388 					 * so upl_commit_range can put them on the
6389 					 * speculative list
6390 					 */
6391 					dst_page->vmp_clustered = TRUE;
6392 
6393 					if (!(cntrl_flags & UPL_FILE_IO)) {
6394 						counter_inc(&vm_statistics_pageins);
6395 					}
6396 				}
6397 			}
6398 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6399 
6400 			dst_page->vmp_overwriting = TRUE;
6401 
6402 			if (dst_page->vmp_pmapped) {
6403 				if (!(cntrl_flags & UPL_FILE_IO)) {
6404 					/*
6405 					 * eliminate all mappings from the
6406 					 * original object and its prodigy
6407 					 */
6408 					refmod_state = pmap_disconnect(phys_page);
6409 				} else {
6410 					refmod_state = pmap_get_refmod(phys_page);
6411 				}
6412 			} else {
6413 				refmod_state = 0;
6414 			}
6415 
6416 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6417 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6418 
6419 			if (cntrl_flags & UPL_SET_LITE) {
6420 				unsigned int    pg_num;
6421 
6422 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6423 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6424 				bitmap_set(upl->lite_list, pg_num);
6425 
6426 				if (hw_dirty) {
6427 					pmap_clear_modify(phys_page);
6428 				}
6429 
6430 				/*
6431 				 * Mark original page as cleaning
6432 				 * in place.
6433 				 */
6434 				dst_page->vmp_cleaning = TRUE;
6435 				dst_page->vmp_precious = FALSE;
6436 			} else {
6437 				/*
6438 				 * use pageclean setup, it is more
6439 				 * convenient even for the pageout
6440 				 * cases here
6441 				 */
6442 				vm_object_lock(upl->map_object);
6443 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6444 				vm_object_unlock(upl->map_object);
6445 
6446 				alias_page->vmp_absent = FALSE;
6447 				alias_page = NULL;
6448 			}
6449 
6450 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6451 				upl->flags &= ~UPL_CLEAR_DIRTY;
6452 				upl->flags |= UPL_SET_DIRTY;
6453 				dirty = TRUE;
6454 				/*
6455 				 * Page belonging to a code-signed object is about to
6456 				 * be written. Mark it tainted and disconnect it from
6457 				 * all pmaps so processes have to fault it back in and
6458 				 * deal with the tainted bit.
6459 				 */
6460 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6461 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6462 					vm_page_upl_tainted++;
6463 					if (dst_page->vmp_pmapped) {
6464 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6465 						if (refmod_state & VM_MEM_REFERENCED) {
6466 							dst_page->vmp_reference = TRUE;
6467 						}
6468 					}
6469 				}
6470 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6471 				/*
6472 				 * clean in place for read implies
6473 				 * that a write will be done on all
6474 				 * the pages that are dirty before
6475 				 * a upl commit is done.  The caller
6476 				 * is obligated to preserve the
6477 				 * contents of all pages marked dirty
6478 				 */
6479 				upl->flags |= UPL_CLEAR_DIRTY;
6480 			}
6481 			dst_page->vmp_dirty = dirty;
6482 
6483 			if (!dirty) {
6484 				dst_page->vmp_precious = TRUE;
6485 			}
6486 
6487 			if (!VM_PAGE_WIRED(dst_page)) {
6488 				/*
6489 				 * deny access to the target page while
6490 				 * it is being worked on
6491 				 */
6492 				dst_page->vmp_busy = TRUE;
6493 			} else {
6494 				dwp->dw_mask |= DW_vm_page_wire;
6495 			}
6496 
6497 			/*
6498 			 * We might be about to satisfy a fault which has been
6499 			 * requested. So no need for the "restart" bit.
6500 			 */
6501 			dst_page->vmp_restart = FALSE;
6502 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6503 				/*
6504 				 * expect the page to be used
6505 				 */
6506 				dwp->dw_mask |= DW_set_reference;
6507 			}
6508 			if (cntrl_flags & UPL_PRECIOUS) {
6509 				if (object->internal) {
6510 					SET_PAGE_DIRTY(dst_page, FALSE);
6511 					dst_page->vmp_precious = FALSE;
6512 				} else {
6513 					dst_page->vmp_precious = TRUE;
6514 				}
6515 			} else {
6516 				dst_page->vmp_precious = FALSE;
6517 			}
6518 		}
6519 		if (dst_page->vmp_busy) {
6520 			upl->flags |= UPL_HAS_BUSY;
6521 		}
6522 		if (VM_PAGE_WIRED(dst_page)) {
6523 			upl->flags |= UPL_HAS_WIRED;
6524 		}
6525 
6526 		if (phys_page > upl->highest_page) {
6527 			upl->highest_page = phys_page;
6528 		}
6529 		assert(!pmap_is_noencrypt(phys_page));
6530 		if (user_page_list) {
6531 			user_page_list[entry].phys_addr = phys_page;
6532 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6533 			user_page_list[entry].absent    = dst_page->vmp_absent;
6534 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6535 			user_page_list[entry].precious  = dst_page->vmp_precious;
6536 			user_page_list[entry].device    = FALSE;
6537 			user_page_list[entry].needed    = FALSE;
6538 			if (dst_page->vmp_clustered == TRUE) {
6539 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6540 			} else {
6541 				user_page_list[entry].speculative = FALSE;
6542 			}
6543 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6544 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6545 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6546 			user_page_list[entry].mark      = FALSE;
6547 		}
6548 		/*
6549 		 * if UPL_RET_ONLY_ABSENT is set, then
6550 		 * we are working with a fresh page and we've
6551 		 * just set the clustered flag on it to
6552 		 * indicate that it was drug in as part of a
6553 		 * speculative cluster... so leave it alone
6554 		 */
6555 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6556 			/*
6557 			 * someone is explicitly grabbing this page...
6558 			 * update clustered and speculative state
6559 			 *
6560 			 */
6561 			if (dst_page->vmp_clustered) {
6562 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6563 			}
6564 		}
6565 try_next_page:
6566 		if (dwp->dw_mask) {
6567 			if (dwp->dw_mask & DW_vm_page_activate) {
6568 				counter_inc(&vm_statistics_reactivations);
6569 			}
6570 
6571 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6572 
6573 			if (dw_count >= dw_limit) {
6574 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6575 
6576 				dwp = dwp_start;
6577 				dw_count = 0;
6578 			}
6579 		}
6580 		entry++;
6581 		dst_offset += PAGE_SIZE_64;
6582 		xfer_size -= PAGE_SIZE;
6583 	}
6584 	if (dw_count) {
6585 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6586 		dwp = dwp_start;
6587 		dw_count = 0;
6588 	}
6589 
6590 	if (alias_page != NULL) {
6591 		VM_PAGE_FREE(alias_page);
6592 	}
6593 	if (pmap_flushes_delayed == TRUE) {
6594 		pmap_flush(&pmap_flush_context_storage);
6595 	}
6596 
6597 	if (page_list_count != NULL) {
6598 		if (upl->flags & UPL_INTERNAL) {
6599 			*page_list_count = 0;
6600 		} else if (*page_list_count > entry) {
6601 			*page_list_count = entry;
6602 		}
6603 	}
6604 #if UPL_DEBUG
6605 	upl->upl_state = 1;
6606 #endif
6607 	vm_object_unlock(object);
6608 
6609 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6610 	if (task != NULL) {
6611 		counter_add(&task->pages_grabbed_upl, page_grab_count);
6612 	}
6613 
6614 	if (dwp_start && dwp_finish_ctx) {
6615 		vm_page_delayed_work_finish_ctx(dwp_start);
6616 		dwp_start = dwp = NULL;
6617 	}
6618 
6619 	return KERN_SUCCESS;
6620 }
6621 
6622 int cs_executable_create_upl = 0;
6623 extern int proc_selfpid(void);
6624 extern char *proc_name_address(void *p);
6625 
6626 kern_return_t
6627 vm_map_create_upl(
6628 	vm_map_t                map,
6629 	vm_map_address_t        offset,
6630 	upl_size_t              *upl_size,
6631 	upl_t                   *upl,
6632 	upl_page_info_array_t   page_list,
6633 	unsigned int            *count,
6634 	upl_control_flags_t     *flags,
6635 	vm_tag_t                tag)
6636 {
6637 	vm_map_entry_t          entry;
6638 	upl_control_flags_t     caller_flags;
6639 	int                     force_data_sync;
6640 	int                     sync_cow_data;
6641 	vm_object_t             local_object;
6642 	vm_map_offset_t         local_offset;
6643 	vm_map_offset_t         local_start;
6644 	kern_return_t           ret;
6645 	vm_map_address_t        original_offset;
6646 	vm_map_size_t           original_size, adjusted_size;
6647 	vm_map_offset_t         local_entry_start;
6648 	vm_object_offset_t      local_entry_offset;
6649 	vm_object_offset_t      offset_in_mapped_page;
6650 	boolean_t               release_map = FALSE;
6651 
6652 
6653 start_with_map:
6654 
6655 	original_offset = offset;
6656 	original_size = *upl_size;
6657 	adjusted_size = original_size;
6658 
6659 	caller_flags = *flags;
6660 
6661 	if (caller_flags & ~UPL_VALID_FLAGS) {
6662 		/*
6663 		 * For forward compatibility's sake,
6664 		 * reject any unknown flag.
6665 		 */
6666 		ret = KERN_INVALID_VALUE;
6667 		goto done;
6668 	}
6669 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6670 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6671 
6672 	if (upl == NULL) {
6673 		ret = KERN_INVALID_ARGUMENT;
6674 		goto done;
6675 	}
6676 
6677 REDISCOVER_ENTRY:
6678 	vm_map_lock_read(map);
6679 
6680 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6681 		vm_map_unlock_read(map);
6682 		ret = KERN_FAILURE;
6683 		goto done;
6684 	}
6685 
6686 	local_entry_start = entry->vme_start;
6687 	local_entry_offset = VME_OFFSET(entry);
6688 
6689 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6690 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6691 	}
6692 
6693 	if (entry->vme_end - original_offset < adjusted_size) {
6694 		adjusted_size = entry->vme_end - original_offset;
6695 		assert(adjusted_size > 0);
6696 		*upl_size = (upl_size_t) adjusted_size;
6697 		assert(*upl_size == adjusted_size);
6698 	}
6699 
6700 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6701 		*flags = 0;
6702 
6703 		if (!entry->is_sub_map &&
6704 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6705 			if (VME_OBJECT(entry)->private) {
6706 				*flags = UPL_DEV_MEMORY;
6707 			}
6708 
6709 			if (VME_OBJECT(entry)->phys_contiguous) {
6710 				*flags |= UPL_PHYS_CONTIG;
6711 			}
6712 		}
6713 		vm_map_unlock_read(map);
6714 		ret = KERN_SUCCESS;
6715 		goto done;
6716 	}
6717 
6718 	offset_in_mapped_page = 0;
6719 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6720 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6721 		*upl_size = (upl_size_t)
6722 		    (vm_map_round_page(original_offset + adjusted_size,
6723 		    VM_MAP_PAGE_MASK(map))
6724 		    - offset);
6725 
6726 		offset_in_mapped_page = original_offset - offset;
6727 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6728 
6729 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6730 	}
6731 
6732 	if (!entry->is_sub_map) {
6733 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6734 		    !VME_OBJECT(entry)->phys_contiguous) {
6735 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6736 				*upl_size = MAX_UPL_SIZE_BYTES;
6737 			}
6738 		}
6739 
6740 		/*
6741 		 *      Create an object if necessary.
6742 		 */
6743 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6744 			if (entry->max_protection == VM_PROT_NONE) {
6745 				/* don't create an object for a reserved range */
6746 				vm_map_unlock_read(map);
6747 				ret = KERN_PROTECTION_FAILURE;
6748 				goto done;
6749 			}
6750 
6751 			if (vm_map_lock_read_to_write(map)) {
6752 				goto REDISCOVER_ENTRY;
6753 			}
6754 
6755 			VME_OBJECT_SET(entry,
6756 			    vm_object_allocate((vm_size_t)
6757 			    vm_object_round_page((entry->vme_end - entry->vme_start))),
6758 			    false, 0);
6759 			VME_OFFSET_SET(entry, 0);
6760 			assert(entry->use_pmap);
6761 
6762 			vm_map_lock_write_to_read(map);
6763 		}
6764 
6765 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6766 		    !(entry->protection & VM_PROT_WRITE)) {
6767 			vm_map_unlock_read(map);
6768 			ret = KERN_PROTECTION_FAILURE;
6769 			goto done;
6770 		}
6771 	}
6772 
6773 #if !XNU_TARGET_OS_OSX
6774 	if (map->pmap != kernel_pmap &&
6775 	    (caller_flags & UPL_COPYOUT_FROM) &&
6776 	    (entry->protection & VM_PROT_EXECUTE) &&
6777 	    !(entry->protection & VM_PROT_WRITE)) {
6778 		vm_offset_t     kaddr;
6779 		vm_size_t       ksize;
6780 
6781 		/*
6782 		 * We're about to create a read-only UPL backed by
6783 		 * memory from an executable mapping.
6784 		 * Wiring the pages would result in the pages being copied
6785 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6786 		 * code-signed, so no longer eligible for execution.
6787 		 * Instead, let's copy the data into a kernel buffer and
6788 		 * create the UPL from this kernel buffer.
6789 		 * The kernel buffer is then freed, leaving the UPL holding
6790 		 * the last reference on the VM object, so the memory will
6791 		 * be released when the UPL is committed.
6792 		 */
6793 
6794 		vm_map_unlock_read(map);
6795 		entry = VM_MAP_ENTRY_NULL;
6796 		/* allocate kernel buffer */
6797 		ksize = round_page(*upl_size);
6798 		kaddr = 0;
6799 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6800 		    KMA_PAGEABLE | KMA_DATA, tag);
6801 		if (ret == KERN_SUCCESS) {
6802 			/* copyin the user data */
6803 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6804 		}
6805 		if (ret == KERN_SUCCESS) {
6806 			if (ksize > *upl_size) {
6807 				/* zero out the extra space in kernel buffer */
6808 				memset((void *)(kaddr + *upl_size),
6809 				    0,
6810 				    ksize - *upl_size);
6811 			}
6812 			/* create the UPL from the kernel buffer */
6813 			vm_object_offset_t      offset_in_object;
6814 			vm_object_offset_t      offset_in_object_page;
6815 
6816 			offset_in_object = offset - local_entry_start + local_entry_offset;
6817 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6818 			assert(offset_in_object_page < PAGE_SIZE);
6819 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6820 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6821 			ret = vm_map_create_upl(kernel_map,
6822 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6823 			    upl_size, upl, page_list, count, flags, tag);
6824 		}
6825 		if (kaddr != 0) {
6826 			/* free the kernel buffer */
6827 			kmem_free(kernel_map, kaddr, ksize);
6828 			kaddr = 0;
6829 			ksize = 0;
6830 		}
6831 #if DEVELOPMENT || DEBUG
6832 		DTRACE_VM4(create_upl_from_executable,
6833 		    vm_map_t, map,
6834 		    vm_map_address_t, offset,
6835 		    upl_size_t, *upl_size,
6836 		    kern_return_t, ret);
6837 #endif /* DEVELOPMENT || DEBUG */
6838 		goto done;
6839 	}
6840 #endif /* !XNU_TARGET_OS_OSX */
6841 
6842 	if (!entry->is_sub_map) {
6843 		local_object = VME_OBJECT(entry);
6844 		assert(local_object != VM_OBJECT_NULL);
6845 	}
6846 
6847 	if (!entry->is_sub_map &&
6848 	    !entry->needs_copy &&
6849 	    *upl_size != 0 &&
6850 	    local_object->vo_size > *upl_size && /* partial UPL */
6851 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6852 	    (map->pmap != kernel_pmap) && /* alias checks */
6853 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6854 	    ||
6855 	    ( /* case 2 */
6856 		    local_object->internal &&
6857 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6858 		    os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6859 		vm_prot_t       prot;
6860 
6861 		/*
6862 		 * Case 1:
6863 		 * Set up the targeted range for copy-on-write to avoid
6864 		 * applying true_share/copy_delay to the entire object.
6865 		 *
6866 		 * Case 2:
6867 		 * This map entry covers only part of an internal
6868 		 * object.  There could be other map entries covering
6869 		 * other areas of this object and some of these map
6870 		 * entries could be marked as "needs_copy", which
6871 		 * assumes that the object is COPY_SYMMETRIC.
6872 		 * To avoid marking this object as COPY_DELAY and
6873 		 * "true_share", let's shadow it and mark the new
6874 		 * (smaller) object as "true_share" and COPY_DELAY.
6875 		 */
6876 
6877 		if (vm_map_lock_read_to_write(map)) {
6878 			goto REDISCOVER_ENTRY;
6879 		}
6880 		vm_map_lock_assert_exclusive(map);
6881 		assert(VME_OBJECT(entry) == local_object);
6882 
6883 		vm_map_clip_start(map,
6884 		    entry,
6885 		    vm_map_trunc_page(offset,
6886 		    VM_MAP_PAGE_MASK(map)));
6887 		vm_map_clip_end(map,
6888 		    entry,
6889 		    vm_map_round_page(offset + *upl_size,
6890 		    VM_MAP_PAGE_MASK(map)));
6891 		if ((entry->vme_end - offset) < *upl_size) {
6892 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6893 			assert(*upl_size == entry->vme_end - offset);
6894 		}
6895 
6896 		prot = entry->protection & ~VM_PROT_WRITE;
6897 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6898 			prot |= VM_PROT_EXECUTE;
6899 		}
6900 		vm_object_pmap_protect(local_object,
6901 		    VME_OFFSET(entry),
6902 		    entry->vme_end - entry->vme_start,
6903 		    ((entry->is_shared ||
6904 		    map->mapped_in_other_pmaps)
6905 		    ? PMAP_NULL
6906 		    : map->pmap),
6907 		    VM_MAP_PAGE_SIZE(map),
6908 		    entry->vme_start,
6909 		    prot);
6910 
6911 		assert(entry->wired_count == 0);
6912 
6913 		/*
6914 		 * Lock the VM object and re-check its status: if it's mapped
6915 		 * in another address space, we could still be racing with
6916 		 * another thread holding that other VM map exclusively.
6917 		 */
6918 		vm_object_lock(local_object);
6919 		if (local_object->true_share) {
6920 			/* object is already in proper state: no COW needed */
6921 			assert(local_object->copy_strategy !=
6922 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6923 		} else {
6924 			/* not true_share: ask for copy-on-write below */
6925 			assert(local_object->copy_strategy ==
6926 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6927 			entry->needs_copy = TRUE;
6928 		}
6929 		vm_object_unlock(local_object);
6930 
6931 		vm_map_lock_write_to_read(map);
6932 	}
6933 
6934 	if (entry->needs_copy) {
6935 		/*
6936 		 * Honor copy-on-write for COPY_SYMMETRIC
6937 		 * strategy.
6938 		 */
6939 		vm_map_t                local_map;
6940 		vm_object_t             object;
6941 		vm_object_offset_t      new_offset;
6942 		vm_prot_t               prot;
6943 		boolean_t               wired;
6944 		vm_map_version_t        version;
6945 		vm_map_t                real_map;
6946 		vm_prot_t               fault_type;
6947 
6948 		local_map = map;
6949 
6950 		if (caller_flags & UPL_COPYOUT_FROM) {
6951 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6952 			vm_counters.create_upl_extra_cow++;
6953 			vm_counters.create_upl_extra_cow_pages +=
6954 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6955 		} else {
6956 			fault_type = VM_PROT_WRITE;
6957 		}
6958 		if (vm_map_lookup_and_lock_object(&local_map,
6959 		    offset, fault_type,
6960 		    OBJECT_LOCK_EXCLUSIVE,
6961 		    &version, &object,
6962 		    &new_offset, &prot, &wired,
6963 		    NULL,
6964 		    &real_map, NULL) != KERN_SUCCESS) {
6965 			if (fault_type == VM_PROT_WRITE) {
6966 				vm_counters.create_upl_lookup_failure_write++;
6967 			} else {
6968 				vm_counters.create_upl_lookup_failure_copy++;
6969 			}
6970 			vm_map_unlock_read(local_map);
6971 			ret = KERN_FAILURE;
6972 			goto done;
6973 		}
6974 		if (real_map != local_map) {
6975 			vm_map_unlock(real_map);
6976 		}
6977 		vm_map_unlock_read(local_map);
6978 
6979 		vm_object_unlock(object);
6980 
6981 		goto REDISCOVER_ENTRY;
6982 	}
6983 
6984 	if (entry->is_sub_map) {
6985 		vm_map_t        submap;
6986 
6987 		submap = VME_SUBMAP(entry);
6988 		local_start = entry->vme_start;
6989 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6990 
6991 		vm_map_reference(submap);
6992 		vm_map_unlock_read(map);
6993 
6994 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6995 		offset += offset_in_mapped_page;
6996 		*upl_size -= offset_in_mapped_page;
6997 
6998 		if (release_map) {
6999 			vm_map_deallocate(map);
7000 		}
7001 		map = submap;
7002 		release_map = TRUE;
7003 		offset = local_offset + (offset - local_start);
7004 		goto start_with_map;
7005 	}
7006 
7007 	if (sync_cow_data &&
7008 	    (VME_OBJECT(entry)->shadow ||
7009 	    VME_OBJECT(entry)->vo_copy)) {
7010 		local_object = VME_OBJECT(entry);
7011 		local_start = entry->vme_start;
7012 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7013 
7014 		vm_object_reference(local_object);
7015 		vm_map_unlock_read(map);
7016 
7017 		if (local_object->shadow && local_object->vo_copy) {
7018 			vm_object_lock_request(local_object->shadow,
7019 			    ((vm_object_offset_t)
7020 			    ((offset - local_start) +
7021 			    local_offset) +
7022 			    local_object->vo_shadow_offset),
7023 			    *upl_size, FALSE,
7024 			    MEMORY_OBJECT_DATA_SYNC,
7025 			    VM_PROT_NO_CHANGE);
7026 		}
7027 		sync_cow_data = FALSE;
7028 		vm_object_deallocate(local_object);
7029 
7030 		goto REDISCOVER_ENTRY;
7031 	}
7032 	if (force_data_sync) {
7033 		local_object = VME_OBJECT(entry);
7034 		local_start = entry->vme_start;
7035 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7036 
7037 		vm_object_reference(local_object);
7038 		vm_map_unlock_read(map);
7039 
7040 		vm_object_lock_request(local_object,
7041 		    ((vm_object_offset_t)
7042 		    ((offset - local_start) +
7043 		    local_offset)),
7044 		    (vm_object_size_t)*upl_size,
7045 		    FALSE,
7046 		    MEMORY_OBJECT_DATA_SYNC,
7047 		    VM_PROT_NO_CHANGE);
7048 
7049 		force_data_sync = FALSE;
7050 		vm_object_deallocate(local_object);
7051 
7052 		goto REDISCOVER_ENTRY;
7053 	}
7054 	if (VME_OBJECT(entry)->private) {
7055 		*flags = UPL_DEV_MEMORY;
7056 	} else {
7057 		*flags = 0;
7058 	}
7059 
7060 	if (VME_OBJECT(entry)->phys_contiguous) {
7061 		*flags |= UPL_PHYS_CONTIG;
7062 	}
7063 
7064 	local_object = VME_OBJECT(entry);
7065 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7066 	local_start = entry->vme_start;
7067 
7068 
7069 	/*
7070 	 * Wiring will copy the pages to the shadow object.
7071 	 * The shadow object will not be code-signed so
7072 	 * attempting to execute code from these copied pages
7073 	 * would trigger a code-signing violation.
7074 	 */
7075 	if (entry->protection & VM_PROT_EXECUTE) {
7076 #if MACH_ASSERT
7077 		printf("pid %d[%s] create_upl out of executable range from "
7078 		    "0x%llx to 0x%llx: side effects may include "
7079 		    "code-signing violations later on\n",
7080 		    proc_selfpid(),
7081 		    (get_bsdtask_info(current_task())
7082 		    ? proc_name_address(get_bsdtask_info(current_task()))
7083 		    : "?"),
7084 		    (uint64_t) entry->vme_start,
7085 		    (uint64_t) entry->vme_end);
7086 #endif /* MACH_ASSERT */
7087 		DTRACE_VM2(cs_executable_create_upl,
7088 		    uint64_t, (uint64_t)entry->vme_start,
7089 		    uint64_t, (uint64_t)entry->vme_end);
7090 		cs_executable_create_upl++;
7091 	}
7092 
7093 	vm_object_lock(local_object);
7094 
7095 	/*
7096 	 * Ensure that this object is "true_share" and "copy_delay" now,
7097 	 * while we're still holding the VM map lock.  After we unlock the map,
7098 	 * anything could happen to that mapping, including some copy-on-write
7099 	 * activity.  We need to make sure that the IOPL will point at the
7100 	 * same memory as the mapping.
7101 	 */
7102 	if (local_object->true_share) {
7103 		assert(local_object->copy_strategy !=
7104 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7105 	} else if (!is_kernel_object(local_object) &&
7106 	    local_object != compressor_object &&
7107 	    !local_object->phys_contiguous) {
7108 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7109 		if (!local_object->true_share &&
7110 		    vm_object_tracking_btlog) {
7111 			btlog_record(vm_object_tracking_btlog, local_object,
7112 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7113 			    btref_get(__builtin_frame_address(0), 0));
7114 		}
7115 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7116 		VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7117 		if (local_object->copy_strategy ==
7118 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7119 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7120 		}
7121 	}
7122 
7123 	vm_object_reference_locked(local_object);
7124 	vm_object_unlock(local_object);
7125 
7126 	vm_map_unlock_read(map);
7127 
7128 	offset += offset_in_mapped_page;
7129 	assert(*upl_size > offset_in_mapped_page);
7130 	*upl_size -= offset_in_mapped_page;
7131 
7132 	ret = vm_object_iopl_request(local_object,
7133 	    ((vm_object_offset_t)
7134 	    ((offset - local_start) + local_offset)),
7135 	    *upl_size,
7136 	    upl,
7137 	    page_list,
7138 	    count,
7139 	    caller_flags,
7140 	    tag);
7141 	vm_object_deallocate(local_object);
7142 
7143 
7144 done:
7145 	if (release_map) {
7146 		vm_map_deallocate(map);
7147 	}
7148 
7149 	return ret;
7150 }
7151 
7152 /*
7153  * Internal routine to enter a UPL into a VM map.
7154  *
7155  * JMM - This should just be doable through the standard
7156  * vm_map_enter() API.
7157  */
7158 kern_return_t
7159 vm_map_enter_upl_range(
7160 	vm_map_t                map,
7161 	upl_t                   upl,
7162 	vm_object_offset_t      offset_to_map,
7163 	vm_size_t               size_to_map,
7164 	vm_prot_t               prot_to_map,
7165 	vm_map_offset_t         *dst_addr)
7166 {
7167 	vm_map_size_t           size;
7168 	vm_object_offset_t      offset;
7169 	vm_map_offset_t         addr;
7170 	vm_page_t               m;
7171 	kern_return_t           kr;
7172 	int                     isVectorUPL = 0, curr_upl = 0;
7173 	upl_t                   vector_upl = NULL;
7174 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7175 	vm_map_t                vector_upl_submap = NULL;
7176 	upl_offset_t            subupl_offset = 0;
7177 	upl_size_t              subupl_size = 0;
7178 
7179 	if (upl == UPL_NULL) {
7180 		return KERN_INVALID_ARGUMENT;
7181 	}
7182 
7183 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7184 	assert(map == kernel_map);
7185 
7186 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7187 		int mapped = 0, valid_upls = 0;
7188 		vector_upl = upl;
7189 
7190 		upl_lock(vector_upl);
7191 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7192 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7193 			if (upl == NULL) {
7194 				continue;
7195 			}
7196 			valid_upls++;
7197 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7198 				mapped++;
7199 			}
7200 		}
7201 
7202 		if (mapped) {
7203 			if (mapped != valid_upls) {
7204 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7205 			} else {
7206 				upl_unlock(vector_upl);
7207 				return KERN_FAILURE;
7208 			}
7209 		}
7210 
7211 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7212 			panic("TODO4K: vector UPL not implemented");
7213 		}
7214 
7215 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7216 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7217 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7218 		    VM_KERN_MEMORY_NONE).kmr_submap;
7219 		map = vector_upl_submap;
7220 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7221 		curr_upl = 0;
7222 	} else {
7223 		upl_lock(upl);
7224 	}
7225 
7226 process_upl_to_enter:
7227 	if (isVectorUPL) {
7228 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7229 			*dst_addr = vector_upl_dst_addr;
7230 			upl_unlock(vector_upl);
7231 			return KERN_SUCCESS;
7232 		}
7233 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7234 		if (upl == NULL) {
7235 			goto process_upl_to_enter;
7236 		}
7237 
7238 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7239 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7240 	} else {
7241 		/*
7242 		 * check to see if already mapped
7243 		 */
7244 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7245 			upl_unlock(upl);
7246 			return KERN_FAILURE;
7247 		}
7248 	}
7249 
7250 	if ((!(upl->flags & UPL_SHADOWED)) &&
7251 	    ((upl->flags & UPL_HAS_BUSY) ||
7252 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7253 		vm_object_t             object;
7254 		vm_page_t               alias_page;
7255 		vm_object_offset_t      new_offset;
7256 		unsigned int            pg_num;
7257 
7258 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7259 		object = upl->map_object;
7260 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7261 
7262 		vm_object_lock(upl->map_object);
7263 
7264 		upl->map_object->shadow = object;
7265 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7266 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7267 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7268 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7269 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7270 		    "object %p shadow_offset 0x%llx",
7271 		    upl->map_object,
7272 		    (uint64_t)upl->map_object->vo_shadow_offset);
7273 		upl->map_object->wimg_bits = object->wimg_bits;
7274 		offset = upl->map_object->vo_shadow_offset;
7275 		new_offset = 0;
7276 
7277 		upl->flags |= UPL_SHADOWED;
7278 
7279 		while (size) {
7280 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7281 			assert(pg_num == new_offset / PAGE_SIZE);
7282 
7283 			if (bitmap_test(upl->lite_list, pg_num)) {
7284 				alias_page = vm_page_create_fictitious();
7285 
7286 				vm_object_lock(object);
7287 
7288 				m = vm_page_lookup(object, offset);
7289 				if (m == VM_PAGE_NULL) {
7290 					panic("vm_upl_map: page missing");
7291 				}
7292 
7293 				/*
7294 				 * Convert the fictitious page to a private
7295 				 * shadow of the real page.
7296 				 */
7297 				alias_page->vmp_free_when_done = TRUE;
7298 				/*
7299 				 * since m is a page in the upl it must
7300 				 * already be wired or BUSY, so it's
7301 				 * safe to assign the underlying physical
7302 				 * page to the alias
7303 				 */
7304 
7305 				vm_object_unlock(object);
7306 
7307 				vm_page_lockspin_queues();
7308 				vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7309 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7310 				vm_page_unlock_queues();
7311 
7312 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7313 
7314 				assert(!alias_page->vmp_wanted);
7315 				alias_page->vmp_busy = FALSE;
7316 				alias_page->vmp_absent = FALSE;
7317 			}
7318 			size -= PAGE_SIZE;
7319 			offset += PAGE_SIZE_64;
7320 			new_offset += PAGE_SIZE_64;
7321 		}
7322 		vm_object_unlock(upl->map_object);
7323 	}
7324 	if (upl->flags & UPL_SHADOWED) {
7325 		if (isVectorUPL) {
7326 			offset = 0;
7327 		} else {
7328 			offset = offset_to_map;
7329 		}
7330 	} else {
7331 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7332 		if (!isVectorUPL) {
7333 			offset += offset_to_map;
7334 		}
7335 	}
7336 
7337 	if (isVectorUPL) {
7338 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7339 	} else {
7340 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7341 	}
7342 
7343 	vm_object_reference(upl->map_object);
7344 
7345 	if (!isVectorUPL) {
7346 		*dst_addr = 0;
7347 		/*
7348 		 * NEED A UPL_MAP ALIAS
7349 		 */
7350 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7351 		    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7352 		    upl->map_object, offset, FALSE,
7353 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7354 
7355 		if (kr != KERN_SUCCESS) {
7356 			vm_object_deallocate(upl->map_object);
7357 			upl_unlock(upl);
7358 			return kr;
7359 		}
7360 	} else {
7361 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7362 		    VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7363 		    upl->map_object, offset, FALSE,
7364 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7365 		if (kr) {
7366 			panic("vm_map_enter failed for a Vector UPL");
7367 		}
7368 	}
7369 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7370 	                                        /* this will have to be an increment rather than */
7371 	                                        /* an assignment. */
7372 	vm_object_lock(upl->map_object);
7373 
7374 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7375 		m = vm_page_lookup(upl->map_object, offset);
7376 
7377 		if (m) {
7378 			m->vmp_pmapped = TRUE;
7379 
7380 			/*
7381 			 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7382 			 * but only in kernel space. If this was on a user map,
7383 			 * we'd have to set the wpmapped bit.
7384 			 */
7385 			/* m->vmp_wpmapped = TRUE; */
7386 			assert(map->pmap == kernel_pmap);
7387 
7388 			kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7389 
7390 			assert(kr == KERN_SUCCESS);
7391 #if KASAN
7392 			kasan_notify_address(addr, PAGE_SIZE_64);
7393 #endif
7394 		}
7395 		offset += PAGE_SIZE_64;
7396 	}
7397 	vm_object_unlock(upl->map_object);
7398 
7399 	/*
7400 	 * hold a reference for the mapping
7401 	 */
7402 	upl->ref_count++;
7403 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7404 	upl->kaddr = (vm_offset_t) *dst_addr;
7405 	assert(upl->kaddr == *dst_addr);
7406 
7407 	if (isVectorUPL) {
7408 		goto process_upl_to_enter;
7409 	}
7410 
7411 	if (!isVectorUPL) {
7412 		vm_map_offset_t addr_adjustment;
7413 
7414 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7415 		if (addr_adjustment) {
7416 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7417 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7418 			*dst_addr += addr_adjustment;
7419 		}
7420 	}
7421 
7422 	upl_unlock(upl);
7423 
7424 	return KERN_SUCCESS;
7425 }
7426 
7427 kern_return_t
7428 vm_map_enter_upl(
7429 	vm_map_t                map,
7430 	upl_t                   upl,
7431 	vm_map_offset_t         *dst_addr)
7432 {
7433 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7434 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7435 }
7436 
7437 /*
7438  * Internal routine to remove a UPL mapping from a VM map.
7439  *
7440  * XXX - This should just be doable through a standard
7441  * vm_map_remove() operation.  Otherwise, implicit clean-up
7442  * of the target map won't be able to correctly remove
7443  * these (and release the reference on the UPL).  Having
7444  * to do this means we can't map these into user-space
7445  * maps yet.
7446  */
7447 kern_return_t
7448 vm_map_remove_upl_range(
7449 	vm_map_t        map,
7450 	upl_t           upl,
7451 	__unused vm_object_offset_t    offset_to_unmap,
7452 	__unused vm_size_t             size_to_unmap)
7453 {
7454 	vm_address_t    addr;
7455 	upl_size_t      size;
7456 	int             isVectorUPL = 0, curr_upl = 0;
7457 	upl_t           vector_upl = NULL;
7458 
7459 	if (upl == UPL_NULL) {
7460 		return KERN_INVALID_ARGUMENT;
7461 	}
7462 
7463 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7464 		int     unmapped = 0, valid_upls = 0;
7465 		vector_upl = upl;
7466 		upl_lock(vector_upl);
7467 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7468 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7469 			if (upl == NULL) {
7470 				continue;
7471 			}
7472 			valid_upls++;
7473 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7474 				unmapped++;
7475 			}
7476 		}
7477 
7478 		if (unmapped) {
7479 			if (unmapped != valid_upls) {
7480 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7481 			} else {
7482 				upl_unlock(vector_upl);
7483 				return KERN_FAILURE;
7484 			}
7485 		}
7486 		curr_upl = 0;
7487 	} else {
7488 		upl_lock(upl);
7489 	}
7490 
7491 process_upl_to_remove:
7492 	if (isVectorUPL) {
7493 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7494 			vm_map_t v_upl_submap;
7495 			vm_offset_t v_upl_submap_dst_addr;
7496 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7497 
7498 			kmem_free_guard(map, v_upl_submap_dst_addr,
7499 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7500 			vm_map_deallocate(v_upl_submap);
7501 			upl_unlock(vector_upl);
7502 			return KERN_SUCCESS;
7503 		}
7504 
7505 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7506 		if (upl == NULL) {
7507 			goto process_upl_to_remove;
7508 		}
7509 	}
7510 
7511 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7512 		addr = upl->kaddr;
7513 		size = upl->u_mapped_size;
7514 
7515 		assert(upl->ref_count > 1);
7516 		upl->ref_count--;               /* removing mapping ref */
7517 
7518 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7519 		upl->kaddr = (vm_offset_t) 0;
7520 		upl->u_mapped_size = 0;
7521 
7522 		if (isVectorUPL) {
7523 			/*
7524 			 * If it's a Vectored UPL, we'll be removing the entire
7525 			 * submap anyways, so no need to remove individual UPL
7526 			 * element mappings from within the submap
7527 			 */
7528 			goto process_upl_to_remove;
7529 		}
7530 
7531 		upl_unlock(upl);
7532 
7533 		vm_map_remove(map,
7534 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7535 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7536 		return KERN_SUCCESS;
7537 	}
7538 	upl_unlock(upl);
7539 
7540 	return KERN_FAILURE;
7541 }
7542 
7543 kern_return_t
7544 vm_map_remove_upl(
7545 	vm_map_t        map,
7546 	upl_t           upl)
7547 {
7548 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7549 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7550 }
7551 
7552 void
7553 iopl_valid_data(
7554 	upl_t    upl,
7555 	vm_tag_t tag)
7556 {
7557 	vm_object_t     object;
7558 	vm_offset_t     offset;
7559 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
7560 	upl_size_t      size;
7561 	int             wired_count = 0;
7562 
7563 	if (upl == NULL) {
7564 		panic("iopl_valid_data: NULL upl");
7565 	}
7566 	if (vector_upl_is_valid(upl)) {
7567 		panic("iopl_valid_data: vector upl");
7568 	}
7569 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7570 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7571 	}
7572 
7573 	object = upl->map_object;
7574 
7575 	if (is_kernel_object(object) || object == compressor_object) {
7576 		panic("iopl_valid_data: object == kernel or compressor");
7577 	}
7578 
7579 	if (object->purgable == VM_PURGABLE_VOLATILE ||
7580 	    object->purgable == VM_PURGABLE_EMPTY) {
7581 		panic("iopl_valid_data: object %p purgable %d",
7582 		    object, object->purgable);
7583 	}
7584 
7585 	size = upl_adjusted_size(upl, PAGE_MASK);
7586 
7587 	vm_object_lock(object);
7588 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7589 
7590 	bool whole_object;
7591 
7592 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7593 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7594 		whole_object = true;
7595 	} else {
7596 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7597 		whole_object = false;
7598 	}
7599 
7600 	while (size) {
7601 		if (whole_object) {
7602 			if (nxt_page != VM_PAGE_NULL) {
7603 				m = nxt_page;
7604 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7605 			}
7606 		} else {
7607 			m = vm_page_lookup(object, offset);
7608 			offset += PAGE_SIZE;
7609 
7610 			if (m == VM_PAGE_NULL) {
7611 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7612 			}
7613 		}
7614 		if (m->vmp_busy) {
7615 			if (!m->vmp_absent) {
7616 				panic("iopl_valid_data: busy page w/o absent");
7617 			}
7618 
7619 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7620 				panic("iopl_valid_data: busy+absent page on page queue");
7621 			}
7622 			if (m->vmp_reusable) {
7623 				panic("iopl_valid_data: %p is reusable", m);
7624 			}
7625 
7626 			m->vmp_absent = FALSE;
7627 			m->vmp_dirty = TRUE;
7628 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7629 			assert(m->vmp_wire_count == 0);
7630 			m->vmp_wire_count++;
7631 			assert(m->vmp_wire_count);
7632 			if (m->vmp_wire_count == 1) {
7633 				m->vmp_q_state = VM_PAGE_IS_WIRED;
7634 				wired_count++;
7635 			} else {
7636 				panic("iopl_valid_data: %p already wired", m);
7637 			}
7638 
7639 
7640 			vm_page_wakeup_done(object, m);
7641 		}
7642 		size -= PAGE_SIZE;
7643 	}
7644 	if (wired_count) {
7645 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7646 		assert(object->resident_page_count >= object->wired_page_count);
7647 
7648 		/* no need to adjust purgeable accounting for this object: */
7649 		assert(object->purgable != VM_PURGABLE_VOLATILE);
7650 		assert(object->purgable != VM_PURGABLE_EMPTY);
7651 
7652 		vm_page_lockspin_queues();
7653 		vm_page_wire_count += wired_count;
7654 		vm_page_unlock_queues();
7655 	}
7656 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7657 	vm_object_unlock(object);
7658 }
7659 
7660 
7661 void
7662 vm_object_set_pmap_cache_attr(
7663 	vm_object_t             object,
7664 	upl_page_info_array_t   user_page_list,
7665 	unsigned int            num_pages,
7666 	boolean_t               batch_pmap_op)
7667 {
7668 	unsigned int    cache_attr = 0;
7669 
7670 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
7671 	assert(user_page_list);
7672 	if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7673 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7674 	}
7675 }
7676 
7677 
7678 static bool
7679 vm_object_iopl_wire_full(
7680 	vm_object_t             object,
7681 	upl_t                   upl,
7682 	upl_page_info_array_t   user_page_list,
7683 	upl_control_flags_t     cntrl_flags,
7684 	vm_tag_t                tag)
7685 {
7686 	vm_page_t       dst_page;
7687 	unsigned int    entry;
7688 	int             page_count;
7689 	int             delayed_unlock = 0;
7690 	boolean_t       retval = TRUE;
7691 	ppnum_t         phys_page;
7692 
7693 	vm_object_lock_assert_exclusive(object);
7694 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7695 	assert(object->purgable != VM_PURGABLE_EMPTY);
7696 	assert(object->pager == NULL);
7697 	assert(object->vo_copy == NULL);
7698 	assert(object->shadow == NULL);
7699 
7700 	page_count = object->resident_page_count;
7701 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7702 
7703 	vm_page_lock_queues();
7704 
7705 	while (page_count--) {
7706 		if (dst_page->vmp_busy ||
7707 		    vm_page_is_fictitious(dst_page) ||
7708 		    dst_page->vmp_absent ||
7709 		    VMP_ERROR_GET(dst_page) ||
7710 		    dst_page->vmp_cleaning ||
7711 		    dst_page->vmp_restart ||
7712 		    dst_page->vmp_laundry) {
7713 			retval = FALSE;
7714 			goto done;
7715 		}
7716 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7717 			retval = FALSE;
7718 			goto done;
7719 		}
7720 		dst_page->vmp_reference = TRUE;
7721 
7722 		vm_page_wire(dst_page, tag, FALSE);
7723 
7724 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7725 			SET_PAGE_DIRTY(dst_page, FALSE);
7726 		}
7727 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7728 		assert(entry >= 0 && entry < object->resident_page_count);
7729 		bitmap_set(upl->lite_list, entry);
7730 
7731 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7732 
7733 		if (phys_page > upl->highest_page) {
7734 			upl->highest_page = phys_page;
7735 		}
7736 
7737 		if (user_page_list) {
7738 			user_page_list[entry].phys_addr = phys_page;
7739 			user_page_list[entry].absent    = dst_page->vmp_absent;
7740 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7741 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
7742 			user_page_list[entry].precious  = dst_page->vmp_precious;
7743 			user_page_list[entry].device    = FALSE;
7744 			user_page_list[entry].speculative = FALSE;
7745 			user_page_list[entry].cs_validated = FALSE;
7746 			user_page_list[entry].cs_tainted = FALSE;
7747 			user_page_list[entry].cs_nx     = FALSE;
7748 			user_page_list[entry].needed    = FALSE;
7749 			user_page_list[entry].mark      = FALSE;
7750 		}
7751 		if (delayed_unlock++ > 256) {
7752 			delayed_unlock = 0;
7753 			lck_mtx_yield(&vm_page_queue_lock);
7754 
7755 			VM_CHECK_MEMORYSTATUS;
7756 		}
7757 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7758 	}
7759 done:
7760 	vm_page_unlock_queues();
7761 
7762 	VM_CHECK_MEMORYSTATUS;
7763 
7764 	return retval;
7765 }
7766 
7767 
7768 static kern_return_t
7769 vm_object_iopl_wire_empty(
7770 	vm_object_t             object,
7771 	upl_t                   upl,
7772 	upl_page_info_array_t   user_page_list,
7773 	upl_control_flags_t     cntrl_flags,
7774 	vm_tag_t                tag,
7775 	vm_object_offset_t     *dst_offset,
7776 	int                     page_count,
7777 	int                    *page_grab_count)
7778 {
7779 	vm_page_t       dst_page;
7780 	boolean_t       no_zero_fill = FALSE;
7781 	int             interruptible;
7782 	int             pages_wired = 0;
7783 	int             pages_inserted = 0;
7784 	int             entry = 0;
7785 	uint64_t        delayed_ledger_update = 0;
7786 	kern_return_t   ret = KERN_SUCCESS;
7787 	int             grab_options;
7788 	ppnum_t         phys_page;
7789 
7790 	vm_object_lock_assert_exclusive(object);
7791 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7792 	assert(object->purgable != VM_PURGABLE_EMPTY);
7793 	assert(object->pager == NULL);
7794 	assert(object->vo_copy == NULL);
7795 	assert(object->shadow == NULL);
7796 
7797 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7798 		interruptible = THREAD_ABORTSAFE;
7799 	} else {
7800 		interruptible = THREAD_UNINT;
7801 	}
7802 
7803 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7804 		no_zero_fill = TRUE;
7805 	}
7806 
7807 	grab_options = 0;
7808 #if CONFIG_SECLUDED_MEMORY
7809 	if (object->can_grab_secluded) {
7810 		grab_options |= VM_PAGE_GRAB_SECLUDED;
7811 	}
7812 #endif /* CONFIG_SECLUDED_MEMORY */
7813 
7814 	while (page_count--) {
7815 		while ((dst_page = vm_page_grab_options(grab_options))
7816 		    == VM_PAGE_NULL) {
7817 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7818 
7819 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7820 
7821 			if (vm_page_wait(interruptible) == FALSE) {
7822 				/*
7823 				 * interrupted case
7824 				 */
7825 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7826 
7827 				VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7828 
7829 				ret = MACH_SEND_INTERRUPTED;
7830 				goto done;
7831 			}
7832 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7833 
7834 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7835 		}
7836 
7837 		dst_page->vmp_absent = no_zero_fill;
7838 		dst_page->vmp_reference = TRUE;
7839 
7840 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7841 			SET_PAGE_DIRTY(dst_page, FALSE);
7842 		}
7843 		if (dst_page->vmp_absent == FALSE) {
7844 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7845 			assert(dst_page->vmp_wire_count == 0);
7846 			dst_page->vmp_wire_count++;
7847 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7848 			assert(dst_page->vmp_wire_count);
7849 			pages_wired++;
7850 
7851 
7852 			vm_page_wakeup_done(object, dst_page);
7853 		}
7854 		pages_inserted++;
7855 
7856 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7857 
7858 		if (no_zero_fill == FALSE) {
7859 			vm_page_zero_fill(
7860 				dst_page
7861 				);
7862 		}
7863 
7864 		bitmap_set(upl->lite_list, entry);
7865 
7866 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7867 
7868 		if (phys_page > upl->highest_page) {
7869 			upl->highest_page = phys_page;
7870 		}
7871 
7872 		if (user_page_list) {
7873 			user_page_list[entry].phys_addr = phys_page;
7874 			user_page_list[entry].absent    = dst_page->vmp_absent;
7875 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7876 			user_page_list[entry].free_when_done    = FALSE;
7877 			user_page_list[entry].precious  = FALSE;
7878 			user_page_list[entry].device    = FALSE;
7879 			user_page_list[entry].speculative = FALSE;
7880 			user_page_list[entry].cs_validated = FALSE;
7881 			user_page_list[entry].cs_tainted = FALSE;
7882 			user_page_list[entry].cs_nx     = FALSE;
7883 			user_page_list[entry].needed    = FALSE;
7884 			user_page_list[entry].mark      = FALSE;
7885 		}
7886 		entry++;
7887 		*dst_offset += PAGE_SIZE_64;
7888 	}
7889 done:
7890 	if (pages_wired) {
7891 		vm_page_lockspin_queues();
7892 		vm_page_wire_count += pages_wired;
7893 		vm_page_unlock_queues();
7894 	}
7895 	if (pages_inserted) {
7896 		if (object->internal) {
7897 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
7898 		} else {
7899 			OSAddAtomic(pages_inserted, &vm_page_external_count);
7900 		}
7901 	}
7902 	if (delayed_ledger_update) {
7903 		task_t          owner;
7904 		int             ledger_idx_volatile;
7905 		int             ledger_idx_nonvolatile;
7906 		int             ledger_idx_volatile_compressed;
7907 		int             ledger_idx_nonvolatile_compressed;
7908 		int             ledger_idx_composite;
7909 		int             ledger_idx_external_wired;
7910 		boolean_t       do_footprint;
7911 
7912 		owner = VM_OBJECT_OWNER(object);
7913 		assert(owner);
7914 
7915 		vm_object_ledger_tag_ledgers(object,
7916 		    &ledger_idx_volatile,
7917 		    &ledger_idx_nonvolatile,
7918 		    &ledger_idx_volatile_compressed,
7919 		    &ledger_idx_nonvolatile_compressed,
7920 		    &ledger_idx_composite,
7921 		    &ledger_idx_external_wired,
7922 		    &do_footprint);
7923 
7924 		if (object->internal) {
7925 			/* more non-volatile bytes */
7926 			ledger_credit(owner->ledger,
7927 			    ledger_idx_nonvolatile,
7928 			    delayed_ledger_update);
7929 			if (do_footprint) {
7930 				/* more footprint */
7931 				ledger_credit(owner->ledger,
7932 				    task_ledgers.phys_footprint,
7933 				    delayed_ledger_update);
7934 			} else if (ledger_idx_composite != -1) {
7935 				ledger_credit(owner->ledger,
7936 				    ledger_idx_composite,
7937 				    delayed_ledger_update);
7938 			}
7939 		} else {
7940 			/* more external wired bytes */
7941 			ledger_credit(owner->ledger,
7942 			    ledger_idx_external_wired,
7943 			    delayed_ledger_update);
7944 			if (do_footprint) {
7945 				/* more footprint */
7946 				ledger_credit(owner->ledger,
7947 				    task_ledgers.phys_footprint,
7948 				    delayed_ledger_update);
7949 			} else if (ledger_idx_composite != -1) {
7950 				ledger_credit(owner->ledger,
7951 				    ledger_idx_composite,
7952 				    delayed_ledger_update);
7953 			}
7954 		}
7955 	}
7956 
7957 	assert(page_grab_count);
7958 	*page_grab_count = pages_inserted;
7959 
7960 	return ret;
7961 }
7962 
7963 
7964 kern_return_t
7965 vm_object_iopl_request(
7966 	vm_object_t             object,
7967 	vm_object_offset_t      offset,
7968 	upl_size_t              size,
7969 	upl_t                   *upl_ptr,
7970 	upl_page_info_array_t   user_page_list,
7971 	unsigned int            *page_list_count,
7972 	upl_control_flags_t     cntrl_flags,
7973 	vm_tag_t                tag)
7974 {
7975 	vm_page_t               dst_page;
7976 	vm_object_offset_t      dst_offset;
7977 	upl_size_t              xfer_size;
7978 	upl_t                   upl = NULL;
7979 	unsigned int            entry;
7980 	int                     no_zero_fill = FALSE;
7981 	unsigned int            size_in_pages;
7982 	int                     page_grab_count = 0;
7983 	u_int32_t               psize;
7984 	kern_return_t           ret;
7985 	vm_prot_t               prot;
7986 	struct vm_object_fault_info fault_info = {};
7987 	struct  vm_page_delayed_work    dw_array;
7988 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7989 	bool                    dwp_finish_ctx = TRUE;
7990 	int                     dw_count;
7991 	int                     dw_limit;
7992 	int                     dw_index;
7993 	boolean_t               caller_lookup;
7994 	int                     io_tracking_flag = 0;
7995 	int                     interruptible;
7996 	ppnum_t                 phys_page;
7997 
7998 	boolean_t               set_cache_attr_needed = FALSE;
7999 	boolean_t               free_wired_pages = FALSE;
8000 	boolean_t               fast_path_empty_req = FALSE;
8001 	boolean_t               fast_path_full_req = FALSE;
8002 
8003 	task_t                  task = current_task();
8004 
8005 	dwp_start = dwp = NULL;
8006 
8007 	vm_object_offset_t original_offset = offset;
8008 	upl_size_t original_size = size;
8009 
8010 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8011 
8012 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8013 	offset = vm_object_trunc_page(offset);
8014 	if (size != original_size || offset != original_offset) {
8015 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8016 	}
8017 
8018 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
8019 		/*
8020 		 * For forward compatibility's sake,
8021 		 * reject any unknown flag.
8022 		 */
8023 		return KERN_INVALID_VALUE;
8024 	}
8025 	if (vm_lopage_needed == FALSE) {
8026 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8027 	}
8028 
8029 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8030 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8031 			return KERN_INVALID_VALUE;
8032 		}
8033 
8034 		if (object->phys_contiguous) {
8035 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8036 				return KERN_INVALID_ADDRESS;
8037 			}
8038 
8039 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8040 				return KERN_INVALID_ADDRESS;
8041 			}
8042 		}
8043 	}
8044 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8045 		no_zero_fill = TRUE;
8046 	}
8047 
8048 	if (cntrl_flags & UPL_COPYOUT_FROM) {
8049 		prot = VM_PROT_READ;
8050 	} else {
8051 		prot = VM_PROT_READ | VM_PROT_WRITE;
8052 	}
8053 
8054 	if ((!object->internal) && (object->paging_offset != 0)) {
8055 		panic("vm_object_iopl_request: external object with non-zero paging offset");
8056 	}
8057 
8058 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8059 
8060 #if CONFIG_IOSCHED || UPL_DEBUG
8061 	if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8062 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8063 	}
8064 #endif
8065 
8066 #if CONFIG_IOSCHED
8067 	if (object->io_tracking) {
8068 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8069 		if (!is_kernel_object(object)) {
8070 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8071 		}
8072 	}
8073 #endif
8074 
8075 	if (object->phys_contiguous) {
8076 		psize = PAGE_SIZE;
8077 	} else {
8078 		psize = size;
8079 
8080 		dw_count = 0;
8081 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8082 		dwp_start = vm_page_delayed_work_get_ctx();
8083 		if (dwp_start == NULL) {
8084 			dwp_start = &dw_array;
8085 			dw_limit = 1;
8086 			dwp_finish_ctx = FALSE;
8087 		}
8088 
8089 		dwp = dwp_start;
8090 	}
8091 
8092 	if (cntrl_flags & UPL_SET_INTERNAL) {
8093 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8094 		user_page_list = size ? upl->page_list : NULL;
8095 	} else {
8096 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8097 	}
8098 	if (user_page_list) {
8099 		user_page_list[0].device = FALSE;
8100 	}
8101 	*upl_ptr = upl;
8102 
8103 	if (cntrl_flags & UPL_NOZEROFILLIO) {
8104 		DTRACE_VM4(upl_nozerofillio,
8105 		    vm_object_t, object,
8106 		    vm_object_offset_t, offset,
8107 		    upl_size_t, size,
8108 		    upl_t, upl);
8109 	}
8110 
8111 	upl->map_object = object;
8112 	upl->u_offset = original_offset;
8113 	upl->u_size = original_size;
8114 
8115 	size_in_pages = size / PAGE_SIZE;
8116 
8117 	if (is_kernel_object(object) &&
8118 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8119 		upl->flags |= UPL_KERNEL_OBJECT;
8120 #if UPL_DEBUG
8121 		vm_object_lock(object);
8122 #else
8123 		vm_object_lock_shared(object);
8124 #endif
8125 	} else {
8126 		vm_object_lock(object);
8127 		vm_object_activity_begin(object);
8128 	}
8129 	/*
8130 	 * paging in progress also protects the paging_offset
8131 	 */
8132 	upl->u_offset = original_offset + object->paging_offset;
8133 
8134 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8135 		/*
8136 		 * The user requested that access to the pages in this UPL
8137 		 * be blocked until the UPL is commited or aborted.
8138 		 */
8139 		upl->flags |= UPL_ACCESS_BLOCKED;
8140 	}
8141 
8142 #if CONFIG_IOSCHED || UPL_DEBUG
8143 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8144 		vm_object_activity_begin(object);
8145 		queue_enter(&object->uplq, upl, upl_t, uplq);
8146 	}
8147 #endif
8148 
8149 	if (object->phys_contiguous) {
8150 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8151 			assert(!object->blocked_access);
8152 			object->blocked_access = TRUE;
8153 		}
8154 
8155 		vm_object_unlock(object);
8156 
8157 		/*
8158 		 * don't need any shadow mappings for this one
8159 		 * since it is already I/O memory
8160 		 */
8161 		upl->flags |= UPL_DEVICE_MEMORY;
8162 
8163 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8164 
8165 		if (user_page_list) {
8166 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8167 			user_page_list[0].device = TRUE;
8168 		}
8169 		if (page_list_count != NULL) {
8170 			if (upl->flags & UPL_INTERNAL) {
8171 				*page_list_count = 0;
8172 			} else {
8173 				*page_list_count = 1;
8174 			}
8175 		}
8176 
8177 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8178 		if (task != NULL) {
8179 			counter_add(&task->pages_grabbed_iopl, page_grab_count);
8180 		}
8181 		return KERN_SUCCESS;
8182 	}
8183 	if (!is_kernel_object(object) && object != compressor_object) {
8184 		/*
8185 		 * Protect user space from future COW operations
8186 		 */
8187 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8188 		if (!object->true_share &&
8189 		    vm_object_tracking_btlog) {
8190 			btlog_record(vm_object_tracking_btlog, object,
8191 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
8192 			    btref_get(__builtin_frame_address(0), 0));
8193 		}
8194 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8195 
8196 		vm_object_lock_assert_exclusive(object);
8197 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8198 
8199 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8200 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8201 		}
8202 	}
8203 
8204 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8205 	    object->vo_copy != VM_OBJECT_NULL) {
8206 		/*
8207 		 * Honor copy-on-write obligations
8208 		 *
8209 		 * The caller is gathering these pages and
8210 		 * might modify their contents.  We need to
8211 		 * make sure that the copy object has its own
8212 		 * private copies of these pages before we let
8213 		 * the caller modify them.
8214 		 *
8215 		 * NOTE: someone else could map the original object
8216 		 * after we've done this copy-on-write here, and they
8217 		 * could then see an inconsistent picture of the memory
8218 		 * while it's being modified via the UPL.  To prevent this,
8219 		 * we would have to block access to these pages until the
8220 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
8221 		 * code path for that...
8222 		 */
8223 		vm_object_update(object,
8224 		    offset,
8225 		    size,
8226 		    NULL,
8227 		    NULL,
8228 		    FALSE,              /* should_return */
8229 		    MEMORY_OBJECT_COPY_SYNC,
8230 		    VM_PROT_NO_CHANGE);
8231 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
8232 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8233 	}
8234 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8235 	    object->purgable != VM_PURGABLE_VOLATILE &&
8236 	    object->purgable != VM_PURGABLE_EMPTY &&
8237 	    object->vo_copy == NULL &&
8238 	    size == object->vo_size &&
8239 	    offset == 0 &&
8240 	    object->shadow == NULL &&
8241 	    object->pager == NULL) {
8242 		if (object->resident_page_count == size_in_pages) {
8243 			assert(object != compressor_object);
8244 			assert(!is_kernel_object(object));
8245 			fast_path_full_req = TRUE;
8246 		} else if (object->resident_page_count == 0) {
8247 			assert(object != compressor_object);
8248 			assert(!is_kernel_object(object));
8249 			fast_path_empty_req = TRUE;
8250 			set_cache_attr_needed = TRUE;
8251 		}
8252 	}
8253 
8254 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8255 		interruptible = THREAD_ABORTSAFE;
8256 	} else {
8257 		interruptible = THREAD_UNINT;
8258 	}
8259 
8260 	entry = 0;
8261 
8262 	xfer_size = size;
8263 	dst_offset = offset;
8264 
8265 	if (fast_path_full_req) {
8266 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8267 			goto finish;
8268 		}
8269 		/*
8270 		 * we couldn't complete the processing of this request on the fast path
8271 		 * so fall through to the slow path and finish up
8272 		 */
8273 	} else if (fast_path_empty_req) {
8274 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8275 			ret = KERN_MEMORY_ERROR;
8276 			goto return_err;
8277 		}
8278 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8279 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8280 
8281 		if (ret) {
8282 			free_wired_pages = TRUE;
8283 			goto return_err;
8284 		}
8285 		goto finish;
8286 	}
8287 
8288 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8289 	fault_info.lo_offset = offset;
8290 	fault_info.hi_offset = offset + xfer_size;
8291 	fault_info.mark_zf_absent = TRUE;
8292 	fault_info.interruptible = interruptible;
8293 	fault_info.batch_pmap_op = TRUE;
8294 
8295 	while (xfer_size) {
8296 		vm_fault_return_t       result;
8297 
8298 		dwp->dw_mask = 0;
8299 
8300 		if (fast_path_full_req) {
8301 			/*
8302 			 * if we get here, it means that we ran into a page
8303 			 * state we couldn't handle in the fast path and
8304 			 * bailed out to the slow path... since the order
8305 			 * we look at pages is different between the 2 paths,
8306 			 * the following check is needed to determine whether
8307 			 * this page was already processed in the fast path
8308 			 */
8309 			if (bitmap_test(upl->lite_list, entry)) {
8310 				goto skip_page;
8311 			}
8312 		}
8313 		dst_page = vm_page_lookup(object, dst_offset);
8314 
8315 		if (dst_page == VM_PAGE_NULL ||
8316 		    dst_page->vmp_busy ||
8317 		    VMP_ERROR_GET(dst_page) ||
8318 		    dst_page->vmp_restart ||
8319 		    dst_page->vmp_absent ||
8320 		    vm_page_is_fictitious(dst_page)) {
8321 			if (is_kernel_object(object)) {
8322 				panic("vm_object_iopl_request: missing/bad page in kernel object");
8323 			}
8324 			if (object == compressor_object) {
8325 				panic("vm_object_iopl_request: missing/bad page in compressor object");
8326 			}
8327 
8328 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8329 				ret = KERN_MEMORY_ERROR;
8330 				goto return_err;
8331 			}
8332 
8333 			if (dst_page != VM_PAGE_NULL &&
8334 			    dst_page->vmp_busy) {
8335 				wait_result_t wait_result;
8336 				vm_object_lock_assert_exclusive(object);
8337 				wait_result = vm_page_sleep(object, dst_page,
8338 				    interruptible, LCK_SLEEP_DEFAULT);
8339 				if (wait_result == THREAD_AWAKENED ||
8340 				    wait_result == THREAD_RESTART) {
8341 					continue;
8342 				}
8343 				ret = MACH_SEND_INTERRUPTED;
8344 				goto return_err;
8345 			}
8346 
8347 			set_cache_attr_needed = TRUE;
8348 
8349 			/*
8350 			 * We just looked up the page and the result remains valid
8351 			 * until the object lock is release, so send it to
8352 			 * vm_fault_page() (as "dst_page"), to avoid having to
8353 			 * look it up again there.
8354 			 */
8355 			caller_lookup = TRUE;
8356 
8357 			do {
8358 				vm_page_t       top_page;
8359 				kern_return_t   error_code;
8360 
8361 				fault_info.cluster_size = xfer_size;
8362 				vm_object_paging_begin(object);
8363 
8364 				result = vm_fault_page(object, dst_offset,
8365 				    prot | VM_PROT_WRITE, FALSE,
8366 				    caller_lookup,
8367 				    &prot, &dst_page, &top_page,
8368 				    (int *)0,
8369 				    &error_code, no_zero_fill,
8370 				    &fault_info);
8371 
8372 				/* our lookup is no longer valid at this point */
8373 				caller_lookup = FALSE;
8374 
8375 				switch (result) {
8376 				case VM_FAULT_SUCCESS:
8377 					page_grab_count++;
8378 
8379 					if (!dst_page->vmp_absent) {
8380 						vm_page_wakeup_done(object, dst_page);
8381 					} else {
8382 						/*
8383 						 * we only get back an absent page if we
8384 						 * requested that it not be zero-filled
8385 						 * because we are about to fill it via I/O
8386 						 *
8387 						 * absent pages should be left BUSY
8388 						 * to prevent them from being faulted
8389 						 * into an address space before we've
8390 						 * had a chance to complete the I/O on
8391 						 * them since they may contain info that
8392 						 * shouldn't be seen by the faulting task
8393 						 */
8394 					}
8395 					/*
8396 					 *	Release paging references and
8397 					 *	top-level placeholder page, if any.
8398 					 */
8399 					if (top_page != VM_PAGE_NULL) {
8400 						vm_object_t local_object;
8401 
8402 						local_object = VM_PAGE_OBJECT(top_page);
8403 
8404 						/*
8405 						 * comparing 2 packed pointers
8406 						 */
8407 						if (top_page->vmp_object != dst_page->vmp_object) {
8408 							vm_object_lock(local_object);
8409 							VM_PAGE_FREE(top_page);
8410 							vm_object_paging_end(local_object);
8411 							vm_object_unlock(local_object);
8412 						} else {
8413 							VM_PAGE_FREE(top_page);
8414 							vm_object_paging_end(local_object);
8415 						}
8416 					}
8417 					vm_object_paging_end(object);
8418 					break;
8419 
8420 				case VM_FAULT_RETRY:
8421 					vm_object_lock(object);
8422 					break;
8423 
8424 				case VM_FAULT_MEMORY_SHORTAGE:
8425 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8426 
8427 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8428 
8429 					if (vm_page_wait(interruptible)) {
8430 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8431 
8432 						VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8433 						vm_object_lock(object);
8434 
8435 						break;
8436 					}
8437 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8438 
8439 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8440 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8441 					OS_FALLTHROUGH;
8442 
8443 				case VM_FAULT_INTERRUPTED:
8444 					error_code = MACH_SEND_INTERRUPTED;
8445 					OS_FALLTHROUGH;
8446 				case VM_FAULT_MEMORY_ERROR:
8447 memory_error:
8448 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8449 
8450 					vm_object_lock(object);
8451 					goto return_err;
8452 
8453 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
8454 					/* success but no page: fail */
8455 					vm_object_paging_end(object);
8456 					vm_object_unlock(object);
8457 					goto memory_error;
8458 
8459 				default:
8460 					panic("vm_object_iopl_request: unexpected error"
8461 					    " 0x%x from vm_fault_page()\n", result);
8462 				}
8463 			} while (result != VM_FAULT_SUCCESS);
8464 		}
8465 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8466 
8467 		if (upl->flags & UPL_KERNEL_OBJECT) {
8468 			goto record_phys_addr;
8469 		}
8470 
8471 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8472 			dst_page->vmp_busy = TRUE;
8473 			goto record_phys_addr;
8474 		}
8475 
8476 		if (dst_page->vmp_cleaning) {
8477 			/*
8478 			 * Someone else is cleaning this page in place.
8479 			 * In theory, we should be able to  proceed and use this
8480 			 * page but they'll probably end up clearing the "busy"
8481 			 * bit on it in upl_commit_range() but they didn't set
8482 			 * it, so they would clear our "busy" bit and open
8483 			 * us to race conditions.
8484 			 * We'd better wait for the cleaning to complete and
8485 			 * then try again.
8486 			 */
8487 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8488 			vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8489 			continue;
8490 		}
8491 		if (dst_page->vmp_laundry) {
8492 			vm_pageout_steal_laundry(dst_page, FALSE);
8493 		}
8494 
8495 		if (
8496 			((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8497 			phys_page >= (max_valid_dma_address >> PAGE_SHIFT))) {
8498 			vm_page_t       new_page;
8499 			int             refmod;
8500 
8501 			/*
8502 			 * support devices that can't DMA above 32 bits
8503 			 * by substituting pages from a pool of low address
8504 			 * memory for any pages we find above the 4G mark
8505 			 * can't substitute if the page is already wired because
8506 			 * we don't know whether that physical address has been
8507 			 * handed out to some other 64 bit capable DMA device to use
8508 			 */
8509 			if (VM_PAGE_WIRED(dst_page)) {
8510 				ret = KERN_PROTECTION_FAILURE;
8511 				goto return_err;
8512 			}
8513 
8514 			{
8515 				new_page = vm_page_grablo();
8516 			}
8517 
8518 			if (new_page == VM_PAGE_NULL) {
8519 				ret = KERN_RESOURCE_SHORTAGE;
8520 				goto return_err;
8521 			}
8522 			/*
8523 			 * from here until the vm_page_replace completes
8524 			 * we musn't drop the object lock... we don't
8525 			 * want anyone refaulting this page in and using
8526 			 * it after we disconnect it... we want the fault
8527 			 * to find the new page being substituted.
8528 			 */
8529 			if (dst_page->vmp_pmapped) {
8530 				refmod = pmap_disconnect(phys_page);
8531 			} else {
8532 				refmod = 0;
8533 			}
8534 
8535 			if (!dst_page->vmp_absent) {
8536 				vm_page_copy(dst_page, new_page);
8537 			}
8538 
8539 			new_page->vmp_reference = dst_page->vmp_reference;
8540 			new_page->vmp_dirty     = dst_page->vmp_dirty;
8541 			new_page->vmp_absent    = dst_page->vmp_absent;
8542 
8543 			if (refmod & VM_MEM_REFERENCED) {
8544 				new_page->vmp_reference = TRUE;
8545 			}
8546 			if (refmod & VM_MEM_MODIFIED) {
8547 				SET_PAGE_DIRTY(new_page, FALSE);
8548 			}
8549 
8550 			vm_page_replace(new_page, object, dst_offset);
8551 
8552 			dst_page = new_page;
8553 			/*
8554 			 * vm_page_grablo returned the page marked
8555 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
8556 			 * here, because we've never dropped the object lock
8557 			 */
8558 			if (!dst_page->vmp_absent) {
8559 				dst_page->vmp_busy = FALSE;
8560 			}
8561 
8562 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8563 		}
8564 		if (!dst_page->vmp_busy) {
8565 			dwp->dw_mask |= DW_vm_page_wire;
8566 		}
8567 
8568 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
8569 			/*
8570 			 * Mark the page "busy" to block any future page fault
8571 			 * on this page in addition to wiring it.
8572 			 * We'll also remove the mapping
8573 			 * of all these pages before leaving this routine.
8574 			 */
8575 			assert(!vm_page_is_fictitious(dst_page));
8576 			dst_page->vmp_busy = TRUE;
8577 		}
8578 		/*
8579 		 * expect the page to be used
8580 		 * page queues lock must be held to set 'reference'
8581 		 */
8582 		dwp->dw_mask |= DW_set_reference;
8583 
8584 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8585 			SET_PAGE_DIRTY(dst_page, TRUE);
8586 			/*
8587 			 * Page belonging to a code-signed object is about to
8588 			 * be written. Mark it tainted and disconnect it from
8589 			 * all pmaps so processes have to fault it back in and
8590 			 * deal with the tainted bit.
8591 			 */
8592 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8593 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8594 				vm_page_iopl_tainted++;
8595 				if (dst_page->vmp_pmapped) {
8596 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8597 					if (refmod & VM_MEM_REFERENCED) {
8598 						dst_page->vmp_reference = TRUE;
8599 					}
8600 				}
8601 			}
8602 		}
8603 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8604 			pmap_sync_page_attributes_phys(phys_page);
8605 			dst_page->vmp_written_by_kernel = FALSE;
8606 		}
8607 
8608 record_phys_addr:
8609 		if (dst_page->vmp_busy) {
8610 			upl->flags |= UPL_HAS_BUSY;
8611 		}
8612 
8613 		bitmap_set(upl->lite_list, entry);
8614 
8615 		if (phys_page > upl->highest_page) {
8616 			upl->highest_page = phys_page;
8617 		}
8618 
8619 		if (user_page_list) {
8620 			user_page_list[entry].phys_addr = phys_page;
8621 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
8622 			user_page_list[entry].absent    = dst_page->vmp_absent;
8623 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8624 			user_page_list[entry].precious  = dst_page->vmp_precious;
8625 			user_page_list[entry].device    = FALSE;
8626 			user_page_list[entry].needed    = FALSE;
8627 			if (dst_page->vmp_clustered == TRUE) {
8628 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8629 			} else {
8630 				user_page_list[entry].speculative = FALSE;
8631 			}
8632 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8633 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8634 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8635 			user_page_list[entry].mark      = FALSE;
8636 		}
8637 		if (!is_kernel_object(object) && object != compressor_object) {
8638 			/*
8639 			 * someone is explicitly grabbing this page...
8640 			 * update clustered and speculative state
8641 			 *
8642 			 */
8643 			if (dst_page->vmp_clustered) {
8644 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
8645 			}
8646 		}
8647 skip_page:
8648 		entry++;
8649 		dst_offset += PAGE_SIZE_64;
8650 		xfer_size -= PAGE_SIZE;
8651 
8652 		if (dwp->dw_mask) {
8653 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8654 
8655 			if (dw_count >= dw_limit) {
8656 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8657 
8658 				dwp = dwp_start;
8659 				dw_count = 0;
8660 			}
8661 		}
8662 	}
8663 	assert(entry == size_in_pages);
8664 
8665 	if (dw_count) {
8666 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8667 		dwp = dwp_start;
8668 		dw_count = 0;
8669 	}
8670 finish:
8671 	if (user_page_list && set_cache_attr_needed == TRUE) {
8672 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8673 	}
8674 
8675 	if (page_list_count != NULL) {
8676 		if (upl->flags & UPL_INTERNAL) {
8677 			*page_list_count = 0;
8678 		} else if (*page_list_count > size_in_pages) {
8679 			*page_list_count = size_in_pages;
8680 		}
8681 	}
8682 	vm_object_unlock(object);
8683 
8684 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8685 		/*
8686 		 * We've marked all the pages "busy" so that future
8687 		 * page faults will block.
8688 		 * Now remove the mapping for these pages, so that they
8689 		 * can't be accessed without causing a page fault.
8690 		 */
8691 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8692 		    PMAP_NULL,
8693 		    PAGE_SIZE,
8694 		    0, VM_PROT_NONE);
8695 		assert(!object->blocked_access);
8696 		object->blocked_access = TRUE;
8697 	}
8698 
8699 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8700 	if (task != NULL) {
8701 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8702 	}
8703 
8704 	if (dwp_start && dwp_finish_ctx) {
8705 		vm_page_delayed_work_finish_ctx(dwp_start);
8706 		dwp_start = dwp = NULL;
8707 	}
8708 
8709 	return KERN_SUCCESS;
8710 
8711 return_err:
8712 	dw_index = 0;
8713 
8714 	for (; offset < dst_offset; offset += PAGE_SIZE) {
8715 		boolean_t need_unwire;
8716 		bool need_wakeup;
8717 
8718 		dst_page = vm_page_lookup(object, offset);
8719 
8720 		if (dst_page == VM_PAGE_NULL) {
8721 			panic("vm_object_iopl_request: Wired page missing.");
8722 		}
8723 
8724 		/*
8725 		 * if we've already processed this page in an earlier
8726 		 * dw_do_work, we need to undo the wiring... we will
8727 		 * leave the dirty and reference bits on if they
8728 		 * were set, since we don't have a good way of knowing
8729 		 * what the previous state was and we won't get here
8730 		 * under any normal circumstances...  we will always
8731 		 * clear BUSY and wakeup any waiters via vm_page_free
8732 		 * or PAGE_WAKEUP_DONE
8733 		 */
8734 		need_unwire = TRUE;
8735 
8736 		need_wakeup = false;
8737 		if (dw_count) {
8738 			if ((dwp_start)[dw_index].dw_m == dst_page) {
8739 				/*
8740 				 * still in the deferred work list
8741 				 * which means we haven't yet called
8742 				 * vm_page_wire on this page
8743 				 */
8744 				need_unwire = FALSE;
8745 
8746 				if (dst_page->vmp_busy &&
8747 				    ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8748 					/*
8749 					 * It's our own "busy" bit, so we need to clear it
8750 					 * now and wake up waiters below.
8751 					 */
8752 					dst_page->vmp_busy = false;
8753 					need_wakeup = true;
8754 				}
8755 
8756 				dw_index++;
8757 				dw_count--;
8758 			}
8759 		}
8760 		vm_page_lock_queues();
8761 
8762 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8763 			vm_page_free(dst_page);
8764 
8765 			need_unwire = FALSE;
8766 		} else {
8767 			if (need_unwire == TRUE) {
8768 				vm_page_unwire(dst_page, TRUE);
8769 			}
8770 			if (dst_page->vmp_busy) {
8771 				/* not our "busy" or we would have cleared it above */
8772 				assert(!need_wakeup);
8773 			}
8774 			if (need_wakeup) {
8775 				assert(!dst_page->vmp_busy);
8776 				vm_page_wakeup(object, dst_page);
8777 			}
8778 		}
8779 		vm_page_unlock_queues();
8780 
8781 		if (need_unwire == TRUE) {
8782 			counter_inc(&vm_statistics_reactivations);
8783 		}
8784 	}
8785 #if UPL_DEBUG
8786 	upl->upl_state = 2;
8787 #endif
8788 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8789 		vm_object_activity_end(object);
8790 		vm_object_collapse(object, 0, TRUE);
8791 	}
8792 	vm_object_unlock(object);
8793 	upl_destroy(upl);
8794 
8795 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8796 	if (task != NULL) {
8797 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8798 	}
8799 
8800 	if (dwp_start && dwp_finish_ctx) {
8801 		vm_page_delayed_work_finish_ctx(dwp_start);
8802 		dwp_start = dwp = NULL;
8803 	}
8804 	return ret;
8805 }
8806 
8807 kern_return_t
8808 upl_transpose(
8809 	upl_t           upl1,
8810 	upl_t           upl2)
8811 {
8812 	kern_return_t           retval;
8813 	boolean_t               upls_locked;
8814 	vm_object_t             object1, object2;
8815 
8816 	/* LD: Should mapped UPLs be eligible for a transpose? */
8817 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8818 		return KERN_INVALID_ARGUMENT;
8819 	}
8820 
8821 	upls_locked = FALSE;
8822 
8823 	/*
8824 	 * Since we need to lock both UPLs at the same time,
8825 	 * avoid deadlocks by always taking locks in the same order.
8826 	 */
8827 	if (upl1 < upl2) {
8828 		upl_lock(upl1);
8829 		upl_lock(upl2);
8830 	} else {
8831 		upl_lock(upl2);
8832 		upl_lock(upl1);
8833 	}
8834 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
8835 
8836 	object1 = upl1->map_object;
8837 	object2 = upl2->map_object;
8838 
8839 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8840 	    upl1->u_size != upl2->u_size) {
8841 		/*
8842 		 * We deal only with full objects, not subsets.
8843 		 * That's because we exchange the entire backing store info
8844 		 * for the objects: pager, resident pages, etc...  We can't do
8845 		 * only part of it.
8846 		 */
8847 		retval = KERN_INVALID_VALUE;
8848 		goto done;
8849 	}
8850 
8851 	/*
8852 	 * Tranpose the VM objects' backing store.
8853 	 */
8854 	retval = vm_object_transpose(object1, object2,
8855 	    upl_adjusted_size(upl1, PAGE_MASK));
8856 
8857 	if (retval == KERN_SUCCESS) {
8858 		/*
8859 		 * Make each UPL point to the correct VM object, i.e. the
8860 		 * object holding the pages that the UPL refers to...
8861 		 */
8862 #if CONFIG_IOSCHED || UPL_DEBUG
8863 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8864 			vm_object_lock(object1);
8865 			vm_object_lock(object2);
8866 		}
8867 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8868 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
8869 		}
8870 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8871 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
8872 		}
8873 #endif
8874 		upl1->map_object = object2;
8875 		upl2->map_object = object1;
8876 
8877 #if CONFIG_IOSCHED || UPL_DEBUG
8878 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8879 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
8880 		}
8881 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8882 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
8883 		}
8884 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8885 			vm_object_unlock(object2);
8886 			vm_object_unlock(object1);
8887 		}
8888 #endif
8889 	}
8890 
8891 done:
8892 	/*
8893 	 * Cleanup.
8894 	 */
8895 	if (upls_locked) {
8896 		upl_unlock(upl1);
8897 		upl_unlock(upl2);
8898 		upls_locked = FALSE;
8899 	}
8900 
8901 	return retval;
8902 }
8903 
8904 void
8905 upl_range_needed(
8906 	upl_t           upl,
8907 	int             index,
8908 	int             count)
8909 {
8910 	int             size_in_pages;
8911 
8912 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8913 		return;
8914 	}
8915 
8916 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8917 
8918 	while (count-- && index < size_in_pages) {
8919 		upl->page_list[index++].needed = TRUE;
8920 	}
8921 }
8922 
8923 
8924 /*
8925  * Reserve of virtual addresses in the kernel address space.
8926  * We need to map the physical pages in the kernel, so that we
8927  * can call the code-signing or slide routines with a kernel
8928  * virtual address.  We keep this pool of pre-allocated kernel
8929  * virtual addresses so that we don't have to scan the kernel's
8930  * virtaul address space each time we need to work with
8931  * a physical page.
8932  */
8933 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8934 #define VM_PAGING_NUM_PAGES     64
8935 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8936 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8937 int             vm_paging_max_index = 0;
8938 int             vm_paging_page_waiter = 0;
8939 int             vm_paging_page_waiter_total = 0;
8940 
8941 unsigned long   vm_paging_no_kernel_page = 0;
8942 unsigned long   vm_paging_objects_mapped = 0;
8943 unsigned long   vm_paging_pages_mapped = 0;
8944 unsigned long   vm_paging_objects_mapped_slow = 0;
8945 unsigned long   vm_paging_pages_mapped_slow = 0;
8946 
8947 __startup_func
8948 static void
8949 vm_paging_map_init(void)
8950 {
8951 	kmem_alloc(kernel_map, &vm_paging_base_address,
8952 	    ptoa(VM_PAGING_NUM_PAGES),
8953 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
8954 	    VM_KERN_MEMORY_NONE);
8955 }
8956 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
8957 
8958 /*
8959  * vm_paging_map_object:
8960  *	Maps part of a VM object's pages in the kernel
8961  *      virtual address space, using the pre-allocated
8962  *	kernel virtual addresses, if possible.
8963  * Context:
8964  *      The VM object is locked.  This lock will get
8965  *      dropped and re-acquired though, so the caller
8966  *      must make sure the VM object is kept alive
8967  *	(by holding a VM map that has a reference
8968  *      on it, for example, or taking an extra reference).
8969  *      The page should also be kept busy to prevent
8970  *	it from being reclaimed.
8971  */
8972 kern_return_t
8973 vm_paging_map_object(
8974 	vm_page_t               page,
8975 	vm_object_t             object,
8976 	vm_object_offset_t      offset,
8977 	vm_prot_t               protection,
8978 	boolean_t               can_unlock_object,
8979 	vm_map_size_t           *size,          /* IN/OUT */
8980 	vm_map_offset_t         *address,       /* OUT */
8981 	boolean_t               *need_unmap)    /* OUT */
8982 {
8983 	kern_return_t           kr;
8984 	vm_map_offset_t         page_map_offset;
8985 	vm_map_size_t           map_size;
8986 	vm_object_offset_t      object_offset;
8987 	int                     i;
8988 
8989 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
8990 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
8991 		*address = (vm_map_offset_t)
8992 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
8993 		*need_unmap = FALSE;
8994 		return KERN_SUCCESS;
8995 
8996 		assert(page->vmp_busy);
8997 		/*
8998 		 * Use one of the pre-allocated kernel virtual addresses
8999 		 * and just enter the VM page in the kernel address space
9000 		 * at that virtual address.
9001 		 */
9002 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9003 
9004 		/*
9005 		 * Try and find an available kernel virtual address
9006 		 * from our pre-allocated pool.
9007 		 */
9008 		page_map_offset = 0;
9009 		for (;;) {
9010 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9011 				if (vm_paging_page_inuse[i] == FALSE) {
9012 					page_map_offset =
9013 					    vm_paging_base_address +
9014 					    (i * PAGE_SIZE);
9015 					break;
9016 				}
9017 			}
9018 			if (page_map_offset != 0) {
9019 				/* found a space to map our page ! */
9020 				break;
9021 			}
9022 
9023 			if (can_unlock_object) {
9024 				/*
9025 				 * If we can afford to unlock the VM object,
9026 				 * let's take the slow path now...
9027 				 */
9028 				break;
9029 			}
9030 			/*
9031 			 * We can't afford to unlock the VM object, so
9032 			 * let's wait for a space to become available...
9033 			 */
9034 			vm_paging_page_waiter_total++;
9035 			vm_paging_page_waiter++;
9036 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9037 			if (kr == THREAD_WAITING) {
9038 				simple_unlock(&vm_paging_lock);
9039 				kr = thread_block(THREAD_CONTINUE_NULL);
9040 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9041 			}
9042 			vm_paging_page_waiter--;
9043 			/* ... and try again */
9044 		}
9045 
9046 		if (page_map_offset != 0) {
9047 			/*
9048 			 * We found a kernel virtual address;
9049 			 * map the physical page to that virtual address.
9050 			 */
9051 			if (i > vm_paging_max_index) {
9052 				vm_paging_max_index = i;
9053 			}
9054 			vm_paging_page_inuse[i] = TRUE;
9055 			simple_unlock(&vm_paging_lock);
9056 
9057 			page->vmp_pmapped = TRUE;
9058 
9059 			/*
9060 			 * Keep the VM object locked over the PMAP_ENTER
9061 			 * and the actual use of the page by the kernel,
9062 			 * or this pmap mapping might get undone by a
9063 			 * vm_object_pmap_protect() call...
9064 			 */
9065 			kr = pmap_enter_check(kernel_pmap,
9066 			    page_map_offset,
9067 			    page,
9068 			    protection,
9069 			    VM_PROT_NONE,
9070 			    TRUE);
9071 			assert(kr == KERN_SUCCESS);
9072 			vm_paging_objects_mapped++;
9073 			vm_paging_pages_mapped++;
9074 			*address = page_map_offset;
9075 			*need_unmap = TRUE;
9076 
9077 #if KASAN
9078 			kasan_notify_address(page_map_offset, PAGE_SIZE);
9079 #endif
9080 
9081 			/* all done and mapped, ready to use ! */
9082 			return KERN_SUCCESS;
9083 		}
9084 
9085 		/*
9086 		 * We ran out of pre-allocated kernel virtual
9087 		 * addresses.  Just map the page in the kernel
9088 		 * the slow and regular way.
9089 		 */
9090 		vm_paging_no_kernel_page++;
9091 		simple_unlock(&vm_paging_lock);
9092 	}
9093 
9094 	if (!can_unlock_object) {
9095 		*address = 0;
9096 		*size = 0;
9097 		*need_unmap = FALSE;
9098 		return KERN_NOT_SUPPORTED;
9099 	}
9100 
9101 	object_offset = vm_object_trunc_page(offset);
9102 	map_size = vm_map_round_page(*size,
9103 	    VM_MAP_PAGE_MASK(kernel_map));
9104 
9105 	/*
9106 	 * Try and map the required range of the object
9107 	 * in the kernel_map. Given that allocation is
9108 	 * for pageable memory, it shouldn't contain
9109 	 * pointers and is mapped into the data range.
9110 	 */
9111 
9112 	vm_object_reference_locked(object);     /* for the map entry */
9113 	vm_object_unlock(object);
9114 
9115 	kr = vm_map_enter(kernel_map,
9116 	    address,
9117 	    map_size,
9118 	    0,
9119 	    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9120 	    object,
9121 	    object_offset,
9122 	    FALSE,
9123 	    protection,
9124 	    VM_PROT_ALL,
9125 	    VM_INHERIT_NONE);
9126 	if (kr != KERN_SUCCESS) {
9127 		*address = 0;
9128 		*size = 0;
9129 		*need_unmap = FALSE;
9130 		vm_object_deallocate(object);   /* for the map entry */
9131 		vm_object_lock(object);
9132 		return kr;
9133 	}
9134 
9135 	*size = map_size;
9136 
9137 	/*
9138 	 * Enter the mapped pages in the page table now.
9139 	 */
9140 	vm_object_lock(object);
9141 	/*
9142 	 * VM object must be kept locked from before PMAP_ENTER()
9143 	 * until after the kernel is done accessing the page(s).
9144 	 * Otherwise, the pmap mappings in the kernel could be
9145 	 * undone by a call to vm_object_pmap_protect().
9146 	 */
9147 
9148 	for (page_map_offset = 0;
9149 	    map_size != 0;
9150 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9151 		page = vm_page_lookup(object, offset + page_map_offset);
9152 		if (page == VM_PAGE_NULL) {
9153 			printf("vm_paging_map_object: no page !?");
9154 			vm_object_unlock(object);
9155 			vm_map_remove(kernel_map, *address, *size);
9156 			*address = 0;
9157 			*size = 0;
9158 			*need_unmap = FALSE;
9159 			vm_object_lock(object);
9160 			return KERN_MEMORY_ERROR;
9161 		}
9162 		page->vmp_pmapped = TRUE;
9163 
9164 		kr = pmap_enter_check(kernel_pmap,
9165 		    *address + page_map_offset,
9166 		    page,
9167 		    protection,
9168 		    VM_PROT_NONE,
9169 		    TRUE);
9170 		assert(kr == KERN_SUCCESS);
9171 #if KASAN
9172 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9173 #endif
9174 	}
9175 
9176 	vm_paging_objects_mapped_slow++;
9177 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9178 
9179 	*need_unmap = TRUE;
9180 
9181 	return KERN_SUCCESS;
9182 }
9183 
9184 /*
9185  * vm_paging_unmap_object:
9186  *	Unmaps part of a VM object's pages from the kernel
9187  *      virtual address space.
9188  * Context:
9189  *      The VM object is locked.  This lock will get
9190  *      dropped and re-acquired though.
9191  */
9192 void
9193 vm_paging_unmap_object(
9194 	vm_object_t     object,
9195 	vm_map_offset_t start,
9196 	vm_map_offset_t end)
9197 {
9198 	int             i;
9199 
9200 	if ((vm_paging_base_address == 0) ||
9201 	    (start < vm_paging_base_address) ||
9202 	    (end > (vm_paging_base_address
9203 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9204 		/*
9205 		 * We didn't use our pre-allocated pool of
9206 		 * kernel virtual address.  Deallocate the
9207 		 * virtual memory.
9208 		 */
9209 		if (object != VM_OBJECT_NULL) {
9210 			vm_object_unlock(object);
9211 		}
9212 		vm_map_remove(kernel_map, start, end);
9213 		if (object != VM_OBJECT_NULL) {
9214 			vm_object_lock(object);
9215 		}
9216 	} else {
9217 		/*
9218 		 * We used a kernel virtual address from our
9219 		 * pre-allocated pool.  Put it back in the pool
9220 		 * for next time.
9221 		 */
9222 		assert(end - start == PAGE_SIZE);
9223 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9224 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9225 
9226 		/* undo the pmap mapping */
9227 		pmap_remove(kernel_pmap, start, end);
9228 
9229 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9230 		vm_paging_page_inuse[i] = FALSE;
9231 		if (vm_paging_page_waiter) {
9232 			thread_wakeup(&vm_paging_page_waiter);
9233 		}
9234 		simple_unlock(&vm_paging_lock);
9235 	}
9236 }
9237 
9238 
9239 /*
9240  * page->vmp_object must be locked
9241  */
9242 void
9243 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9244 {
9245 	if (!queues_locked) {
9246 		vm_page_lockspin_queues();
9247 	}
9248 
9249 	page->vmp_free_when_done = FALSE;
9250 	/*
9251 	 * need to drop the laundry count...
9252 	 * we may also need to remove it
9253 	 * from the I/O paging queue...
9254 	 * vm_pageout_throttle_up handles both cases
9255 	 *
9256 	 * the laundry and pageout_queue flags are cleared...
9257 	 */
9258 	vm_pageout_throttle_up(page);
9259 
9260 	if (!queues_locked) {
9261 		vm_page_unlock_queues();
9262 	}
9263 }
9264 
9265 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9266 
9267 upl_t
9268 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9269 {
9270 	int i = 0;
9271 	upl_t   upl;
9272 
9273 	assert(max_upls > 0);
9274 	if (max_upls == 0) {
9275 		return NULL;
9276 	}
9277 
9278 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9279 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9280 	}
9281 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9282 
9283 	upl = upl_create(0, UPL_VECTOR, 0);
9284 	upl->vector_upl = vector_upl;
9285 	upl->u_offset = upl_offset;
9286 	vector_upl->size = 0;
9287 	vector_upl->offset = upl_offset;
9288 	vector_upl->invalid_upls = 0;
9289 	vector_upl->num_upls = 0;
9290 	vector_upl->pagelist = NULL;
9291 	vector_upl->max_upls = max_upls;
9292 
9293 	for (i = 0; i < max_upls; i++) {
9294 		vector_upl->upls[i].iostate.size = 0;
9295 		vector_upl->upls[i].iostate.offset = 0;
9296 	}
9297 	return upl;
9298 }
9299 
9300 upl_size_t
9301 vector_upl_get_size(const upl_t upl)
9302 {
9303 	if (!vector_upl_is_valid(upl)) {
9304 		return upl_get_size(upl);
9305 	} else {
9306 		return round_page_32(upl->vector_upl->size);
9307 	}
9308 }
9309 
9310 uint32_t
9311 vector_upl_max_upls(const upl_t upl)
9312 {
9313 	if (!vector_upl_is_valid(upl)) {
9314 		return 0;
9315 	}
9316 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
9317 }
9318 
9319 void
9320 vector_upl_deallocate(upl_t upl)
9321 {
9322 	vector_upl_t vector_upl = upl->vector_upl;
9323 
9324 	assert(vector_upl_is_valid(upl));
9325 
9326 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
9327 		panic("Deallocating non-empty Vectored UPL");
9328 	}
9329 	uint32_t max_upls = vector_upl->max_upls;
9330 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9331 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9332 	upl->vector_upl = NULL;
9333 }
9334 
9335 boolean_t
9336 vector_upl_is_valid(upl_t upl)
9337 {
9338 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9339 }
9340 
9341 boolean_t
9342 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9343 {
9344 	if (vector_upl_is_valid(upl)) {
9345 		vector_upl_t vector_upl = upl->vector_upl;
9346 
9347 		if (vector_upl) {
9348 			if (subupl) {
9349 				if (io_size) {
9350 					if (io_size < PAGE_SIZE) {
9351 						io_size = PAGE_SIZE;
9352 					}
9353 					subupl->vector_upl = (void*)vector_upl;
9354 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9355 					vector_upl->size += io_size;
9356 					upl->u_size += io_size;
9357 				} else {
9358 					uint32_t i = 0, invalid_upls = 0;
9359 					for (i = 0; i < vector_upl->num_upls; i++) {
9360 						if (vector_upl->upls[i].elem == subupl) {
9361 							break;
9362 						}
9363 					}
9364 					if (i == vector_upl->num_upls) {
9365 						panic("Trying to remove sub-upl when none exists");
9366 					}
9367 
9368 					vector_upl->upls[i].elem = NULL;
9369 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9370 					    relaxed);
9371 					if (invalid_upls == vector_upl->num_upls) {
9372 						return TRUE;
9373 					} else {
9374 						return FALSE;
9375 					}
9376 				}
9377 			} else {
9378 				panic("vector_upl_set_subupl was passed a NULL upl element");
9379 			}
9380 		} else {
9381 			panic("vector_upl_set_subupl was passed a non-vectored upl");
9382 		}
9383 	} else {
9384 		panic("vector_upl_set_subupl was passed a NULL upl");
9385 	}
9386 
9387 	return FALSE;
9388 }
9389 
9390 void
9391 vector_upl_set_pagelist(upl_t upl)
9392 {
9393 	if (vector_upl_is_valid(upl)) {
9394 		uint32_t i = 0;
9395 		vector_upl_t vector_upl = upl->vector_upl;
9396 
9397 		if (vector_upl) {
9398 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9399 
9400 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
9401 			    atop(vector_upl->size), Z_WAITOK);
9402 
9403 			for (i = 0; i < vector_upl->num_upls; i++) {
9404 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9405 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9406 				pagelist_size += cur_upl_pagelist_size;
9407 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9408 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
9409 				}
9410 			}
9411 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9412 		} else {
9413 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
9414 		}
9415 	} else {
9416 		panic("vector_upl_set_pagelist was passed a NULL upl");
9417 	}
9418 }
9419 
9420 upl_t
9421 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9422 {
9423 	if (vector_upl_is_valid(upl)) {
9424 		vector_upl_t vector_upl = upl->vector_upl;
9425 		if (vector_upl) {
9426 			if (index < vector_upl->num_upls) {
9427 				return vector_upl->upls[index].elem;
9428 			}
9429 		} else {
9430 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9431 		}
9432 	}
9433 	return NULL;
9434 }
9435 
9436 upl_t
9437 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9438 {
9439 	if (vector_upl_is_valid(upl)) {
9440 		uint32_t i = 0;
9441 		vector_upl_t vector_upl = upl->vector_upl;
9442 
9443 		if (vector_upl) {
9444 			upl_t subupl = NULL;
9445 			vector_upl_iostates_t subupl_state;
9446 
9447 			for (i = 0; i < vector_upl->num_upls; i++) {
9448 				subupl = vector_upl->upls[i].elem;
9449 				subupl_state = vector_upl->upls[i].iostate;
9450 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9451 					/* We could have been passed an offset/size pair that belongs
9452 					 * to an UPL element that has already been committed/aborted.
9453 					 * If so, return NULL.
9454 					 */
9455 					if (subupl == NULL) {
9456 						return NULL;
9457 					}
9458 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9459 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9460 						if (*upl_size > subupl_state.size) {
9461 							*upl_size = subupl_state.size;
9462 						}
9463 					}
9464 					if (*upl_offset >= subupl_state.offset) {
9465 						*upl_offset -= subupl_state.offset;
9466 					} else if (i) {
9467 						panic("Vector UPL offset miscalculation");
9468 					}
9469 					return subupl;
9470 				}
9471 			}
9472 		} else {
9473 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9474 		}
9475 	}
9476 	return NULL;
9477 }
9478 
9479 void
9480 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9481 {
9482 	*v_upl_submap = NULL;
9483 
9484 	if (vector_upl_is_valid(upl)) {
9485 		vector_upl_t vector_upl = upl->vector_upl;
9486 		if (vector_upl) {
9487 			*v_upl_submap = vector_upl->submap;
9488 			*submap_dst_addr = vector_upl->submap_dst_addr;
9489 		} else {
9490 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9491 		}
9492 	} else {
9493 		panic("vector_upl_get_submap was passed a null UPL");
9494 	}
9495 }
9496 
9497 void
9498 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9499 {
9500 	if (vector_upl_is_valid(upl)) {
9501 		vector_upl_t vector_upl = upl->vector_upl;
9502 		if (vector_upl) {
9503 			vector_upl->submap = submap;
9504 			vector_upl->submap_dst_addr = submap_dst_addr;
9505 		} else {
9506 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9507 		}
9508 	} else {
9509 		panic("vector_upl_get_submap was passed a NULL UPL");
9510 	}
9511 }
9512 
9513 void
9514 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9515 {
9516 	if (vector_upl_is_valid(upl)) {
9517 		uint32_t i = 0;
9518 		vector_upl_t vector_upl = upl->vector_upl;
9519 
9520 		if (vector_upl) {
9521 			for (i = 0; i < vector_upl->num_upls; i++) {
9522 				if (vector_upl->upls[i].elem == subupl) {
9523 					break;
9524 				}
9525 			}
9526 
9527 			if (i == vector_upl->num_upls) {
9528 				panic("setting sub-upl iostate when none exists");
9529 			}
9530 
9531 			vector_upl->upls[i].iostate.offset = offset;
9532 			if (size < PAGE_SIZE) {
9533 				size = PAGE_SIZE;
9534 			}
9535 			vector_upl->upls[i].iostate.size = size;
9536 		} else {
9537 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
9538 		}
9539 	} else {
9540 		panic("vector_upl_set_iostate was passed a NULL UPL");
9541 	}
9542 }
9543 
9544 void
9545 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9546 {
9547 	if (vector_upl_is_valid(upl)) {
9548 		uint32_t i = 0;
9549 		vector_upl_t vector_upl = upl->vector_upl;
9550 
9551 		if (vector_upl) {
9552 			for (i = 0; i < vector_upl->num_upls; i++) {
9553 				if (vector_upl->upls[i].elem == subupl) {
9554 					break;
9555 				}
9556 			}
9557 
9558 			if (i == vector_upl->num_upls) {
9559 				panic("getting sub-upl iostate when none exists");
9560 			}
9561 
9562 			*offset = vector_upl->upls[i].iostate.offset;
9563 			*size = vector_upl->upls[i].iostate.size;
9564 		} else {
9565 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
9566 		}
9567 	} else {
9568 		panic("vector_upl_get_iostate was passed a NULL UPL");
9569 	}
9570 }
9571 
9572 void
9573 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9574 {
9575 	if (vector_upl_is_valid(upl)) {
9576 		vector_upl_t vector_upl = upl->vector_upl;
9577 		if (vector_upl) {
9578 			if (index < vector_upl->num_upls) {
9579 				*offset = vector_upl->upls[index].iostate.offset;
9580 				*size = vector_upl->upls[index].iostate.size;
9581 			} else {
9582 				*offset = *size = 0;
9583 			}
9584 		} else {
9585 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9586 		}
9587 	} else {
9588 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9589 	}
9590 }
9591 
9592 void *
9593 upl_get_internal_vectorupl(upl_t upl)
9594 {
9595 	return upl->vector_upl;
9596 }
9597 
9598 upl_page_info_t *
9599 upl_get_internal_vectorupl_pagelist(upl_t upl)
9600 {
9601 	return upl->vector_upl->pagelist;
9602 }
9603 
9604 upl_page_info_t *
9605 upl_get_internal_page_list(upl_t upl)
9606 {
9607 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9608 }
9609 
9610 void
9611 upl_clear_dirty(
9612 	upl_t           upl,
9613 	boolean_t       value)
9614 {
9615 	if (value) {
9616 		upl->flags |= UPL_CLEAR_DIRTY;
9617 	} else {
9618 		upl->flags &= ~UPL_CLEAR_DIRTY;
9619 	}
9620 }
9621 
9622 void
9623 upl_set_referenced(
9624 	upl_t           upl,
9625 	boolean_t       value)
9626 {
9627 	upl_lock(upl);
9628 	if (value) {
9629 		upl->ext_ref_count++;
9630 	} else {
9631 		if (!upl->ext_ref_count) {
9632 			panic("upl_set_referenced not %p", upl);
9633 		}
9634 		upl->ext_ref_count--;
9635 	}
9636 	upl_unlock(upl);
9637 }
9638 
9639 void
9640 upl_set_map_exclusive(upl_t upl)
9641 {
9642 	upl_lock(upl);
9643 	while (upl->map_addr_owner) {
9644 		upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9645 		upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9646 	}
9647 	upl->map_addr_owner = thread_get_ctid(current_thread());
9648 	upl_unlock(upl);
9649 }
9650 
9651 void
9652 upl_clear_map_exclusive(upl_t upl)
9653 {
9654 	assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9655 	upl_lock(upl);
9656 	if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9657 		upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9658 		upl_wakeup(&upl->map_addr_owner);
9659 	}
9660 	upl->map_addr_owner = 0;
9661 	upl_unlock(upl);
9662 }
9663 
9664 #if CONFIG_IOSCHED
9665 void
9666 upl_set_blkno(
9667 	upl_t           upl,
9668 	vm_offset_t     upl_offset,
9669 	int             io_size,
9670 	int64_t         blkno)
9671 {
9672 	int i, j;
9673 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9674 		return;
9675 	}
9676 
9677 	assert(upl->upl_reprio_info != 0);
9678 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9679 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9680 	}
9681 }
9682 #endif
9683 
9684 void inline
9685 memoryshot(unsigned int event, unsigned int control)
9686 {
9687 	if (vm_debug_events) {
9688 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9689 		    vm_page_active_count, vm_page_inactive_count,
9690 		    vm_page_free_count, vm_page_speculative_count,
9691 		    vm_page_throttled_count);
9692 	} else {
9693 		(void) event;
9694 		(void) control;
9695 	}
9696 }
9697 
9698 #ifdef MACH_BSD
9699 
9700 boolean_t
9701 upl_device_page(upl_page_info_t *upl)
9702 {
9703 	return UPL_DEVICE_PAGE(upl);
9704 }
9705 boolean_t
9706 upl_page_present(upl_page_info_t *upl, int index)
9707 {
9708 	return UPL_PAGE_PRESENT(upl, index);
9709 }
9710 boolean_t
9711 upl_speculative_page(upl_page_info_t *upl, int index)
9712 {
9713 	return UPL_SPECULATIVE_PAGE(upl, index);
9714 }
9715 boolean_t
9716 upl_dirty_page(upl_page_info_t *upl, int index)
9717 {
9718 	return UPL_DIRTY_PAGE(upl, index);
9719 }
9720 boolean_t
9721 upl_valid_page(upl_page_info_t *upl, int index)
9722 {
9723 	return UPL_VALID_PAGE(upl, index);
9724 }
9725 ppnum_t
9726 upl_phys_page(upl_page_info_t *upl, int index)
9727 {
9728 	return UPL_PHYS_PAGE(upl, index);
9729 }
9730 
9731 void
9732 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9733 {
9734 	upl[index].mark = v;
9735 }
9736 
9737 boolean_t
9738 upl_page_get_mark(upl_page_info_t *upl, int index)
9739 {
9740 	return upl[index].mark;
9741 }
9742 
9743 void
9744 vm_countdirtypages(void)
9745 {
9746 	vm_page_t m;
9747 	int dpages;
9748 	int pgopages;
9749 	int precpages;
9750 
9751 
9752 	dpages = 0;
9753 	pgopages = 0;
9754 	precpages = 0;
9755 
9756 	vm_page_lock_queues();
9757 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9758 	do {
9759 		if (m == (vm_page_t)0) {
9760 			break;
9761 		}
9762 
9763 		if (m->vmp_dirty) {
9764 			dpages++;
9765 		}
9766 		if (m->vmp_free_when_done) {
9767 			pgopages++;
9768 		}
9769 		if (m->vmp_precious) {
9770 			precpages++;
9771 		}
9772 
9773 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9774 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9775 		if (m == (vm_page_t)0) {
9776 			break;
9777 		}
9778 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9779 	vm_page_unlock_queues();
9780 
9781 	vm_page_lock_queues();
9782 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9783 	do {
9784 		if (m == (vm_page_t)0) {
9785 			break;
9786 		}
9787 
9788 		dpages++;
9789 		assert(m->vmp_dirty);
9790 		assert(!m->vmp_free_when_done);
9791 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9792 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9793 		if (m == (vm_page_t)0) {
9794 			break;
9795 		}
9796 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9797 	vm_page_unlock_queues();
9798 
9799 	vm_page_lock_queues();
9800 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9801 	do {
9802 		if (m == (vm_page_t)0) {
9803 			break;
9804 		}
9805 
9806 		if (m->vmp_dirty) {
9807 			dpages++;
9808 		}
9809 		if (m->vmp_free_when_done) {
9810 			pgopages++;
9811 		}
9812 		if (m->vmp_precious) {
9813 			precpages++;
9814 		}
9815 
9816 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9817 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9818 		if (m == (vm_page_t)0) {
9819 			break;
9820 		}
9821 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9822 	vm_page_unlock_queues();
9823 
9824 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9825 
9826 	dpages = 0;
9827 	pgopages = 0;
9828 	precpages = 0;
9829 
9830 	vm_page_lock_queues();
9831 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9832 
9833 	do {
9834 		if (m == (vm_page_t)0) {
9835 			break;
9836 		}
9837 		if (m->vmp_dirty) {
9838 			dpages++;
9839 		}
9840 		if (m->vmp_free_when_done) {
9841 			pgopages++;
9842 		}
9843 		if (m->vmp_precious) {
9844 			precpages++;
9845 		}
9846 
9847 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9848 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9849 		if (m == (vm_page_t)0) {
9850 			break;
9851 		}
9852 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9853 	vm_page_unlock_queues();
9854 
9855 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9856 }
9857 #endif /* MACH_BSD */
9858 
9859 
9860 #if CONFIG_IOSCHED
9861 int
9862 upl_get_cached_tier(upl_t  upl)
9863 {
9864 	assert(upl);
9865 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9866 		return upl->upl_priority;
9867 	}
9868 	return -1;
9869 }
9870 #endif /* CONFIG_IOSCHED */
9871 
9872 
9873 void
9874 upl_callout_iodone(upl_t upl)
9875 {
9876 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9877 
9878 	if (upl_ctx) {
9879 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
9880 
9881 		assert(upl_ctx->io_done);
9882 
9883 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9884 	}
9885 }
9886 
9887 void
9888 upl_set_iodone(upl_t upl, void *upl_iodone)
9889 {
9890 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9891 }
9892 
9893 void
9894 upl_set_iodone_error(upl_t upl, int error)
9895 {
9896 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9897 
9898 	if (upl_ctx) {
9899 		upl_ctx->io_error = error;
9900 	}
9901 }
9902 
9903 
9904 ppnum_t
9905 upl_get_highest_page(
9906 	upl_t                      upl)
9907 {
9908 	return upl->highest_page;
9909 }
9910 
9911 upl_size_t
9912 upl_get_size(
9913 	upl_t                      upl)
9914 {
9915 	return upl_adjusted_size(upl, PAGE_MASK);
9916 }
9917 
9918 upl_size_t
9919 upl_adjusted_size(
9920 	upl_t upl,
9921 	vm_map_offset_t pgmask)
9922 {
9923 	vm_object_offset_t start_offset, end_offset;
9924 
9925 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9926 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9927 
9928 	return (upl_size_t)(end_offset - start_offset);
9929 }
9930 
9931 vm_object_offset_t
9932 upl_adjusted_offset(
9933 	upl_t upl,
9934 	vm_map_offset_t pgmask)
9935 {
9936 	return trunc_page_mask_64(upl->u_offset, pgmask);
9937 }
9938 
9939 vm_object_offset_t
9940 upl_get_data_offset(
9941 	upl_t upl)
9942 {
9943 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9944 }
9945 
9946 upl_t
9947 upl_associated_upl(upl_t upl)
9948 {
9949 	return upl->associated_upl;
9950 }
9951 
9952 void
9953 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9954 {
9955 	upl->associated_upl = associated_upl;
9956 }
9957 
9958 struct vnode *
9959 upl_lookup_vnode(upl_t upl)
9960 {
9961 	if (!upl->map_object->internal) {
9962 		return vnode_pager_lookup_vnode(upl->map_object->pager);
9963 	} else {
9964 		return NULL;
9965 	}
9966 }
9967 
9968 boolean_t
9969 upl_has_wired_pages(upl_t upl)
9970 {
9971 	return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
9972 }
9973 
9974 #if UPL_DEBUG
9975 kern_return_t
9976 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
9977 {
9978 	upl->ubc_alias1 = alias1;
9979 	upl->ubc_alias2 = alias2;
9980 	return KERN_SUCCESS;
9981 }
9982 int
9983 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
9984 {
9985 	if (al) {
9986 		*al = upl->ubc_alias1;
9987 	}
9988 	if (al2) {
9989 		*al2 = upl->ubc_alias2;
9990 	}
9991 	return KERN_SUCCESS;
9992 }
9993 #endif /* UPL_DEBUG */
9994 
9995 #if VM_PRESSURE_EVENTS
9996 /*
9997  * Upward trajectory.
9998  */
9999 
10000 boolean_t
10001 VM_PRESSURE_NORMAL_TO_WARNING(void)
10002 {
10003 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10004 		/* Available pages below our threshold */
10005 		uint32_t available_pages = memorystatus_get_available_page_count();
10006 		if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10007 #if CONFIG_FREEZE
10008 			/* No frozen processes to kill */
10009 			if (memorystatus_frozen_count == 0) {
10010 				/* Not enough suspended processes available. */
10011 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10012 					return TRUE;
10013 				}
10014 			}
10015 #else /* CONFIG_FREEZE */
10016 			return TRUE;
10017 #endif /* CONFIG_FREEZE */
10018 		}
10019 		return FALSE;
10020 	} else {
10021 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10022 	}
10023 }
10024 
10025 boolean_t
10026 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10027 {
10028 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10029 		/* Available pages below our threshold */
10030 		uint32_t available_pages = memorystatus_get_available_page_count();
10031 		return available_pages < memorystatus_get_critical_page_shortage_threshold();
10032 	} else {
10033 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10034 	}
10035 }
10036 
10037 /*
10038  * Downward trajectory.
10039  */
10040 boolean_t
10041 VM_PRESSURE_WARNING_TO_NORMAL(void)
10042 {
10043 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10044 		/* Available pages above our threshold */
10045 		uint32_t available_pages = memorystatus_get_available_page_count();
10046 		uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10047 		return available_pages > target_threshold;
10048 	} else {
10049 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10050 	}
10051 }
10052 
10053 boolean_t
10054 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10055 {
10056 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10057 		uint32_t available_pages = memorystatus_get_available_page_count();
10058 		uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10059 		return available_pages > target_threshold;
10060 	} else {
10061 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10062 	}
10063 }
10064 #endif /* VM_PRESSURE_EVENTS */
10065 
10066 #if DEVELOPMENT || DEBUG
10067 bool compressor_running_perf_test;
10068 uint64_t compressor_perf_test_pages_processed;
10069 
10070 static kern_return_t
10071 move_pages_to_queue(
10072 	vm_map_t map,
10073 	user_addr_t start_addr,
10074 	size_t buffer_size,
10075 	vm_page_queue_head_t *queue,
10076 	size_t *pages_moved)
10077 {
10078 	kern_return_t err = KERN_SUCCESS;
10079 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10080 	boolean_t addr_in_map = FALSE;
10081 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10082 	vm_object_t curr_object = VM_OBJECT_NULL;
10083 	*pages_moved = 0;
10084 
10085 
10086 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10087 		/*
10088 		 * We don't currently support benchmarking maps with a different page size
10089 		 * than the kernel.
10090 		 */
10091 		return KERN_INVALID_ARGUMENT;
10092 	}
10093 
10094 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10095 		return KERN_INVALID_ARGUMENT;
10096 	}
10097 
10098 	vm_map_lock_read(map);
10099 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10100 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10101 
10102 
10103 	while (curr_addr < end_addr) {
10104 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10105 		if (!addr_in_map) {
10106 			err = KERN_INVALID_ARGUMENT;
10107 			break;
10108 		}
10109 		curr_object = VME_OBJECT(curr_entry);
10110 		if (curr_object) {
10111 			vm_object_lock(curr_object);
10112 			/* We really only want anonymous memory that's in the top level map and object here. */
10113 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10114 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10115 				err = KERN_INVALID_ARGUMENT;
10116 				vm_object_unlock(curr_object);
10117 				break;
10118 			}
10119 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10120 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10121 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
10122 			vm_map_offset_t curr_offset = start_offset;
10123 			vm_page_t curr_page;
10124 			while (curr_offset < end_offset) {
10125 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10126 				if (curr_page != VM_PAGE_NULL) {
10127 					vm_page_lock_queues();
10128 					if (curr_page->vmp_laundry) {
10129 						vm_pageout_steal_laundry(curr_page, TRUE);
10130 					}
10131 					/*
10132 					 * we've already factored out pages in the laundry which
10133 					 * means this page can't be on the pageout queue so it's
10134 					 * safe to do the vm_page_queues_remove
10135 					 */
10136 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10137 					vm_page_queues_remove(curr_page, TRUE);
10138 					if (donate) {
10139 						/*
10140 						 * The compressor needs to see this bit to know
10141 						 * where this page needs to land. Also if stolen,
10142 						 * this bit helps put the page back in the right
10143 						 * special queue where it belongs.
10144 						 */
10145 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10146 					}
10147 					// Clear the referenced bit so we ensure this gets paged out
10148 					curr_page->vmp_reference = false;
10149 					if (curr_page->vmp_pmapped) {
10150 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10151 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10152 					}
10153 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
10154 					vm_page_unlock_queues();
10155 					*pages_moved += 1;
10156 				}
10157 				curr_offset += PAGE_SIZE_64;
10158 				curr_addr += PAGE_SIZE_64;
10159 			}
10160 		}
10161 		vm_object_unlock(curr_object);
10162 	}
10163 	vm_map_unlock_read(map);
10164 	return err;
10165 }
10166 
10167 /*
10168  * Local queue for processing benchmark pages.
10169  * Can't be allocated on the stack because the pointer has to
10170  * be packable.
10171  */
10172 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10173 kern_return_t
10174 run_compressor_perf_test(
10175 	user_addr_t buf,
10176 	size_t buffer_size,
10177 	uint64_t *time,
10178 	uint64_t *bytes_compressed,
10179 	uint64_t *compressor_growth)
10180 {
10181 	kern_return_t err = KERN_SUCCESS;
10182 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10183 		return KERN_NOT_SUPPORTED;
10184 	}
10185 	if (current_task() == kernel_task) {
10186 		return KERN_INVALID_ARGUMENT;
10187 	}
10188 	vm_page_lock_queues();
10189 	if (compressor_running_perf_test) {
10190 		/* Only run one instance of the benchmark at a time. */
10191 		vm_page_unlock_queues();
10192 		return KERN_RESOURCE_SHORTAGE;
10193 	}
10194 	vm_page_unlock_queues();
10195 	size_t page_count = 0;
10196 	vm_map_t map;
10197 	vm_page_t p, next;
10198 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10199 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10200 	*bytes_compressed = *compressor_growth = 0;
10201 
10202 	vm_page_queue_init(&compressor_perf_test_queue);
10203 	map = current_task()->map;
10204 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10205 	if (err != KERN_SUCCESS) {
10206 		goto out;
10207 	}
10208 
10209 	vm_page_lock_queues();
10210 	compressor_running_perf_test = true;
10211 	compressor_perf_test_pages_processed = 0;
10212 	/*
10213 	 * At this point the compressor threads should only process the benchmark queue
10214 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10215 	 * to determine how many compressed bytes we ended up using.
10216 	 */
10217 	compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10218 	vm_page_unlock_queues();
10219 
10220 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10221 
10222 	vm_page_lock_queues();
10223 	compressor_perf_test_start = mach_absolute_time();
10224 
10225 	// Wake up the compressor thread(s)
10226 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10227 	    pgo_iothread_internal_state[0].pgo_iothread);
10228 
10229 	/*
10230 	 * Depending on when this test is run we could overshoot or be right on the mark
10231 	 * with our page_count. So the comparison is of the _less than_ variety.
10232 	 */
10233 	while (compressor_perf_test_pages_processed < page_count) {
10234 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10235 		vm_page_unlock_queues();
10236 		thread_block(THREAD_CONTINUE_NULL);
10237 		vm_page_lock_queues();
10238 	}
10239 	compressor_perf_test_end = mach_absolute_time();
10240 	compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10241 	vm_page_unlock_queues();
10242 
10243 
10244 out:
10245 	/*
10246 	 * If we errored out above, then we could still have some pages
10247 	 * on the local queue. Make sure to put them back on the active queue before
10248 	 * returning so they're not orphaned.
10249 	 */
10250 	vm_page_lock_queues();
10251 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10252 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10253 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10254 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10255 
10256 		vm_page_enqueue_active(p, FALSE);
10257 		p = next;
10258 	}
10259 
10260 	compressor_running_perf_test = false;
10261 	vm_page_unlock_queues();
10262 	if (err == KERN_SUCCESS) {
10263 		*bytes_compressed = page_count * PAGE_SIZE_64;
10264 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
10265 	}
10266 
10267 	/*
10268 	 * pageout_scan will consider waking the compactor swapper
10269 	 * before it blocks. Do the same thing here before we return
10270 	 * to ensure that back to back benchmark runs can't overly fragment the
10271 	 * compressor pool.
10272 	 */
10273 	vm_consider_waking_compactor_swapper();
10274 	return err;
10275 }
10276 #endif /* DEVELOPMENT || DEBUG */
10277