xref: /xnu-10002.1.13/osfmk/vm/vm_pageout.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include <stdint.h>
67 #include <ptrauth.h>
68 
69 #include <debug.h>
70 
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79 
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91 
92 #include <os/log.h>
93 
94 #include <sys/kdebug_triage.h>
95 
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98 
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map_internal.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111 
112 #include <san/kasan.h>
113 
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117 
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121 
122 extern int cs_debug;
123 
124 #if CONFIG_MBUF_MCACHE
125 extern void mbuf_drain(boolean_t);
126 #endif /* CONFIG_MBUF_MCACHE */
127 
128 #if VM_PRESSURE_EVENTS
129 #if CONFIG_JETSAM
130 extern unsigned int memorystatus_available_pages;
131 extern unsigned int memorystatus_available_pages_pressure;
132 extern unsigned int memorystatus_available_pages_critical;
133 #else /* CONFIG_JETSAM */
134 extern uint64_t memorystatus_available_pages;
135 extern uint64_t memorystatus_available_pages_pressure;
136 extern uint64_t memorystatus_available_pages_critical;
137 #endif /* CONFIG_JETSAM */
138 
139 extern unsigned int memorystatus_frozen_count;
140 extern unsigned int memorystatus_suspended_count;
141 extern vm_pressure_level_t memorystatus_vm_pressure_level;
142 
143 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
144 extern uint32_t memorystatus_jetsam_fg_band_waiters;
145 
146 void vm_pressure_response(void);
147 extern void consider_vm_pressure_events(void);
148 
149 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
150 #endif /* VM_PRESSURE_EVENTS */
151 
152 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
153 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
154 #if CONFIG_VPS_DYNAMIC_PRIO
155 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
156 #else
157 const bool vps_dynamic_priority_enabled = false;
158 #endif
159 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
160 
161 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
162 #if !XNU_TARGET_OS_OSX
163 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
164 #else /* !XNU_TARGET_OS_OSX */
165 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
166 #endif /* !XNU_TARGET_OS_OSX */
167 #endif
168 
169 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
170 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
171 #endif
172 
173 #ifndef VM_PAGE_LAUNDRY_MAX
174 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
175 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
176 
177 #ifndef VM_PAGEOUT_BURST_WAIT
178 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
179 #endif  /* VM_PAGEOUT_BURST_WAIT */
180 
181 #ifndef VM_PAGEOUT_EMPTY_WAIT
182 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
183 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
184 
185 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
186 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
187 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
188 
189 #ifndef VM_PAGEOUT_IDLE_WAIT
190 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
191 #endif  /* VM_PAGEOUT_IDLE_WAIT */
192 
193 #ifndef VM_PAGEOUT_SWAP_WAIT
194 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
195 #endif  /* VM_PAGEOUT_SWAP_WAIT */
196 
197 
198 #ifndef VM_PAGE_SPECULATIVE_TARGET
199 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
200 #endif /* VM_PAGE_SPECULATIVE_TARGET */
201 
202 
203 /*
204  *	To obtain a reasonable LRU approximation, the inactive queue
205  *	needs to be large enough to give pages on it a chance to be
206  *	referenced a second time.  This macro defines the fraction
207  *	of active+inactive pages that should be inactive.
208  *	The pageout daemon uses it to update vm_page_inactive_target.
209  *
210  *	If vm_page_free_count falls below vm_page_free_target and
211  *	vm_page_inactive_count is below vm_page_inactive_target,
212  *	then the pageout daemon starts running.
213  */
214 
215 #ifndef VM_PAGE_INACTIVE_TARGET
216 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
217 #endif  /* VM_PAGE_INACTIVE_TARGET */
218 
219 /*
220  *	Once the pageout daemon starts running, it keeps going
221  *	until vm_page_free_count meets or exceeds vm_page_free_target.
222  */
223 
224 #ifndef VM_PAGE_FREE_TARGET
225 #if !XNU_TARGET_OS_OSX
226 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
227 #else /* !XNU_TARGET_OS_OSX */
228 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
229 #endif /* !XNU_TARGET_OS_OSX */
230 #endif  /* VM_PAGE_FREE_TARGET */
231 
232 
233 /*
234  *	The pageout daemon always starts running once vm_page_free_count
235  *	falls below vm_page_free_min.
236  */
237 
238 #ifndef VM_PAGE_FREE_MIN
239 #if !XNU_TARGET_OS_OSX
240 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
241 #else /* !XNU_TARGET_OS_OSX */
242 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
243 #endif /* !XNU_TARGET_OS_OSX */
244 #endif  /* VM_PAGE_FREE_MIN */
245 
246 #if !XNU_TARGET_OS_OSX
247 #define VM_PAGE_FREE_RESERVED_LIMIT     100
248 #define VM_PAGE_FREE_MIN_LIMIT          1500
249 #define VM_PAGE_FREE_TARGET_LIMIT       2000
250 #else /* !XNU_TARGET_OS_OSX */
251 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
252 #define VM_PAGE_FREE_MIN_LIMIT          3500
253 #define VM_PAGE_FREE_TARGET_LIMIT       4000
254 #endif /* !XNU_TARGET_OS_OSX */
255 
256 /*
257  *	When vm_page_free_count falls below vm_page_free_reserved,
258  *	only vm-privileged threads can allocate pages.  vm-privilege
259  *	allows the pageout daemon and default pager (and any other
260  *	associated threads needed for default pageout) to continue
261  *	operation by dipping into the reserved pool of pages.
262  */
263 
264 #ifndef VM_PAGE_FREE_RESERVED
265 #define VM_PAGE_FREE_RESERVED(n)        \
266 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
267 #endif  /* VM_PAGE_FREE_RESERVED */
268 
269 /*
270  *	When we dequeue pages from the inactive list, they are
271  *	reactivated (ie, put back on the active queue) if referenced.
272  *	However, it is possible to starve the free list if other
273  *	processors are referencing pages faster than we can turn off
274  *	the referenced bit.  So we limit the number of reactivations
275  *	we will make per call of vm_pageout_scan().
276  */
277 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
278 
279 #ifndef VM_PAGE_REACTIVATE_LIMIT
280 #if !XNU_TARGET_OS_OSX
281 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
282 #else /* !XNU_TARGET_OS_OSX */
283 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
284 #endif /* !XNU_TARGET_OS_OSX */
285 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
286 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
287 
288 int vm_pageout_protect_realtime = true;
289 
290 extern boolean_t hibernate_cleaning_in_progress;
291 
292 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
293 struct pgo_iothread_state pgo_iothread_external_state;
294 
295 #if VM_PRESSURE_EVENTS
296 void vm_pressure_thread(void);
297 
298 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
299 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
300 
301 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
302 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
303 #endif
304 
305 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
306 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
307 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
308 
309 extern void vm_pageout_continue(void);
310 extern void vm_pageout_scan(void);
311 
312 boolean_t vm_pageout_running = FALSE;
313 
314 uint32_t vm_page_upl_tainted = 0;
315 uint32_t vm_page_iopl_tainted = 0;
316 
317 #if XNU_TARGET_OS_OSX
318 static boolean_t vm_pageout_waiter  = FALSE;
319 #endif /* XNU_TARGET_OS_OSX */
320 
321 
322 #if DEVELOPMENT || DEBUG
323 struct vm_pageout_debug vm_pageout_debug;
324 #endif
325 struct vm_pageout_vminfo vm_pageout_vminfo;
326 struct vm_pageout_state  vm_pageout_state;
327 struct vm_config         vm_config;
328 
329 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
330 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
331 #if DEVELOPMENT || DEBUG
332 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
333 #endif /* DEVELOPMENT || DEBUG */
334 
335 int         vm_upl_wait_for_pages = 0;
336 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
337 
338 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
339 
340 int     vm_debug_events = 0;
341 
342 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
343 
344 #if CONFIG_MEMORYSTATUS
345 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
346 
347 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
348 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
349 
350 #endif
351 
352 #if __AMP__
353 
354 
355 /*
356  * Bind compressor threads to e-cores unless there are multiple non-e clusters
357  */
358 #if (MAX_CPU_CLUSTERS > 2)
359 #define VM_COMPRESSOR_EBOUND_DEFAULT false
360 #else
361 #define VM_COMPRESSOR_EBOUND_DEFAULT true
362 #endif
363 
364 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
365 int vm_pgo_pbound = 0;
366 extern void thread_bind_cluster_type(thread_t, char, bool);
367 
368 #endif /* __AMP__ */
369 
370 
371 /*
372  *	Routine:	vm_pageout_object_terminate
373  *	Purpose:
374  *		Destroy the pageout_object, and perform all of the
375  *		required cleanup actions.
376  *
377  *	In/Out conditions:
378  *		The object must be locked, and will be returned locked.
379  */
380 void
vm_pageout_object_terminate(vm_object_t object)381 vm_pageout_object_terminate(
382 	vm_object_t     object)
383 {
384 	vm_object_t     shadow_object;
385 
386 	/*
387 	 * Deal with the deallocation (last reference) of a pageout object
388 	 * (used for cleaning-in-place) by dropping the paging references/
389 	 * freeing pages in the original object.
390 	 */
391 
392 	assert(object->pageout);
393 	shadow_object = object->shadow;
394 	vm_object_lock(shadow_object);
395 
396 	while (!vm_page_queue_empty(&object->memq)) {
397 		vm_page_t               p, m;
398 		vm_object_offset_t      offset;
399 
400 		p = (vm_page_t) vm_page_queue_first(&object->memq);
401 
402 		assert(p->vmp_private);
403 		assert(p->vmp_free_when_done);
404 		p->vmp_free_when_done = FALSE;
405 		assert(!p->vmp_cleaning);
406 		assert(!p->vmp_laundry);
407 
408 		offset = p->vmp_offset;
409 		VM_PAGE_FREE(p);
410 		p = VM_PAGE_NULL;
411 
412 		m = vm_page_lookup(shadow_object,
413 		    offset + object->vo_shadow_offset);
414 
415 		if (m == VM_PAGE_NULL) {
416 			continue;
417 		}
418 
419 		assert((m->vmp_dirty) || (m->vmp_precious) ||
420 		    (m->vmp_busy && m->vmp_cleaning));
421 
422 		/*
423 		 * Handle the trusted pager throttle.
424 		 * Also decrement the burst throttle (if external).
425 		 */
426 		vm_page_lock_queues();
427 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
428 			vm_pageout_throttle_up(m);
429 		}
430 
431 		/*
432 		 * Handle the "target" page(s). These pages are to be freed if
433 		 * successfully cleaned. Target pages are always busy, and are
434 		 * wired exactly once. The initial target pages are not mapped,
435 		 * (so cannot be referenced or modified) but converted target
436 		 * pages may have been modified between the selection as an
437 		 * adjacent page and conversion to a target.
438 		 */
439 		if (m->vmp_free_when_done) {
440 			assert(m->vmp_busy);
441 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
442 			assert(m->vmp_wire_count == 1);
443 			m->vmp_cleaning = FALSE;
444 			m->vmp_free_when_done = FALSE;
445 			/*
446 			 * Revoke all access to the page. Since the object is
447 			 * locked, and the page is busy, this prevents the page
448 			 * from being dirtied after the pmap_disconnect() call
449 			 * returns.
450 			 *
451 			 * Since the page is left "dirty" but "not modifed", we
452 			 * can detect whether the page was redirtied during
453 			 * pageout by checking the modify state.
454 			 */
455 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
456 				SET_PAGE_DIRTY(m, FALSE);
457 			} else {
458 				m->vmp_dirty = FALSE;
459 			}
460 
461 			if (m->vmp_dirty) {
462 				vm_page_unwire(m, TRUE);        /* reactivates */
463 				counter_inc(&vm_statistics_reactivations);
464 				PAGE_WAKEUP_DONE(m);
465 			} else {
466 				vm_page_free(m);  /* clears busy, etc. */
467 			}
468 			vm_page_unlock_queues();
469 			continue;
470 		}
471 		/*
472 		 * Handle the "adjacent" pages. These pages were cleaned in
473 		 * place, and should be left alone.
474 		 * If prep_pin_count is nonzero, then someone is using the
475 		 * page, so make it active.
476 		 */
477 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
478 			if (m->vmp_reference) {
479 				vm_page_activate(m);
480 			} else {
481 				vm_page_deactivate(m);
482 			}
483 		}
484 		if (m->vmp_overwriting) {
485 			/*
486 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
487 			 */
488 			if (m->vmp_busy) {
489 				/*
490 				 * We do not re-set m->vmp_dirty !
491 				 * The page was busy so no extraneous activity
492 				 * could have occurred. COPY_INTO is a read into the
493 				 * new pages. CLEAN_IN_PLACE does actually write
494 				 * out the pages but handling outside of this code
495 				 * will take care of resetting dirty. We clear the
496 				 * modify however for the Programmed I/O case.
497 				 */
498 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
499 
500 				m->vmp_busy = FALSE;
501 				m->vmp_absent = FALSE;
502 			} else {
503 				/*
504 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
505 				 * Occurs when the original page was wired
506 				 * at the time of the list request
507 				 */
508 				assert(VM_PAGE_WIRED(m));
509 				vm_page_unwire(m, TRUE);        /* reactivates */
510 			}
511 			m->vmp_overwriting = FALSE;
512 		} else {
513 			m->vmp_dirty = FALSE;
514 		}
515 		m->vmp_cleaning = FALSE;
516 
517 		/*
518 		 * Wakeup any thread waiting for the page to be un-cleaning.
519 		 */
520 		PAGE_WAKEUP(m);
521 		vm_page_unlock_queues();
522 	}
523 	/*
524 	 * Account for the paging reference taken in vm_paging_object_allocate.
525 	 */
526 	vm_object_activity_end(shadow_object);
527 	vm_object_unlock(shadow_object);
528 
529 	assert(object->ref_count == 0);
530 	assert(object->paging_in_progress == 0);
531 	assert(object->activity_in_progress == 0);
532 	assert(object->resident_page_count == 0);
533 	return;
534 }
535 
536 /*
537  * Routine:	vm_pageclean_setup
538  *
539  * Purpose:	setup a page to be cleaned (made non-dirty), but not
540  *		necessarily flushed from the VM page cache.
541  *		This is accomplished by cleaning in place.
542  *
543  *		The page must not be busy, and new_object
544  *		must be locked.
545  *
546  */
547 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)548 vm_pageclean_setup(
549 	vm_page_t               m,
550 	vm_page_t               new_m,
551 	vm_object_t             new_object,
552 	vm_object_offset_t      new_offset)
553 {
554 	assert(!m->vmp_busy);
555 #if 0
556 	assert(!m->vmp_cleaning);
557 #endif
558 
559 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
560 
561 	/*
562 	 * Mark original page as cleaning in place.
563 	 */
564 	m->vmp_cleaning = TRUE;
565 	SET_PAGE_DIRTY(m, FALSE);
566 	m->vmp_precious = FALSE;
567 
568 	/*
569 	 * Convert the fictitious page to a private shadow of
570 	 * the real page.
571 	 */
572 	assert(new_m->vmp_fictitious);
573 	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
574 	new_m->vmp_fictitious = FALSE;
575 	new_m->vmp_private = TRUE;
576 	new_m->vmp_free_when_done = TRUE;
577 	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
578 
579 	vm_page_lockspin_queues();
580 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 	vm_page_unlock_queues();
582 
583 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 	assert(!new_m->vmp_wanted);
585 	new_m->vmp_busy = FALSE;
586 }
587 
588 /*
589  *	Routine:	vm_pageout_initialize_page
590  *	Purpose:
591  *		Causes the specified page to be initialized in
592  *		the appropriate memory object. This routine is used to push
593  *		pages into a copy-object when they are modified in the
594  *		permanent object.
595  *
596  *		The page is moved to a temporary object and paged out.
597  *
598  *	In/out conditions:
599  *		The page in question must not be on any pageout queues.
600  *		The object to which it belongs must be locked.
601  *		The page must be busy, but not hold a paging reference.
602  *
603  *	Implementation:
604  *		Move this page to a completely new object.
605  */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 	vm_page_t       m)
609 {
610 	vm_object_t             object;
611 	vm_object_offset_t      paging_offset;
612 	memory_object_t         pager;
613 
614 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615 
616 	object = VM_PAGE_OBJECT(m);
617 
618 	assert(m->vmp_busy);
619 	assert(object->internal);
620 
621 	/*
622 	 *	Verify that we really want to clean this page
623 	 */
624 	assert(!m->vmp_absent);
625 	assert(m->vmp_dirty);
626 
627 	/*
628 	 *	Create a paging reference to let us play with the object.
629 	 */
630 	paging_offset = m->vmp_offset + object->paging_offset;
631 
632 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 		panic("reservation without pageout?"); /* alan */
634 
635 		VM_PAGE_FREE(m);
636 		vm_object_unlock(object);
637 
638 		return;
639 	}
640 
641 	/*
642 	 * If there's no pager, then we can't clean the page.  This should
643 	 * never happen since this should be a copy object and therefore not
644 	 * an external object, so the pager should always be there.
645 	 */
646 
647 	pager = object->pager;
648 
649 	if (pager == MEMORY_OBJECT_NULL) {
650 		panic("missing pager for copy object");
651 
652 		VM_PAGE_FREE(m);
653 		return;
654 	}
655 
656 	/*
657 	 * set the page for future call to vm_fault_list_request
658 	 */
659 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 	SET_PAGE_DIRTY(m, FALSE);
661 
662 	/*
663 	 * keep the object from collapsing or terminating
664 	 */
665 	vm_object_paging_begin(object);
666 	vm_object_unlock(object);
667 
668 	/*
669 	 *	Write the data to its pager.
670 	 *	Note that the data is passed by naming the new object,
671 	 *	not a virtual address; the pager interface has been
672 	 *	manipulated to use the "internal memory" data type.
673 	 *	[The object reference from its allocation is donated
674 	 *	to the eventual recipient.]
675 	 */
676 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677 
678 	vm_object_lock(object);
679 	vm_object_paging_end(object);
680 }
681 
682 
683 /*
684  * vm_pageout_cluster:
685  *
686  * Given a page, queue it to the appropriate I/O thread,
687  * which will page it out and attempt to clean adjacent pages
688  * in the same operation.
689  *
690  * The object and queues must be locked. We will take a
691  * paging reference to prevent deallocation or collapse when we
692  * release the object lock back at the call site.  The I/O thread
693  * is responsible for consuming this reference
694  *
695  * The page must not be on any pageout queue.
696  */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699 
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703 
704 typedef enum vmct_state_t {
705 	VMCT_IDLE,
706 	VMCT_AWAKENED,
707 	VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711 
712 
713 
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 	vm_object_t object = VM_PAGE_OBJECT(m);
718 
719 	VM_PAGE_CHECK(m);
720 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 	vm_object_lock_assert_exclusive(object);
722 
723 	/*
724 	 * Make sure it's OK to page this out.
725 	 */
726 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 	assert(!m->vmp_cleaning && !m->vmp_laundry);
728 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729 
730 	/*
731 	 * protect the object from collapse or termination
732 	 */
733 	vm_object_activity_begin(object);
734 
735 
736 	/*
737 	 * pgo_laundry count is tied to the laundry bit
738 	 */
739 	m->vmp_laundry = TRUE;
740 	q->pgo_laundry++;
741 
742 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744 
745 	// the benchmark queue will be woken up independently by the benchmark itself
746 	if (
747 		object->internal == TRUE
748 #if DEVELOPMENT || DEBUG
749 		&& q != &vm_pageout_queue_benchmark
750 #endif
751 		) {
752 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
753 		m->vmp_busy = TRUE;
754 		// Wake up the first compressor thread. It will wake subsequent threads if necessary.
755 		sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup, pgo_iothread_internal_state[0].pgo_iothread);
756 	} else {
757 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
758 	}
759 	VM_PAGE_CHECK(m);
760 }
761 
762 void
vm_pageout_cluster(vm_page_t m)763 vm_pageout_cluster(vm_page_t m)
764 {
765 	struct          vm_pageout_queue *q;
766 	vm_object_t     object = VM_PAGE_OBJECT(m);
767 	if (object->internal) {
768 		q = &vm_pageout_queue_internal;
769 	} else {
770 		q = &vm_pageout_queue_external;
771 	}
772 	vm_pageout_cluster_to_queue(m, q);
773 }
774 
775 
776 /*
777  * A page is back from laundry or we are stealing it back from
778  * the laundering state.  See if there are some pages waiting to
779  * go to laundry and if we can let some of them go now.
780  *
781  * Object and page queues must be locked.
782  */
783 void
vm_pageout_throttle_up(vm_page_t m)784 vm_pageout_throttle_up(
785 	vm_page_t       m)
786 {
787 	struct vm_pageout_queue *q;
788 	vm_object_t      m_object;
789 
790 	m_object = VM_PAGE_OBJECT(m);
791 
792 	assert(m_object != VM_OBJECT_NULL);
793 	assert(!is_kernel_object(m_object));
794 
795 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
796 	vm_object_lock_assert_exclusive(m_object);
797 
798 	if (m_object->internal == TRUE) {
799 		q = &vm_pageout_queue_internal;
800 	} else {
801 		q = &vm_pageout_queue_external;
802 	}
803 
804 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
805 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
806 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
807 
808 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
809 
810 		vm_object_activity_end(m_object);
811 
812 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
813 	}
814 	if (m->vmp_laundry == TRUE) {
815 		m->vmp_laundry = FALSE;
816 		q->pgo_laundry--;
817 
818 		if (q->pgo_throttled == TRUE) {
819 			q->pgo_throttled = FALSE;
820 			thread_wakeup((event_t) &q->pgo_laundry);
821 		}
822 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
823 			q->pgo_draining = FALSE;
824 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
825 		}
826 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
827 	}
828 }
829 
830 
831 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)832 vm_pageout_throttle_up_batch(
833 	struct vm_pageout_queue *q,
834 	int             batch_cnt)
835 {
836 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
837 
838 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
839 
840 	q->pgo_laundry -= batch_cnt;
841 
842 	if (q->pgo_throttled == TRUE) {
843 		q->pgo_throttled = FALSE;
844 		thread_wakeup((event_t) &q->pgo_laundry);
845 	}
846 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
847 		q->pgo_draining = FALSE;
848 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
849 	}
850 }
851 
852 
853 
854 /*
855  * VM memory pressure monitoring.
856  *
857  * vm_pageout_scan() keeps track of the number of pages it considers and
858  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
859  *
860  * compute_memory_pressure() is called every second from compute_averages()
861  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
862  * of recalimed pages in a new vm_pageout_stat[] bucket.
863  *
864  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
865  * The caller provides the number of seconds ("nsecs") worth of statistics
866  * it wants, up to 30 seconds.
867  * It computes the number of pages reclaimed in the past "nsecs" seconds and
868  * also returns the number of pages the system still needs to reclaim at this
869  * moment in time.
870  */
871 #if DEVELOPMENT || DEBUG
872 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
873 #else
874 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
875 #endif
876 struct vm_pageout_stat {
877 	unsigned long vm_page_active_count;
878 	unsigned long vm_page_speculative_count;
879 	unsigned long vm_page_inactive_count;
880 	unsigned long vm_page_anonymous_count;
881 
882 	unsigned long vm_page_free_count;
883 	unsigned long vm_page_wire_count;
884 	unsigned long vm_page_compressor_count;
885 
886 	unsigned long vm_page_pages_compressed;
887 	unsigned long vm_page_pageable_internal_count;
888 	unsigned long vm_page_pageable_external_count;
889 	unsigned long vm_page_xpmapped_external_count;
890 
891 	unsigned int pages_grabbed;
892 	unsigned int pages_freed;
893 
894 	unsigned int pages_compressed;
895 	unsigned int pages_grabbed_by_compressor;
896 	unsigned int failed_compressions;
897 
898 	unsigned int pages_evicted;
899 	unsigned int pages_purged;
900 
901 	unsigned int considered;
902 	unsigned int considered_bq_internal;
903 	unsigned int considered_bq_external;
904 
905 	unsigned int skipped_external;
906 	unsigned int skipped_internal;
907 	unsigned int filecache_min_reactivations;
908 
909 	unsigned int freed_speculative;
910 	unsigned int freed_cleaned;
911 	unsigned int freed_internal;
912 	unsigned int freed_external;
913 
914 	unsigned int cleaned_dirty_external;
915 	unsigned int cleaned_dirty_internal;
916 
917 	unsigned int inactive_referenced;
918 	unsigned int inactive_nolock;
919 	unsigned int reactivation_limit_exceeded;
920 	unsigned int forced_inactive_reclaim;
921 
922 	unsigned int throttled_internal_q;
923 	unsigned int throttled_external_q;
924 
925 	unsigned int phantom_ghosts_found;
926 	unsigned int phantom_ghosts_added;
927 
928 	unsigned int vm_page_realtime_count;
929 	unsigned int forcereclaimed_sharedcache;
930 	unsigned int forcereclaimed_realtime;
931 	unsigned int protected_sharedcache;
932 	unsigned int protected_realtime;
933 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
934 
935 unsigned int vm_pageout_stat_now = 0;
936 
937 #define VM_PAGEOUT_STAT_BEFORE(i) \
938 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
939 #define VM_PAGEOUT_STAT_AFTER(i) \
940 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
941 
942 #if VM_PAGE_BUCKETS_CHECK
943 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
944 #endif /* VM_PAGE_BUCKETS_CHECK */
945 
946 
947 void
948 record_memory_pressure(void);
949 void
record_memory_pressure(void)950 record_memory_pressure(void)
951 {
952 	unsigned int vm_pageout_next;
953 
954 #if VM_PAGE_BUCKETS_CHECK
955 	/* check the consistency of VM page buckets at regular interval */
956 	static int counter = 0;
957 	if ((++counter % vm_page_buckets_check_interval) == 0) {
958 		vm_page_buckets_check();
959 	}
960 #endif /* VM_PAGE_BUCKETS_CHECK */
961 
962 	vm_pageout_state.vm_memory_pressure =
963 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
964 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
965 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
966 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
967 
968 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
969 
970 	/* move "now" forward */
971 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
972 
973 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
974 
975 	vm_pageout_stat_now = vm_pageout_next;
976 }
977 
978 
979 /*
980  * IMPORTANT
981  * mach_vm_ctl_page_free_wanted() is called indirectly, via
982  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
983  * it must be safe in the restricted stackshot context. Locks and/or
984  * blocking are not allowable.
985  */
986 unsigned int
mach_vm_ctl_page_free_wanted(void)987 mach_vm_ctl_page_free_wanted(void)
988 {
989 	unsigned int page_free_target, page_free_count, page_free_wanted;
990 
991 	page_free_target = vm_page_free_target;
992 	page_free_count = vm_page_free_count;
993 	if (page_free_target > page_free_count) {
994 		page_free_wanted = page_free_target - page_free_count;
995 	} else {
996 		page_free_wanted = 0;
997 	}
998 
999 	return page_free_wanted;
1000 }
1001 
1002 
1003 /*
1004  * IMPORTANT:
1005  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1006  * wait_for_pressure FALSE, so that code path must remain safe in the
1007  * restricted stackshot context. No blocking or locks are allowable.
1008  * on that code path.
1009  */
1010 
1011 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)1012 mach_vm_pressure_monitor(
1013 	boolean_t       wait_for_pressure,
1014 	unsigned int    nsecs_monitored,
1015 	unsigned int    *pages_reclaimed_p,
1016 	unsigned int    *pages_wanted_p)
1017 {
1018 	wait_result_t   wr;
1019 	unsigned int    vm_pageout_then, vm_pageout_now;
1020 	unsigned int    pages_reclaimed;
1021 	unsigned int    units_of_monitor;
1022 
1023 	units_of_monitor = 8 * nsecs_monitored;
1024 	/*
1025 	 * We don't take the vm_page_queue_lock here because we don't want
1026 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1027 	 * thread when it's trying to reclaim memory.  We don't need fully
1028 	 * accurate monitoring anyway...
1029 	 */
1030 
1031 	if (wait_for_pressure) {
1032 		/* wait until there's memory pressure */
1033 		while (vm_page_free_count >= vm_page_free_target) {
1034 			wr = assert_wait((event_t) &vm_page_free_wanted,
1035 			    THREAD_INTERRUPTIBLE);
1036 			if (wr == THREAD_WAITING) {
1037 				wr = thread_block(THREAD_CONTINUE_NULL);
1038 			}
1039 			if (wr == THREAD_INTERRUPTED) {
1040 				return KERN_ABORTED;
1041 			}
1042 			if (wr == THREAD_AWAKENED) {
1043 				/*
1044 				 * The memory pressure might have already
1045 				 * been relieved but let's not block again
1046 				 * and let's report that there was memory
1047 				 * pressure at some point.
1048 				 */
1049 				break;
1050 			}
1051 		}
1052 	}
1053 
1054 	/* provide the number of pages the system wants to reclaim */
1055 	if (pages_wanted_p != NULL) {
1056 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1057 	}
1058 
1059 	if (pages_reclaimed_p == NULL) {
1060 		return KERN_SUCCESS;
1061 	}
1062 
1063 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1064 	vm_pageout_now = vm_pageout_stat_now;
1065 	pages_reclaimed = 0;
1066 	for (vm_pageout_then =
1067 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1068 	    vm_pageout_then != vm_pageout_now &&
1069 	    units_of_monitor-- != 0;
1070 	    vm_pageout_then =
1071 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1072 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1073 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1074 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1075 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1076 	}
1077 	*pages_reclaimed_p = pages_reclaimed;
1078 
1079 	return KERN_SUCCESS;
1080 }
1081 
1082 
1083 
1084 #if DEVELOPMENT || DEBUG
1085 
1086 static void
1087 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1088 
1089 /*
1090  * condition variable used to make sure there is
1091  * only a single sweep going on at a time
1092  */
1093 bool vm_pageout_disconnect_all_pages_active = false;
1094 
1095 void
vm_pageout_disconnect_all_pages()1096 vm_pageout_disconnect_all_pages()
1097 {
1098 	vm_page_lock_queues();
1099 
1100 	if (vm_pageout_disconnect_all_pages_active) {
1101 		vm_page_unlock_queues();
1102 		return;
1103 	}
1104 	vm_pageout_disconnect_all_pages_active = true;
1105 
1106 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1107 	    vm_page_throttled_count);
1108 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1109 	    vm_page_anonymous_count);
1110 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1111 	    (vm_page_inactive_count - vm_page_anonymous_count));
1112 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1113 	    vm_page_active_count);
1114 #ifdef CONFIG_SECLUDED_MEMORY
1115 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1116 	    vm_page_secluded_count);
1117 #endif /* CONFIG_SECLUDED_MEMORY */
1118 	vm_page_unlock_queues();
1119 
1120 	vm_pageout_disconnect_all_pages_active = false;
1121 }
1122 
1123 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1124 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1125 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1126 {
1127 	vm_page_t       m;
1128 	vm_object_t     t_object = NULL;
1129 	vm_object_t     l_object = NULL;
1130 	vm_object_t     m_object = NULL;
1131 	int             delayed_unlock = 0;
1132 	int             try_failed_count = 0;
1133 	int             disconnected_count = 0;
1134 	int             paused_count = 0;
1135 	int             object_locked_count = 0;
1136 
1137 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1138 	    DBG_FUNC_START),
1139 	    q, qcount);
1140 
1141 	while (qcount && !vm_page_queue_empty(q)) {
1142 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1143 
1144 		m = (vm_page_t) vm_page_queue_first(q);
1145 		m_object = VM_PAGE_OBJECT(m);
1146 
1147 		/*
1148 		 * check to see if we currently are working
1149 		 * with the same object... if so, we've
1150 		 * already got the lock
1151 		 */
1152 		if (m_object != l_object) {
1153 			/*
1154 			 * the object associated with candidate page is
1155 			 * different from the one we were just working
1156 			 * with... dump the lock if we still own it
1157 			 */
1158 			if (l_object != NULL) {
1159 				vm_object_unlock(l_object);
1160 				l_object = NULL;
1161 			}
1162 			if (m_object != t_object) {
1163 				try_failed_count = 0;
1164 			}
1165 
1166 			/*
1167 			 * Try to lock object; since we've alread got the
1168 			 * page queues lock, we can only 'try' for this one.
1169 			 * if the 'try' fails, we need to do a mutex_pause
1170 			 * to allow the owner of the object lock a chance to
1171 			 * run...
1172 			 */
1173 			if (!vm_object_lock_try_scan(m_object)) {
1174 				if (try_failed_count > 20) {
1175 					goto reenter_pg_on_q;
1176 				}
1177 				vm_page_unlock_queues();
1178 				mutex_pause(try_failed_count++);
1179 				vm_page_lock_queues();
1180 				delayed_unlock = 0;
1181 
1182 				paused_count++;
1183 
1184 				t_object = m_object;
1185 				continue;
1186 			}
1187 			object_locked_count++;
1188 
1189 			l_object = m_object;
1190 		}
1191 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1192 		    m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1193 		    m->vmp_free_when_done) {
1194 			/*
1195 			 * put it back on the head of its queue
1196 			 */
1197 			goto reenter_pg_on_q;
1198 		}
1199 		if (m->vmp_pmapped == TRUE) {
1200 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1201 
1202 			disconnected_count++;
1203 		}
1204 reenter_pg_on_q:
1205 		vm_page_queue_remove(q, m, vmp_pageq);
1206 		vm_page_queue_enter(q, m, vmp_pageq);
1207 
1208 		qcount--;
1209 		try_failed_count = 0;
1210 
1211 		if (delayed_unlock++ > 128) {
1212 			if (l_object != NULL) {
1213 				vm_object_unlock(l_object);
1214 				l_object = NULL;
1215 			}
1216 			lck_mtx_yield(&vm_page_queue_lock);
1217 			delayed_unlock = 0;
1218 		}
1219 	}
1220 	if (l_object != NULL) {
1221 		vm_object_unlock(l_object);
1222 		l_object = NULL;
1223 	}
1224 
1225 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1226 	    DBG_FUNC_END),
1227 	    q, disconnected_count, object_locked_count, paused_count);
1228 }
1229 
1230 extern char* proc_best_name(struct proc* proc);
1231 
1232 int
vm_toggle_task_selfdonate_pages(task_t task)1233 vm_toggle_task_selfdonate_pages(task_t task)
1234 {
1235 	int state = 0;
1236 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1237 		printf("VM Donation mode is OFF on the system\n");
1238 		return state;
1239 	}
1240 	if (task != kernel_task) {
1241 		task_lock(task);
1242 		if (!task->donates_own_pages) {
1243 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1244 			task->donates_own_pages = true;
1245 			state = 1;
1246 		} else if (task->donates_own_pages) {
1247 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1248 			task->donates_own_pages = false;
1249 			state = 0;
1250 		}
1251 		task_unlock(task);
1252 	}
1253 	return state;
1254 }
1255 #endif /* DEVELOPMENT || DEBUG */
1256 
1257 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1258 vm_task_set_selfdonate_pages(task_t task, bool donate)
1259 {
1260 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1261 	assert(task != kernel_task);
1262 
1263 	task_lock(task);
1264 	task->donates_own_pages = donate;
1265 	task_unlock(task);
1266 }
1267 
1268 
1269 
1270 static size_t
1271 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1272 
1273 /*
1274  * condition variable used to make sure there is
1275  * only a single sweep going on at a time
1276  */
1277 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1278 
1279 
1280 void
vm_pageout_anonymous_pages()1281 vm_pageout_anonymous_pages()
1282 {
1283 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1284 		vm_page_lock_queues();
1285 
1286 		if (vm_pageout_anonymous_pages_active == TRUE) {
1287 			vm_page_unlock_queues();
1288 			return;
1289 		}
1290 		vm_pageout_anonymous_pages_active = TRUE;
1291 		vm_page_unlock_queues();
1292 
1293 		vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1294 		vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1295 		vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1296 
1297 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1298 			vm_consider_swapping();
1299 		}
1300 
1301 		vm_page_lock_queues();
1302 		vm_pageout_anonymous_pages_active = FALSE;
1303 		vm_page_unlock_queues();
1304 	}
1305 }
1306 
1307 
1308 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1309 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1310 {
1311 	vm_page_t       m;
1312 	vm_object_t     t_object = NULL;
1313 	vm_object_t     l_object = NULL;
1314 	vm_object_t     m_object = NULL;
1315 	int             delayed_unlock = 0;
1316 	int             try_failed_count = 0;
1317 	int             refmod_state;
1318 	int             pmap_options;
1319 	struct          vm_pageout_queue *iq;
1320 	ppnum_t         phys_page;
1321 	size_t          pages_moved = 0;
1322 
1323 
1324 	iq = &vm_pageout_queue_internal;
1325 
1326 	vm_page_lock_queues();
1327 
1328 #if DEVELOPMENT || DEBUG
1329 	if (perf_test) {
1330 		iq = &vm_pageout_queue_benchmark;
1331 		// ensure the benchmark queue isn't throttled
1332 		iq->pgo_maxlaundry = (unsigned int) qcount;
1333 	}
1334 #endif /* DEVELOPMENT ||DEBUG */
1335 
1336 	while (qcount && !vm_page_queue_empty(q)) {
1337 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1338 
1339 		if (VM_PAGE_Q_THROTTLED(iq)) {
1340 			if (l_object != NULL) {
1341 				vm_object_unlock(l_object);
1342 				l_object = NULL;
1343 			}
1344 			iq->pgo_draining = TRUE;
1345 
1346 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1347 			vm_page_unlock_queues();
1348 
1349 			thread_block(THREAD_CONTINUE_NULL);
1350 
1351 			vm_page_lock_queues();
1352 			delayed_unlock = 0;
1353 			continue;
1354 		}
1355 		m = (vm_page_t) vm_page_queue_first(q);
1356 		m_object = VM_PAGE_OBJECT(m);
1357 
1358 		/*
1359 		 * check to see if we currently are working
1360 		 * with the same object... if so, we've
1361 		 * already got the lock
1362 		 */
1363 		if (m_object != l_object) {
1364 			if (!m_object->internal) {
1365 				goto reenter_pg_on_q;
1366 			}
1367 
1368 			/*
1369 			 * the object associated with candidate page is
1370 			 * different from the one we were just working
1371 			 * with... dump the lock if we still own it
1372 			 */
1373 			if (l_object != NULL) {
1374 				vm_object_unlock(l_object);
1375 				l_object = NULL;
1376 			}
1377 			if (m_object != t_object) {
1378 				try_failed_count = 0;
1379 			}
1380 
1381 			/*
1382 			 * Try to lock object; since we've alread got the
1383 			 * page queues lock, we can only 'try' for this one.
1384 			 * if the 'try' fails, we need to do a mutex_pause
1385 			 * to allow the owner of the object lock a chance to
1386 			 * run...
1387 			 */
1388 			if (!vm_object_lock_try_scan(m_object)) {
1389 				if (try_failed_count > 20) {
1390 					goto reenter_pg_on_q;
1391 				}
1392 				vm_page_unlock_queues();
1393 				mutex_pause(try_failed_count++);
1394 				vm_page_lock_queues();
1395 				delayed_unlock = 0;
1396 
1397 				t_object = m_object;
1398 				continue;
1399 			}
1400 			l_object = m_object;
1401 		}
1402 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1403 			/*
1404 			 * page is not to be cleaned
1405 			 * put it back on the head of its queue
1406 			 */
1407 			goto reenter_pg_on_q;
1408 		}
1409 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1410 
1411 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1412 			refmod_state = pmap_get_refmod(phys_page);
1413 
1414 			if (refmod_state & VM_MEM_REFERENCED) {
1415 				m->vmp_reference = TRUE;
1416 			}
1417 			if (refmod_state & VM_MEM_MODIFIED) {
1418 				SET_PAGE_DIRTY(m, FALSE);
1419 			}
1420 		}
1421 		if (m->vmp_reference == TRUE) {
1422 			m->vmp_reference = FALSE;
1423 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1424 			goto reenter_pg_on_q;
1425 		}
1426 		if (m->vmp_pmapped == TRUE) {
1427 			if (m->vmp_dirty || m->vmp_precious) {
1428 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1429 			} else {
1430 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1431 			}
1432 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1433 			if (refmod_state & VM_MEM_MODIFIED) {
1434 				SET_PAGE_DIRTY(m, FALSE);
1435 			}
1436 		}
1437 
1438 		if (!m->vmp_dirty && !m->vmp_precious) {
1439 			vm_page_unlock_queues();
1440 			VM_PAGE_FREE(m);
1441 			vm_page_lock_queues();
1442 			delayed_unlock = 0;
1443 
1444 			goto next_pg;
1445 		}
1446 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1447 			if (!m_object->pager_initialized) {
1448 				vm_page_unlock_queues();
1449 
1450 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1451 
1452 				if (!m_object->pager_initialized) {
1453 					vm_object_compressor_pager_create(m_object);
1454 				}
1455 
1456 				vm_page_lock_queues();
1457 				delayed_unlock = 0;
1458 			}
1459 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1460 				goto reenter_pg_on_q;
1461 			}
1462 			/*
1463 			 * vm_object_compressor_pager_create will drop the object lock
1464 			 * which means 'm' may no longer be valid to use
1465 			 */
1466 			continue;
1467 		}
1468 
1469 		if (!perf_test) {
1470 			/*
1471 			 * we've already factored out pages in the laundry which
1472 			 * means this page can't be on the pageout queue so it's
1473 			 * safe to do the vm_page_queues_remove
1474 			 */
1475 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1476 			vm_page_queues_remove(m, TRUE);
1477 			if (donate) {
1478 				/*
1479 				 * The compressor needs to see this bit to know
1480 				 * where this page needs to land. Also if stolen,
1481 				 * this bit helps put the page back in the right
1482 				 * special queue where it belongs.
1483 				 */
1484 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1485 			}
1486 		} else {
1487 			vm_page_queue_remove(q, m, vmp_pageq);
1488 		}
1489 
1490 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1491 
1492 		vm_pageout_cluster_to_queue(m, iq);
1493 
1494 		pages_moved++;
1495 		goto next_pg;
1496 
1497 reenter_pg_on_q:
1498 		vm_page_queue_remove(q, m, vmp_pageq);
1499 		vm_page_queue_enter(q, m, vmp_pageq);
1500 next_pg:
1501 		qcount--;
1502 		try_failed_count = 0;
1503 
1504 		if (delayed_unlock++ > 128) {
1505 			if (l_object != NULL) {
1506 				vm_object_unlock(l_object);
1507 				l_object = NULL;
1508 			}
1509 			lck_mtx_yield(&vm_page_queue_lock);
1510 			delayed_unlock = 0;
1511 		}
1512 	}
1513 	if (l_object != NULL) {
1514 		vm_object_unlock(l_object);
1515 		l_object = NULL;
1516 	}
1517 	vm_page_unlock_queues();
1518 	return pages_moved;
1519 }
1520 
1521 
1522 
1523 /*
1524  * function in BSD to apply I/O throttle to the pageout thread
1525  */
1526 extern void vm_pageout_io_throttle(void);
1527 
1528 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1529 	MACRO_BEGIN                                                     \
1530 	/* \
1531 	 * If a "reusable" page somehow made it back into \
1532 	 * the active queue, it's been re-used and is not \
1533 	 * quite re-usable. \
1534 	 * If the VM object was "all_reusable", consider it \
1535 	 * as "all re-used" instead of converting it to \
1536 	 * "partially re-used", which could be expensive. \
1537 	 */                                                             \
1538 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1539 	if ((m)->vmp_reusable ||                                        \
1540 	    (obj)->all_reusable) {                                      \
1541 	        vm_object_reuse_pages((obj),                            \
1542 	                              (m)->vmp_offset,                  \
1543 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1544 	                              FALSE);                           \
1545 	}                                                               \
1546 	MACRO_END
1547 
1548 
1549 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1550 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1551 
1552 #define FCS_IDLE                0
1553 #define FCS_DELAYED             1
1554 #define FCS_DEADLOCK_DETECTED   2
1555 
1556 struct flow_control {
1557 	int             state;
1558 	mach_timespec_t ts;
1559 };
1560 
1561 
1562 uint64_t vm_pageout_rejected_bq_internal = 0;
1563 uint64_t vm_pageout_rejected_bq_external = 0;
1564 uint64_t vm_pageout_skipped_bq_internal = 0;
1565 uint64_t vm_pageout_skipped_bq_external = 0;
1566 
1567 #define ANONS_GRABBED_LIMIT     2
1568 
1569 
1570 #if 0
1571 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1572 #endif
1573 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1574 
1575 #define VM_PAGEOUT_PB_NO_ACTION                         0
1576 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1577 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1578 
1579 
1580 #if 0
1581 static void
1582 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1583 {
1584 	if (*local_freeq) {
1585 		vm_page_unlock_queues();
1586 
1587 		VM_DEBUG_CONSTANT_EVENT(
1588 			vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1589 			vm_page_free_count, 0, 0, 1);
1590 
1591 		vm_page_free_list(*local_freeq, TRUE);
1592 
1593 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1594 		    vm_page_free_count, *local_freed, 0, 1);
1595 
1596 		*local_freeq = NULL;
1597 		*local_freed = 0;
1598 
1599 		vm_page_lock_queues();
1600 	} else {
1601 		lck_mtx_yield(&vm_page_queue_lock);
1602 	}
1603 	*delayed_unlock = 1;
1604 }
1605 #endif
1606 
1607 
1608 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1609 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1610     vm_page_t *local_freeq, int *local_freed, int action)
1611 {
1612 	vm_page_unlock_queues();
1613 
1614 	if (*object != NULL) {
1615 		vm_object_unlock(*object);
1616 		*object = NULL;
1617 	}
1618 	if (*local_freeq) {
1619 		vm_page_free_list(*local_freeq, TRUE);
1620 
1621 		*local_freeq = NULL;
1622 		*local_freed = 0;
1623 	}
1624 	*delayed_unlock = 1;
1625 
1626 	switch (action) {
1627 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1628 		vm_consider_waking_compactor_swapper();
1629 		break;
1630 	case VM_PAGEOUT_PB_THREAD_YIELD:
1631 		thread_yield_internal(1);
1632 		break;
1633 	case VM_PAGEOUT_PB_NO_ACTION:
1634 	default:
1635 		break;
1636 	}
1637 	vm_page_lock_queues();
1638 }
1639 
1640 
1641 static struct vm_pageout_vminfo last;
1642 
1643 uint64_t last_vm_page_pages_grabbed = 0;
1644 
1645 extern  uint32_t c_segment_pages_compressed;
1646 
1647 extern uint64_t shared_region_pager_reclaimed;
1648 extern struct memory_object_pager_ops shared_region_pager_ops;
1649 
1650 void
update_vm_info(void)1651 update_vm_info(void)
1652 {
1653 	unsigned long tmp;
1654 	uint64_t tmp64;
1655 
1656 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1657 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1658 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1659 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1660 
1661 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1662 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1663 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1664 
1665 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1666 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1667 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1668 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1669 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1670 
1671 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1672 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1673 	last.vm_pageout_considered_page = tmp;
1674 
1675 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1676 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1677 	last.vm_pageout_compressions = tmp64;
1678 
1679 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1680 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1681 	last.vm_compressor_failed = tmp;
1682 
1683 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1684 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1685 	last.vm_compressor_pages_grabbed = tmp64;
1686 
1687 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1688 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1689 	last.vm_phantom_cache_found_ghost = tmp;
1690 
1691 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1692 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1693 	last.vm_phantom_cache_added_ghost = tmp;
1694 
1695 	tmp64 = counter_load(&vm_page_grab_count);
1696 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1697 	last_vm_page_pages_grabbed = tmp64;
1698 
1699 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1700 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1701 	last.vm_page_pages_freed = tmp;
1702 
1703 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1704 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1705 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1706 		last.vm_pageout_pages_evicted = tmp;
1707 
1708 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1709 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1710 		last.vm_pageout_pages_purged = tmp;
1711 
1712 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1713 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1714 		last.vm_pageout_freed_speculative = tmp;
1715 
1716 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1717 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1718 		last.vm_pageout_freed_external = tmp;
1719 
1720 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1721 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1722 		last.vm_pageout_inactive_referenced = tmp;
1723 
1724 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1725 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1726 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1727 
1728 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1729 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1730 		last.vm_pageout_inactive_dirty_external = tmp;
1731 
1732 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1733 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1734 		last.vm_pageout_freed_cleaned = tmp;
1735 
1736 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1737 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1738 		last.vm_pageout_inactive_nolock = tmp;
1739 
1740 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1741 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1742 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1743 
1744 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1745 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1746 		last.vm_pageout_skipped_external = tmp;
1747 
1748 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1749 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1750 		last.vm_pageout_skipped_internal = tmp;
1751 
1752 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1753 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1754 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1755 
1756 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1757 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1758 		last.vm_pageout_inactive_force_reclaim = tmp;
1759 
1760 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1761 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1762 		last.vm_pageout_freed_internal = tmp;
1763 
1764 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1765 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1766 		last.vm_pageout_considered_bq_internal = tmp;
1767 
1768 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1769 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1770 		last.vm_pageout_considered_bq_external = tmp;
1771 
1772 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1773 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1774 		last.vm_pageout_filecache_min_reactivated = tmp;
1775 
1776 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1777 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1778 		last.vm_pageout_inactive_dirty_internal = tmp;
1779 
1780 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1781 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1782 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1783 
1784 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1785 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1786 		last.vm_pageout_forcereclaimed_realtime = tmp;
1787 
1788 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1789 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1790 		last.vm_pageout_protected_sharedcache = tmp;
1791 
1792 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1793 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1794 		last.vm_pageout_protected_realtime = tmp;
1795 	}
1796 
1797 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1798 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1799 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1800 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1801 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1802 	    0);
1803 
1804 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1805 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1806 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1807 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1808 	    0,
1809 	    0);
1810 
1811 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1812 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1813 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1814 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1815 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1816 	    0);
1817 
1818 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1819 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1820 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1821 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1822 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1823 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1824 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1825 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1826 		    0);
1827 
1828 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1829 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1830 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1831 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1832 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1833 		    0);
1834 
1835 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1836 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1837 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1838 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1839 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1840 		    0);
1841 
1842 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1843 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1844 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1845 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1846 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1847 		    0);
1848 
1849 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1850 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1851 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1852 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1853 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1854 		    0);
1855 
1856 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1857 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1858 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1859 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1860 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1861 		    0);
1862 	}
1863 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1864 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1865 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1866 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1867 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1868 	    0);
1869 
1870 	record_memory_pressure();
1871 }
1872 
1873 extern boolean_t hibernation_vmqueues_inspection;
1874 
1875 /*
1876  * Return values for functions called by vm_pageout_scan
1877  * that control its flow.
1878  *
1879  * PROCEED -- vm_pageout_scan will keep making forward progress.
1880  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1881  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1882  */
1883 
1884 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1885 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1886 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1887 
1888 /*
1889  * This function is called only from vm_pageout_scan and
1890  * it moves overflow secluded pages (one-at-a-time) to the
1891  * batched 'local' free Q or active Q.
1892  */
1893 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1894 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1895 {
1896 #if CONFIG_SECLUDED_MEMORY
1897 	/*
1898 	 * Deal with secluded_q overflow.
1899 	 */
1900 	if (vm_page_secluded_count > vm_page_secluded_target) {
1901 		vm_page_t secluded_page;
1902 
1903 		/*
1904 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1905 		 * Excess secluded pages go to the active queue and
1906 		 * will later go to the inactive queue.
1907 		 */
1908 		assert((vm_page_secluded_count_free +
1909 		    vm_page_secluded_count_inuse) ==
1910 		    vm_page_secluded_count);
1911 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1912 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1913 
1914 		vm_page_queues_remove(secluded_page, FALSE);
1915 		assert(!secluded_page->vmp_fictitious);
1916 		assert(!VM_PAGE_WIRED(secluded_page));
1917 
1918 		if (secluded_page->vmp_object == 0) {
1919 			/* transfer to free queue */
1920 			assert(secluded_page->vmp_busy);
1921 			secluded_page->vmp_snext = *local_freeq;
1922 			*local_freeq = secluded_page;
1923 			*local_freed += 1;
1924 		} else {
1925 			/* transfer to head of active queue */
1926 			vm_page_enqueue_active(secluded_page, FALSE);
1927 			secluded_page = VM_PAGE_NULL;
1928 		}
1929 	}
1930 #else /* CONFIG_SECLUDED_MEMORY */
1931 
1932 #pragma unused(local_freeq)
1933 #pragma unused(local_freed)
1934 
1935 	return;
1936 
1937 #endif /* CONFIG_SECLUDED_MEMORY */
1938 }
1939 
1940 /*
1941  * This function is called only from vm_pageout_scan and
1942  * it initializes the loop targets for vm_pageout_scan().
1943  */
1944 static void
vps_init_page_targets(void)1945 vps_init_page_targets(void)
1946 {
1947 	/*
1948 	 * LD TODO: Other page targets should be calculated here too.
1949 	 */
1950 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1951 
1952 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1953 		vm_pageout_state.vm_page_speculative_percentage = 50;
1954 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1955 		vm_pageout_state.vm_page_speculative_percentage = 1;
1956 	}
1957 
1958 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1959 	    vm_page_inactive_count);
1960 }
1961 
1962 /*
1963  * This function is called only from vm_pageout_scan and
1964  * it purges a single VM object at-a-time and will either
1965  * make vm_pageout_scan() restart the loop or keeping moving forward.
1966  */
1967 static int
vps_purge_object()1968 vps_purge_object()
1969 {
1970 	int             force_purge;
1971 
1972 	assert(available_for_purge >= 0);
1973 	force_purge = 0; /* no force-purging */
1974 
1975 #if VM_PRESSURE_EVENTS
1976 	vm_pressure_level_t pressure_level;
1977 
1978 	pressure_level = memorystatus_vm_pressure_level;
1979 
1980 	if (pressure_level > kVMPressureNormal) {
1981 		if (pressure_level >= kVMPressureCritical) {
1982 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1983 		} else if (pressure_level >= kVMPressureUrgent) {
1984 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1985 		} else if (pressure_level >= kVMPressureWarning) {
1986 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1987 		}
1988 	}
1989 #endif /* VM_PRESSURE_EVENTS */
1990 
1991 	if (available_for_purge || force_purge) {
1992 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1993 
1994 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1995 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1996 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1997 			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1998 			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1999 
2000 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001 		}
2002 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2003 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2004 	}
2005 
2006 	return VM_PAGEOUT_SCAN_PROCEED;
2007 }
2008 
2009 /*
2010  * This function is called only from vm_pageout_scan and
2011  * it will try to age the next speculative Q if the oldest
2012  * one is empty.
2013  */
2014 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)2015 vps_age_speculative_queue(boolean_t force_speculative_aging)
2016 {
2017 #define DELAY_SPECULATIVE_AGE   1000
2018 
2019 	/*
2020 	 * try to pull pages from the aging bins...
2021 	 * see vm_page.h for an explanation of how
2022 	 * this mechanism works
2023 	 */
2024 	boolean_t                       can_steal = FALSE;
2025 	int                             num_scanned_queues;
2026 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2027 	mach_timespec_t                 ts;
2028 	struct vm_speculative_age_q     *aq;
2029 	struct vm_speculative_age_q     *sq;
2030 
2031 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2032 
2033 	aq = &vm_page_queue_speculative[speculative_steal_index];
2034 
2035 	num_scanned_queues = 0;
2036 	while (vm_page_queue_empty(&aq->age_q) &&
2037 	    num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2038 		speculative_steal_index++;
2039 
2040 		if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2041 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2042 		}
2043 
2044 		aq = &vm_page_queue_speculative[speculative_steal_index];
2045 	}
2046 
2047 	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2048 		/*
2049 		 * XXX We've scanned all the speculative
2050 		 * queues but still haven't found one
2051 		 * that is not empty, even though
2052 		 * vm_page_speculative_count is not 0.
2053 		 */
2054 		if (!vm_page_queue_empty(&sq->age_q)) {
2055 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2056 		}
2057 #if DEVELOPMENT || DEBUG
2058 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2059 #endif
2060 		/* readjust... */
2061 		vm_page_speculative_count = 0;
2062 		/* ... and continue */
2063 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2064 	}
2065 
2066 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2067 		can_steal = TRUE;
2068 	} else {
2069 		if (!delay_speculative_age) {
2070 			mach_timespec_t ts_fully_aged;
2071 
2072 			ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2073 			ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2074 			    * 1000 * NSEC_PER_USEC;
2075 
2076 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2077 
2078 			clock_sec_t sec;
2079 			clock_nsec_t nsec;
2080 			clock_get_system_nanotime(&sec, &nsec);
2081 			ts.tv_sec = (unsigned int) sec;
2082 			ts.tv_nsec = nsec;
2083 
2084 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2085 				can_steal = TRUE;
2086 			} else {
2087 				delay_speculative_age++;
2088 			}
2089 		} else {
2090 			delay_speculative_age++;
2091 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2092 				delay_speculative_age = 0;
2093 			}
2094 		}
2095 	}
2096 	if (can_steal == TRUE) {
2097 		vm_page_speculate_ageit(aq);
2098 	}
2099 
2100 	return VM_PAGEOUT_SCAN_PROCEED;
2101 }
2102 
2103 /*
2104  * This function is called only from vm_pageout_scan and
2105  * it evicts a single VM object from the cache.
2106  */
2107 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2108 vps_object_cache_evict(vm_object_t *object_to_unlock)
2109 {
2110 	static int                      cache_evict_throttle = 0;
2111 	struct vm_speculative_age_q     *sq;
2112 
2113 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2114 
2115 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2116 		int     pages_evicted;
2117 
2118 		if (*object_to_unlock != NULL) {
2119 			vm_object_unlock(*object_to_unlock);
2120 			*object_to_unlock = NULL;
2121 		}
2122 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2123 
2124 		pages_evicted = vm_object_cache_evict(100, 10);
2125 
2126 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2127 
2128 		if (pages_evicted) {
2129 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2130 
2131 			VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2132 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2133 			memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2134 
2135 			/*
2136 			 * we just freed up to 100 pages,
2137 			 * so go back to the top of the main loop
2138 			 * and re-evaulate the memory situation
2139 			 */
2140 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2141 		} else {
2142 			cache_evict_throttle = 1000;
2143 		}
2144 	}
2145 	if (cache_evict_throttle) {
2146 		cache_evict_throttle--;
2147 	}
2148 
2149 	return VM_PAGEOUT_SCAN_PROCEED;
2150 }
2151 
2152 
2153 /*
2154  * This function is called only from vm_pageout_scan and
2155  * it calculates the filecache min. that needs to be maintained
2156  * as we start to steal pages.
2157  */
2158 static void
vps_calculate_filecache_min(void)2159 vps_calculate_filecache_min(void)
2160 {
2161 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2162 
2163 #if CONFIG_JETSAM
2164 	/*
2165 	 * don't let the filecache_min fall below 15% of available memory
2166 	 * on systems with an active compressor that isn't nearing its
2167 	 * limits w/r to accepting new data
2168 	 *
2169 	 * on systems w/o the compressor/swapper, the filecache is always
2170 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2171 	 * since most (if not all) of the anonymous pages are in the
2172 	 * throttled queue (which isn't counted as available) which
2173 	 * effectively disables this filter
2174 	 */
2175 	if (vm_compressor_low_on_space() || divisor == 0) {
2176 		vm_pageout_state.vm_page_filecache_min = 0;
2177 	} else {
2178 		vm_pageout_state.vm_page_filecache_min =
2179 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2180 	}
2181 #else
2182 	if (vm_compressor_out_of_space() || divisor == 0) {
2183 		vm_pageout_state.vm_page_filecache_min = 0;
2184 	} else {
2185 		/*
2186 		 * don't let the filecache_min fall below the specified critical level
2187 		 */
2188 		vm_pageout_state.vm_page_filecache_min =
2189 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2190 	}
2191 #endif
2192 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2193 		vm_pageout_state.vm_page_filecache_min = 0;
2194 	}
2195 }
2196 
2197 /*
2198  * This function is called only from vm_pageout_scan and
2199  * it updates the flow control time to detect if VM pageoutscan
2200  * isn't making progress.
2201  */
2202 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2203 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2204 {
2205 	mach_timespec_t ts;
2206 	clock_sec_t sec;
2207 	clock_nsec_t nsec;
2208 
2209 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2210 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2211 	clock_get_system_nanotime(&sec, &nsec);
2212 	flow_control->ts.tv_sec = (unsigned int) sec;
2213 	flow_control->ts.tv_nsec = nsec;
2214 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2215 
2216 	flow_control->state = FCS_DELAYED;
2217 
2218 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2219 }
2220 
2221 /*
2222  * This function is called only from vm_pageout_scan and
2223  * it is the flow control logic of VM pageout scan which
2224  * controls if it should block and for how long.
2225  * Any blocking of vm_pageout_scan happens ONLY in this function.
2226  */
2227 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2228 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2229     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2230 {
2231 	boolean_t       exceeded_burst_throttle = FALSE;
2232 	unsigned int    msecs = 0;
2233 	uint32_t        inactive_external_count;
2234 	mach_timespec_t ts;
2235 	struct  vm_pageout_queue *iq;
2236 	struct  vm_pageout_queue *eq;
2237 	struct  vm_speculative_age_q *sq;
2238 
2239 	iq = &vm_pageout_queue_internal;
2240 	eq = &vm_pageout_queue_external;
2241 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2242 
2243 	/*
2244 	 * Sometimes we have to pause:
2245 	 *	1) No inactive pages - nothing to do.
2246 	 *	2) Loop control - no acceptable pages found on the inactive queue
2247 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2248 	 *	3) Flow control - default pageout queue is full
2249 	 */
2250 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2251 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2252 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2253 	    vm_page_queue_empty(&sq->age_q)) {
2254 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2255 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2256 	} else if (inactive_burst_count >=
2257 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2258 	    (vm_page_inactive_count +
2259 	    vm_page_speculative_count))) {
2260 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2261 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2262 
2263 		exceeded_burst_throttle = TRUE;
2264 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2265 	    VM_DYNAMIC_PAGING_ENABLED()) {
2266 		clock_sec_t sec;
2267 		clock_nsec_t nsec;
2268 
2269 		switch (flow_control->state) {
2270 		case FCS_IDLE:
2271 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2272 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2273 				/*
2274 				 * since the compressor is running independently of vm_pageout_scan
2275 				 * let's not wait for it just yet... as long as we have a healthy supply
2276 				 * of filecache pages to work with, let's keep stealing those.
2277 				 */
2278 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2279 
2280 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2281 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2282 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2283 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2284 					return VM_PAGEOUT_SCAN_PROCEED;
2285 				}
2286 			}
2287 
2288 			vps_flow_control_reset_deadlock_timer(flow_control);
2289 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2290 
2291 			break;
2292 
2293 		case FCS_DELAYED:
2294 			clock_get_system_nanotime(&sec, &nsec);
2295 			ts.tv_sec = (unsigned int) sec;
2296 			ts.tv_nsec = nsec;
2297 
2298 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2299 				/*
2300 				 * the pageout thread for the default pager is potentially
2301 				 * deadlocked since the
2302 				 * default pager queue has been throttled for more than the
2303 				 * allowable time... we need to move some clean pages or dirty
2304 				 * pages belonging to the external pagers if they aren't throttled
2305 				 * vm_page_free_wanted represents the number of threads currently
2306 				 * blocked waiting for pages... we'll move one page for each of
2307 				 * these plus a fixed amount to break the logjam... once we're done
2308 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2309 				 * with a new timeout target since we have no way of knowing
2310 				 * whether we've broken the deadlock except through observation
2311 				 * of the queue associated with the default pager... we need to
2312 				 * stop moving pages and allow the system to run to see what
2313 				 * state it settles into.
2314 				 */
2315 
2316 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2317 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2318 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2319 				flow_control->state = FCS_DEADLOCK_DETECTED;
2320 				thread_wakeup(VM_PAGEOUT_GC_EVENT);
2321 				return VM_PAGEOUT_SCAN_PROCEED;
2322 			}
2323 			/*
2324 			 * just resniff instead of trying
2325 			 * to compute a new delay time... we're going to be
2326 			 * awakened immediately upon a laundry completion,
2327 			 * so we won't wait any longer than necessary
2328 			 */
2329 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2330 			break;
2331 
2332 		case FCS_DEADLOCK_DETECTED:
2333 			if (*vm_pageout_deadlock_target) {
2334 				return VM_PAGEOUT_SCAN_PROCEED;
2335 			}
2336 
2337 			vps_flow_control_reset_deadlock_timer(flow_control);
2338 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2339 
2340 			break;
2341 		}
2342 	} else {
2343 		/*
2344 		 * No need to pause...
2345 		 */
2346 		return VM_PAGEOUT_SCAN_PROCEED;
2347 	}
2348 
2349 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2350 
2351 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2352 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2353 
2354 	if (vm_page_free_count >= vm_page_free_target) {
2355 		/*
2356 		 * we're here because
2357 		 *  1) someone else freed up some pages while we had
2358 		 *     the queues unlocked above
2359 		 * and we've hit one of the 3 conditions that
2360 		 * cause us to pause the pageout scan thread
2361 		 *
2362 		 * since we already have enough free pages,
2363 		 * let's avoid stalling and return normally
2364 		 *
2365 		 * before we return, make sure the pageout I/O threads
2366 		 * are running throttled in case there are still requests
2367 		 * in the laundry... since we have enough free pages
2368 		 * we don't need the laundry to be cleaned in a timely
2369 		 * fashion... so let's avoid interfering with foreground
2370 		 * activity
2371 		 *
2372 		 * we don't want to hold vm_page_queue_free_lock when
2373 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2374 		 * may cause other locks to be taken), we do the intitial
2375 		 * check outside of the lock.  Once we take the lock,
2376 		 * we recheck the condition since it may have changed.
2377 		 * if it has, no problem, we will make the threads
2378 		 * non-throttled before actually blocking
2379 		 */
2380 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2381 	}
2382 	vm_free_page_lock();
2383 
2384 	if (vm_page_free_count >= vm_page_free_target &&
2385 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2386 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2387 	}
2388 	vm_free_page_unlock();
2389 
2390 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2391 		/*
2392 		 * we're most likely about to block due to one of
2393 		 * the 3 conditions that cause vm_pageout_scan to
2394 		 * not be able to make forward progress w/r
2395 		 * to providing new pages to the free queue,
2396 		 * so unthrottle the I/O threads in case we
2397 		 * have laundry to be cleaned... it needs
2398 		 * to be completed ASAP.
2399 		 *
2400 		 * even if we don't block, we want the io threads
2401 		 * running unthrottled since the sum of free +
2402 		 * clean pages is still under our free target
2403 		 */
2404 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2405 	}
2406 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2407 		/*
2408 		 * if we get here we're below our free target and
2409 		 * we're stalling due to a full laundry queue or
2410 		 * we don't have any inactive pages other then
2411 		 * those in the clean queue...
2412 		 * however, we have pages on the clean queue that
2413 		 * can be moved to the free queue, so let's not
2414 		 * stall the pageout scan
2415 		 */
2416 		flow_control->state = FCS_IDLE;
2417 		return VM_PAGEOUT_SCAN_PROCEED;
2418 	}
2419 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2420 		flow_control->state = FCS_IDLE;
2421 		return VM_PAGEOUT_SCAN_PROCEED;
2422 	}
2423 
2424 	VM_CHECK_MEMORYSTATUS;
2425 
2426 	if (flow_control->state != FCS_IDLE) {
2427 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2428 	}
2429 
2430 	iq->pgo_throttled = TRUE;
2431 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2432 
2433 	vm_page_unlock_queues();
2434 
2435 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2436 
2437 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2438 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2439 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2440 
2441 	thread_block(THREAD_CONTINUE_NULL);
2442 
2443 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2444 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2445 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2446 
2447 	vm_page_lock_queues();
2448 
2449 	iq->pgo_throttled = FALSE;
2450 
2451 	vps_init_page_targets();
2452 
2453 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2454 }
2455 
2456 extern boolean_t vm_darkwake_mode;
2457 /*
2458  * This function is called only from vm_pageout_scan and
2459  * it will find and return the most appropriate page to be
2460  * reclaimed.
2461  */
2462 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2463 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2464     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2465 {
2466 	vm_page_t                       m = NULL;
2467 	vm_object_t                     m_object = VM_OBJECT_NULL;
2468 	uint32_t                        inactive_external_count;
2469 	struct vm_speculative_age_q     *sq;
2470 	struct vm_pageout_queue         *iq;
2471 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2472 
2473 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2474 	iq = &vm_pageout_queue_internal;
2475 
2476 	*is_page_from_bg_q = FALSE;
2477 
2478 	m = NULL;
2479 	m_object = VM_OBJECT_NULL;
2480 
2481 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2482 		assert(vm_page_throttled_count == 0);
2483 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2484 	}
2485 
2486 	/*
2487 	 * Try for a clean-queue inactive page.
2488 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2489 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2490 	 */
2491 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2492 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2493 
2494 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2495 
2496 		goto found_page;
2497 	}
2498 
2499 	/*
2500 	 * The next most eligible pages are ones we paged in speculatively,
2501 	 * but which have not yet been touched and have been aged out.
2502 	 */
2503 	if (!vm_page_queue_empty(&sq->age_q)) {
2504 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2505 
2506 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2507 
2508 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2509 			goto found_page;
2510 		} else {
2511 			m = NULL;
2512 		}
2513 	}
2514 
2515 #if !CONFIG_JETSAM
2516 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2517 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2518 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2519 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2520 			goto found_page;
2521 		}
2522 	}
2523 #endif /* !CONFIG_JETSAM */
2524 
2525 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2526 		vm_object_t     bg_m_object = NULL;
2527 
2528 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2529 
2530 		bg_m_object = VM_PAGE_OBJECT(m);
2531 
2532 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2533 			/*
2534 			 * This page is on the background queue
2535 			 * but not on a pageable queue OR is busy during
2536 			 * darkwake mode when the target is artificially lowered.
2537 			 * If it is busy during darkwake mode, and we don't skip it,
2538 			 * we will just swing back around and try again with the same
2539 			 * queue and might hit the same page or its neighbor in a
2540 			 * similar state. Both of these are transient states and will
2541 			 * get resolved, but, at this point let's ignore this page.
2542 			 */
2543 			if (vm_darkwake_mode && m->vmp_busy) {
2544 				if (bg_m_object->internal) {
2545 					vm_pageout_skipped_bq_internal++;
2546 				} else {
2547 					vm_pageout_skipped_bq_external++;
2548 				}
2549 			}
2550 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2551 			if (bg_m_object->internal &&
2552 			    (VM_PAGE_Q_THROTTLED(iq) ||
2553 			    vm_compressor_out_of_space() == TRUE ||
2554 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2555 				vm_pageout_skipped_bq_internal++;
2556 			} else {
2557 				*is_page_from_bg_q = TRUE;
2558 
2559 				if (bg_m_object->internal) {
2560 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2561 				} else {
2562 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2563 				}
2564 				goto found_page;
2565 			}
2566 		}
2567 	}
2568 
2569 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2570 
2571 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2572 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2573 		*grab_anonymous = TRUE;
2574 		*anons_grabbed = 0;
2575 
2576 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2577 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2578 		} else {
2579 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2580 				/*
2581 				 * No swap and we are in dangerously low levels of free memory.
2582 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2583 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2584 				 *
2585 				 * So, pick a file backed page...
2586 				 */
2587 				*grab_anonymous = FALSE;
2588 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2589 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2590 			}
2591 		}
2592 		goto want_anonymous;
2593 	}
2594 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2595 
2596 #if CONFIG_JETSAM
2597 	/* If the file-backed pool has accumulated
2598 	 * significantly more pages than the jetsam
2599 	 * threshold, prefer to reclaim those
2600 	 * inline to minimise compute overhead of reclaiming
2601 	 * anonymous pages.
2602 	 * This calculation does not account for the CPU local
2603 	 * external page queues, as those are expected to be
2604 	 * much smaller relative to the global pools.
2605 	 */
2606 
2607 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2608 
2609 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2610 		if (vm_page_pageable_external_count >
2611 		    vm_pageout_state.vm_page_filecache_min) {
2612 			if ((vm_page_pageable_external_count *
2613 			    vm_pageout_memorystatus_fb_factor_dr) >
2614 			    (memorystatus_available_pages_critical *
2615 			    vm_pageout_memorystatus_fb_factor_nr)) {
2616 				*grab_anonymous = FALSE;
2617 
2618 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2619 			}
2620 		}
2621 		if (*grab_anonymous) {
2622 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2623 		}
2624 	}
2625 #endif /* CONFIG_JETSAM */
2626 
2627 want_anonymous:
2628 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2629 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2630 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2631 
2632 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2633 			*anons_grabbed = 0;
2634 
2635 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2636 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637 					if ((++(*reactivated_this_call) % 100)) {
2638 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2639 
2640 						vm_page_activate(m);
2641 						counter_inc(&vm_statistics_reactivations);
2642 #if DEVELOPMENT || DEBUG
2643 						if (*is_page_from_bg_q == TRUE) {
2644 							if (m_object->internal) {
2645 								vm_pageout_rejected_bq_internal++;
2646 							} else {
2647 								vm_pageout_rejected_bq_external++;
2648 							}
2649 						}
2650 #endif /* DEVELOPMENT || DEBUG */
2651 						vm_pageout_state.vm_pageout_inactive_used++;
2652 
2653 						m = NULL;
2654 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2655 
2656 						goto found_page;
2657 					}
2658 
2659 					/*
2660 					 * steal 1 of the file backed pages even if
2661 					 * we are under the limit that has been set
2662 					 * for a healthy filecache
2663 					 */
2664 				}
2665 			}
2666 			goto found_page;
2667 		}
2668 	}
2669 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2670 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2671 
2672 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2673 		*anons_grabbed += 1;
2674 
2675 		goto found_page;
2676 	}
2677 
2678 	m = NULL;
2679 
2680 found_page:
2681 	*victim_page = m;
2682 
2683 	return retval;
2684 }
2685 
2686 /*
2687  * This function is called only from vm_pageout_scan and
2688  * it will put a page back on the active/inactive queue
2689  * if we can't reclaim it for some reason.
2690  */
2691 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2692 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2693 {
2694 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2695 		vm_page_enqueue_inactive(m, FALSE);
2696 	} else {
2697 		vm_page_activate(m);
2698 	}
2699 
2700 #if DEVELOPMENT || DEBUG
2701 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2702 
2703 	if (page_from_bg_q == TRUE) {
2704 		if (m_object->internal) {
2705 			vm_pageout_rejected_bq_internal++;
2706 		} else {
2707 			vm_pageout_rejected_bq_external++;
2708 		}
2709 	}
2710 #endif /* DEVELOPMENT || DEBUG */
2711 }
2712 
2713 /*
2714  * This function is called only from vm_pageout_scan and
2715  * it will try to grab the victim page's VM object (m_object)
2716  * which differs from the previous victim page's object (object).
2717  */
2718 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2719 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2720 {
2721 	struct vm_speculative_age_q *sq;
2722 
2723 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2724 
2725 	/*
2726 	 * the object associated with candidate page is
2727 	 * different from the one we were just working
2728 	 * with... dump the lock if we still own it
2729 	 */
2730 	if (*object != NULL) {
2731 		vm_object_unlock(*object);
2732 		*object = NULL;
2733 	}
2734 	/*
2735 	 * Try to lock object; since we've alread got the
2736 	 * page queues lock, we can only 'try' for this one.
2737 	 * if the 'try' fails, we need to do a mutex_pause
2738 	 * to allow the owner of the object lock a chance to
2739 	 * run... otherwise, we're likely to trip over this
2740 	 * object in the same state as we work our way through
2741 	 * the queue... clumps of pages associated with the same
2742 	 * object are fairly typical on the inactive and active queues
2743 	 */
2744 	if (!vm_object_lock_try_scan(m_object)) {
2745 		vm_page_t m_want = NULL;
2746 
2747 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2748 
2749 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2750 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2751 		}
2752 
2753 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2754 
2755 		m->vmp_reference = FALSE;
2756 
2757 		if (!m_object->object_is_shared_cache) {
2758 			/*
2759 			 * don't apply this optimization if this is the shared cache
2760 			 * object, it's too easy to get rid of very hot and important
2761 			 * pages...
2762 			 * m->vmp_object must be stable since we hold the page queues lock...
2763 			 * we can update the scan_collisions field sans the object lock
2764 			 * since it is a separate field and this is the only spot that does
2765 			 * a read-modify-write operation and it is never executed concurrently...
2766 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2767 			 * is possible for the value to be a bit non-determistic, but that's ok
2768 			 * since it's only used as a hint
2769 			 */
2770 			m_object->scan_collisions = 1;
2771 		}
2772 		if (page_from_bg_q) {
2773 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2774 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2775 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2776 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2777 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2778 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2779 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2780 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2781 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2782 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2783 		}
2784 
2785 		/*
2786 		 * this is the next object we're going to be interested in
2787 		 * try to make sure its available after the mutex_pause
2788 		 * returns control
2789 		 */
2790 		if (m_want) {
2791 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2792 		}
2793 
2794 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2795 
2796 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2797 	} else {
2798 		*object = m_object;
2799 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2800 	}
2801 
2802 	return VM_PAGEOUT_SCAN_PROCEED;
2803 }
2804 
2805 /*
2806  * This function is called only from vm_pageout_scan and
2807  * it notices that pageout scan may be rendered ineffective
2808  * due to a FS deadlock and will jetsam a process if possible.
2809  * If jetsam isn't supported, it'll move the page to the active
2810  * queue to try and get some different pages pushed onwards so
2811  * we can try to get out of this scenario.
2812  */
2813 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2814 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2815     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2816 {
2817 	struct  vm_pageout_queue *eq;
2818 	vm_object_t cur_object = VM_OBJECT_NULL;
2819 
2820 	cur_object = *object;
2821 
2822 	eq = &vm_pageout_queue_external;
2823 
2824 	if (cur_object->internal == FALSE) {
2825 		/*
2826 		 * we need to break up the following potential deadlock case...
2827 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2828 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2829 		 *  c) Most of the pages in the inactive queue belong to this file.
2830 		 *
2831 		 * we are potentially in this deadlock because...
2832 		 *  a) the external pageout queue is throttled
2833 		 *  b) we're done with the active queue and moved on to the inactive queue
2834 		 *  c) we've got a dirty external page
2835 		 *
2836 		 * since we don't know the reason for the external pageout queue being throttled we
2837 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2838 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2839 		 *
2840 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2841 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2842 		 * pool the next time we select a victim page... if we can make enough new free pages,
2843 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2844 		 * be throttled
2845 		 *
2846 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2847 		 * that we can try to find clean pages in the active/inactive queues before
2848 		 * deciding to jetsam a process
2849 		 */
2850 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2851 
2852 		vm_page_check_pageable_safe(m);
2853 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2854 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2855 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2856 		vm_page_active_count++;
2857 		vm_page_pageable_external_count++;
2858 
2859 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2860 
2861 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2862 
2863 #pragma unused(force_anonymous)
2864 
2865 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2866 
2867 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2868 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2869 			/*
2870 			 * Possible deadlock scenario so request jetsam action
2871 			 */
2872 
2873 			assert(cur_object);
2874 			vm_object_unlock(cur_object);
2875 
2876 			cur_object = VM_OBJECT_NULL;
2877 
2878 			/*
2879 			 * VM pageout scan needs to know we have dropped this lock and so set the
2880 			 * object variable we got passed in to NULL.
2881 			 */
2882 			*object = VM_OBJECT_NULL;
2883 
2884 			vm_page_unlock_queues();
2885 
2886 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2887 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2888 
2889 			/* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2890 			if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2891 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2892 			}
2893 
2894 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2895 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2896 
2897 			vm_page_lock_queues();
2898 			*delayed_unlock = 1;
2899 		}
2900 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2901 
2902 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2903 #pragma unused(delayed_unlock)
2904 
2905 		*force_anonymous = TRUE;
2906 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2907 	} else {
2908 		vm_page_activate(m);
2909 		counter_inc(&vm_statistics_reactivations);
2910 
2911 #if DEVELOPMENT || DEBUG
2912 		if (is_page_from_bg_q == TRUE) {
2913 			if (cur_object->internal) {
2914 				vm_pageout_rejected_bq_internal++;
2915 			} else {
2916 				vm_pageout_rejected_bq_external++;
2917 			}
2918 		}
2919 #endif /* DEVELOPMENT || DEBUG */
2920 
2921 		vm_pageout_state.vm_pageout_inactive_used++;
2922 	}
2923 }
2924 
2925 
2926 void
vm_page_balance_inactive(int max_to_move)2927 vm_page_balance_inactive(int max_to_move)
2928 {
2929 	vm_page_t m;
2930 
2931 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2932 
2933 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2934 		/*
2935 		 * It is likely that the hibernation code path is
2936 		 * dealing with these very queues as we are about
2937 		 * to move pages around in/from them and completely
2938 		 * change the linkage of the pages.
2939 		 *
2940 		 * And so we skip the rebalancing of these queues.
2941 		 */
2942 		return;
2943 	}
2944 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2945 	    vm_page_inactive_count +
2946 	    vm_page_speculative_count);
2947 
2948 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2949 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2950 
2951 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2952 
2953 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2954 		assert(!m->vmp_laundry);
2955 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2956 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2957 
2958 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2959 
2960 		/*
2961 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2962 		 *
2963 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2964 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2965 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2966 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2967 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2968 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2969 		 * have happened before we moved the page
2970 		 */
2971 		if (m->vmp_pmapped == TRUE) {
2972 			/*
2973 			 * We might be holding the page queue lock as a
2974 			 * spin lock and clearing the "referenced" bit could
2975 			 * take a while if there are lots of mappings of
2976 			 * that page, so make sure we acquire the lock as
2977 			 * as mutex to avoid a spinlock timeout.
2978 			 */
2979 			vm_page_lockconvert_queues();
2980 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2981 		}
2982 
2983 		/*
2984 		 * The page might be absent or busy,
2985 		 * but vm_page_deactivate can handle that.
2986 		 * FALSE indicates that we don't want a H/W clear reference
2987 		 */
2988 		vm_page_deactivate_internal(m, FALSE);
2989 	}
2990 }
2991 
2992 /*
2993  *	vm_pageout_scan does the dirty work for the pageout daemon.
2994  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2995  *	held and vm_page_free_wanted == 0.
2996  */
2997 void
vm_pageout_scan(void)2998 vm_pageout_scan(void)
2999 {
3000 	unsigned int loop_count = 0;
3001 	unsigned int inactive_burst_count = 0;
3002 	unsigned int reactivated_this_call;
3003 	unsigned int reactivate_limit;
3004 	vm_page_t   local_freeq = NULL;
3005 	int         local_freed = 0;
3006 	int         delayed_unlock;
3007 	int         delayed_unlock_limit = 0;
3008 	int         refmod_state = 0;
3009 	int     vm_pageout_deadlock_target = 0;
3010 	struct  vm_pageout_queue *iq;
3011 	struct  vm_pageout_queue *eq;
3012 	struct  vm_speculative_age_q *sq;
3013 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3014 	boolean_t inactive_throttled = FALSE;
3015 	vm_object_t     object = NULL;
3016 	uint32_t        inactive_reclaim_run;
3017 	boolean_t       grab_anonymous = FALSE;
3018 	boolean_t       force_anonymous = FALSE;
3019 	boolean_t       force_speculative_aging = FALSE;
3020 	int             anons_grabbed = 0;
3021 	int             page_prev_q_state = 0;
3022 	boolean_t       page_from_bg_q = FALSE;
3023 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3024 	vm_object_t     m_object = VM_OBJECT_NULL;
3025 	int             retval = 0;
3026 	boolean_t       lock_yield_check = FALSE;
3027 
3028 
3029 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3030 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3031 	    vm_pageout_state.vm_pageout_inactive_clean,
3032 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3033 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3034 
3035 	flow_control.state = FCS_IDLE;
3036 	iq = &vm_pageout_queue_internal;
3037 	eq = &vm_pageout_queue_external;
3038 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3039 
3040 	/* Ask the pmap layer to return any pages it no longer needs. */
3041 	pmap_release_pages_fast();
3042 
3043 	vm_page_lock_queues();
3044 
3045 	delayed_unlock = 1;
3046 
3047 	/*
3048 	 *	Calculate the max number of referenced pages on the inactive
3049 	 *	queue that we will reactivate.
3050 	 */
3051 	reactivated_this_call = 0;
3052 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3053 	    vm_page_inactive_count);
3054 	inactive_reclaim_run = 0;
3055 
3056 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3057 
3058 	/*
3059 	 *	We must limit the rate at which we send pages to the pagers
3060 	 *	so that we don't tie up too many pages in the I/O queues.
3061 	 *	We implement a throttling mechanism using the laundry count
3062 	 *      to limit the number of pages outstanding to the default
3063 	 *	and external pagers.  We can bypass the throttles and look
3064 	 *	for clean pages if the pageout queues don't drain in a timely
3065 	 *	fashion since this may indicate that the pageout paths are
3066 	 *	stalled waiting for memory, which only we can provide.
3067 	 */
3068 
3069 	vps_init_page_targets();
3070 	assert(object == NULL);
3071 	assert(delayed_unlock != 0);
3072 
3073 	for (;;) {
3074 		vm_page_t m;
3075 
3076 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3077 
3078 		if (lock_yield_check) {
3079 			lock_yield_check = FALSE;
3080 
3081 			if (delayed_unlock++ > delayed_unlock_limit) {
3082 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3083 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3084 			} else if (vm_pageout_scan_wants_object) {
3085 				vm_page_unlock_queues();
3086 				mutex_pause(0);
3087 				vm_page_lock_queues();
3088 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3089 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3090 			}
3091 		}
3092 
3093 		if (vm_upl_wait_for_pages < 0) {
3094 			vm_upl_wait_for_pages = 0;
3095 		}
3096 
3097 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3098 
3099 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3100 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3101 		}
3102 
3103 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3104 
3105 		assert(delayed_unlock);
3106 
3107 		/*
3108 		 * maintain our balance
3109 		 */
3110 		vm_page_balance_inactive(1);
3111 
3112 
3113 		/**********************************************************************
3114 		* above this point we're playing with the active and secluded queues
3115 		* below this point we're playing with the throttling mechanisms
3116 		* and the inactive queue
3117 		**********************************************************************/
3118 
3119 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3120 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3121 
3122 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3123 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3124 			/*
3125 			 * make sure the pageout I/O threads are running
3126 			 * throttled in case there are still requests
3127 			 * in the laundry... since we have met our targets
3128 			 * we don't need the laundry to be cleaned in a timely
3129 			 * fashion... so let's avoid interfering with foreground
3130 			 * activity
3131 			 */
3132 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3133 
3134 			vm_free_page_lock();
3135 
3136 			if ((vm_page_free_count >= vm_page_free_target) &&
3137 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3138 				/*
3139 				 * done - we have met our target *and*
3140 				 * there is no one waiting for a page.
3141 				 */
3142 return_from_scan:
3143 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3144 
3145 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3146 				    vm_pageout_state.vm_pageout_inactive,
3147 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3148 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3149 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3150 				    vm_pageout_state.vm_pageout_inactive_clean,
3151 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3152 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3153 
3154 				return;
3155 			}
3156 			vm_free_page_unlock();
3157 		}
3158 
3159 		/*
3160 		 * Before anything, we check if we have any ripe volatile
3161 		 * objects around. If so, try to purge the first object.
3162 		 * If the purge fails, fall through to reclaim a page instead.
3163 		 * If the purge succeeds, go back to the top and reevalute
3164 		 * the new memory situation.
3165 		 */
3166 		retval = vps_purge_object();
3167 
3168 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3169 			/*
3170 			 * Success
3171 			 */
3172 			if (object != NULL) {
3173 				vm_object_unlock(object);
3174 				object = NULL;
3175 			}
3176 
3177 			lock_yield_check = FALSE;
3178 			continue;
3179 		}
3180 
3181 		/*
3182 		 * If our 'aged' queue is empty and we have some speculative pages
3183 		 * in the other queues, let's go through and see if we need to age
3184 		 * them.
3185 		 *
3186 		 * If we succeeded in aging a speculative Q or just that everything
3187 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3188 		 *
3189 		 * If, for some reason, we seem to have a mismatch between the spec.
3190 		 * page count and the page queues, we reset those variables and
3191 		 * restart the loop (LD TODO: Track this better?).
3192 		 */
3193 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3194 			retval = vps_age_speculative_queue(force_speculative_aging);
3195 
3196 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3197 				lock_yield_check = FALSE;
3198 				continue;
3199 			}
3200 		}
3201 		force_speculative_aging = FALSE;
3202 
3203 		/*
3204 		 * Check to see if we need to evict objects from the cache.
3205 		 *
3206 		 * Note: 'object' here doesn't have anything to do with
3207 		 * the eviction part. We just need to make sure we have dropped
3208 		 * any object lock we might be holding if we need to go down
3209 		 * into the eviction logic.
3210 		 */
3211 		retval = vps_object_cache_evict(&object);
3212 
3213 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3214 			lock_yield_check = FALSE;
3215 			continue;
3216 		}
3217 
3218 
3219 		/*
3220 		 * Calculate our filecache_min that will affect the loop
3221 		 * going forward.
3222 		 */
3223 		vps_calculate_filecache_min();
3224 
3225 		/*
3226 		 * LD TODO: Use a structure to hold all state variables for a single
3227 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3228 		 */
3229 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3230 		    &delayed_unlock, &local_freeq, &local_freed,
3231 		    &vm_pageout_deadlock_target, inactive_burst_count);
3232 
3233 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3234 			if (loop_count >= vm_page_inactive_count) {
3235 				loop_count = 0;
3236 			}
3237 
3238 			inactive_burst_count = 0;
3239 
3240 			assert(object == NULL);
3241 			assert(delayed_unlock != 0);
3242 
3243 			lock_yield_check = FALSE;
3244 			continue;
3245 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3246 			goto return_from_scan;
3247 		}
3248 
3249 		flow_control.state = FCS_IDLE;
3250 
3251 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3252 		    vm_pageout_inactive_external_forced_reactivate_limit);
3253 		loop_count++;
3254 		inactive_burst_count++;
3255 		vm_pageout_state.vm_pageout_inactive++;
3256 
3257 		/*
3258 		 * Choose a victim.
3259 		 */
3260 
3261 		m = NULL;
3262 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3263 
3264 		if (m == NULL) {
3265 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3266 				inactive_burst_count = 0;
3267 
3268 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3269 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3270 				}
3271 
3272 				lock_yield_check = TRUE;
3273 				continue;
3274 			}
3275 
3276 			/*
3277 			 * if we've gotten here, we have no victim page.
3278 			 * check to see if we've not finished balancing the queues
3279 			 * or we have a page on the aged speculative queue that we
3280 			 * skipped due to force_anonymous == TRUE.. or we have
3281 			 * speculative  pages that we can prematurely age... if
3282 			 * one of these cases we'll keep going, else panic
3283 			 */
3284 			force_anonymous = FALSE;
3285 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3286 
3287 			if (!vm_page_queue_empty(&sq->age_q)) {
3288 				lock_yield_check = TRUE;
3289 				continue;
3290 			}
3291 
3292 			if (vm_page_speculative_count) {
3293 				force_speculative_aging = TRUE;
3294 				lock_yield_check = TRUE;
3295 				continue;
3296 			}
3297 			panic("vm_pageout: no victim");
3298 
3299 			/* NOTREACHED */
3300 		}
3301 
3302 		assert(VM_PAGE_PAGEABLE(m));
3303 		m_object = VM_PAGE_OBJECT(m);
3304 		force_anonymous = FALSE;
3305 
3306 		page_prev_q_state = m->vmp_q_state;
3307 		/*
3308 		 * we just found this page on one of our queues...
3309 		 * it can't also be on the pageout queue, so safe
3310 		 * to call vm_page_queues_remove
3311 		 */
3312 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3313 		vm_page_queues_remove(m, TRUE);
3314 		if (donate) {
3315 			/*
3316 			 * The compressor needs to see this bit to know
3317 			 * where this page needs to land. Also if stolen,
3318 			 * this bit helps put the page back in the right
3319 			 * special queue where it belongs.
3320 			 */
3321 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3322 		}
3323 
3324 		assert(!m->vmp_laundry);
3325 		assert(!m->vmp_private);
3326 		assert(!m->vmp_fictitious);
3327 		assert(!is_kernel_object(m_object));
3328 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3329 
3330 		vm_pageout_vminfo.vm_pageout_considered_page++;
3331 
3332 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3333 
3334 		/*
3335 		 * check to see if we currently are working
3336 		 * with the same object... if so, we've
3337 		 * already got the lock
3338 		 */
3339 		if (m_object != object) {
3340 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3341 
3342 			/*
3343 			 * vps_switch_object() will always drop the 'object' lock first
3344 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3345 			 * either 'm_object' or NULL.
3346 			 */
3347 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3348 
3349 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3350 				lock_yield_check = TRUE;
3351 				continue;
3352 			}
3353 		}
3354 		assert(m_object == object);
3355 		assert(VM_PAGE_OBJECT(m) == m_object);
3356 
3357 		if (m->vmp_busy) {
3358 			/*
3359 			 *	Somebody is already playing with this page.
3360 			 *	Put it back on the appropriate queue
3361 			 *
3362 			 */
3363 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3364 
3365 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3366 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3367 			}
3368 
3369 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3370 
3371 			lock_yield_check = TRUE;
3372 			continue;
3373 		}
3374 
3375 		/*
3376 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3377 		 *	If already cleaning this page in place
3378 		 *	just leave if off the paging queues.
3379 		 *	We can leave the page mapped, and upl_commit_range
3380 		 *	will put it on the clean queue.
3381 		 *
3382 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3383 		 *	an msync INVALIDATE is in progress...
3384 		 *	this page has been marked for destruction
3385 		 *      after it has been cleaned,
3386 		 *      but not yet gathered into a UPL
3387 		 *	where 'cleaning' will be set...
3388 		 *	just leave it off the paging queues
3389 		 *
3390 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3391 		 *	an msync INVALIDATE is in progress
3392 		 *	and the UPL has already gathered this page...
3393 		 *	just leave it off the paging queues
3394 		 */
3395 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3396 			lock_yield_check = TRUE;
3397 			continue;
3398 		}
3399 
3400 
3401 		/*
3402 		 *	If it's absent, in error or the object is no longer alive,
3403 		 *	we can reclaim the page... in the no longer alive case,
3404 		 *	there are 2 states the page can be in that preclude us
3405 		 *	from reclaiming it - busy or cleaning - that we've already
3406 		 *	dealt with
3407 		 */
3408 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3409 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3410 			if (m->vmp_absent) {
3411 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3412 			} else if (!object->alive ||
3413 			    (!object->internal &&
3414 			    object->pager == MEMORY_OBJECT_NULL)) {
3415 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3416 			} else {
3417 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3418 			}
3419 reclaim_page:
3420 			if (vm_pageout_deadlock_target) {
3421 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3422 				vm_pageout_deadlock_target--;
3423 			}
3424 
3425 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3426 
3427 			if (object->internal) {
3428 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3429 			} else {
3430 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3431 			}
3432 			assert(!m->vmp_cleaning);
3433 			assert(!m->vmp_laundry);
3434 
3435 			if (!object->internal &&
3436 			    object->pager != NULL &&
3437 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3438 				shared_region_pager_reclaimed++;
3439 			}
3440 
3441 			m->vmp_busy = TRUE;
3442 
3443 			/*
3444 			 * remove page from object here since we're already
3445 			 * behind the object lock... defer the rest of the work
3446 			 * we'd normally do in vm_page_free_prepare_object
3447 			 * until 'vm_page_free_list' is called
3448 			 */
3449 			if (m->vmp_tabled) {
3450 				vm_page_remove(m, TRUE);
3451 			}
3452 
3453 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3454 			m->vmp_snext = local_freeq;
3455 			local_freeq = m;
3456 			local_freed++;
3457 
3458 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3459 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3460 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3461 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3462 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3463 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3464 			} else {
3465 				vm_pageout_vminfo.vm_pageout_freed_external++;
3466 			}
3467 
3468 			inactive_burst_count = 0;
3469 
3470 			lock_yield_check = TRUE;
3471 			continue;
3472 		}
3473 		if (object->vo_copy == VM_OBJECT_NULL) {
3474 			/*
3475 			 * No one else can have any interest in this page.
3476 			 * If this is an empty purgable object, the page can be
3477 			 * reclaimed even if dirty.
3478 			 * If the page belongs to a volatile purgable object, we
3479 			 * reactivate it if the compressor isn't active.
3480 			 */
3481 			if (object->purgable == VM_PURGABLE_EMPTY) {
3482 				if (m->vmp_pmapped == TRUE) {
3483 					/* unmap the page */
3484 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3485 					if (refmod_state & VM_MEM_MODIFIED) {
3486 						SET_PAGE_DIRTY(m, FALSE);
3487 					}
3488 				}
3489 				if (m->vmp_dirty || m->vmp_precious) {
3490 					/* we saved the cost of cleaning this page ! */
3491 					vm_page_purged_count++;
3492 				}
3493 				goto reclaim_page;
3494 			}
3495 
3496 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3497 				/*
3498 				 * With the VM compressor, the cost of
3499 				 * reclaiming a page is much lower (no I/O),
3500 				 * so if we find a "volatile" page, it's better
3501 				 * to let it get compressed rather than letting
3502 				 * it occupy a full page until it gets purged.
3503 				 * So no need to check for "volatile" here.
3504 				 */
3505 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3506 				/*
3507 				 * Avoid cleaning a "volatile" page which might
3508 				 * be purged soon.
3509 				 */
3510 
3511 				/* if it's wired, we can't put it on our queue */
3512 				assert(!VM_PAGE_WIRED(m));
3513 
3514 				/* just stick it back on! */
3515 				reactivated_this_call++;
3516 
3517 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3518 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3519 				}
3520 
3521 				goto reactivate_page;
3522 			}
3523 		}
3524 		/*
3525 		 *	If it's being used, reactivate.
3526 		 *	(Fictitious pages are either busy or absent.)
3527 		 *	First, update the reference and dirty bits
3528 		 *	to make sure the page is unreferenced.
3529 		 */
3530 		refmod_state = -1;
3531 
3532 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3533 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3534 
3535 			if (refmod_state & VM_MEM_REFERENCED) {
3536 				m->vmp_reference = TRUE;
3537 			}
3538 			if (refmod_state & VM_MEM_MODIFIED) {
3539 				SET_PAGE_DIRTY(m, FALSE);
3540 			}
3541 		}
3542 
3543 		if (m->vmp_reference || m->vmp_dirty) {
3544 			/* deal with a rogue "reusable" page */
3545 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3546 		}
3547 
3548 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3549 			vm_pageout_state.vm_page_xpmapped_min = 0;
3550 		} else {
3551 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3552 		}
3553 
3554 		if (!m->vmp_no_cache &&
3555 		    page_from_bg_q == FALSE &&
3556 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3557 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3558 			/*
3559 			 * The page we pulled off the inactive list has
3560 			 * been referenced.  It is possible for other
3561 			 * processors to be touching pages faster than we
3562 			 * can clear the referenced bit and traverse the
3563 			 * inactive queue, so we limit the number of
3564 			 * reactivations.
3565 			 */
3566 			if (++reactivated_this_call >= reactivate_limit &&
3567 			    !object->object_is_shared_cache &&
3568 			    !((m->vmp_realtime ||
3569 			    object->for_realtime) &&
3570 			    vm_pageout_protect_realtime)) {
3571 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3572 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3573 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3574 				if (object->object_is_shared_cache) {
3575 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3576 				} else if (m->vmp_realtime ||
3577 				    object->for_realtime) {
3578 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3579 				}
3580 			} else {
3581 				uint32_t isinuse;
3582 
3583 				if (reactivated_this_call >= reactivate_limit) {
3584 					if (object->object_is_shared_cache) {
3585 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3586 					} else if ((m->vmp_realtime ||
3587 					    object->for_realtime) &&
3588 					    vm_pageout_protect_realtime) {
3589 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3590 					}
3591 				}
3592 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3593 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3594 				}
3595 
3596 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3597 reactivate_page:
3598 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3599 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3600 					/*
3601 					 * no explict mappings of this object exist
3602 					 * and it's not open via the filesystem
3603 					 */
3604 					vm_page_deactivate(m);
3605 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3606 				} else {
3607 					/*
3608 					 * The page was/is being used, so put back on active list.
3609 					 */
3610 					vm_page_activate(m);
3611 					counter_inc(&vm_statistics_reactivations);
3612 					inactive_burst_count = 0;
3613 				}
3614 #if DEVELOPMENT || DEBUG
3615 				if (page_from_bg_q == TRUE) {
3616 					if (m_object->internal) {
3617 						vm_pageout_rejected_bq_internal++;
3618 					} else {
3619 						vm_pageout_rejected_bq_external++;
3620 					}
3621 				}
3622 #endif /* DEVELOPMENT || DEBUG */
3623 
3624 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3625 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3626 				}
3627 				vm_pageout_state.vm_pageout_inactive_used++;
3628 
3629 				lock_yield_check = TRUE;
3630 				continue;
3631 			}
3632 			/*
3633 			 * Make sure we call pmap_get_refmod() if it
3634 			 * wasn't already called just above, to update
3635 			 * the dirty bit.
3636 			 */
3637 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3638 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3639 				if (refmod_state & VM_MEM_MODIFIED) {
3640 					SET_PAGE_DIRTY(m, FALSE);
3641 				}
3642 			}
3643 		}
3644 
3645 		/*
3646 		 * we've got a candidate page to steal...
3647 		 *
3648 		 * m->vmp_dirty is up to date courtesy of the
3649 		 * preceding check for m->vmp_reference... if
3650 		 * we get here, then m->vmp_reference had to be
3651 		 * FALSE (or possibly "reactivate_limit" was
3652 		 * exceeded), but in either case we called
3653 		 * pmap_get_refmod() and updated both
3654 		 * m->vmp_reference and m->vmp_dirty
3655 		 *
3656 		 * if it's dirty or precious we need to
3657 		 * see if the target queue is throtttled
3658 		 * it if is, we need to skip over it by moving it back
3659 		 * to the end of the inactive queue
3660 		 */
3661 
3662 		inactive_throttled = FALSE;
3663 
3664 		if (m->vmp_dirty || m->vmp_precious) {
3665 			if (object->internal) {
3666 				if (VM_PAGE_Q_THROTTLED(iq)) {
3667 					inactive_throttled = TRUE;
3668 				}
3669 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3670 				inactive_throttled = TRUE;
3671 			}
3672 		}
3673 throttle_inactive:
3674 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3675 		    object->internal && m->vmp_dirty &&
3676 		    (object->purgable == VM_PURGABLE_DENY ||
3677 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3678 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3679 			vm_page_check_pageable_safe(m);
3680 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3681 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3682 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3683 			vm_page_throttled_count++;
3684 
3685 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3686 
3687 			inactive_burst_count = 0;
3688 
3689 			lock_yield_check = TRUE;
3690 			continue;
3691 		}
3692 		if (inactive_throttled == TRUE) {
3693 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3694 			    &delayed_unlock, &force_anonymous, page_from_bg_q);
3695 
3696 			inactive_burst_count = 0;
3697 
3698 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3699 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3700 			}
3701 
3702 			lock_yield_check = TRUE;
3703 			continue;
3704 		}
3705 
3706 		/*
3707 		 * we've got a page that we can steal...
3708 		 * eliminate all mappings and make sure
3709 		 * we have the up-to-date modified state
3710 		 *
3711 		 * if we need to do a pmap_disconnect then we
3712 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3713 		 * provides the true state atomically... the
3714 		 * page was still mapped up to the pmap_disconnect
3715 		 * and may have been dirtied at the last microsecond
3716 		 *
3717 		 * Note that if 'pmapped' is FALSE then the page is not
3718 		 * and has not been in any map, so there is no point calling
3719 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3720 		 * of likely usage of the page.
3721 		 */
3722 		if (m->vmp_pmapped == TRUE) {
3723 			int pmap_options;
3724 
3725 			/*
3726 			 * Don't count this page as going into the compressor
3727 			 * if any of these are true:
3728 			 * 1) compressed pager isn't enabled
3729 			 * 2) Freezer enabled device with compressed pager
3730 			 *    backend (exclusive use) i.e. most of the VM system
3731 			 *    (including vm_pageout_scan) has no knowledge of
3732 			 *    the compressor
3733 			 * 3) This page belongs to a file and hence will not be
3734 			 *    sent into the compressor
3735 			 */
3736 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3737 			    object->internal == FALSE) {
3738 				pmap_options = 0;
3739 			} else if (m->vmp_dirty || m->vmp_precious) {
3740 				/*
3741 				 * VM knows that this page is dirty (or
3742 				 * precious) and needs to be compressed
3743 				 * rather than freed.
3744 				 * Tell the pmap layer to count this page
3745 				 * as "compressed".
3746 				 */
3747 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3748 			} else {
3749 				/*
3750 				 * VM does not know if the page needs to
3751 				 * be preserved but the pmap layer might tell
3752 				 * us if any mapping has "modified" it.
3753 				 * Let's the pmap layer to count this page
3754 				 * as compressed if and only if it has been
3755 				 * modified.
3756 				 */
3757 				pmap_options =
3758 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3759 			}
3760 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3761 			    pmap_options,
3762 			    NULL);
3763 			if (refmod_state & VM_MEM_MODIFIED) {
3764 				SET_PAGE_DIRTY(m, FALSE);
3765 			}
3766 		}
3767 
3768 		/*
3769 		 * reset our count of pages that have been reclaimed
3770 		 * since the last page was 'stolen'
3771 		 */
3772 		inactive_reclaim_run = 0;
3773 
3774 		/*
3775 		 *	If it's clean and not precious, we can free the page.
3776 		 */
3777 		if (!m->vmp_dirty && !m->vmp_precious) {
3778 			vm_pageout_state.vm_pageout_inactive_clean++;
3779 
3780 			/*
3781 			 * OK, at this point we have found a page we are going to free.
3782 			 */
3783 #if CONFIG_PHANTOM_CACHE
3784 			if (!object->internal) {
3785 				vm_phantom_cache_add_ghost(m);
3786 			}
3787 #endif
3788 			goto reclaim_page;
3789 		}
3790 
3791 		/*
3792 		 * The page may have been dirtied since the last check
3793 		 * for a throttled target queue (which may have been skipped
3794 		 * if the page was clean then).  With the dirty page
3795 		 * disconnected here, we can make one final check.
3796 		 */
3797 		if (object->internal) {
3798 			if (VM_PAGE_Q_THROTTLED(iq)) {
3799 				inactive_throttled = TRUE;
3800 			}
3801 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3802 			inactive_throttled = TRUE;
3803 		}
3804 
3805 		if (inactive_throttled == TRUE) {
3806 			goto throttle_inactive;
3807 		}
3808 
3809 #if VM_PRESSURE_EVENTS
3810 #if CONFIG_JETSAM
3811 
3812 		/*
3813 		 * If Jetsam is enabled, then the sending
3814 		 * of memory pressure notifications is handled
3815 		 * from the same thread that takes care of high-water
3816 		 * and other jetsams i.e. the memorystatus_thread.
3817 		 */
3818 
3819 #else /* CONFIG_JETSAM */
3820 
3821 		vm_pressure_response();
3822 
3823 #endif /* CONFIG_JETSAM */
3824 #endif /* VM_PRESSURE_EVENTS */
3825 
3826 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3827 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3828 		}
3829 
3830 		if (object->internal) {
3831 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3832 		} else {
3833 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3834 		}
3835 
3836 		/*
3837 		 * internal pages will go to the compressor...
3838 		 * external pages will go to the appropriate pager to be cleaned
3839 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3840 		 * is a preferred queue to steal from
3841 		 */
3842 		vm_pageout_cluster(m);
3843 		inactive_burst_count = 0;
3844 
3845 		/*
3846 		 * back to top of pageout scan loop
3847 		 */
3848 	}
3849 }
3850 
3851 
3852 void
vm_page_free_reserve(int pages)3853 vm_page_free_reserve(
3854 	int pages)
3855 {
3856 	int             free_after_reserve;
3857 
3858 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3859 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3860 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3861 		} else {
3862 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3863 		}
3864 	} else {
3865 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3866 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3867 		} else {
3868 			vm_page_free_reserved += pages;
3869 		}
3870 	}
3871 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3872 
3873 	vm_page_free_min = vm_page_free_reserved +
3874 	    VM_PAGE_FREE_MIN(free_after_reserve);
3875 
3876 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3877 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3878 	}
3879 
3880 	vm_page_free_target = vm_page_free_reserved +
3881 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3882 
3883 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3884 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3885 	}
3886 
3887 	if (vm_page_free_target < vm_page_free_min + 5) {
3888 		vm_page_free_target = vm_page_free_min + 5;
3889 	}
3890 
3891 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3892 }
3893 
3894 /*
3895  *	vm_pageout is the high level pageout daemon.
3896  */
3897 
3898 void
vm_pageout_continue(void)3899 vm_pageout_continue(void)
3900 {
3901 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3902 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3903 
3904 	vm_free_page_lock();
3905 	vm_pageout_running = TRUE;
3906 	vm_free_page_unlock();
3907 
3908 	vm_pageout_scan();
3909 	/*
3910 	 * we hold both the vm_page_queue_free_lock
3911 	 * and the vm_page_queues_lock at this point
3912 	 */
3913 	assert(vm_page_free_wanted == 0);
3914 	assert(vm_page_free_wanted_privileged == 0);
3915 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3916 
3917 	vm_pageout_running = FALSE;
3918 #if XNU_TARGET_OS_OSX
3919 	if (vm_pageout_waiter) {
3920 		vm_pageout_waiter = FALSE;
3921 		thread_wakeup((event_t)&vm_pageout_waiter);
3922 	}
3923 #endif /* XNU_TARGET_OS_OSX */
3924 
3925 	vm_free_page_unlock();
3926 	vm_page_unlock_queues();
3927 
3928 	thread_block((thread_continue_t)vm_pageout_continue);
3929 	/*NOTREACHED*/
3930 }
3931 
3932 #if XNU_TARGET_OS_OSX
3933 kern_return_t
vm_pageout_wait(uint64_t deadline)3934 vm_pageout_wait(uint64_t deadline)
3935 {
3936 	kern_return_t kr;
3937 
3938 	vm_free_page_lock();
3939 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3940 		vm_pageout_waiter = TRUE;
3941 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3942 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3943 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3944 			kr = KERN_OPERATION_TIMED_OUT;
3945 		}
3946 	}
3947 	vm_free_page_unlock();
3948 
3949 	return kr;
3950 }
3951 #endif /* XNU_TARGET_OS_OSX */
3952 
3953 OS_NORETURN
3954 static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state * ethr,__unused wait_result_t w)3955 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3956 {
3957 	vm_page_t       m = NULL;
3958 	vm_object_t     object;
3959 	vm_object_offset_t offset;
3960 	memory_object_t pager;
3961 	struct vm_pageout_queue *q = ethr->q;
3962 
3963 	/* On systems with a compressor, the external IO thread clears its
3964 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3965 	 * creation)
3966 	 */
3967 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3968 		current_thread()->options &= ~TH_OPT_VMPRIV;
3969 	}
3970 
3971 	sched_cond_ack(&(ethr->pgo_wakeup));
3972 
3973 	while (true) {
3974 		vm_page_lockspin_queues();
3975 
3976 		while (!vm_page_queue_empty(&q->pgo_pending)) {
3977 			q->pgo_busy = TRUE;
3978 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3979 
3980 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3981 			VM_PAGE_CHECK(m);
3982 			/*
3983 			 * grab a snapshot of the object and offset this
3984 			 * page is tabled in so that we can relookup this
3985 			 * page after we've taken the object lock - these
3986 			 * fields are stable while we hold the page queues lock
3987 			 * but as soon as we drop it, there is nothing to keep
3988 			 * this page in this object... we hold an activity_in_progress
3989 			 * on this object which will keep it from terminating
3990 			 */
3991 			object = VM_PAGE_OBJECT(m);
3992 			offset = m->vmp_offset;
3993 
3994 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3995 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3996 
3997 			vm_page_unlock_queues();
3998 
3999 			vm_object_lock(object);
4000 
4001 			m = vm_page_lookup(object, offset);
4002 
4003 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4004 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4005 				/*
4006 				 * it's either the same page that someone else has
4007 				 * started cleaning (or it's finished cleaning or
4008 				 * been put back on the pageout queue), or
4009 				 * the page has been freed or we have found a
4010 				 * new page at this offset... in all of these cases
4011 				 * we merely need to release the activity_in_progress
4012 				 * we took when we put the page on the pageout queue
4013 				 */
4014 				vm_object_activity_end(object);
4015 				vm_object_unlock(object);
4016 
4017 				vm_page_lockspin_queues();
4018 				continue;
4019 			}
4020 			pager = object->pager;
4021 
4022 			if (pager == MEMORY_OBJECT_NULL) {
4023 				/*
4024 				 * This pager has been destroyed by either
4025 				 * memory_object_destroy or vm_object_destroy, and
4026 				 * so there is nowhere for the page to go.
4027 				 */
4028 				if (m->vmp_free_when_done) {
4029 					/*
4030 					 * Just free the page... VM_PAGE_FREE takes
4031 					 * care of cleaning up all the state...
4032 					 * including doing the vm_pageout_throttle_up
4033 					 */
4034 					VM_PAGE_FREE(m);
4035 				} else {
4036 					vm_page_lockspin_queues();
4037 
4038 					vm_pageout_throttle_up(m);
4039 					vm_page_activate(m);
4040 
4041 					vm_page_unlock_queues();
4042 
4043 					/*
4044 					 *	And we are done with it.
4045 					 */
4046 				}
4047 				vm_object_activity_end(object);
4048 				vm_object_unlock(object);
4049 
4050 				vm_page_lockspin_queues();
4051 				continue;
4052 			}
4053 	#if 0
4054 			/*
4055 			 * we don't hold the page queue lock
4056 			 * so this check isn't safe to make
4057 			 */
4058 			VM_PAGE_CHECK(m);
4059 	#endif
4060 			/*
4061 			 * give back the activity_in_progress reference we
4062 			 * took when we queued up this page and replace it
4063 			 * it with a paging_in_progress reference that will
4064 			 * also hold the paging offset from changing and
4065 			 * prevent the object from terminating
4066 			 */
4067 			vm_object_activity_end(object);
4068 			vm_object_paging_begin(object);
4069 			vm_object_unlock(object);
4070 
4071 			/*
4072 			 * Send the data to the pager.
4073 			 * any pageout clustering happens there
4074 			 */
4075 			memory_object_data_return(pager,
4076 			    m->vmp_offset + object->paging_offset,
4077 			    PAGE_SIZE,
4078 			    NULL,
4079 			    NULL,
4080 			    FALSE,
4081 			    FALSE,
4082 			    0);
4083 
4084 			vm_object_lock(object);
4085 			vm_object_paging_end(object);
4086 			vm_object_unlock(object);
4087 
4088 			vm_pageout_io_throttle();
4089 
4090 			vm_page_lockspin_queues();
4091 		}
4092 		q->pgo_busy = FALSE;
4093 
4094 		vm_page_unlock_queues();
4095 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4096 	}
4097 	/*NOTREACHED*/
4098 }
4099 
4100 
4101 #define         MAX_FREE_BATCH          32
4102 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4103                                      * this thread.
4104                                      */
4105 
4106 
4107 OS_NORETURN
4108 static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state * cq,__unused wait_result_t w)4109 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4110 {
4111 	struct vm_pageout_queue *q;
4112 	vm_page_t       m = NULL;
4113 	boolean_t       pgo_draining;
4114 	vm_page_t   local_q;
4115 	int         local_cnt;
4116 	vm_page_t   local_freeq = NULL;
4117 	int         local_freed = 0;
4118 	int         local_batch_size;
4119 #if DEVELOPMENT || DEBUG
4120 	int       ncomps = 0;
4121 	boolean_t marked_active = FALSE;
4122 	int       num_pages_processed = 0;
4123 #endif
4124 	void *chead = NULL;
4125 
4126 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4127 
4128 	sched_cond_ack(&(cq->pgo_wakeup));
4129 
4130 	q = cq->q;
4131 
4132 	while (true) {
4133 #if DEVELOPMENT || DEBUG
4134 		bool benchmark_accounting = false;
4135 		/*
4136 		 * If we're running the compressor perf test, only process the benchmark pages.
4137 		 * We'll get back to our regular queue once the benchmark is done
4138 		 */
4139 		if (compressor_running_perf_test) {
4140 			q = cq->benchmark_q;
4141 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4142 				benchmark_accounting = true;
4143 			} else {
4144 				q = cq->q;
4145 				benchmark_accounting = false;
4146 			}
4147 		}
4148 #endif /* DEVELOPMENT || DEBUG */
4149 
4150 #if __AMP__
4151 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4152 			local_batch_size = (q->pgo_maxlaundry >> 3);
4153 			local_batch_size = MAX(local_batch_size, 16);
4154 		} else {
4155 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4156 		}
4157 #else
4158 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4159 #endif
4160 
4161 #if RECORD_THE_COMPRESSED_DATA
4162 		if (q->pgo_laundry) {
4163 			c_compressed_record_init();
4164 		}
4165 #endif
4166 		while (true) {
4167 			int     pages_left_on_q = 0;
4168 
4169 			local_cnt = 0;
4170 			local_q = NULL;
4171 
4172 			KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4173 
4174 			vm_page_lock_queues();
4175 #if DEVELOPMENT || DEBUG
4176 			if (marked_active == FALSE) {
4177 				vmct_active++;
4178 				vmct_state[cq->id] = VMCT_ACTIVE;
4179 				marked_active = TRUE;
4180 				if (vmct_active == 1) {
4181 					vm_compressor_epoch_start = mach_absolute_time();
4182 				}
4183 			}
4184 #endif
4185 			KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4186 
4187 			KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4188 
4189 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4190 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4191 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4192 				VM_PAGE_CHECK(m);
4193 
4194 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4195 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4196 				m->vmp_laundry = FALSE;
4197 
4198 				m->vmp_snext = local_q;
4199 				local_q = m;
4200 				local_cnt++;
4201 			}
4202 			if (local_q == NULL) {
4203 				break;
4204 			}
4205 
4206 			q->pgo_busy = TRUE;
4207 
4208 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4209 				vm_pageout_throttle_up_batch(q, local_cnt);
4210 				pages_left_on_q = q->pgo_laundry;
4211 			} else {
4212 				pages_left_on_q = q->pgo_laundry - local_cnt;
4213 			}
4214 
4215 			vm_page_unlock_queues();
4216 
4217 #if !RECORD_THE_COMPRESSED_DATA
4218 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4219 				// wake up the next compressor thread
4220 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4221 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4222 			}
4223 #endif
4224 			KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4225 
4226 			while (local_q) {
4227 				KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4228 
4229 				m = local_q;
4230 				local_q = m->vmp_snext;
4231 				m->vmp_snext = NULL;
4232 
4233 				/*
4234 				 * Technically we need the pageq locks to manipulate this field.
4235 				 * However, this page has been removed from all queues and is only
4236 				 * known to this compressor thread dealing with this local queue.
4237 				 *
4238 				 * TODO LIONEL: Add a second localq that is the early localq and
4239 				 * put special pages like this one on that queue in the block above
4240 				 * under the pageq lock to avoid this 'works but not clean' logic.
4241 				 */
4242 				void *donate_queue_head;
4243 #if XNU_TARGET_OS_OSX
4244 				donate_queue_head = &cq->current_early_swapout_chead;
4245 #else /* XNU_TARGET_OS_OSX */
4246 				donate_queue_head = &cq->current_late_swapout_chead;
4247 #endif /* XNU_TARGET_OS_OSX */
4248 				if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4249 					m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4250 					chead = donate_queue_head;
4251 				} else {
4252 					chead = &cq->current_regular_swapout_chead;
4253 				}
4254 
4255 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4256 #if DEVELOPMENT || DEBUG
4257 					ncomps++;
4258 #endif
4259 					KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4260 
4261 					m->vmp_snext = local_freeq;
4262 					local_freeq = m;
4263 					local_freed++;
4264 
4265 					if (local_freed >= MAX_FREE_BATCH) {
4266 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4267 
4268 						vm_page_free_list(local_freeq, TRUE);
4269 
4270 						local_freeq = NULL;
4271 						local_freed = 0;
4272 					}
4273 				}
4274 #if DEVELOPMENT || DEBUG
4275 				num_pages_processed++;
4276 #endif /* DEVELOPMENT || DEBUG */
4277 #if !CONFIG_JETSAM
4278 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4279 					kern_return_t   wait_result;
4280 					int             need_wakeup = 0;
4281 
4282 					if (local_freeq) {
4283 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4284 
4285 						vm_page_free_list(local_freeq, TRUE);
4286 						local_freeq = NULL;
4287 						local_freed = 0;
4288 
4289 						continue;
4290 					}
4291 					vm_free_page_lock_spin();
4292 
4293 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4294 						if (vm_page_free_wanted_privileged++ == 0) {
4295 							need_wakeup = 1;
4296 						}
4297 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4298 
4299 						vm_free_page_unlock();
4300 
4301 						if (need_wakeup) {
4302 							thread_wakeup((event_t)&vm_page_free_wanted);
4303 						}
4304 
4305 						if (wait_result == THREAD_WAITING) {
4306 							thread_block(THREAD_CONTINUE_NULL);
4307 						}
4308 					} else {
4309 						vm_free_page_unlock();
4310 					}
4311 				}
4312 #endif
4313 			}
4314 			if (local_freeq) {
4315 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4316 
4317 				vm_page_free_list(local_freeq, TRUE);
4318 				local_freeq = NULL;
4319 				local_freed = 0;
4320 			}
4321 			if (pgo_draining == TRUE) {
4322 				vm_page_lockspin_queues();
4323 				vm_pageout_throttle_up_batch(q, local_cnt);
4324 				vm_page_unlock_queues();
4325 			}
4326 		}
4327 		KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4328 
4329 		/*
4330 		 * queue lock is held and our q is empty
4331 		 */
4332 		q->pgo_busy = FALSE;
4333 #if DEVELOPMENT || DEBUG
4334 		if (marked_active == TRUE) {
4335 			vmct_active--;
4336 			vmct_state[cq->id] = VMCT_IDLE;
4337 
4338 			if (vmct_active == 0) {
4339 				vm_compressor_epoch_stop = mach_absolute_time();
4340 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4341 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4342 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4343 				/* This interval includes intervals where one or more
4344 				 * compressor threads were pre-empted
4345 				 */
4346 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4347 			}
4348 		}
4349 		if (compressor_running_perf_test && benchmark_accounting) {
4350 			/*
4351 			 * We could turn ON compressor_running_perf_test while still processing
4352 			 * regular non-benchmark pages. We shouldn't count them here else we
4353 			 * could overshoot. We might also still be populating that benchmark Q
4354 			 * and be under pressure. So we will go back to the regular queues. And
4355 			 * benchmark accounting will be off for that case too.
4356 			 */
4357 			compressor_perf_test_pages_processed += num_pages_processed;
4358 			thread_wakeup(&compressor_perf_test_pages_processed);
4359 		}
4360 #endif
4361 		vm_page_unlock_queues();
4362 #if DEVELOPMENT || DEBUG
4363 		if (__improbable(vm_compressor_time_thread)) {
4364 			vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4365 			vmct_stats.vmct_pages[cq->id] += ncomps;
4366 			vmct_stats.vmct_iterations[cq->id]++;
4367 			if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4368 				vmct_stats.vmct_maxpages[cq->id] = ncomps;
4369 			}
4370 			if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4371 				vmct_stats.vmct_minpages[cq->id] = ncomps;
4372 			}
4373 		}
4374 #endif
4375 
4376 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4377 #if DEVELOPMENT || DEBUG
4378 		if (compressor_running_perf_test && benchmark_accounting) {
4379 			/*
4380 			 * We've been exclusively compressing pages from the benchmark queue,
4381 			 * do 1 pass over the internal queue before blocking.
4382 			 */
4383 			continue;
4384 		}
4385 #endif
4386 
4387 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4388 	}
4389 	/*NOTREACHED*/
4390 }
4391 
4392 
4393 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4394 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4395 {
4396 	vm_object_t     object;
4397 	memory_object_t pager;
4398 	int             compressed_count_delta;
4399 	kern_return_t   retval;
4400 
4401 	object = VM_PAGE_OBJECT(m);
4402 
4403 	assert(!m->vmp_free_when_done);
4404 	assert(!m->vmp_laundry);
4405 
4406 	pager = object->pager;
4407 
4408 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4409 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4410 
4411 		vm_object_lock(object);
4412 
4413 		/*
4414 		 * If there is no memory object for the page, create
4415 		 * one and hand it to the compression pager.
4416 		 */
4417 
4418 		if (!object->pager_initialized) {
4419 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4420 		}
4421 		if (!object->pager_initialized) {
4422 			vm_object_compressor_pager_create(object);
4423 		}
4424 
4425 		pager = object->pager;
4426 
4427 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4428 			/*
4429 			 * Still no pager for the object,
4430 			 * or the pager has been destroyed.
4431 			 * Reactivate the page.
4432 			 *
4433 			 * Should only happen if there is no
4434 			 * compression pager
4435 			 */
4436 			PAGE_WAKEUP_DONE(m);
4437 
4438 			vm_page_lockspin_queues();
4439 			vm_page_activate(m);
4440 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4441 			vm_page_unlock_queues();
4442 
4443 			/*
4444 			 *	And we are done with it.
4445 			 */
4446 			vm_object_activity_end(object);
4447 			vm_object_unlock(object);
4448 
4449 			return KERN_FAILURE;
4450 		}
4451 		vm_object_unlock(object);
4452 
4453 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4454 	}
4455 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4456 	assert(object->activity_in_progress > 0);
4457 
4458 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4459 	if (m->vmp_unmodified_ro == true) {
4460 		os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4461 	}
4462 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4463 
4464 	retval = vm_compressor_pager_put(
4465 		pager,
4466 		m->vmp_offset + object->paging_offset,
4467 		VM_PAGE_GET_PHYS_PAGE(m),
4468 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4469 		m->vmp_unmodified_ro,
4470 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4471 		false,
4472 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4473 		current_chead,
4474 		scratch_buf,
4475 		&compressed_count_delta);
4476 
4477 	vm_object_lock(object);
4478 
4479 	assert(object->activity_in_progress > 0);
4480 	assert(VM_PAGE_OBJECT(m) == object);
4481 	assert( !VM_PAGE_WIRED(m));
4482 
4483 	vm_compressor_pager_count(pager,
4484 	    compressed_count_delta,
4485 	    FALSE,                       /* shared_lock */
4486 	    object);
4487 
4488 	if (retval == KERN_SUCCESS) {
4489 		/*
4490 		 * If the object is purgeable, its owner's
4491 		 * purgeable ledgers will be updated in
4492 		 * vm_page_remove() but the page still
4493 		 * contributes to the owner's memory footprint,
4494 		 * so account for it as such.
4495 		 */
4496 		if ((object->purgable != VM_PURGABLE_DENY ||
4497 		    object->vo_ledger_tag) &&
4498 		    object->vo_owner != NULL) {
4499 			/* one more compressed purgeable/tagged page */
4500 			vm_object_owner_compressed_update(object,
4501 			    compressed_count_delta);
4502 		}
4503 		counter_inc(&vm_statistics_compressions);
4504 
4505 		if (m->vmp_tabled) {
4506 			vm_page_remove(m, TRUE);
4507 		}
4508 	} else {
4509 		PAGE_WAKEUP_DONE(m);
4510 
4511 		vm_page_lockspin_queues();
4512 
4513 		vm_page_activate(m);
4514 		vm_pageout_vminfo.vm_compressor_failed++;
4515 
4516 		vm_page_unlock_queues();
4517 	}
4518 	vm_object_activity_end(object);
4519 	vm_object_unlock(object);
4520 
4521 	return retval;
4522 }
4523 
4524 
4525 static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state * ethr,boolean_t req_lowpriority)4526 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4527 {
4528 	uint32_t        policy;
4529 
4530 	if (hibernate_cleaning_in_progress == TRUE) {
4531 		req_lowpriority = FALSE;
4532 	}
4533 
4534 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4535 		vm_page_unlock_queues();
4536 
4537 		if (req_lowpriority == TRUE) {
4538 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4539 			DTRACE_VM(laundrythrottle);
4540 		} else {
4541 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4542 			DTRACE_VM(laundryunthrottle);
4543 		}
4544 		proc_set_thread_policy(ethr->pgo_iothread,
4545 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4546 
4547 		vm_page_lock_queues();
4548 		ethr->q->pgo_lowpriority = req_lowpriority;
4549 	}
4550 }
4551 
4552 OS_NORETURN
4553 static void
vm_pageout_iothread_external(struct pgo_iothread_state * ethr,__unused wait_result_t w)4554 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4555 {
4556 	thread_t        self = current_thread();
4557 
4558 	self->options |= TH_OPT_VMPRIV;
4559 
4560 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4561 
4562 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4563 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4564 
4565 	vm_page_lock_queues();
4566 
4567 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4568 	vm_pageout_queue_external.pgo_inited = TRUE;
4569 
4570 	vm_page_unlock_queues();
4571 
4572 #if CONFIG_THREAD_GROUPS
4573 	thread_group_vm_add();
4574 #endif /* CONFIG_THREAD_GROUPS */
4575 
4576 	vm_pageout_iothread_external_continue(ethr, 0);
4577 	/*NOTREACHED*/
4578 }
4579 
4580 
4581 OS_NORETURN
4582 static void
vm_pageout_iothread_internal(struct pgo_iothread_state * cthr,__unused wait_result_t w)4583 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4584 {
4585 	thread_t        self = current_thread();
4586 
4587 	self->options |= TH_OPT_VMPRIV;
4588 
4589 	vm_page_lock_queues();
4590 
4591 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4592 	vm_pageout_queue_internal.pgo_inited = TRUE;
4593 
4594 #if DEVELOPMENT || DEBUG
4595 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4596 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4597 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4598 #endif /* DEVELOPMENT || DEBUG */
4599 
4600 	vm_page_unlock_queues();
4601 
4602 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4603 		thread_vm_bind_group_add();
4604 	}
4605 
4606 #if CONFIG_THREAD_GROUPS
4607 	thread_group_vm_add();
4608 #endif /* CONFIG_THREAD_GROUPS */
4609 
4610 #if __AMP__
4611 	if (vm_compressor_ebound) {
4612 		/*
4613 		 * Use the soft bound option for vm_compressor to allow it to run on
4614 		 * P-cores if E-cluster is unavailable.
4615 		 */
4616 		thread_bind_cluster_type(self, 'E', true);
4617 	}
4618 #endif /* __AMP__ */
4619 
4620 	thread_set_thread_name(current_thread(), "VM_compressor");
4621 #if DEVELOPMENT || DEBUG
4622 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4623 #endif
4624 	vm_pageout_iothread_internal_continue(cthr, 0);
4625 
4626 	/*NOTREACHED*/
4627 }
4628 
4629 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4630 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4631 {
4632 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4633 		return KERN_SUCCESS;
4634 	} else {
4635 		return KERN_FAILURE; /* Already set */
4636 	}
4637 }
4638 
4639 extern boolean_t        memorystatus_manual_testing_on;
4640 extern unsigned int     memorystatus_level;
4641 
4642 
4643 #if VM_PRESSURE_EVENTS
4644 
4645 boolean_t vm_pressure_events_enabled = FALSE;
4646 
4647 extern uint64_t next_warning_notification_sent_at_ts;
4648 extern uint64_t next_critical_notification_sent_at_ts;
4649 
4650 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4651 
4652 /*
4653  * The last time there was change in pressure level OR we forced a check
4654  * because the system is stuck in a non-normal pressure level.
4655  */
4656 uint64_t  vm_pressure_last_level_transition_abs = 0;
4657 
4658 /*
4659  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4660  * level before resending out notifications for that level again.
4661  */
4662 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4663 
4664 void
vm_pressure_response(void)4665 vm_pressure_response(void)
4666 {
4667 	vm_pressure_level_t     old_level = kVMPressureNormal;
4668 	int                     new_level = -1;
4669 	unsigned int            total_pages;
4670 	uint64_t                available_memory = 0;
4671 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4672 	bool                    force_check = false;
4673 	int                     time_in_mins;
4674 
4675 
4676 	if (vm_pressure_events_enabled == FALSE) {
4677 		return;
4678 	}
4679 
4680 #if !XNU_TARGET_OS_OSX
4681 
4682 	available_memory = (uint64_t) memorystatus_available_pages;
4683 
4684 #else /* !XNU_TARGET_OS_OSX */
4685 
4686 	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4687 	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4688 
4689 #endif /* !XNU_TARGET_OS_OSX */
4690 
4691 	total_pages = (unsigned int) atop_64(max_mem);
4692 #if CONFIG_SECLUDED_MEMORY
4693 	total_pages -= vm_page_secluded_count;
4694 #endif /* CONFIG_SECLUDED_MEMORY */
4695 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4696 
4697 	if (memorystatus_manual_testing_on) {
4698 		return;
4699 	}
4700 
4701 	curr_ts = mach_absolute_time();
4702 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4703 
4704 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4705 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4706 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4707 
4708 	old_level = memorystatus_vm_pressure_level;
4709 
4710 	switch (memorystatus_vm_pressure_level) {
4711 	case kVMPressureNormal:
4712 	{
4713 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4714 			new_level = kVMPressureCritical;
4715 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4716 			new_level = kVMPressureWarning;
4717 		}
4718 		break;
4719 	}
4720 
4721 	case kVMPressureWarning:
4722 	case kVMPressureUrgent:
4723 	{
4724 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4725 			new_level = kVMPressureNormal;
4726 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4727 			new_level = kVMPressureCritical;
4728 		} else if (force_check) {
4729 			new_level = kVMPressureWarning;
4730 			next_warning_notification_sent_at_ts = curr_ts;
4731 		}
4732 		break;
4733 	}
4734 
4735 	case kVMPressureCritical:
4736 	{
4737 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4738 			new_level = kVMPressureNormal;
4739 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4740 			new_level = kVMPressureWarning;
4741 		} else if (force_check) {
4742 			new_level = kVMPressureCritical;
4743 			next_critical_notification_sent_at_ts = curr_ts;
4744 		}
4745 		break;
4746 	}
4747 
4748 	default:
4749 		return;
4750 	}
4751 
4752 	if (new_level != -1 || force_check) {
4753 		if (new_level != -1) {
4754 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4755 
4756 			if (new_level != (int) old_level) {
4757 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4758 				    new_level, old_level, 0, 0);
4759 			}
4760 		} else {
4761 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4762 			    new_level, old_level, force_check, 0);
4763 		}
4764 
4765 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4766 			/*
4767 			 * We don't want to schedule a wakeup while hibernation is in progress
4768 			 * because that could collide with checks for non-monotonicity in the scheduler.
4769 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4770 			 * we _might_ want to use that for decisions regarding which pages or how
4771 			 * many pages we want to dump in hibernation.
4772 			 */
4773 			return;
4774 		}
4775 
4776 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4777 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4778 				thread_wakeup(&vm_pressure_thread);
4779 			}
4780 
4781 			if (old_level != memorystatus_vm_pressure_level) {
4782 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4783 			}
4784 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4785 		}
4786 	}
4787 }
4788 #endif /* VM_PRESSURE_EVENTS */
4789 
4790 
4791 /**
4792  * Called by a kernel thread to ask if a number of pages may be wired.
4793  */
4794 kern_return_t
mach_vm_wire_level_monitor(int64_t requested_pages)4795 mach_vm_wire_level_monitor(int64_t requested_pages)
4796 {
4797 	if (requested_pages <= 0) {
4798 		return KERN_INVALID_ARGUMENT;
4799 	}
4800 
4801 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4802 	/**
4803 	 * Available pages can be negative in the case where more system memory is
4804 	 * wired than the threshold, so we must use a signed integer.
4805 	 */
4806 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4807 
4808 	if (requested_pages > available_pages) {
4809 		return KERN_RESOURCE_SHORTAGE;
4810 	}
4811 	return KERN_SUCCESS;
4812 }
4813 
4814 /*
4815  * Function called by a kernel thread to either get the current pressure level or
4816  * wait until memory pressure changes from a given level.
4817  */
4818 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4819 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4820 {
4821 #if !VM_PRESSURE_EVENTS
4822 
4823 	return KERN_FAILURE;
4824 
4825 #else /* VM_PRESSURE_EVENTS */
4826 
4827 	wait_result_t       wr = 0;
4828 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4829 
4830 	if (pressure_level == NULL) {
4831 		return KERN_INVALID_ARGUMENT;
4832 	}
4833 
4834 	if (*pressure_level == kVMPressureJetsam) {
4835 		if (!wait_for_pressure) {
4836 			return KERN_INVALID_ARGUMENT;
4837 		}
4838 
4839 		lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4840 		wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4841 		    THREAD_INTERRUPTIBLE);
4842 		if (wr == THREAD_WAITING) {
4843 			++memorystatus_jetsam_fg_band_waiters;
4844 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4845 			wr = thread_block(THREAD_CONTINUE_NULL);
4846 		} else {
4847 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4848 		}
4849 		if (wr != THREAD_AWAKENED) {
4850 			return KERN_ABORTED;
4851 		}
4852 		*pressure_level = kVMPressureJetsam;
4853 		return KERN_SUCCESS;
4854 	}
4855 
4856 	if (wait_for_pressure == TRUE) {
4857 		while (old_level == *pressure_level) {
4858 			wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4859 			    THREAD_INTERRUPTIBLE);
4860 			if (wr == THREAD_WAITING) {
4861 				wr = thread_block(THREAD_CONTINUE_NULL);
4862 			}
4863 			if (wr == THREAD_INTERRUPTED) {
4864 				return KERN_ABORTED;
4865 			}
4866 
4867 			if (wr == THREAD_AWAKENED) {
4868 				old_level = memorystatus_vm_pressure_level;
4869 			}
4870 		}
4871 	}
4872 
4873 	*pressure_level = old_level;
4874 	return KERN_SUCCESS;
4875 #endif /* VM_PRESSURE_EVENTS */
4876 }
4877 
4878 #if VM_PRESSURE_EVENTS
4879 void
vm_pressure_thread(void)4880 vm_pressure_thread(void)
4881 {
4882 	static boolean_t thread_initialized = FALSE;
4883 
4884 	if (thread_initialized == TRUE) {
4885 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4886 		consider_vm_pressure_events();
4887 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4888 	}
4889 
4890 #if CONFIG_THREAD_GROUPS
4891 	thread_group_vm_add();
4892 #endif /* CONFIG_THREAD_GROUPS */
4893 
4894 	thread_set_thread_name(current_thread(), "VM_pressure");
4895 	thread_initialized = TRUE;
4896 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4897 	thread_block((thread_continue_t)vm_pressure_thread);
4898 }
4899 #endif /* VM_PRESSURE_EVENTS */
4900 
4901 
4902 /*
4903  * called once per-second via "compute_averages"
4904  */
4905 void
compute_pageout_gc_throttle(__unused void * arg)4906 compute_pageout_gc_throttle(__unused void *arg)
4907 {
4908 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4909 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4910 
4911 		thread_wakeup(VM_PAGEOUT_GC_EVENT);
4912 	}
4913 }
4914 
4915 /*
4916  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4917  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4918  * jetsams. We need to check if the zone map size is above its jetsam limit to
4919  * decide if this was indeed the case.
4920  *
4921  * We need to do this on a different thread because of the following reasons:
4922  *
4923  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4924  * itself causing the system to hang. We perform synchronous jetsams if we're
4925  * leaking in the VM map entries zone, so the leaking process could be doing a
4926  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4927  * jetsam itself. We also need the vm_map lock on the process termination path,
4928  * which would now lead the dying process to deadlock against itself.
4929  *
4930  * 2. The jetsam path might need to allocate zone memory itself. We could try
4931  * using the non-blocking variant of zalloc for this path, but we can still
4932  * end up trying to do a kmem_alloc when the zone maps are almost full.
4933  */
4934 __dead2
4935 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4936 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4937 {
4938 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4939 
4940 	if (step == VM_PAGEOUT_GC_INIT) {
4941 		/* first time being called is not about GC */
4942 #if CONFIG_THREAD_GROUPS
4943 		thread_group_vm_add();
4944 #endif /* CONFIG_THREAD_GROUPS */
4945 	} else if (zone_map_nearing_exhaustion()) {
4946 		/*
4947 		 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4948 		 *
4949 		 * Bail out after calling zone_gc (which triggers the
4950 		 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4951 		 * operations that clear out a bunch of caches might allocate zone
4952 		 * memory themselves (for eg. vm_map operations would need VM map
4953 		 * entries). Since the zone map is almost full at this point, we
4954 		 * could end up with a panic. We just need to quickly jetsam a
4955 		 * process and exit here.
4956 		 *
4957 		 * It could so happen that we were woken up to relieve memory
4958 		 * pressure and the zone map also happened to be near its limit at
4959 		 * the time, in which case we'll skip out early. But that should be
4960 		 * ok; if memory pressure persists, the thread will simply be woken
4961 		 * up again.
4962 		 */
4963 		zone_gc(ZONE_GC_JETSAM);
4964 	} else {
4965 		/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4966 		boolean_t buf_large_zfree = FALSE;
4967 		boolean_t first_try = TRUE;
4968 
4969 		stack_collect();
4970 
4971 		consider_machine_collect();
4972 #if CONFIG_MBUF_MCACHE
4973 		mbuf_drain(FALSE);
4974 #endif /* CONFIG_MBUF_MCACHE */
4975 
4976 		do {
4977 			if (consider_buffer_cache_collect != NULL) {
4978 				buf_large_zfree = (*consider_buffer_cache_collect)(0);
4979 			}
4980 			if (first_try == TRUE || buf_large_zfree == TRUE) {
4981 				/*
4982 				 * zone_gc should be last, because the other operations
4983 				 * might return memory to zones.
4984 				 */
4985 				zone_gc(ZONE_GC_TRIM);
4986 			}
4987 			first_try = FALSE;
4988 		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4989 
4990 		consider_machine_adjust();
4991 	}
4992 
4993 	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4994 
4995 	thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4996 	__builtin_unreachable();
4997 }
4998 
4999 
5000 #if VM_PAGE_BUCKETS_CHECK
5001 #if VM_PAGE_FAKE_BUCKETS
5002 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5003 #endif /* VM_PAGE_FAKE_BUCKETS */
5004 #endif /* VM_PAGE_BUCKETS_CHECK */
5005 
5006 
5007 
5008 void
vm_set_restrictions(unsigned int num_cpus)5009 vm_set_restrictions(unsigned int num_cpus)
5010 {
5011 	int vm_restricted_to_single_processor = 0;
5012 
5013 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5014 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5015 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5016 	} else {
5017 		assert(num_cpus > 0);
5018 
5019 		if (num_cpus <= 3) {
5020 			/*
5021 			 * on systems with a limited number of CPUS, bind the
5022 			 * 4 major threads that can free memory and that tend to use
5023 			 * a fair bit of CPU under pressured conditions to a single processor.
5024 			 * This insures that these threads don't hog all of the available CPUs
5025 			 * (important for camera launch), while allowing them to run independently
5026 			 * w/r to locks... the 4 threads are
5027 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
5028 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
5029 			 * memorystatus_thread (jetsams).
5030 			 *
5031 			 * the first time the thread is run, it is responsible for checking the
5032 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
5033 			 * thread_bind_master...  someday this should be replaced with a group
5034 			 * scheduling mechanism and KPI.
5035 			 */
5036 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5037 		} else {
5038 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5039 		}
5040 	}
5041 }
5042 
5043 /*
5044  * Set up vm_config based on the vm_compressor_mode.
5045  * Must run BEFORE the pageout thread starts up.
5046  */
5047 __startup_func
5048 void
vm_config_init(void)5049 vm_config_init(void)
5050 {
5051 	bzero(&vm_config, sizeof(vm_config));
5052 
5053 	switch (vm_compressor_mode) {
5054 	case VM_PAGER_DEFAULT:
5055 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5056 		OS_FALLTHROUGH;
5057 
5058 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5059 		vm_config.compressor_is_present = TRUE;
5060 		vm_config.swap_is_present = TRUE;
5061 		vm_config.compressor_is_active = TRUE;
5062 		vm_config.swap_is_active = TRUE;
5063 		break;
5064 
5065 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5066 		vm_config.compressor_is_present = TRUE;
5067 		vm_config.swap_is_present = TRUE;
5068 		vm_config.compressor_is_active = TRUE;
5069 		break;
5070 
5071 	case VM_PAGER_FREEZER_DEFAULT:
5072 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5073 		OS_FALLTHROUGH;
5074 
5075 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5076 		vm_config.compressor_is_present = TRUE;
5077 		vm_config.swap_is_present = TRUE;
5078 		break;
5079 
5080 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5081 		vm_config.compressor_is_present = TRUE;
5082 		vm_config.swap_is_present = TRUE;
5083 		vm_config.compressor_is_active = TRUE;
5084 		vm_config.freezer_swap_is_active = TRUE;
5085 		break;
5086 
5087 	case VM_PAGER_NOT_CONFIGURED:
5088 		break;
5089 
5090 	default:
5091 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5092 		break;
5093 	}
5094 }
5095 
5096 __startup_func
5097 static void
vm_pageout_create_gc_thread(void)5098 vm_pageout_create_gc_thread(void)
5099 {
5100 	thread_t thread;
5101 
5102 	if (kernel_thread_create(vm_pageout_garbage_collect,
5103 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5104 		panic("vm_pageout_garbage_collect: create failed");
5105 	}
5106 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5107 	if (thread->reserved_stack == 0) {
5108 		assert(thread->kernel_stack);
5109 		thread->reserved_stack = thread->kernel_stack;
5110 	}
5111 
5112 	/* thread is started in vm_pageout() */
5113 	vm_pageout_gc_thread = thread;
5114 }
5115 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5116 
5117 void
vm_pageout(void)5118 vm_pageout(void)
5119 {
5120 	thread_t        self = current_thread();
5121 	thread_t        thread;
5122 	kern_return_t   result;
5123 	spl_t           s;
5124 
5125 	/*
5126 	 * Set thread privileges.
5127 	 */
5128 	s = splsched();
5129 
5130 #if CONFIG_VPS_DYNAMIC_PRIO
5131 	if (vps_dynamic_priority_enabled) {
5132 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5133 		thread_set_eager_preempt(self);
5134 	} else {
5135 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5136 	}
5137 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5138 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5139 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5140 
5141 	thread_lock(self);
5142 	self->options |= TH_OPT_VMPRIV;
5143 	thread_unlock(self);
5144 
5145 	if (!self->reserved_stack) {
5146 		self->reserved_stack = self->kernel_stack;
5147 	}
5148 
5149 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5150 	    !vps_dynamic_priority_enabled) {
5151 		thread_vm_bind_group_add();
5152 	}
5153 
5154 
5155 #if CONFIG_THREAD_GROUPS
5156 	thread_group_vm_add();
5157 #endif /* CONFIG_THREAD_GROUPS */
5158 
5159 #if __AMP__
5160 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5161 	if (vm_pgo_pbound) {
5162 		/*
5163 		 * Use the soft bound option for vm pageout to allow it to run on
5164 		 * E-cores if P-cluster is unavailable.
5165 		 */
5166 		thread_bind_cluster_type(self, 'P', true);
5167 	}
5168 #endif /* __AMP__ */
5169 
5170 	PE_parse_boot_argn("vmpgo_protect_realtime",
5171 	    &vm_pageout_protect_realtime,
5172 	    sizeof(vm_pageout_protect_realtime));
5173 	splx(s);
5174 
5175 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5176 
5177 	/*
5178 	 *	Initialize some paging parameters.
5179 	 */
5180 
5181 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5182 	vm_pageout_state.vm_pressure_changed = FALSE;
5183 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5184 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5185 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5186 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5187 	vm_pageout_state.vm_page_speculative_percentage = 5;
5188 	vm_pageout_state.vm_page_speculative_target = 0;
5189 
5190 	vm_pageout_state.vm_pageout_swap_wait = 0;
5191 	vm_pageout_state.vm_pageout_idle_wait = 0;
5192 	vm_pageout_state.vm_pageout_empty_wait = 0;
5193 	vm_pageout_state.vm_pageout_burst_wait = 0;
5194 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5195 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5196 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5197 
5198 	vm_pageout_state.vm_pageout_inactive = 0;
5199 	vm_pageout_state.vm_pageout_inactive_used = 0;
5200 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5201 
5202 	vm_pageout_state.vm_memory_pressure = 0;
5203 	vm_pageout_state.vm_page_filecache_min = 0;
5204 #if CONFIG_JETSAM
5205 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5206 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5207 #else
5208 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5209 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5210 #endif
5211 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5212 
5213 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5214 
5215 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5216 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5217 	}
5218 
5219 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5220 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5221 	}
5222 
5223 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5224 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5225 	}
5226 
5227 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5228 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5229 	}
5230 
5231 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5232 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5233 	}
5234 
5235 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5236 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5237 	}
5238 
5239 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5240 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5241 	}
5242 	/*
5243 	 * even if we've already called vm_page_free_reserve
5244 	 * call it again here to insure that the targets are
5245 	 * accurately calculated (it uses vm_page_free_count_init)
5246 	 * calling it with an arg of 0 will not change the reserve
5247 	 * but will re-calculate free_min and free_target
5248 	 */
5249 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5250 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5251 	} else {
5252 		vm_page_free_reserve(0);
5253 	}
5254 
5255 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5256 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5257 
5258 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5259 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5260 
5261 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5262 
5263 #if DEVELOPMENT || DEBUG
5264 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5265 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5266 #endif /* DEVELOPMENT || DEBUG */
5267 
5268 
5269 	/* internal pageout thread started when default pager registered first time */
5270 	/* external pageout and garbage collection threads started here */
5271 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5272 	ethr->id = 0;
5273 	ethr->q = &vm_pageout_queue_external;
5274 	ethr->current_early_swapout_chead = NULL;
5275 	ethr->current_regular_swapout_chead = NULL;
5276 	ethr->current_late_swapout_chead = NULL;
5277 	ethr->scratch_buf = NULL;
5278 #if DEVELOPMENT || DEBUG
5279 	ethr->benchmark_q = NULL;
5280 #endif /* DEVELOPMENT || DEBUG */
5281 	sched_cond_init(&(ethr->pgo_wakeup));
5282 
5283 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5284 	    (void *)ethr, BASEPRI_VM,
5285 	    &(ethr->pgo_iothread));
5286 	if (result != KERN_SUCCESS) {
5287 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5288 	}
5289 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5290 
5291 	thread_mtx_lock(vm_pageout_gc_thread );
5292 	thread_start(vm_pageout_gc_thread );
5293 	thread_mtx_unlock(vm_pageout_gc_thread);
5294 
5295 #if VM_PRESSURE_EVENTS
5296 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5297 	    BASEPRI_DEFAULT,
5298 	    &thread);
5299 
5300 	if (result != KERN_SUCCESS) {
5301 		panic("vm_pressure_thread: create failed");
5302 	}
5303 
5304 	thread_deallocate(thread);
5305 #endif
5306 
5307 	vm_object_reaper_init();
5308 
5309 
5310 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5311 		vm_compressor_init();
5312 	}
5313 
5314 #if VM_PRESSURE_EVENTS
5315 	vm_pressure_events_enabled = TRUE;
5316 #endif /* VM_PRESSURE_EVENTS */
5317 
5318 #if CONFIG_PHANTOM_CACHE
5319 	vm_phantom_cache_init();
5320 #endif
5321 #if VM_PAGE_BUCKETS_CHECK
5322 #if VM_PAGE_FAKE_BUCKETS
5323 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5324 	    (uint64_t) vm_page_fake_buckets_start,
5325 	    (uint64_t) vm_page_fake_buckets_end);
5326 	pmap_protect(kernel_pmap,
5327 	    vm_page_fake_buckets_start,
5328 	    vm_page_fake_buckets_end,
5329 	    VM_PROT_READ);
5330 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5331 #endif /* VM_PAGE_FAKE_BUCKETS */
5332 #endif /* VM_PAGE_BUCKETS_CHECK */
5333 
5334 #if VM_OBJECT_TRACKING
5335 	vm_object_tracking_init();
5336 #endif /* VM_OBJECT_TRACKING */
5337 
5338 #if __arm64__
5339 //	vm_tests();
5340 #endif /* __arm64__ */
5341 
5342 	vm_pageout_continue();
5343 
5344 	/*
5345 	 * Unreached code!
5346 	 *
5347 	 * The vm_pageout_continue() call above never returns, so the code below is never
5348 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5349 	 * points that our kernel doesn't have an analog for.  These are probe points that
5350 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5351 	 * scripts that use them.  Declaring the probe points here means their scripts will
5352 	 * compile and execute which we want for portability of the scripts, but since this
5353 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5354 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5355 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5356 	 */
5357 
5358 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5359 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5360 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5361 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5362 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5363 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5364 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5365 	/*NOTREACHED*/
5366 }
5367 
5368 
5369 
5370 kern_return_t
vm_pageout_internal_start(void)5371 vm_pageout_internal_start(void)
5372 {
5373 	kern_return_t   result = KERN_SUCCESS;
5374 	host_basic_info_data_t hinfo;
5375 	vm_offset_t     buf, bufsize;
5376 
5377 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5378 
5379 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5380 #define BSD_HOST 1
5381 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5382 
5383 	assert(hinfo.max_cpus > 0);
5384 
5385 #if !XNU_TARGET_OS_OSX
5386 	vm_pageout_state.vm_compressor_thread_count = 1;
5387 #else /* !XNU_TARGET_OS_OSX */
5388 	if (hinfo.max_cpus > 4) {
5389 		vm_pageout_state.vm_compressor_thread_count = 2;
5390 	} else {
5391 		vm_pageout_state.vm_compressor_thread_count = 1;
5392 	}
5393 #endif /* !XNU_TARGET_OS_OSX */
5394 #if     __AMP__
5395 	if (vm_compressor_ebound) {
5396 		vm_pageout_state.vm_compressor_thread_count = 2;
5397 	}
5398 #endif
5399 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5400 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5401 
5402 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5403 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5404 	}
5405 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5406 		vm_pageout_state.vm_compressor_thread_count = 1;
5407 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5408 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5409 	}
5410 
5411 	vm_pageout_queue_internal.pgo_maxlaundry =
5412 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5413 
5414 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5415 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5416 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5417 
5418 #if DEVELOPMENT || DEBUG
5419 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5420 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5421 #endif /* DEVELOPMENT || DEBUG */
5422 
5423 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5424 
5425 	kmem_alloc(kernel_map, &buf,
5426 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5427 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5428 	    VM_KERN_MEMORY_COMPRESSOR);
5429 
5430 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5431 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5432 		iq->id = i;
5433 		iq->q = &vm_pageout_queue_internal;
5434 		iq->current_early_swapout_chead = NULL;
5435 		iq->current_regular_swapout_chead = NULL;
5436 		iq->current_late_swapout_chead = NULL;
5437 		iq->scratch_buf = (char *)(buf + i * bufsize);
5438 #if DEVELOPMENT || DEBUG
5439 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5440 #endif /* DEVELOPMENT || DEBUG */
5441 		sched_cond_init(&(iq->pgo_wakeup));
5442 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5443 		    (void *)iq, BASEPRI_VM,
5444 		    &(iq->pgo_iothread));
5445 
5446 		if (result != KERN_SUCCESS) {
5447 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5448 		}
5449 	}
5450 	return result;
5451 }
5452 
5453 #if CONFIG_IOSCHED
5454 /*
5455  * To support I/O Expedite for compressed files we mark the upls with special flags.
5456  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5457  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5458  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5459  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5460  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5461  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5462  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5463  * unless the real I/O upl is being destroyed).
5464  */
5465 
5466 
5467 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5468 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5469 {
5470 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5471 
5472 	upl_lock(src_upl);
5473 	if (src_upl->decmp_io_upl) {
5474 		/*
5475 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5476 		 * This case should rarely happen and even if it does, it just means
5477 		 * that we might issue a spurious expedite which the driver is expected
5478 		 * to handle.
5479 		 */
5480 		upl_unlock(src_upl);
5481 		return;
5482 	}
5483 	src_upl->decmp_io_upl = (void *)upl;
5484 	src_upl->ref_count++;
5485 
5486 	upl->flags |= UPL_DECMP_REAL_IO;
5487 	upl->decmp_io_upl = (void *)src_upl;
5488 	upl_unlock(src_upl);
5489 }
5490 #endif /* CONFIG_IOSCHED */
5491 
5492 #if UPL_DEBUG
5493 int     upl_debug_enabled = 1;
5494 #else
5495 int     upl_debug_enabled = 0;
5496 #endif
5497 
5498 static upl_t
upl_create(int type,int flags,upl_size_t size)5499 upl_create(int type, int flags, upl_size_t size)
5500 {
5501 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5502 	upl_t    upl;
5503 
5504 	assert(page_aligned(size));
5505 
5506 	/*
5507 	 * FIXME: this code assumes the allocation always succeeds,
5508 	 *        however `pages` can be up to MAX_UPL_SIZE.
5509 	 *
5510 	 *        The allocation size is above 32k (resp. 128k)
5511 	 *        on 16k pages (resp. 4k), which kalloc might fail
5512 	 *        to allocate.
5513 	 */
5514 	upl = kalloc_type(struct upl, struct upl_page_info,
5515 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5516 	if (type & UPL_CREATE_INTERNAL) {
5517 		flags |= UPL_INTERNAL;
5518 	}
5519 
5520 	if (type & UPL_CREATE_LITE) {
5521 		flags |= UPL_LITE;
5522 		if (pages) {
5523 			upl->lite_list = bitmap_alloc(pages);
5524 		}
5525 	}
5526 
5527 	upl->flags = flags;
5528 	upl->ref_count = 1;
5529 	upl_lock_init(upl);
5530 #if CONFIG_IOSCHED
5531 	if (type & UPL_CREATE_IO_TRACKING) {
5532 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5533 	}
5534 
5535 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5536 		/* Only support expedite on internal UPLs */
5537 		thread_t        curthread = current_thread();
5538 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5539 		    Z_WAITOK | Z_ZERO);
5540 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5541 		if (curthread->decmp_upl != NULL) {
5542 			upl_set_decmp_info(upl, curthread->decmp_upl);
5543 		}
5544 	}
5545 #endif
5546 #if CONFIG_IOSCHED || UPL_DEBUG
5547 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5548 		upl->upl_creator = current_thread();
5549 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5550 	}
5551 #endif
5552 
5553 #if UPL_DEBUG
5554 	upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5555 #endif /* UPL_DEBUG */
5556 
5557 	return upl;
5558 }
5559 
5560 static void
upl_destroy(upl_t upl)5561 upl_destroy(upl_t upl)
5562 {
5563 	uint32_t pages;
5564 
5565 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5566 
5567 	if (upl->ext_ref_count) {
5568 		panic("upl(%p) ext_ref_count", upl);
5569 	}
5570 
5571 #if CONFIG_IOSCHED
5572 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5573 		upl_t src_upl;
5574 		src_upl = upl->decmp_io_upl;
5575 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5576 		upl_lock(src_upl);
5577 		src_upl->decmp_io_upl = NULL;
5578 		upl_unlock(src_upl);
5579 		upl_deallocate(src_upl);
5580 	}
5581 #endif /* CONFIG_IOSCHED */
5582 
5583 #if CONFIG_IOSCHED || UPL_DEBUG
5584 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5585 	    !(upl->flags & UPL_VECTOR)) {
5586 		vm_object_t     object;
5587 
5588 		if (upl->flags & UPL_SHADOWED) {
5589 			object = upl->map_object->shadow;
5590 		} else {
5591 			object = upl->map_object;
5592 		}
5593 
5594 		vm_object_lock(object);
5595 		queue_remove(&object->uplq, upl, upl_t, uplq);
5596 		vm_object_activity_end(object);
5597 		vm_object_collapse(object, 0, TRUE);
5598 		vm_object_unlock(object);
5599 	}
5600 #endif
5601 	/*
5602 	 * drop a reference on the map_object whether or
5603 	 * not a pageout object is inserted
5604 	 */
5605 	if (upl->flags & UPL_SHADOWED) {
5606 		vm_object_deallocate(upl->map_object);
5607 	}
5608 
5609 	if (upl->flags & UPL_DEVICE_MEMORY) {
5610 		pages = 1;
5611 	} else {
5612 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5613 	}
5614 
5615 	upl_lock_destroy(upl);
5616 
5617 #if CONFIG_IOSCHED
5618 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5619 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5620 	}
5621 #endif
5622 
5623 #if UPL_DEBUG
5624 	for (int i = 0; i < upl->upl_commit_index; i++) {
5625 		btref_put(upl->upl_commit_records[i].c_btref);
5626 	}
5627 	btref_put(upl->uple_create_btref);
5628 #endif /* UPL_DEBUG */
5629 
5630 	if ((upl->flags & UPL_LITE) && pages) {
5631 		bitmap_free(upl->lite_list, pages);
5632 	}
5633 	kfree_type(struct upl, struct upl_page_info,
5634 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5635 }
5636 
5637 void
upl_deallocate(upl_t upl)5638 upl_deallocate(upl_t upl)
5639 {
5640 	upl_lock(upl);
5641 
5642 	if (--upl->ref_count == 0) {
5643 		if (vector_upl_is_valid(upl)) {
5644 			vector_upl_deallocate(upl);
5645 		}
5646 		upl_unlock(upl);
5647 
5648 		if (upl->upl_iodone) {
5649 			upl_callout_iodone(upl);
5650 		}
5651 
5652 		upl_destroy(upl);
5653 	} else {
5654 		upl_unlock(upl);
5655 	}
5656 }
5657 
5658 #if CONFIG_IOSCHED
5659 void
upl_mark_decmp(upl_t upl)5660 upl_mark_decmp(upl_t upl)
5661 {
5662 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5663 		upl->flags |= UPL_DECMP_REQ;
5664 		upl->upl_creator->decmp_upl = (void *)upl;
5665 	}
5666 }
5667 
5668 void
upl_unmark_decmp(upl_t upl)5669 upl_unmark_decmp(upl_t upl)
5670 {
5671 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5672 		upl->upl_creator->decmp_upl = NULL;
5673 	}
5674 }
5675 
5676 #endif /* CONFIG_IOSCHED */
5677 
5678 #define VM_PAGE_Q_BACKING_UP(q)         \
5679 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5680 
5681 boolean_t must_throttle_writes(void);
5682 
5683 boolean_t
must_throttle_writes()5684 must_throttle_writes()
5685 {
5686 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5687 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5688 		return TRUE;
5689 	}
5690 
5691 	return FALSE;
5692 }
5693 
5694 int vm_page_delayed_work_ctx_needed = 0;
5695 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5696 
5697 __startup_func
5698 static void
vm_page_delayed_work_init_ctx(void)5699 vm_page_delayed_work_init_ctx(void)
5700 {
5701 	uint16_t min_delayed_work_ctx_allocated = 16;
5702 
5703 	/*
5704 	 * try really hard to always keep NCPU elements around in the zone
5705 	 * in order for the UPL code to almost always get an element.
5706 	 */
5707 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5708 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5709 	}
5710 
5711 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5712 }
5713 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5714 
5715 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5716 vm_page_delayed_work_get_ctx(void)
5717 {
5718 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5719 
5720 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5721 
5722 	if (__probable(dw_ctx)) {
5723 		dw_ctx->delayed_owner = current_thread();
5724 	} else {
5725 		vm_page_delayed_work_ctx_needed++;
5726 	}
5727 	return dw_ctx ? dw_ctx->dwp : NULL;
5728 }
5729 
5730 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5731 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5732 {
5733 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5734 
5735 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5736 	ldw_ctx->delayed_owner = NULL;
5737 
5738 	zfree(dw_ctx_zone, ldw_ctx);
5739 }
5740 
5741 /*
5742  *	Routine:	vm_object_upl_request
5743  *	Purpose:
5744  *		Cause the population of a portion of a vm_object.
5745  *		Depending on the nature of the request, the pages
5746  *		returned may be contain valid data or be uninitialized.
5747  *		A page list structure, listing the physical pages
5748  *		will be returned upon request.
5749  *		This function is called by the file system or any other
5750  *		supplier of backing store to a pager.
5751  *		IMPORTANT NOTE: The caller must still respect the relationship
5752  *		between the vm_object and its backing memory object.  The
5753  *		caller MUST NOT substitute changes in the backing file
5754  *		without first doing a memory_object_lock_request on the
5755  *		target range unless it is know that the pages are not
5756  *		shared with another entity at the pager level.
5757  *		Copy_in_to:
5758  *			if a page list structure is present
5759  *			return the mapped physical pages, where a
5760  *			page is not present, return a non-initialized
5761  *			one.  If the no_sync bit is turned on, don't
5762  *			call the pager unlock to synchronize with other
5763  *			possible copies of the page. Leave pages busy
5764  *			in the original object, if a page list structure
5765  *			was specified.  When a commit of the page list
5766  *			pages is done, the dirty bit will be set for each one.
5767  *		Copy_out_from:
5768  *			If a page list structure is present, return
5769  *			all mapped pages.  Where a page does not exist
5770  *			map a zero filled one. Leave pages busy in
5771  *			the original object.  If a page list structure
5772  *			is not specified, this call is a no-op.
5773  *
5774  *		Note:  access of default pager objects has a rather interesting
5775  *		twist.  The caller of this routine, presumably the file system
5776  *		page cache handling code, will never actually make a request
5777  *		against a default pager backed object.  Only the default
5778  *		pager will make requests on backing store related vm_objects
5779  *		In this way the default pager can maintain the relationship
5780  *		between backing store files (abstract memory objects) and
5781  *		the vm_objects (cache objects), they support.
5782  *
5783  */
5784 
5785 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5786 vm_object_upl_request(
5787 	vm_object_t             object,
5788 	vm_object_offset_t      offset,
5789 	upl_size_t              size,
5790 	upl_t                   *upl_ptr,
5791 	upl_page_info_array_t   user_page_list,
5792 	unsigned int            *page_list_count,
5793 	upl_control_flags_t     cntrl_flags,
5794 	vm_tag_t                tag)
5795 {
5796 	vm_page_t               dst_page = VM_PAGE_NULL;
5797 	vm_object_offset_t      dst_offset;
5798 	upl_size_t              xfer_size;
5799 	unsigned int            size_in_pages;
5800 	boolean_t               dirty;
5801 	boolean_t               hw_dirty;
5802 	upl_t                   upl = NULL;
5803 	unsigned int            entry;
5804 	vm_page_t               alias_page = NULL;
5805 	int                     refmod_state = 0;
5806 	vm_object_t             last_copy_object;
5807 	struct  vm_page_delayed_work    dw_array;
5808 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5809 	bool                    dwp_finish_ctx = TRUE;
5810 	int                     dw_count;
5811 	int                     dw_limit;
5812 	int                     io_tracking_flag = 0;
5813 	int                     grab_options;
5814 	int                     page_grab_count = 0;
5815 	ppnum_t                 phys_page;
5816 	pmap_flush_context      pmap_flush_context_storage;
5817 	boolean_t               pmap_flushes_delayed = FALSE;
5818 #if DEVELOPMENT || DEBUG
5819 	task_t                  task = current_task();
5820 #endif /* DEVELOPMENT || DEBUG */
5821 
5822 	dwp_start = dwp = NULL;
5823 
5824 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5825 		/*
5826 		 * For forward compatibility's sake,
5827 		 * reject any unknown flag.
5828 		 */
5829 		return KERN_INVALID_VALUE;
5830 	}
5831 	if ((!object->internal) && (object->paging_offset != 0)) {
5832 		panic("vm_object_upl_request: external object with non-zero paging offset");
5833 	}
5834 	if (object->phys_contiguous) {
5835 		panic("vm_object_upl_request: contiguous object specified");
5836 	}
5837 
5838 	assertf(page_aligned(offset) && page_aligned(size),
5839 	    "offset 0x%llx size 0x%x",
5840 	    offset, size);
5841 
5842 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5843 
5844 	dw_count = 0;
5845 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5846 	dwp_start = vm_page_delayed_work_get_ctx();
5847 	if (dwp_start == NULL) {
5848 		dwp_start = &dw_array;
5849 		dw_limit = 1;
5850 		dwp_finish_ctx = FALSE;
5851 	}
5852 
5853 	dwp = dwp_start;
5854 
5855 	if (size > MAX_UPL_SIZE_BYTES) {
5856 		size = MAX_UPL_SIZE_BYTES;
5857 	}
5858 
5859 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5860 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5861 	}
5862 
5863 #if CONFIG_IOSCHED || UPL_DEBUG
5864 	if (object->io_tracking || upl_debug_enabled) {
5865 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5866 	}
5867 #endif
5868 #if CONFIG_IOSCHED
5869 	if (object->io_tracking) {
5870 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5871 	}
5872 #endif
5873 
5874 	if (cntrl_flags & UPL_SET_INTERNAL) {
5875 		if (cntrl_flags & UPL_SET_LITE) {
5876 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5877 		} else {
5878 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5879 		}
5880 		user_page_list = size ? upl->page_list : NULL;
5881 	} else {
5882 		if (cntrl_flags & UPL_SET_LITE) {
5883 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5884 		} else {
5885 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5886 		}
5887 	}
5888 	*upl_ptr = upl;
5889 
5890 	if (user_page_list) {
5891 		user_page_list[0].device = FALSE;
5892 	}
5893 
5894 	if (cntrl_flags & UPL_SET_LITE) {
5895 		upl->map_object = object;
5896 	} else {
5897 		upl->map_object = vm_object_allocate(size);
5898 		/*
5899 		 * No neeed to lock the new object: nobody else knows
5900 		 * about it yet, so it's all ours so far.
5901 		 */
5902 		upl->map_object->shadow = object;
5903 		upl->map_object->pageout = TRUE;
5904 		upl->map_object->can_persist = FALSE;
5905 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5906 		upl->map_object->vo_shadow_offset = offset;
5907 		upl->map_object->wimg_bits = object->wimg_bits;
5908 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5909 		    "object %p shadow_offset 0x%llx",
5910 		    upl->map_object, upl->map_object->vo_shadow_offset);
5911 
5912 		alias_page = vm_page_grab_fictitious(TRUE);
5913 
5914 		upl->flags |= UPL_SHADOWED;
5915 	}
5916 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5917 		upl->flags |= UPL_PAGEOUT;
5918 	}
5919 
5920 	vm_object_lock(object);
5921 	vm_object_activity_begin(object);
5922 
5923 	grab_options = 0;
5924 #if CONFIG_SECLUDED_MEMORY
5925 	if (object->can_grab_secluded) {
5926 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5927 	}
5928 #endif /* CONFIG_SECLUDED_MEMORY */
5929 
5930 	/*
5931 	 * we can lock in the paging_offset once paging_in_progress is set
5932 	 */
5933 	upl->u_size = size;
5934 	upl->u_offset = offset + object->paging_offset;
5935 
5936 #if CONFIG_IOSCHED || UPL_DEBUG
5937 	if (object->io_tracking || upl_debug_enabled) {
5938 		vm_object_activity_begin(object);
5939 		queue_enter(&object->uplq, upl, upl_t, uplq);
5940 	}
5941 #endif
5942 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5943 		/*
5944 		 * Honor copy-on-write obligations
5945 		 *
5946 		 * The caller is gathering these pages and
5947 		 * might modify their contents.  We need to
5948 		 * make sure that the copy object has its own
5949 		 * private copies of these pages before we let
5950 		 * the caller modify them.
5951 		 */
5952 		vm_object_update(object,
5953 		    offset,
5954 		    size,
5955 		    NULL,
5956 		    NULL,
5957 		    FALSE,              /* should_return */
5958 		    MEMORY_OBJECT_COPY_SYNC,
5959 		    VM_PROT_NO_CHANGE);
5960 
5961 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5962 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5963 	}
5964 	/*
5965 	 * remember which copy object we synchronized with
5966 	 */
5967 	last_copy_object = object->vo_copy;
5968 	entry = 0;
5969 
5970 	xfer_size = size;
5971 	dst_offset = offset;
5972 	size_in_pages = size / PAGE_SIZE;
5973 
5974 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5975 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5976 		object->scan_collisions = 0;
5977 	}
5978 
5979 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5980 		boolean_t       isSSD = FALSE;
5981 
5982 #if !XNU_TARGET_OS_OSX
5983 		isSSD = TRUE;
5984 #else /* !XNU_TARGET_OS_OSX */
5985 		vnode_pager_get_isSSD(object->pager, &isSSD);
5986 #endif /* !XNU_TARGET_OS_OSX */
5987 		vm_object_unlock(object);
5988 
5989 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5990 
5991 		if (isSSD == TRUE) {
5992 			delay(1000 * size_in_pages);
5993 		} else {
5994 			delay(5000 * size_in_pages);
5995 		}
5996 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5997 
5998 		vm_object_lock(object);
5999 	}
6000 
6001 	while (xfer_size) {
6002 		dwp->dw_mask = 0;
6003 
6004 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6005 			vm_object_unlock(object);
6006 			alias_page = vm_page_grab_fictitious(TRUE);
6007 			vm_object_lock(object);
6008 		}
6009 		if (cntrl_flags & UPL_COPYOUT_FROM) {
6010 			upl->flags |= UPL_PAGE_SYNC_DONE;
6011 
6012 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6013 			    dst_page->vmp_fictitious ||
6014 			    dst_page->vmp_absent ||
6015 			    VMP_ERROR_GET(dst_page) ||
6016 			    dst_page->vmp_cleaning ||
6017 			    (VM_PAGE_WIRED(dst_page))) {
6018 				if (user_page_list) {
6019 					user_page_list[entry].phys_addr = 0;
6020 				}
6021 
6022 				goto try_next_page;
6023 			}
6024 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6025 
6026 			/*
6027 			 * grab this up front...
6028 			 * a high percentange of the time we're going to
6029 			 * need the hardware modification state a bit later
6030 			 * anyway... so we can eliminate an extra call into
6031 			 * the pmap layer by grabbing it here and recording it
6032 			 */
6033 			if (dst_page->vmp_pmapped) {
6034 				refmod_state = pmap_get_refmod(phys_page);
6035 			} else {
6036 				refmod_state = 0;
6037 			}
6038 
6039 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6040 				/*
6041 				 * page is on inactive list and referenced...
6042 				 * reactivate it now... this gets it out of the
6043 				 * way of vm_pageout_scan which would have to
6044 				 * reactivate it upon tripping over it
6045 				 */
6046 				dwp->dw_mask |= DW_vm_page_activate;
6047 			}
6048 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6049 				/*
6050 				 * we're only asking for DIRTY pages to be returned
6051 				 */
6052 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6053 					/*
6054 					 * if we were the page stolen by vm_pageout_scan to be
6055 					 * cleaned (as opposed to a buddy being clustered in
6056 					 * or this request is not being driven by a PAGEOUT cluster
6057 					 * then we only need to check for the page being dirty or
6058 					 * precious to decide whether to return it
6059 					 */
6060 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6061 						goto check_busy;
6062 					}
6063 					goto dont_return;
6064 				}
6065 				/*
6066 				 * this is a request for a PAGEOUT cluster and this page
6067 				 * is merely along for the ride as a 'buddy'... not only
6068 				 * does it have to be dirty to be returned, but it also
6069 				 * can't have been referenced recently...
6070 				 */
6071 				if ((hibernate_cleaning_in_progress == TRUE ||
6072 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6073 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6074 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6075 					goto check_busy;
6076 				}
6077 dont_return:
6078 				/*
6079 				 * if we reach here, we're not to return
6080 				 * the page... go on to the next one
6081 				 */
6082 				if (dst_page->vmp_laundry == TRUE) {
6083 					/*
6084 					 * if we get here, the page is not 'cleaning' (filtered out above).
6085 					 * since it has been referenced, remove it from the laundry
6086 					 * so we don't pay the cost of an I/O to clean a page
6087 					 * we're just going to take back
6088 					 */
6089 					vm_page_lockspin_queues();
6090 
6091 					vm_pageout_steal_laundry(dst_page, TRUE);
6092 					vm_page_activate(dst_page);
6093 
6094 					vm_page_unlock_queues();
6095 				}
6096 				if (user_page_list) {
6097 					user_page_list[entry].phys_addr = 0;
6098 				}
6099 
6100 				goto try_next_page;
6101 			}
6102 check_busy:
6103 			if (dst_page->vmp_busy) {
6104 				if (cntrl_flags & UPL_NOBLOCK) {
6105 					if (user_page_list) {
6106 						user_page_list[entry].phys_addr = 0;
6107 					}
6108 					dwp->dw_mask = 0;
6109 
6110 					goto try_next_page;
6111 				}
6112 				/*
6113 				 * someone else is playing with the
6114 				 * page.  We will have to wait.
6115 				 */
6116 				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6117 
6118 				continue;
6119 			}
6120 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6121 				vm_page_lockspin_queues();
6122 
6123 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6124 					/*
6125 					 * we've buddied up a page for a clustered pageout
6126 					 * that has already been moved to the pageout
6127 					 * queue by pageout_scan... we need to remove
6128 					 * it from the queue and drop the laundry count
6129 					 * on that queue
6130 					 */
6131 					vm_pageout_throttle_up(dst_page);
6132 				}
6133 				vm_page_unlock_queues();
6134 			}
6135 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6136 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6137 
6138 			if (phys_page > upl->highest_page) {
6139 				upl->highest_page = phys_page;
6140 			}
6141 
6142 			assert(!pmap_is_noencrypt(phys_page));
6143 
6144 			if (cntrl_flags & UPL_SET_LITE) {
6145 				unsigned int    pg_num;
6146 
6147 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6148 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6149 				bitmap_set(upl->lite_list, pg_num);
6150 
6151 				if (hw_dirty) {
6152 					if (pmap_flushes_delayed == FALSE) {
6153 						pmap_flush_context_init(&pmap_flush_context_storage);
6154 						pmap_flushes_delayed = TRUE;
6155 					}
6156 					pmap_clear_refmod_options(phys_page,
6157 					    VM_MEM_MODIFIED,
6158 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6159 					    &pmap_flush_context_storage);
6160 				}
6161 
6162 				/*
6163 				 * Mark original page as cleaning
6164 				 * in place.
6165 				 */
6166 				dst_page->vmp_cleaning = TRUE;
6167 				dst_page->vmp_precious = FALSE;
6168 			} else {
6169 				/*
6170 				 * use pageclean setup, it is more
6171 				 * convenient even for the pageout
6172 				 * cases here
6173 				 */
6174 				vm_object_lock(upl->map_object);
6175 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6176 				vm_object_unlock(upl->map_object);
6177 
6178 				alias_page->vmp_absent = FALSE;
6179 				alias_page = NULL;
6180 			}
6181 			if (dirty) {
6182 				SET_PAGE_DIRTY(dst_page, FALSE);
6183 			} else {
6184 				dst_page->vmp_dirty = FALSE;
6185 			}
6186 
6187 			if (!dirty) {
6188 				dst_page->vmp_precious = TRUE;
6189 			}
6190 
6191 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6192 				if (!VM_PAGE_WIRED(dst_page)) {
6193 					dst_page->vmp_free_when_done = TRUE;
6194 				}
6195 			}
6196 		} else {
6197 			if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != last_copy_object) {
6198 				/*
6199 				 * Honor copy-on-write obligations
6200 				 *
6201 				 * The copy object has changed since we
6202 				 * last synchronized for copy-on-write.
6203 				 * Another copy object might have been
6204 				 * inserted while we released the object's
6205 				 * lock.  Since someone could have seen the
6206 				 * original contents of the remaining pages
6207 				 * through that new object, we have to
6208 				 * synchronize with it again for the remaining
6209 				 * pages only.  The previous pages are "busy"
6210 				 * so they can not be seen through the new
6211 				 * mapping.  The new mapping will see our
6212 				 * upcoming changes for those previous pages,
6213 				 * but that's OK since they couldn't see what
6214 				 * was there before.  It's just a race anyway
6215 				 * and there's no guarantee of consistency or
6216 				 * atomicity.  We just don't want new mappings
6217 				 * to see both the *before* and *after* pages.
6218 				 */
6219 				if (object->vo_copy != VM_OBJECT_NULL) {
6220 					vm_object_update(
6221 						object,
6222 						dst_offset,/* current offset */
6223 						xfer_size, /* remaining size */
6224 						NULL,
6225 						NULL,
6226 						FALSE,     /* should_return */
6227 						MEMORY_OBJECT_COPY_SYNC,
6228 						VM_PROT_NO_CHANGE);
6229 
6230 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6231 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6232 				}
6233 				/*
6234 				 * remember the copy object we synced with
6235 				 */
6236 				last_copy_object = object->vo_copy;
6237 			}
6238 			dst_page = vm_page_lookup(object, dst_offset);
6239 
6240 			if (dst_page != VM_PAGE_NULL) {
6241 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6242 					/*
6243 					 * skip over pages already present in the cache
6244 					 */
6245 					if (user_page_list) {
6246 						user_page_list[entry].phys_addr = 0;
6247 					}
6248 
6249 					goto try_next_page;
6250 				}
6251 				if (dst_page->vmp_fictitious) {
6252 					panic("need corner case for fictitious page");
6253 				}
6254 
6255 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6256 					/*
6257 					 * someone else is playing with the
6258 					 * page.  We will have to wait.
6259 					 */
6260 					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6261 
6262 					continue;
6263 				}
6264 				if (dst_page->vmp_laundry) {
6265 					vm_pageout_steal_laundry(dst_page, FALSE);
6266 				}
6267 			} else {
6268 				if (object->private) {
6269 					/*
6270 					 * This is a nasty wrinkle for users
6271 					 * of upl who encounter device or
6272 					 * private memory however, it is
6273 					 * unavoidable, only a fault can
6274 					 * resolve the actual backing
6275 					 * physical page by asking the
6276 					 * backing device.
6277 					 */
6278 					if (user_page_list) {
6279 						user_page_list[entry].phys_addr = 0;
6280 					}
6281 
6282 					goto try_next_page;
6283 				}
6284 				if (object->scan_collisions) {
6285 					/*
6286 					 * the pageout_scan thread is trying to steal
6287 					 * pages from this object, but has run into our
6288 					 * lock... grab 2 pages from the head of the object...
6289 					 * the first is freed on behalf of pageout_scan, the
6290 					 * 2nd is for our own use... we use vm_object_page_grab
6291 					 * in both cases to avoid taking pages from the free
6292 					 * list since we are under memory pressure and our
6293 					 * lock on this object is getting in the way of
6294 					 * relieving it
6295 					 */
6296 					dst_page = vm_object_page_grab(object);
6297 
6298 					if (dst_page != VM_PAGE_NULL) {
6299 						vm_page_release(dst_page,
6300 						    FALSE);
6301 					}
6302 
6303 					dst_page = vm_object_page_grab(object);
6304 				}
6305 				if (dst_page == VM_PAGE_NULL) {
6306 					/*
6307 					 * need to allocate a page
6308 					 */
6309 					dst_page = vm_page_grab_options(grab_options);
6310 					if (dst_page != VM_PAGE_NULL) {
6311 						page_grab_count++;
6312 					}
6313 				}
6314 				if (dst_page == VM_PAGE_NULL) {
6315 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6316 						/*
6317 						 * we don't want to stall waiting for pages to come onto the free list
6318 						 * while we're already holding absent pages in this UPL
6319 						 * the caller will deal with the empty slots
6320 						 */
6321 						if (user_page_list) {
6322 							user_page_list[entry].phys_addr = 0;
6323 						}
6324 
6325 						goto try_next_page;
6326 					}
6327 					/*
6328 					 * no pages available... wait
6329 					 * then try again for the same
6330 					 * offset...
6331 					 */
6332 					vm_object_unlock(object);
6333 
6334 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6335 
6336 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6337 
6338 					VM_PAGE_WAIT();
6339 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6340 
6341 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6342 
6343 					vm_object_lock(object);
6344 
6345 					continue;
6346 				}
6347 				vm_page_insert(dst_page, object, dst_offset);
6348 
6349 				dst_page->vmp_absent = TRUE;
6350 				dst_page->vmp_busy = FALSE;
6351 
6352 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6353 					/*
6354 					 * if UPL_RET_ONLY_ABSENT was specified,
6355 					 * than we're definitely setting up a
6356 					 * upl for a clustered read/pagein
6357 					 * operation... mark the pages as clustered
6358 					 * so upl_commit_range can put them on the
6359 					 * speculative list
6360 					 */
6361 					dst_page->vmp_clustered = TRUE;
6362 
6363 					if (!(cntrl_flags & UPL_FILE_IO)) {
6364 						counter_inc(&vm_statistics_pageins);
6365 					}
6366 				}
6367 			}
6368 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6369 
6370 			dst_page->vmp_overwriting = TRUE;
6371 
6372 			if (dst_page->vmp_pmapped) {
6373 				if (!(cntrl_flags & UPL_FILE_IO)) {
6374 					/*
6375 					 * eliminate all mappings from the
6376 					 * original object and its prodigy
6377 					 */
6378 					refmod_state = pmap_disconnect(phys_page);
6379 				} else {
6380 					refmod_state = pmap_get_refmod(phys_page);
6381 				}
6382 			} else {
6383 				refmod_state = 0;
6384 			}
6385 
6386 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6387 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6388 
6389 			if (cntrl_flags & UPL_SET_LITE) {
6390 				unsigned int    pg_num;
6391 
6392 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6393 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6394 				bitmap_set(upl->lite_list, pg_num);
6395 
6396 				if (hw_dirty) {
6397 					pmap_clear_modify(phys_page);
6398 				}
6399 
6400 				/*
6401 				 * Mark original page as cleaning
6402 				 * in place.
6403 				 */
6404 				dst_page->vmp_cleaning = TRUE;
6405 				dst_page->vmp_precious = FALSE;
6406 			} else {
6407 				/*
6408 				 * use pageclean setup, it is more
6409 				 * convenient even for the pageout
6410 				 * cases here
6411 				 */
6412 				vm_object_lock(upl->map_object);
6413 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6414 				vm_object_unlock(upl->map_object);
6415 
6416 				alias_page->vmp_absent = FALSE;
6417 				alias_page = NULL;
6418 			}
6419 
6420 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6421 				upl->flags &= ~UPL_CLEAR_DIRTY;
6422 				upl->flags |= UPL_SET_DIRTY;
6423 				dirty = TRUE;
6424 				/*
6425 				 * Page belonging to a code-signed object is about to
6426 				 * be written. Mark it tainted and disconnect it from
6427 				 * all pmaps so processes have to fault it back in and
6428 				 * deal with the tainted bit.
6429 				 */
6430 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6431 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6432 					vm_page_upl_tainted++;
6433 					if (dst_page->vmp_pmapped) {
6434 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6435 						if (refmod_state & VM_MEM_REFERENCED) {
6436 							dst_page->vmp_reference = TRUE;
6437 						}
6438 					}
6439 				}
6440 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6441 				/*
6442 				 * clean in place for read implies
6443 				 * that a write will be done on all
6444 				 * the pages that are dirty before
6445 				 * a upl commit is done.  The caller
6446 				 * is obligated to preserve the
6447 				 * contents of all pages marked dirty
6448 				 */
6449 				upl->flags |= UPL_CLEAR_DIRTY;
6450 			}
6451 			dst_page->vmp_dirty = dirty;
6452 
6453 			if (!dirty) {
6454 				dst_page->vmp_precious = TRUE;
6455 			}
6456 
6457 			if (!VM_PAGE_WIRED(dst_page)) {
6458 				/*
6459 				 * deny access to the target page while
6460 				 * it is being worked on
6461 				 */
6462 				dst_page->vmp_busy = TRUE;
6463 			} else {
6464 				dwp->dw_mask |= DW_vm_page_wire;
6465 			}
6466 
6467 			/*
6468 			 * We might be about to satisfy a fault which has been
6469 			 * requested. So no need for the "restart" bit.
6470 			 */
6471 			dst_page->vmp_restart = FALSE;
6472 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6473 				/*
6474 				 * expect the page to be used
6475 				 */
6476 				dwp->dw_mask |= DW_set_reference;
6477 			}
6478 			if (cntrl_flags & UPL_PRECIOUS) {
6479 				if (object->internal) {
6480 					SET_PAGE_DIRTY(dst_page, FALSE);
6481 					dst_page->vmp_precious = FALSE;
6482 				} else {
6483 					dst_page->vmp_precious = TRUE;
6484 				}
6485 			} else {
6486 				dst_page->vmp_precious = FALSE;
6487 			}
6488 		}
6489 		if (dst_page->vmp_busy) {
6490 			upl->flags |= UPL_HAS_BUSY;
6491 		}
6492 
6493 		if (phys_page > upl->highest_page) {
6494 			upl->highest_page = phys_page;
6495 		}
6496 		assert(!pmap_is_noencrypt(phys_page));
6497 		if (user_page_list) {
6498 			user_page_list[entry].phys_addr = phys_page;
6499 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6500 			user_page_list[entry].absent    = dst_page->vmp_absent;
6501 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6502 			user_page_list[entry].precious  = dst_page->vmp_precious;
6503 			user_page_list[entry].device    = FALSE;
6504 			user_page_list[entry].needed    = FALSE;
6505 			if (dst_page->vmp_clustered == TRUE) {
6506 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6507 			} else {
6508 				user_page_list[entry].speculative = FALSE;
6509 			}
6510 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6511 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6512 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6513 			user_page_list[entry].mark      = FALSE;
6514 		}
6515 		/*
6516 		 * if UPL_RET_ONLY_ABSENT is set, then
6517 		 * we are working with a fresh page and we've
6518 		 * just set the clustered flag on it to
6519 		 * indicate that it was drug in as part of a
6520 		 * speculative cluster... so leave it alone
6521 		 */
6522 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6523 			/*
6524 			 * someone is explicitly grabbing this page...
6525 			 * update clustered and speculative state
6526 			 *
6527 			 */
6528 			if (dst_page->vmp_clustered) {
6529 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6530 			}
6531 		}
6532 try_next_page:
6533 		if (dwp->dw_mask) {
6534 			if (dwp->dw_mask & DW_vm_page_activate) {
6535 				counter_inc(&vm_statistics_reactivations);
6536 			}
6537 
6538 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6539 
6540 			if (dw_count >= dw_limit) {
6541 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6542 
6543 				dwp = dwp_start;
6544 				dw_count = 0;
6545 			}
6546 		}
6547 		entry++;
6548 		dst_offset += PAGE_SIZE_64;
6549 		xfer_size -= PAGE_SIZE;
6550 	}
6551 	if (dw_count) {
6552 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6553 		dwp = dwp_start;
6554 		dw_count = 0;
6555 	}
6556 
6557 	if (alias_page != NULL) {
6558 		VM_PAGE_FREE(alias_page);
6559 	}
6560 	if (pmap_flushes_delayed == TRUE) {
6561 		pmap_flush(&pmap_flush_context_storage);
6562 	}
6563 
6564 	if (page_list_count != NULL) {
6565 		if (upl->flags & UPL_INTERNAL) {
6566 			*page_list_count = 0;
6567 		} else if (*page_list_count > entry) {
6568 			*page_list_count = entry;
6569 		}
6570 	}
6571 #if UPL_DEBUG
6572 	upl->upl_state = 1;
6573 #endif
6574 	vm_object_unlock(object);
6575 
6576 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6577 #if DEVELOPMENT || DEBUG
6578 	if (task != NULL) {
6579 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6580 	}
6581 #endif /* DEVELOPMENT || DEBUG */
6582 
6583 	if (dwp_start && dwp_finish_ctx) {
6584 		vm_page_delayed_work_finish_ctx(dwp_start);
6585 		dwp_start = dwp = NULL;
6586 	}
6587 
6588 	return KERN_SUCCESS;
6589 }
6590 
6591 /*
6592  *	Routine:	vm_object_super_upl_request
6593  *	Purpose:
6594  *		Cause the population of a portion of a vm_object
6595  *		in much the same way as memory_object_upl_request.
6596  *		Depending on the nature of the request, the pages
6597  *		returned may be contain valid data or be uninitialized.
6598  *		However, the region may be expanded up to the super
6599  *		cluster size provided.
6600  */
6601 
6602 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6603 vm_object_super_upl_request(
6604 	vm_object_t object,
6605 	vm_object_offset_t      offset,
6606 	upl_size_t              size,
6607 	upl_size_t              super_cluster,
6608 	upl_t                   *upl,
6609 	upl_page_info_t         *user_page_list,
6610 	unsigned int            *page_list_count,
6611 	upl_control_flags_t     cntrl_flags,
6612 	vm_tag_t                tag)
6613 {
6614 	if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6615 		return KERN_FAILURE;
6616 	}
6617 
6618 	assert(object->paging_in_progress);
6619 	offset = offset - object->paging_offset;
6620 
6621 	if (super_cluster > size) {
6622 		vm_object_offset_t      base_offset;
6623 		upl_size_t              super_size;
6624 		vm_object_size_t        super_size_64;
6625 
6626 		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6627 		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6628 		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6629 		super_size = (upl_size_t) super_size_64;
6630 		assert(super_size == super_size_64);
6631 
6632 		if (offset > (base_offset + super_size)) {
6633 			panic("vm_object_super_upl_request: Missed target pageout"
6634 			    " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6635 			    offset, base_offset, super_size, super_cluster,
6636 			    size, object->paging_offset);
6637 		}
6638 		/*
6639 		 * apparently there is a case where the vm requests a
6640 		 * page to be written out who's offset is beyond the
6641 		 * object size
6642 		 */
6643 		if ((offset + size) > (base_offset + super_size)) {
6644 			super_size_64 = (offset + size) - base_offset;
6645 			super_size = (upl_size_t) super_size_64;
6646 			assert(super_size == super_size_64);
6647 		}
6648 
6649 		offset = base_offset;
6650 		size = super_size;
6651 	}
6652 	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6653 }
6654 
6655 int cs_executable_create_upl = 0;
6656 extern int proc_selfpid(void);
6657 extern char *proc_name_address(void *p);
6658 
6659 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6660 vm_map_create_upl(
6661 	vm_map_t                map,
6662 	vm_map_address_t        offset,
6663 	upl_size_t              *upl_size,
6664 	upl_t                   *upl,
6665 	upl_page_info_array_t   page_list,
6666 	unsigned int            *count,
6667 	upl_control_flags_t     *flags,
6668 	vm_tag_t                tag)
6669 {
6670 	vm_map_entry_t          entry;
6671 	upl_control_flags_t     caller_flags;
6672 	int                     force_data_sync;
6673 	int                     sync_cow_data;
6674 	vm_object_t             local_object;
6675 	vm_map_offset_t         local_offset;
6676 	vm_map_offset_t         local_start;
6677 	kern_return_t           ret;
6678 	vm_map_address_t        original_offset;
6679 	vm_map_size_t           original_size, adjusted_size;
6680 	vm_map_offset_t         local_entry_start;
6681 	vm_object_offset_t      local_entry_offset;
6682 	vm_object_offset_t      offset_in_mapped_page;
6683 	boolean_t               release_map = FALSE;
6684 
6685 start_with_map:
6686 
6687 	original_offset = offset;
6688 	original_size = *upl_size;
6689 	adjusted_size = original_size;
6690 
6691 	caller_flags = *flags;
6692 
6693 	if (caller_flags & ~UPL_VALID_FLAGS) {
6694 		/*
6695 		 * For forward compatibility's sake,
6696 		 * reject any unknown flag.
6697 		 */
6698 		ret = KERN_INVALID_VALUE;
6699 		goto done;
6700 	}
6701 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6702 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6703 
6704 	if (upl == NULL) {
6705 		ret = KERN_INVALID_ARGUMENT;
6706 		goto done;
6707 	}
6708 
6709 REDISCOVER_ENTRY:
6710 	vm_map_lock_read(map);
6711 
6712 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6713 		vm_map_unlock_read(map);
6714 		ret = KERN_FAILURE;
6715 		goto done;
6716 	}
6717 
6718 	local_entry_start = entry->vme_start;
6719 	local_entry_offset = VME_OFFSET(entry);
6720 
6721 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6722 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6723 	}
6724 
6725 	if (entry->vme_end - original_offset < adjusted_size) {
6726 		adjusted_size = entry->vme_end - original_offset;
6727 		assert(adjusted_size > 0);
6728 		*upl_size = (upl_size_t) adjusted_size;
6729 		assert(*upl_size == adjusted_size);
6730 	}
6731 
6732 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6733 		*flags = 0;
6734 
6735 		if (!entry->is_sub_map &&
6736 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6737 			if (VME_OBJECT(entry)->private) {
6738 				*flags = UPL_DEV_MEMORY;
6739 			}
6740 
6741 			if (VME_OBJECT(entry)->phys_contiguous) {
6742 				*flags |= UPL_PHYS_CONTIG;
6743 			}
6744 		}
6745 		vm_map_unlock_read(map);
6746 		ret = KERN_SUCCESS;
6747 		goto done;
6748 	}
6749 
6750 	offset_in_mapped_page = 0;
6751 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6752 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6753 		*upl_size = (upl_size_t)
6754 		    (vm_map_round_page(original_offset + adjusted_size,
6755 		    VM_MAP_PAGE_MASK(map))
6756 		    - offset);
6757 
6758 		offset_in_mapped_page = original_offset - offset;
6759 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6760 
6761 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6762 	}
6763 
6764 	if (!entry->is_sub_map) {
6765 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6766 		    !VME_OBJECT(entry)->phys_contiguous) {
6767 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6768 				*upl_size = MAX_UPL_SIZE_BYTES;
6769 			}
6770 		}
6771 
6772 		/*
6773 		 *      Create an object if necessary.
6774 		 */
6775 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6776 			if (vm_map_lock_read_to_write(map)) {
6777 				goto REDISCOVER_ENTRY;
6778 			}
6779 
6780 			VME_OBJECT_SET(entry,
6781 			    vm_object_allocate((vm_size_t)
6782 			    vm_object_round_page((entry->vme_end - entry->vme_start))),
6783 			    false, 0);
6784 			VME_OFFSET_SET(entry, 0);
6785 			assert(entry->use_pmap);
6786 
6787 			vm_map_lock_write_to_read(map);
6788 		}
6789 
6790 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6791 		    !(entry->protection & VM_PROT_WRITE)) {
6792 			vm_map_unlock_read(map);
6793 			ret = KERN_PROTECTION_FAILURE;
6794 			goto done;
6795 		}
6796 	}
6797 
6798 #if !XNU_TARGET_OS_OSX
6799 	if (map->pmap != kernel_pmap &&
6800 	    (caller_flags & UPL_COPYOUT_FROM) &&
6801 	    (entry->protection & VM_PROT_EXECUTE) &&
6802 	    !(entry->protection & VM_PROT_WRITE)) {
6803 		vm_offset_t     kaddr;
6804 		vm_size_t       ksize;
6805 
6806 		/*
6807 		 * We're about to create a read-only UPL backed by
6808 		 * memory from an executable mapping.
6809 		 * Wiring the pages would result in the pages being copied
6810 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6811 		 * code-signed, so no longer eligible for execution.
6812 		 * Instead, let's copy the data into a kernel buffer and
6813 		 * create the UPL from this kernel buffer.
6814 		 * The kernel buffer is then freed, leaving the UPL holding
6815 		 * the last reference on the VM object, so the memory will
6816 		 * be released when the UPL is committed.
6817 		 */
6818 
6819 		vm_map_unlock_read(map);
6820 		entry = VM_MAP_ENTRY_NULL;
6821 		/* allocate kernel buffer */
6822 		ksize = round_page(*upl_size);
6823 		kaddr = 0;
6824 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6825 		    KMA_PAGEABLE | KMA_DATA, tag);
6826 		if (ret == KERN_SUCCESS) {
6827 			/* copyin the user data */
6828 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6829 		}
6830 		if (ret == KERN_SUCCESS) {
6831 			if (ksize > *upl_size) {
6832 				/* zero out the extra space in kernel buffer */
6833 				memset((void *)(kaddr + *upl_size),
6834 				    0,
6835 				    ksize - *upl_size);
6836 			}
6837 			/* create the UPL from the kernel buffer */
6838 			vm_object_offset_t      offset_in_object;
6839 			vm_object_offset_t      offset_in_object_page;
6840 
6841 			offset_in_object = offset - local_entry_start + local_entry_offset;
6842 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6843 			assert(offset_in_object_page < PAGE_SIZE);
6844 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6845 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6846 			ret = vm_map_create_upl(kernel_map,
6847 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6848 			    upl_size, upl, page_list, count, flags, tag);
6849 		}
6850 		if (kaddr != 0) {
6851 			/* free the kernel buffer */
6852 			kmem_free(kernel_map, kaddr, ksize);
6853 			kaddr = 0;
6854 			ksize = 0;
6855 		}
6856 #if DEVELOPMENT || DEBUG
6857 		DTRACE_VM4(create_upl_from_executable,
6858 		    vm_map_t, map,
6859 		    vm_map_address_t, offset,
6860 		    upl_size_t, *upl_size,
6861 		    kern_return_t, ret);
6862 #endif /* DEVELOPMENT || DEBUG */
6863 		goto done;
6864 	}
6865 #endif /* !XNU_TARGET_OS_OSX */
6866 
6867 	if (!entry->is_sub_map) {
6868 		local_object = VME_OBJECT(entry);
6869 		assert(local_object != VM_OBJECT_NULL);
6870 	}
6871 
6872 	if (!entry->is_sub_map &&
6873 	    !entry->needs_copy &&
6874 	    *upl_size != 0 &&
6875 	    local_object->vo_size > *upl_size && /* partial UPL */
6876 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6877 	    (map->pmap != kernel_pmap) && /* alias checks */
6878 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6879 	    ||
6880 	    ( /* case 2 */
6881 		    local_object->internal &&
6882 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6883 		    local_object->ref_count > 1))) {
6884 		vm_prot_t       prot;
6885 
6886 		/*
6887 		 * Case 1:
6888 		 * Set up the targeted range for copy-on-write to avoid
6889 		 * applying true_share/copy_delay to the entire object.
6890 		 *
6891 		 * Case 2:
6892 		 * This map entry covers only part of an internal
6893 		 * object.  There could be other map entries covering
6894 		 * other areas of this object and some of these map
6895 		 * entries could be marked as "needs_copy", which
6896 		 * assumes that the object is COPY_SYMMETRIC.
6897 		 * To avoid marking this object as COPY_DELAY and
6898 		 * "true_share", let's shadow it and mark the new
6899 		 * (smaller) object as "true_share" and COPY_DELAY.
6900 		 */
6901 
6902 		if (vm_map_lock_read_to_write(map)) {
6903 			goto REDISCOVER_ENTRY;
6904 		}
6905 		vm_map_lock_assert_exclusive(map);
6906 		assert(VME_OBJECT(entry) == local_object);
6907 
6908 		vm_map_clip_start(map,
6909 		    entry,
6910 		    vm_map_trunc_page(offset,
6911 		    VM_MAP_PAGE_MASK(map)));
6912 		vm_map_clip_end(map,
6913 		    entry,
6914 		    vm_map_round_page(offset + *upl_size,
6915 		    VM_MAP_PAGE_MASK(map)));
6916 		if ((entry->vme_end - offset) < *upl_size) {
6917 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6918 			assert(*upl_size == entry->vme_end - offset);
6919 		}
6920 
6921 		prot = entry->protection & ~VM_PROT_WRITE;
6922 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6923 			prot |= VM_PROT_EXECUTE;
6924 		}
6925 		vm_object_pmap_protect(local_object,
6926 		    VME_OFFSET(entry),
6927 		    entry->vme_end - entry->vme_start,
6928 		    ((entry->is_shared ||
6929 		    map->mapped_in_other_pmaps)
6930 		    ? PMAP_NULL
6931 		    : map->pmap),
6932 		    VM_MAP_PAGE_SIZE(map),
6933 		    entry->vme_start,
6934 		    prot);
6935 
6936 		assert(entry->wired_count == 0);
6937 
6938 		/*
6939 		 * Lock the VM object and re-check its status: if it's mapped
6940 		 * in another address space, we could still be racing with
6941 		 * another thread holding that other VM map exclusively.
6942 		 */
6943 		vm_object_lock(local_object);
6944 		if (local_object->true_share) {
6945 			/* object is already in proper state: no COW needed */
6946 			assert(local_object->copy_strategy !=
6947 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6948 		} else {
6949 			/* not true_share: ask for copy-on-write below */
6950 			assert(local_object->copy_strategy ==
6951 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6952 			entry->needs_copy = TRUE;
6953 		}
6954 		vm_object_unlock(local_object);
6955 
6956 		vm_map_lock_write_to_read(map);
6957 	}
6958 
6959 	if (entry->needs_copy) {
6960 		/*
6961 		 * Honor copy-on-write for COPY_SYMMETRIC
6962 		 * strategy.
6963 		 */
6964 		vm_map_t                local_map;
6965 		vm_object_t             object;
6966 		vm_object_offset_t      new_offset;
6967 		vm_prot_t               prot;
6968 		boolean_t               wired;
6969 		vm_map_version_t        version;
6970 		vm_map_t                real_map;
6971 		vm_prot_t               fault_type;
6972 
6973 		local_map = map;
6974 
6975 		if (caller_flags & UPL_COPYOUT_FROM) {
6976 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6977 			vm_counters.create_upl_extra_cow++;
6978 			vm_counters.create_upl_extra_cow_pages +=
6979 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6980 		} else {
6981 			fault_type = VM_PROT_WRITE;
6982 		}
6983 		if (vm_map_lookup_and_lock_object(&local_map,
6984 		    offset, fault_type,
6985 		    OBJECT_LOCK_EXCLUSIVE,
6986 		    &version, &object,
6987 		    &new_offset, &prot, &wired,
6988 		    NULL,
6989 		    &real_map, NULL) != KERN_SUCCESS) {
6990 			if (fault_type == VM_PROT_WRITE) {
6991 				vm_counters.create_upl_lookup_failure_write++;
6992 			} else {
6993 				vm_counters.create_upl_lookup_failure_copy++;
6994 			}
6995 			vm_map_unlock_read(local_map);
6996 			ret = KERN_FAILURE;
6997 			goto done;
6998 		}
6999 		if (real_map != local_map) {
7000 			vm_map_unlock(real_map);
7001 		}
7002 		vm_map_unlock_read(local_map);
7003 
7004 		vm_object_unlock(object);
7005 
7006 		goto REDISCOVER_ENTRY;
7007 	}
7008 
7009 	if (entry->is_sub_map) {
7010 		vm_map_t        submap;
7011 
7012 		submap = VME_SUBMAP(entry);
7013 		local_start = entry->vme_start;
7014 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7015 
7016 		vm_map_reference(submap);
7017 		vm_map_unlock_read(map);
7018 
7019 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7020 		offset += offset_in_mapped_page;
7021 		*upl_size -= offset_in_mapped_page;
7022 
7023 		if (release_map) {
7024 			vm_map_deallocate(map);
7025 		}
7026 		map = submap;
7027 		release_map = TRUE;
7028 		offset = local_offset + (offset - local_start);
7029 		goto start_with_map;
7030 	}
7031 
7032 	if (sync_cow_data &&
7033 	    (VME_OBJECT(entry)->shadow ||
7034 	    VME_OBJECT(entry)->vo_copy)) {
7035 		local_object = VME_OBJECT(entry);
7036 		local_start = entry->vme_start;
7037 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7038 
7039 		vm_object_reference(local_object);
7040 		vm_map_unlock_read(map);
7041 
7042 		if (local_object->shadow && local_object->vo_copy) {
7043 			vm_object_lock_request(local_object->shadow,
7044 			    ((vm_object_offset_t)
7045 			    ((offset - local_start) +
7046 			    local_offset) +
7047 			    local_object->vo_shadow_offset),
7048 			    *upl_size, FALSE,
7049 			    MEMORY_OBJECT_DATA_SYNC,
7050 			    VM_PROT_NO_CHANGE);
7051 		}
7052 		sync_cow_data = FALSE;
7053 		vm_object_deallocate(local_object);
7054 
7055 		goto REDISCOVER_ENTRY;
7056 	}
7057 	if (force_data_sync) {
7058 		local_object = VME_OBJECT(entry);
7059 		local_start = entry->vme_start;
7060 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7061 
7062 		vm_object_reference(local_object);
7063 		vm_map_unlock_read(map);
7064 
7065 		vm_object_lock_request(local_object,
7066 		    ((vm_object_offset_t)
7067 		    ((offset - local_start) +
7068 		    local_offset)),
7069 		    (vm_object_size_t)*upl_size,
7070 		    FALSE,
7071 		    MEMORY_OBJECT_DATA_SYNC,
7072 		    VM_PROT_NO_CHANGE);
7073 
7074 		force_data_sync = FALSE;
7075 		vm_object_deallocate(local_object);
7076 
7077 		goto REDISCOVER_ENTRY;
7078 	}
7079 	if (VME_OBJECT(entry)->private) {
7080 		*flags = UPL_DEV_MEMORY;
7081 	} else {
7082 		*flags = 0;
7083 	}
7084 
7085 	if (VME_OBJECT(entry)->phys_contiguous) {
7086 		*flags |= UPL_PHYS_CONTIG;
7087 	}
7088 
7089 	local_object = VME_OBJECT(entry);
7090 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7091 	local_start = entry->vme_start;
7092 
7093 	/*
7094 	 * Wiring will copy the pages to the shadow object.
7095 	 * The shadow object will not be code-signed so
7096 	 * attempting to execute code from these copied pages
7097 	 * would trigger a code-signing violation.
7098 	 */
7099 	if (entry->protection & VM_PROT_EXECUTE) {
7100 #if MACH_ASSERT
7101 		printf("pid %d[%s] create_upl out of executable range from "
7102 		    "0x%llx to 0x%llx: side effects may include "
7103 		    "code-signing violations later on\n",
7104 		    proc_selfpid(),
7105 		    (get_bsdtask_info(current_task())
7106 		    ? proc_name_address(get_bsdtask_info(current_task()))
7107 		    : "?"),
7108 		    (uint64_t) entry->vme_start,
7109 		    (uint64_t) entry->vme_end);
7110 #endif /* MACH_ASSERT */
7111 		DTRACE_VM2(cs_executable_create_upl,
7112 		    uint64_t, (uint64_t)entry->vme_start,
7113 		    uint64_t, (uint64_t)entry->vme_end);
7114 		cs_executable_create_upl++;
7115 	}
7116 
7117 	vm_object_lock(local_object);
7118 
7119 	/*
7120 	 * Ensure that this object is "true_share" and "copy_delay" now,
7121 	 * while we're still holding the VM map lock.  After we unlock the map,
7122 	 * anything could happen to that mapping, including some copy-on-write
7123 	 * activity.  We need to make sure that the IOPL will point at the
7124 	 * same memory as the mapping.
7125 	 */
7126 	if (local_object->true_share) {
7127 		assert(local_object->copy_strategy !=
7128 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7129 	} else if (!is_kernel_object(local_object) &&
7130 	    local_object != compressor_object &&
7131 	    !local_object->phys_contiguous) {
7132 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7133 		if (!local_object->true_share &&
7134 		    vm_object_tracking_btlog) {
7135 			btlog_record(vm_object_tracking_btlog, local_object,
7136 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7137 			    btref_get(__builtin_frame_address(0), 0));
7138 		}
7139 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7140 		local_object->true_share = TRUE;
7141 		if (local_object->copy_strategy ==
7142 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7143 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7144 		}
7145 	}
7146 
7147 	vm_object_reference_locked(local_object);
7148 	vm_object_unlock(local_object);
7149 
7150 	vm_map_unlock_read(map);
7151 
7152 	offset += offset_in_mapped_page;
7153 	assert(*upl_size > offset_in_mapped_page);
7154 	*upl_size -= offset_in_mapped_page;
7155 
7156 	ret = vm_object_iopl_request(local_object,
7157 	    ((vm_object_offset_t)
7158 	    ((offset - local_start) + local_offset)),
7159 	    *upl_size,
7160 	    upl,
7161 	    page_list,
7162 	    count,
7163 	    caller_flags,
7164 	    tag);
7165 	vm_object_deallocate(local_object);
7166 
7167 done:
7168 	if (release_map) {
7169 		vm_map_deallocate(map);
7170 	}
7171 
7172 	return ret;
7173 }
7174 
7175 /*
7176  * Internal routine to enter a UPL into a VM map.
7177  *
7178  * JMM - This should just be doable through the standard
7179  * vm_map_enter() API.
7180  */
7181 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7182 vm_map_enter_upl_range(
7183 	vm_map_t                map,
7184 	upl_t                   upl,
7185 	vm_object_offset_t      offset_to_map,
7186 	upl_size_t              size_to_map,
7187 	vm_prot_t               prot_to_map,
7188 	vm_map_offset_t         *dst_addr)
7189 {
7190 	vm_map_size_t           size;
7191 	vm_object_offset_t      offset;
7192 	vm_map_offset_t         addr;
7193 	vm_page_t               m;
7194 	kern_return_t           kr;
7195 	int                     isVectorUPL = 0, curr_upl = 0;
7196 	upl_t                   vector_upl = NULL;
7197 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7198 	vm_map_t                vector_upl_submap = NULL;
7199 	upl_offset_t            subupl_offset = 0;
7200 	upl_size_t              subupl_size = 0;
7201 
7202 	if (upl == UPL_NULL) {
7203 		return KERN_INVALID_ARGUMENT;
7204 	}
7205 
7206 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7207 	assert(map == kernel_map);
7208 
7209 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7210 		int mapped = 0, valid_upls = 0;
7211 		vector_upl = upl;
7212 
7213 		upl_lock(vector_upl);
7214 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7215 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7216 			if (upl == NULL) {
7217 				continue;
7218 			}
7219 			valid_upls++;
7220 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7221 				mapped++;
7222 			}
7223 		}
7224 
7225 		if (mapped) {
7226 			if (mapped != valid_upls) {
7227 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7228 			} else {
7229 				upl_unlock(vector_upl);
7230 				return KERN_FAILURE;
7231 			}
7232 		}
7233 
7234 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7235 			panic("TODO4K: vector UPL not implemented");
7236 		}
7237 
7238 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7239 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7240 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7241 		    VM_KERN_MEMORY_NONE).kmr_submap;
7242 		map = vector_upl_submap;
7243 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7244 		curr_upl = 0;
7245 	} else {
7246 		upl_lock(upl);
7247 	}
7248 
7249 process_upl_to_enter:
7250 	if (isVectorUPL) {
7251 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7252 			*dst_addr = vector_upl_dst_addr;
7253 			upl_unlock(vector_upl);
7254 			return KERN_SUCCESS;
7255 		}
7256 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7257 		if (upl == NULL) {
7258 			goto process_upl_to_enter;
7259 		}
7260 
7261 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7262 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7263 	} else {
7264 		/*
7265 		 * check to see if already mapped
7266 		 */
7267 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7268 			upl_unlock(upl);
7269 			return KERN_FAILURE;
7270 		}
7271 	}
7272 
7273 	if ((!(upl->flags & UPL_SHADOWED)) &&
7274 	    ((upl->flags & UPL_HAS_BUSY) ||
7275 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7276 		vm_object_t             object;
7277 		vm_page_t               alias_page;
7278 		vm_object_offset_t      new_offset;
7279 		unsigned int            pg_num;
7280 
7281 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7282 		object = upl->map_object;
7283 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7284 
7285 		vm_object_lock(upl->map_object);
7286 
7287 		upl->map_object->shadow = object;
7288 		upl->map_object->pageout = TRUE;
7289 		upl->map_object->can_persist = FALSE;
7290 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7291 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7292 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7293 		    "object %p shadow_offset 0x%llx",
7294 		    upl->map_object,
7295 		    (uint64_t)upl->map_object->vo_shadow_offset);
7296 		upl->map_object->wimg_bits = object->wimg_bits;
7297 		offset = upl->map_object->vo_shadow_offset;
7298 		new_offset = 0;
7299 
7300 		upl->flags |= UPL_SHADOWED;
7301 
7302 		while (size) {
7303 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7304 			assert(pg_num == new_offset / PAGE_SIZE);
7305 
7306 			if (bitmap_test(upl->lite_list, pg_num)) {
7307 				alias_page = vm_page_grab_fictitious(TRUE);
7308 
7309 				vm_object_lock(object);
7310 
7311 				m = vm_page_lookup(object, offset);
7312 				if (m == VM_PAGE_NULL) {
7313 					panic("vm_upl_map: page missing");
7314 				}
7315 
7316 				/*
7317 				 * Convert the fictitious page to a private
7318 				 * shadow of the real page.
7319 				 */
7320 				assert(alias_page->vmp_fictitious);
7321 				alias_page->vmp_fictitious = FALSE;
7322 				alias_page->vmp_private = TRUE;
7323 				alias_page->vmp_free_when_done = TRUE;
7324 				/*
7325 				 * since m is a page in the upl it must
7326 				 * already be wired or BUSY, so it's
7327 				 * safe to assign the underlying physical
7328 				 * page to the alias
7329 				 */
7330 				VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7331 
7332 				vm_object_unlock(object);
7333 
7334 				vm_page_lockspin_queues();
7335 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7336 				vm_page_unlock_queues();
7337 
7338 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7339 
7340 				assert(!alias_page->vmp_wanted);
7341 				alias_page->vmp_busy = FALSE;
7342 				alias_page->vmp_absent = FALSE;
7343 			}
7344 			size -= PAGE_SIZE;
7345 			offset += PAGE_SIZE_64;
7346 			new_offset += PAGE_SIZE_64;
7347 		}
7348 		vm_object_unlock(upl->map_object);
7349 	}
7350 	if (upl->flags & UPL_SHADOWED) {
7351 		if (isVectorUPL) {
7352 			offset = 0;
7353 		} else {
7354 			offset = offset_to_map;
7355 		}
7356 	} else {
7357 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7358 		if (!isVectorUPL) {
7359 			offset += offset_to_map;
7360 		}
7361 	}
7362 
7363 	if (isVectorUPL) {
7364 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7365 	} else {
7366 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7367 	}
7368 
7369 	vm_object_reference(upl->map_object);
7370 
7371 	if (!isVectorUPL) {
7372 		*dst_addr = 0;
7373 		/*
7374 		 * NEED A UPL_MAP ALIAS
7375 		 */
7376 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7377 		    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7378 		    upl->map_object, offset, FALSE,
7379 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7380 
7381 		if (kr != KERN_SUCCESS) {
7382 			vm_object_deallocate(upl->map_object);
7383 			upl_unlock(upl);
7384 			return kr;
7385 		}
7386 	} else {
7387 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7388 		    VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7389 		    upl->map_object, offset, FALSE,
7390 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7391 		if (kr) {
7392 			panic("vm_map_enter failed for a Vector UPL");
7393 		}
7394 	}
7395 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7396 	                                        /* this will have to be an increment rather than */
7397 	                                        /* an assignment. */
7398 	vm_object_lock(upl->map_object);
7399 
7400 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7401 		m = vm_page_lookup(upl->map_object, offset);
7402 
7403 		if (m) {
7404 			m->vmp_pmapped = TRUE;
7405 
7406 			/*
7407 			 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7408 			 * but only in kernel space. If this was on a user map,
7409 			 * we'd have to set the wpmapped bit.
7410 			 */
7411 			/* m->vmp_wpmapped = TRUE; */
7412 			assert(map->pmap == kernel_pmap);
7413 
7414 			kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE);
7415 
7416 			assert(kr == KERN_SUCCESS);
7417 #if KASAN
7418 			kasan_notify_address(addr, PAGE_SIZE_64);
7419 #endif
7420 		}
7421 		offset += PAGE_SIZE_64;
7422 	}
7423 	vm_object_unlock(upl->map_object);
7424 
7425 	/*
7426 	 * hold a reference for the mapping
7427 	 */
7428 	upl->ref_count++;
7429 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7430 	upl->kaddr = (vm_offset_t) *dst_addr;
7431 	assert(upl->kaddr == *dst_addr);
7432 
7433 	if (isVectorUPL) {
7434 		goto process_upl_to_enter;
7435 	}
7436 
7437 	if (!isVectorUPL) {
7438 		vm_map_offset_t addr_adjustment;
7439 
7440 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7441 		if (addr_adjustment) {
7442 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7443 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7444 			*dst_addr += addr_adjustment;
7445 		}
7446 	}
7447 
7448 	upl_unlock(upl);
7449 
7450 	return KERN_SUCCESS;
7451 }
7452 
7453 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7454 vm_map_enter_upl(
7455 	vm_map_t                map,
7456 	upl_t                   upl,
7457 	vm_map_offset_t         *dst_addr)
7458 {
7459 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7460 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7461 }
7462 
7463 /*
7464  * Internal routine to remove a UPL mapping from a VM map.
7465  *
7466  * XXX - This should just be doable through a standard
7467  * vm_map_remove() operation.  Otherwise, implicit clean-up
7468  * of the target map won't be able to correctly remove
7469  * these (and release the reference on the UPL).  Having
7470  * to do this means we can't map these into user-space
7471  * maps yet.
7472  */
7473 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7474 vm_map_remove_upl_range(
7475 	vm_map_t        map,
7476 	upl_t           upl,
7477 	__unused vm_object_offset_t    offset_to_unmap,
7478 	__unused upl_size_t      size_to_unmap)
7479 {
7480 	vm_address_t    addr;
7481 	upl_size_t      size;
7482 	int             isVectorUPL = 0, curr_upl = 0;
7483 	upl_t           vector_upl = NULL;
7484 
7485 	if (upl == UPL_NULL) {
7486 		return KERN_INVALID_ARGUMENT;
7487 	}
7488 
7489 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7490 		int     unmapped = 0, valid_upls = 0;
7491 		vector_upl = upl;
7492 		upl_lock(vector_upl);
7493 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7494 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7495 			if (upl == NULL) {
7496 				continue;
7497 			}
7498 			valid_upls++;
7499 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7500 				unmapped++;
7501 			}
7502 		}
7503 
7504 		if (unmapped) {
7505 			if (unmapped != valid_upls) {
7506 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7507 			} else {
7508 				upl_unlock(vector_upl);
7509 				return KERN_FAILURE;
7510 			}
7511 		}
7512 		curr_upl = 0;
7513 	} else {
7514 		upl_lock(upl);
7515 	}
7516 
7517 process_upl_to_remove:
7518 	if (isVectorUPL) {
7519 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7520 			vm_map_t v_upl_submap;
7521 			vm_offset_t v_upl_submap_dst_addr;
7522 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7523 
7524 			kmem_free_guard(map, v_upl_submap_dst_addr,
7525 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7526 			vm_map_deallocate(v_upl_submap);
7527 			upl_unlock(vector_upl);
7528 			return KERN_SUCCESS;
7529 		}
7530 
7531 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7532 		if (upl == NULL) {
7533 			goto process_upl_to_remove;
7534 		}
7535 	}
7536 
7537 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7538 		addr = upl->kaddr;
7539 		size = upl->u_mapped_size;
7540 
7541 		assert(upl->ref_count > 1);
7542 		upl->ref_count--;               /* removing mapping ref */
7543 
7544 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7545 		upl->kaddr = (vm_offset_t) 0;
7546 		upl->u_mapped_size = 0;
7547 
7548 		if (isVectorUPL) {
7549 			/*
7550 			 * If it's a Vectored UPL, we'll be removing the entire
7551 			 * submap anyways, so no need to remove individual UPL
7552 			 * element mappings from within the submap
7553 			 */
7554 			goto process_upl_to_remove;
7555 		}
7556 
7557 		upl_unlock(upl);
7558 
7559 		vm_map_remove(map,
7560 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7561 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7562 		return KERN_SUCCESS;
7563 	}
7564 	upl_unlock(upl);
7565 
7566 	return KERN_FAILURE;
7567 }
7568 
7569 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7570 vm_map_remove_upl(
7571 	vm_map_t        map,
7572 	upl_t           upl)
7573 {
7574 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7575 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7576 }
7577 
7578 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7579 upl_commit_range(
7580 	upl_t                   upl,
7581 	upl_offset_t            offset,
7582 	upl_size_t              size,
7583 	int                     flags,
7584 	upl_page_info_t         *page_list,
7585 	mach_msg_type_number_t  count,
7586 	boolean_t               *empty)
7587 {
7588 	upl_size_t              xfer_size, subupl_size;
7589 	vm_object_t             shadow_object;
7590 	vm_object_t             object;
7591 	vm_object_t             m_object;
7592 	vm_object_offset_t      target_offset;
7593 	upl_offset_t            subupl_offset = offset;
7594 	int                     entry;
7595 	int                     occupied;
7596 	int                     clear_refmod = 0;
7597 	int                     pgpgout_count = 0;
7598 	struct  vm_page_delayed_work    dw_array;
7599 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7600 	bool                    dwp_finish_ctx = TRUE;
7601 	int                     dw_count;
7602 	int                     dw_limit;
7603 	int                     isVectorUPL = 0;
7604 	upl_t                   vector_upl = NULL;
7605 	boolean_t               should_be_throttled = FALSE;
7606 
7607 	vm_page_t               nxt_page = VM_PAGE_NULL;
7608 	int                     fast_path_possible = 0;
7609 	int                     fast_path_full_commit = 0;
7610 	int                     throttle_page = 0;
7611 	int                     unwired_count = 0;
7612 	int                     local_queue_count = 0;
7613 	vm_page_t               first_local, last_local;
7614 	vm_object_offset_t      obj_start, obj_end, obj_offset;
7615 	kern_return_t           kr = KERN_SUCCESS;
7616 
7617 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7618 
7619 	dwp_start = dwp = NULL;
7620 
7621 	subupl_size = size;
7622 	*empty = FALSE;
7623 
7624 	if (upl == UPL_NULL) {
7625 		return KERN_INVALID_ARGUMENT;
7626 	}
7627 
7628 	dw_count = 0;
7629 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7630 	dwp_start = vm_page_delayed_work_get_ctx();
7631 	if (dwp_start == NULL) {
7632 		dwp_start = &dw_array;
7633 		dw_limit = 1;
7634 		dwp_finish_ctx = FALSE;
7635 	}
7636 
7637 	dwp = dwp_start;
7638 
7639 	if (count == 0) {
7640 		page_list = NULL;
7641 	}
7642 
7643 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7644 		vector_upl = upl;
7645 		upl_lock(vector_upl);
7646 	} else {
7647 		upl_lock(upl);
7648 	}
7649 
7650 process_upl_to_commit:
7651 
7652 	if (isVectorUPL) {
7653 		size = subupl_size;
7654 		offset = subupl_offset;
7655 		if (size == 0) {
7656 			upl_unlock(vector_upl);
7657 			kr = KERN_SUCCESS;
7658 			goto done;
7659 		}
7660 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7661 		if (upl == NULL) {
7662 			upl_unlock(vector_upl);
7663 			kr = KERN_FAILURE;
7664 			goto done;
7665 		}
7666 		page_list = upl->page_list;
7667 		subupl_size -= size;
7668 		subupl_offset += size;
7669 	}
7670 
7671 #if UPL_DEBUG
7672 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7673 		upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7674 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7675 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7676 
7677 		upl->upl_commit_index++;
7678 	}
7679 #endif
7680 	if (upl->flags & UPL_DEVICE_MEMORY) {
7681 		xfer_size = 0;
7682 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7683 		xfer_size = size;
7684 	} else {
7685 		if (!isVectorUPL) {
7686 			upl_unlock(upl);
7687 		} else {
7688 			upl_unlock(vector_upl);
7689 		}
7690 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7691 		kr = KERN_FAILURE;
7692 		goto done;
7693 	}
7694 	if (upl->flags & UPL_SET_DIRTY) {
7695 		flags |= UPL_COMMIT_SET_DIRTY;
7696 	}
7697 	if (upl->flags & UPL_CLEAR_DIRTY) {
7698 		flags |= UPL_COMMIT_CLEAR_DIRTY;
7699 	}
7700 
7701 	object = upl->map_object;
7702 
7703 	if (upl->flags & UPL_SHADOWED) {
7704 		vm_object_lock(object);
7705 		shadow_object = object->shadow;
7706 	} else {
7707 		shadow_object = object;
7708 	}
7709 	entry = offset / PAGE_SIZE;
7710 	target_offset = (vm_object_offset_t)offset;
7711 
7712 	if (upl->flags & UPL_KERNEL_OBJECT) {
7713 		vm_object_lock_shared(shadow_object);
7714 	} else {
7715 		vm_object_lock(shadow_object);
7716 	}
7717 
7718 	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7719 
7720 	if (upl->flags & UPL_ACCESS_BLOCKED) {
7721 		assert(shadow_object->blocked_access);
7722 		shadow_object->blocked_access = FALSE;
7723 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7724 	}
7725 
7726 	if (shadow_object->code_signed) {
7727 		/*
7728 		 * CODE SIGNING:
7729 		 * If the object is code-signed, do not let this UPL tell
7730 		 * us if the pages are valid or not.  Let the pages be
7731 		 * validated by VM the normal way (when they get mapped or
7732 		 * copied).
7733 		 */
7734 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7735 	}
7736 	if (!page_list) {
7737 		/*
7738 		 * No page list to get the code-signing info from !?
7739 		 */
7740 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7741 	}
7742 	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7743 		should_be_throttled = TRUE;
7744 	}
7745 
7746 	if ((upl->flags & UPL_IO_WIRE) &&
7747 	    !(flags & UPL_COMMIT_FREE_ABSENT) &&
7748 	    !isVectorUPL &&
7749 	    shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7750 	    shadow_object->purgable != VM_PURGABLE_EMPTY) {
7751 		if (!vm_page_queue_empty(&shadow_object->memq)) {
7752 			if (shadow_object->internal && size == shadow_object->vo_size) {
7753 				nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7754 				fast_path_full_commit = 1;
7755 			}
7756 			fast_path_possible = 1;
7757 
7758 			if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7759 			    (shadow_object->purgable == VM_PURGABLE_DENY ||
7760 			    shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7761 			    shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7762 				throttle_page = 1;
7763 			}
7764 		}
7765 	}
7766 	first_local = VM_PAGE_NULL;
7767 	last_local = VM_PAGE_NULL;
7768 
7769 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7770 	obj_end = obj_start + xfer_size;
7771 	obj_start = vm_object_trunc_page(obj_start);
7772 	obj_end = vm_object_round_page(obj_end);
7773 	for (obj_offset = obj_start;
7774 	    obj_offset < obj_end;
7775 	    obj_offset += PAGE_SIZE) {
7776 		vm_page_t       t, m;
7777 
7778 		dwp->dw_mask = 0;
7779 		clear_refmod = 0;
7780 
7781 		m = VM_PAGE_NULL;
7782 
7783 		if (upl->flags & UPL_LITE) {
7784 			unsigned int    pg_num;
7785 
7786 			if (nxt_page != VM_PAGE_NULL) {
7787 				m = nxt_page;
7788 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7789 				target_offset = m->vmp_offset;
7790 			}
7791 			pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7792 			assert(pg_num == target_offset / PAGE_SIZE);
7793 
7794 			if (bitmap_test(upl->lite_list, pg_num)) {
7795 				bitmap_clear(upl->lite_list, pg_num);
7796 
7797 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7798 					m = vm_page_lookup(shadow_object, obj_offset);
7799 				}
7800 			} else {
7801 				m = NULL;
7802 			}
7803 		}
7804 		if (upl->flags & UPL_SHADOWED) {
7805 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7806 				t->vmp_free_when_done = FALSE;
7807 
7808 				VM_PAGE_FREE(t);
7809 
7810 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7811 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7812 				}
7813 			}
7814 		}
7815 		if (m == VM_PAGE_NULL) {
7816 			goto commit_next_page;
7817 		}
7818 
7819 		m_object = VM_PAGE_OBJECT(m);
7820 
7821 		if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7822 			assert(m->vmp_busy);
7823 
7824 			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7825 			goto commit_next_page;
7826 		}
7827 
7828 		if (flags & UPL_COMMIT_CS_VALIDATED) {
7829 			/*
7830 			 * CODE SIGNING:
7831 			 * Set the code signing bits according to
7832 			 * what the UPL says they should be.
7833 			 */
7834 			m->vmp_cs_validated |= page_list[entry].cs_validated;
7835 			m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7836 			m->vmp_cs_nx |= page_list[entry].cs_nx;
7837 		}
7838 		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7839 			m->vmp_written_by_kernel = TRUE;
7840 		}
7841 
7842 		if (upl->flags & UPL_IO_WIRE) {
7843 			if (page_list) {
7844 				page_list[entry].phys_addr = 0;
7845 			}
7846 
7847 			if (flags & UPL_COMMIT_SET_DIRTY) {
7848 				SET_PAGE_DIRTY(m, FALSE);
7849 			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7850 				m->vmp_dirty = FALSE;
7851 
7852 				if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7853 				    m->vmp_cs_validated &&
7854 				    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7855 					/*
7856 					 * CODE SIGNING:
7857 					 * This page is no longer dirty
7858 					 * but could have been modified,
7859 					 * so it will need to be
7860 					 * re-validated.
7861 					 */
7862 					m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7863 
7864 					VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7865 
7866 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7867 				}
7868 				clear_refmod |= VM_MEM_MODIFIED;
7869 			}
7870 			if (upl->flags & UPL_ACCESS_BLOCKED) {
7871 				/*
7872 				 * We blocked access to the pages in this UPL.
7873 				 * Clear the "busy" bit and wake up any waiter
7874 				 * for this page.
7875 				 */
7876 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7877 			}
7878 			if (fast_path_possible) {
7879 				assert(m_object->purgable != VM_PURGABLE_EMPTY);
7880 				assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7881 				if (m->vmp_absent) {
7882 					assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7883 					assert(m->vmp_wire_count == 0);
7884 					assert(m->vmp_busy);
7885 
7886 					m->vmp_absent = FALSE;
7887 					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7888 				} else {
7889 					if (m->vmp_wire_count == 0) {
7890 						panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7891 					}
7892 					assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7893 
7894 					/*
7895 					 * XXX FBDP need to update some other
7896 					 * counters here (purgeable_wired_count)
7897 					 * (ledgers), ...
7898 					 */
7899 					assert(m->vmp_wire_count > 0);
7900 					m->vmp_wire_count--;
7901 
7902 					if (m->vmp_wire_count == 0) {
7903 						m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7904 						unwired_count++;
7905 					}
7906 				}
7907 				if (m->vmp_wire_count == 0) {
7908 					assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7909 
7910 					if (last_local == VM_PAGE_NULL) {
7911 						assert(first_local == VM_PAGE_NULL);
7912 
7913 						last_local = m;
7914 						first_local = m;
7915 					} else {
7916 						assert(first_local != VM_PAGE_NULL);
7917 
7918 						m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7919 						first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7920 						first_local = m;
7921 					}
7922 					local_queue_count++;
7923 
7924 					if (throttle_page) {
7925 						m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7926 					} else {
7927 						if (flags & UPL_COMMIT_INACTIVATE) {
7928 							if (shadow_object->internal) {
7929 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7930 							} else {
7931 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7932 							}
7933 						} else {
7934 							m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7935 						}
7936 					}
7937 				}
7938 			} else {
7939 				if (flags & UPL_COMMIT_INACTIVATE) {
7940 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7941 					clear_refmod |= VM_MEM_REFERENCED;
7942 				}
7943 				if (m->vmp_absent) {
7944 					if (flags & UPL_COMMIT_FREE_ABSENT) {
7945 						dwp->dw_mask |= DW_vm_page_free;
7946 					} else {
7947 						m->vmp_absent = FALSE;
7948 						dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7949 
7950 						if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7951 							dwp->dw_mask |= DW_vm_page_activate;
7952 						}
7953 					}
7954 				} else {
7955 					dwp->dw_mask |= DW_vm_page_unwire;
7956 				}
7957 			}
7958 			goto commit_next_page;
7959 		}
7960 		assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7961 
7962 		if (page_list) {
7963 			page_list[entry].phys_addr = 0;
7964 		}
7965 
7966 		/*
7967 		 * make sure to clear the hardware
7968 		 * modify or reference bits before
7969 		 * releasing the BUSY bit on this page
7970 		 * otherwise we risk losing a legitimate
7971 		 * change of state
7972 		 */
7973 		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7974 			m->vmp_dirty = FALSE;
7975 
7976 			clear_refmod |= VM_MEM_MODIFIED;
7977 		}
7978 		if (m->vmp_laundry) {
7979 			dwp->dw_mask |= DW_vm_pageout_throttle_up;
7980 		}
7981 
7982 		if (VM_PAGE_WIRED(m)) {
7983 			m->vmp_free_when_done = FALSE;
7984 		}
7985 
7986 		if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7987 		    m->vmp_cs_validated &&
7988 		    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7989 			/*
7990 			 * CODE SIGNING:
7991 			 * This page is no longer dirty
7992 			 * but could have been modified,
7993 			 * so it will need to be
7994 			 * re-validated.
7995 			 */
7996 			m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7997 
7998 			VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7999 
8000 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8001 		}
8002 		if (m->vmp_overwriting) {
8003 			/*
8004 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
8005 			 */
8006 			if (m->vmp_busy) {
8007 #if CONFIG_PHANTOM_CACHE
8008 				if (m->vmp_absent && !m_object->internal) {
8009 					dwp->dw_mask |= DW_vm_phantom_cache_update;
8010 				}
8011 #endif
8012 				m->vmp_absent = FALSE;
8013 
8014 				dwp->dw_mask |= DW_clear_busy;
8015 			} else {
8016 				/*
8017 				 * alternate (COPY_OUT_FROM == FALSE) page_list case
8018 				 * Occurs when the original page was wired
8019 				 * at the time of the list request
8020 				 */
8021 				assert(VM_PAGE_WIRED(m));
8022 
8023 				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8024 			}
8025 			m->vmp_overwriting = FALSE;
8026 		}
8027 		m->vmp_cleaning = FALSE;
8028 
8029 		if (m->vmp_free_when_done) {
8030 			/*
8031 			 * With the clean queue enabled, UPL_PAGEOUT should
8032 			 * no longer set the pageout bit. Its pages now go
8033 			 * to the clean queue.
8034 			 *
8035 			 * We don't use the cleaned Q anymore and so this
8036 			 * assert isn't correct. The code for the clean Q
8037 			 * still exists and might be used in the future. If we
8038 			 * go back to the cleaned Q, we will re-enable this
8039 			 * assert.
8040 			 *
8041 			 * assert(!(upl->flags & UPL_PAGEOUT));
8042 			 */
8043 			assert(!m_object->internal);
8044 
8045 			m->vmp_free_when_done = FALSE;
8046 
8047 			if ((flags & UPL_COMMIT_SET_DIRTY) ||
8048 			    (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8049 				/*
8050 				 * page was re-dirtied after we started
8051 				 * the pageout... reactivate it since
8052 				 * we don't know whether the on-disk
8053 				 * copy matches what is now in memory
8054 				 */
8055 				SET_PAGE_DIRTY(m, FALSE);
8056 
8057 				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8058 
8059 				if (upl->flags & UPL_PAGEOUT) {
8060 					counter_inc(&vm_statistics_reactivations);
8061 					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8062 				}
8063 			} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8064 				/*
8065 				 * Someone else might still be handling this
8066 				 * page (vm_fault() for example), so let's not
8067 				 * free it or "un-busy" it!
8068 				 * Put that page in the "speculative" queue
8069 				 * for now (since we would otherwise have freed
8070 				 * it) and let whoever is keeping the page
8071 				 * "busy" move it if needed when they're done
8072 				 * with it.
8073 				 */
8074 				dwp->dw_mask |= DW_vm_page_speculate;
8075 			} else {
8076 				/*
8077 				 * page has been successfully cleaned
8078 				 * go ahead and free it for other use
8079 				 */
8080 				if (m_object->internal) {
8081 					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8082 				} else {
8083 					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8084 				}
8085 				m->vmp_dirty = FALSE;
8086 				if (!(upl->flags & UPL_HAS_BUSY)) {
8087 					assert(!m->vmp_busy);
8088 				}
8089 				m->vmp_busy = TRUE;
8090 
8091 				dwp->dw_mask |= DW_vm_page_free;
8092 			}
8093 			goto commit_next_page;
8094 		}
8095 		/*
8096 		 * It is a part of the semantic of COPYOUT_FROM
8097 		 * UPLs that a commit implies cache sync
8098 		 * between the vm page and the backing store
8099 		 * this can be used to strip the precious bit
8100 		 * as well as clean
8101 		 */
8102 		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8103 			m->vmp_precious = FALSE;
8104 		}
8105 
8106 		if (flags & UPL_COMMIT_SET_DIRTY) {
8107 			SET_PAGE_DIRTY(m, FALSE);
8108 		} else {
8109 			m->vmp_dirty = FALSE;
8110 		}
8111 
8112 		/* with the clean queue on, move *all* cleaned pages to the clean queue */
8113 		if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8114 			pgpgout_count++;
8115 
8116 			counter_inc(&vm_statistics_pageouts);
8117 			DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8118 
8119 			dwp->dw_mask |= DW_enqueue_cleaned;
8120 		} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8121 			/*
8122 			 * page coming back in from being 'frozen'...
8123 			 * it was dirty before it was frozen, so keep it so
8124 			 * the vm_page_activate will notice that it really belongs
8125 			 * on the throttle queue and put it there
8126 			 */
8127 			SET_PAGE_DIRTY(m, FALSE);
8128 			dwp->dw_mask |= DW_vm_page_activate;
8129 		} else {
8130 			if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8131 				dwp->dw_mask |= DW_vm_page_deactivate_internal;
8132 				clear_refmod |= VM_MEM_REFERENCED;
8133 			} else if (!VM_PAGE_PAGEABLE(m)) {
8134 				if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8135 					dwp->dw_mask |= DW_vm_page_speculate;
8136 				} else if (m->vmp_reference) {
8137 					dwp->dw_mask |= DW_vm_page_activate;
8138 				} else {
8139 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
8140 					clear_refmod |= VM_MEM_REFERENCED;
8141 				}
8142 			}
8143 		}
8144 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8145 			/*
8146 			 * We blocked access to the pages in this URL.
8147 			 * Clear the "busy" bit on this page before we
8148 			 * wake up any waiter.
8149 			 */
8150 			dwp->dw_mask |= DW_clear_busy;
8151 		}
8152 		/*
8153 		 * Wakeup any thread waiting for the page to be un-cleaning.
8154 		 */
8155 		dwp->dw_mask |= DW_PAGE_WAKEUP;
8156 
8157 commit_next_page:
8158 		if (clear_refmod) {
8159 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8160 		}
8161 
8162 		target_offset += PAGE_SIZE_64;
8163 		xfer_size -= PAGE_SIZE;
8164 		entry++;
8165 
8166 		if (dwp->dw_mask) {
8167 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8168 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8169 
8170 				if (dw_count >= dw_limit) {
8171 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8172 
8173 					dwp = dwp_start;
8174 					dw_count = 0;
8175 				}
8176 			} else {
8177 				if (dwp->dw_mask & DW_clear_busy) {
8178 					m->vmp_busy = FALSE;
8179 				}
8180 
8181 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8182 					PAGE_WAKEUP(m);
8183 				}
8184 			}
8185 		}
8186 	}
8187 	if (dw_count) {
8188 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8189 		dwp = dwp_start;
8190 		dw_count = 0;
8191 	}
8192 
8193 	if (fast_path_possible) {
8194 		assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8195 		assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8196 
8197 		if (local_queue_count || unwired_count) {
8198 			if (local_queue_count) {
8199 				vm_page_t       first_target;
8200 				vm_page_queue_head_t    *target_queue;
8201 
8202 				if (throttle_page) {
8203 					target_queue = &vm_page_queue_throttled;
8204 				} else {
8205 					if (flags & UPL_COMMIT_INACTIVATE) {
8206 						if (shadow_object->internal) {
8207 							target_queue = &vm_page_queue_anonymous;
8208 						} else {
8209 							target_queue = &vm_page_queue_inactive;
8210 						}
8211 					} else {
8212 						target_queue = &vm_page_queue_active;
8213 					}
8214 				}
8215 				/*
8216 				 * Transfer the entire local queue to a regular LRU page queues.
8217 				 */
8218 				vm_page_lockspin_queues();
8219 
8220 				first_target = (vm_page_t) vm_page_queue_first(target_queue);
8221 
8222 				if (vm_page_queue_empty(target_queue)) {
8223 					target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8224 				} else {
8225 					first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8226 				}
8227 
8228 				target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8229 				first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8230 				last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8231 
8232 				/*
8233 				 * Adjust the global page counts.
8234 				 */
8235 				if (throttle_page) {
8236 					vm_page_throttled_count += local_queue_count;
8237 				} else {
8238 					if (flags & UPL_COMMIT_INACTIVATE) {
8239 						if (shadow_object->internal) {
8240 							vm_page_anonymous_count += local_queue_count;
8241 						}
8242 						vm_page_inactive_count += local_queue_count;
8243 
8244 						token_new_pagecount += local_queue_count;
8245 					} else {
8246 						vm_page_active_count += local_queue_count;
8247 					}
8248 
8249 					if (shadow_object->internal) {
8250 						vm_page_pageable_internal_count += local_queue_count;
8251 					} else {
8252 						vm_page_pageable_external_count += local_queue_count;
8253 					}
8254 				}
8255 			} else {
8256 				vm_page_lockspin_queues();
8257 			}
8258 			if (unwired_count) {
8259 				vm_page_wire_count -= unwired_count;
8260 				VM_CHECK_MEMORYSTATUS;
8261 			}
8262 			vm_page_unlock_queues();
8263 
8264 			VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8265 		}
8266 	}
8267 
8268 	if (upl->flags & UPL_DEVICE_MEMORY) {
8269 		occupied = 0;
8270 	} else if (upl->flags & UPL_LITE) {
8271 		uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8272 
8273 		occupied = !fast_path_full_commit &&
8274 		    !bitmap_is_empty(upl->lite_list, pages);
8275 	} else {
8276 		occupied = !vm_page_queue_empty(&upl->map_object->memq);
8277 	}
8278 	if (occupied == 0) {
8279 		/*
8280 		 * If this UPL element belongs to a Vector UPL and is
8281 		 * empty, then this is the right function to deallocate
8282 		 * it. So go ahead set the *empty variable. The flag
8283 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8284 		 * should be considered relevant for the Vector UPL and not
8285 		 * the internal UPLs.
8286 		 */
8287 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8288 			*empty = TRUE;
8289 		}
8290 
8291 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8292 			/*
8293 			 * this is not a paging object
8294 			 * so we need to drop the paging reference
8295 			 * that was taken when we created the UPL
8296 			 * against this object
8297 			 */
8298 			vm_object_activity_end(shadow_object);
8299 			vm_object_collapse(shadow_object, 0, TRUE);
8300 		} else {
8301 			/*
8302 			 * we dontated the paging reference to
8303 			 * the map object... vm_pageout_object_terminate
8304 			 * will drop this reference
8305 			 */
8306 		}
8307 	}
8308 	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8309 	vm_object_unlock(shadow_object);
8310 	if (object != shadow_object) {
8311 		vm_object_unlock(object);
8312 	}
8313 
8314 	if (!isVectorUPL) {
8315 		upl_unlock(upl);
8316 	} else {
8317 		/*
8318 		 * If we completed our operations on an UPL that is
8319 		 * part of a Vectored UPL and if empty is TRUE, then
8320 		 * we should go ahead and deallocate this UPL element.
8321 		 * Then we check if this was the last of the UPL elements
8322 		 * within that Vectored UPL. If so, set empty to TRUE
8323 		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8324 		 * can go ahead and deallocate the Vector UPL too.
8325 		 */
8326 		if (*empty == TRUE) {
8327 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8328 			upl_deallocate(upl);
8329 		}
8330 		goto process_upl_to_commit;
8331 	}
8332 	if (pgpgout_count) {
8333 		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8334 	}
8335 
8336 	kr = KERN_SUCCESS;
8337 done:
8338 	if (dwp_start && dwp_finish_ctx) {
8339 		vm_page_delayed_work_finish_ctx(dwp_start);
8340 		dwp_start = dwp = NULL;
8341 	}
8342 
8343 	return kr;
8344 }
8345 
8346 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8347 upl_abort_range(
8348 	upl_t                   upl,
8349 	upl_offset_t            offset,
8350 	upl_size_t              size,
8351 	int                     error,
8352 	boolean_t               *empty)
8353 {
8354 	upl_size_t              xfer_size, subupl_size;
8355 	vm_object_t             shadow_object;
8356 	vm_object_t             object;
8357 	vm_object_offset_t      target_offset;
8358 	upl_offset_t            subupl_offset = offset;
8359 	int                     occupied;
8360 	struct  vm_page_delayed_work    dw_array;
8361 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8362 	bool                    dwp_finish_ctx = TRUE;
8363 	int                     dw_count;
8364 	int                     dw_limit;
8365 	int                     isVectorUPL = 0;
8366 	upl_t                   vector_upl = NULL;
8367 	vm_object_offset_t      obj_start, obj_end, obj_offset;
8368 	kern_return_t           kr = KERN_SUCCESS;
8369 
8370 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8371 
8372 	dwp_start = dwp = NULL;
8373 
8374 	subupl_size = size;
8375 	*empty = FALSE;
8376 
8377 	if (upl == UPL_NULL) {
8378 		return KERN_INVALID_ARGUMENT;
8379 	}
8380 
8381 	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8382 		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8383 	}
8384 
8385 	dw_count = 0;
8386 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8387 	dwp_start = vm_page_delayed_work_get_ctx();
8388 	if (dwp_start == NULL) {
8389 		dwp_start = &dw_array;
8390 		dw_limit = 1;
8391 		dwp_finish_ctx = FALSE;
8392 	}
8393 
8394 	dwp = dwp_start;
8395 
8396 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8397 		vector_upl = upl;
8398 		upl_lock(vector_upl);
8399 	} else {
8400 		upl_lock(upl);
8401 	}
8402 
8403 process_upl_to_abort:
8404 	if (isVectorUPL) {
8405 		size = subupl_size;
8406 		offset = subupl_offset;
8407 		if (size == 0) {
8408 			upl_unlock(vector_upl);
8409 			kr = KERN_SUCCESS;
8410 			goto done;
8411 		}
8412 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8413 		if (upl == NULL) {
8414 			upl_unlock(vector_upl);
8415 			kr = KERN_FAILURE;
8416 			goto done;
8417 		}
8418 		subupl_size -= size;
8419 		subupl_offset += size;
8420 	}
8421 
8422 	*empty = FALSE;
8423 
8424 #if UPL_DEBUG
8425 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8426 		upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8427 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8428 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8429 		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8430 
8431 		upl->upl_commit_index++;
8432 	}
8433 #endif
8434 	if (upl->flags & UPL_DEVICE_MEMORY) {
8435 		xfer_size = 0;
8436 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8437 		xfer_size = size;
8438 	} else {
8439 		if (!isVectorUPL) {
8440 			upl_unlock(upl);
8441 		} else {
8442 			upl_unlock(vector_upl);
8443 		}
8444 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8445 		kr = KERN_FAILURE;
8446 		goto done;
8447 	}
8448 	object = upl->map_object;
8449 
8450 	if (upl->flags & UPL_SHADOWED) {
8451 		vm_object_lock(object);
8452 		shadow_object = object->shadow;
8453 	} else {
8454 		shadow_object = object;
8455 	}
8456 
8457 	target_offset = (vm_object_offset_t)offset;
8458 
8459 	if (upl->flags & UPL_KERNEL_OBJECT) {
8460 		vm_object_lock_shared(shadow_object);
8461 	} else {
8462 		vm_object_lock(shadow_object);
8463 	}
8464 
8465 	if (upl->flags & UPL_ACCESS_BLOCKED) {
8466 		assert(shadow_object->blocked_access);
8467 		shadow_object->blocked_access = FALSE;
8468 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8469 	}
8470 
8471 	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8472 		panic("upl_abort_range: kernel_object being DUMPED");
8473 	}
8474 
8475 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8476 	obj_end = obj_start + xfer_size;
8477 	obj_start = vm_object_trunc_page(obj_start);
8478 	obj_end = vm_object_round_page(obj_end);
8479 	for (obj_offset = obj_start;
8480 	    obj_offset < obj_end;
8481 	    obj_offset += PAGE_SIZE) {
8482 		vm_page_t       t, m;
8483 		unsigned int    pg_num;
8484 		boolean_t       needed;
8485 
8486 		pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8487 		assert(pg_num == target_offset / PAGE_SIZE);
8488 
8489 		needed = FALSE;
8490 
8491 		if (upl->flags & UPL_INTERNAL) {
8492 			needed = upl->page_list[pg_num].needed;
8493 		}
8494 
8495 		dwp->dw_mask = 0;
8496 		m = VM_PAGE_NULL;
8497 
8498 		if (upl->flags & UPL_LITE) {
8499 			if (bitmap_test(upl->lite_list, pg_num)) {
8500 				bitmap_clear(upl->lite_list, pg_num);
8501 
8502 				if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8503 					m = vm_page_lookup(shadow_object, obj_offset);
8504 				}
8505 			}
8506 		}
8507 		if (upl->flags & UPL_SHADOWED) {
8508 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8509 				t->vmp_free_when_done = FALSE;
8510 
8511 				VM_PAGE_FREE(t);
8512 
8513 				if (m == VM_PAGE_NULL) {
8514 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8515 				}
8516 			}
8517 		}
8518 		if ((upl->flags & UPL_KERNEL_OBJECT)) {
8519 			goto abort_next_page;
8520 		}
8521 
8522 		if (m != VM_PAGE_NULL) {
8523 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8524 
8525 			if (m->vmp_absent) {
8526 				boolean_t must_free = TRUE;
8527 
8528 				/*
8529 				 * COPYOUT = FALSE case
8530 				 * check for error conditions which must
8531 				 * be passed back to the pages customer
8532 				 */
8533 				if (error & UPL_ABORT_RESTART) {
8534 					m->vmp_restart = TRUE;
8535 					m->vmp_absent = FALSE;
8536 					m->vmp_unusual = TRUE;
8537 					must_free = FALSE;
8538 				} else if (error & UPL_ABORT_UNAVAILABLE) {
8539 					m->vmp_restart = FALSE;
8540 					m->vmp_unusual = TRUE;
8541 					must_free = FALSE;
8542 				} else if (error & UPL_ABORT_ERROR) {
8543 					m->vmp_restart = FALSE;
8544 					m->vmp_absent = FALSE;
8545 					m->vmp_error = TRUE;
8546 					m->vmp_unusual = TRUE;
8547 					must_free = FALSE;
8548 				}
8549 				if (m->vmp_clustered && needed == FALSE) {
8550 					/*
8551 					 * This page was a part of a speculative
8552 					 * read-ahead initiated by the kernel
8553 					 * itself.  No one is expecting this
8554 					 * page and no one will clean up its
8555 					 * error state if it ever becomes valid
8556 					 * in the future.
8557 					 * We have to free it here.
8558 					 */
8559 					must_free = TRUE;
8560 				}
8561 				m->vmp_cleaning = FALSE;
8562 
8563 				if (m->vmp_overwriting && !m->vmp_busy) {
8564 					/*
8565 					 * this shouldn't happen since
8566 					 * this is an 'absent' page, but
8567 					 * it doesn't hurt to check for
8568 					 * the 'alternate' method of
8569 					 * stabilizing the page...
8570 					 * we will mark 'busy' to be cleared
8571 					 * in the following code which will
8572 					 * take care of the primary stabilzation
8573 					 * method (i.e. setting 'busy' to TRUE)
8574 					 */
8575 					dwp->dw_mask |= DW_vm_page_unwire;
8576 				}
8577 				m->vmp_overwriting = FALSE;
8578 
8579 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8580 
8581 				if (must_free == TRUE) {
8582 					dwp->dw_mask |= DW_vm_page_free;
8583 				} else {
8584 					dwp->dw_mask |= DW_vm_page_activate;
8585 				}
8586 			} else {
8587 				/*
8588 				 * Handle the trusted pager throttle.
8589 				 */
8590 				if (m->vmp_laundry) {
8591 					dwp->dw_mask |= DW_vm_pageout_throttle_up;
8592 				}
8593 
8594 				if (upl->flags & UPL_ACCESS_BLOCKED) {
8595 					/*
8596 					 * We blocked access to the pages in this UPL.
8597 					 * Clear the "busy" bit and wake up any waiter
8598 					 * for this page.
8599 					 */
8600 					dwp->dw_mask |= DW_clear_busy;
8601 				}
8602 				if (m->vmp_overwriting) {
8603 					if (m->vmp_busy) {
8604 						dwp->dw_mask |= DW_clear_busy;
8605 					} else {
8606 						/*
8607 						 * deal with the 'alternate' method
8608 						 * of stabilizing the page...
8609 						 * we will either free the page
8610 						 * or mark 'busy' to be cleared
8611 						 * in the following code which will
8612 						 * take care of the primary stabilzation
8613 						 * method (i.e. setting 'busy' to TRUE)
8614 						 */
8615 						dwp->dw_mask |= DW_vm_page_unwire;
8616 					}
8617 					m->vmp_overwriting = FALSE;
8618 				}
8619 				m->vmp_free_when_done = FALSE;
8620 				m->vmp_cleaning = FALSE;
8621 
8622 				if (error & UPL_ABORT_DUMP_PAGES) {
8623 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8624 
8625 					dwp->dw_mask |= DW_vm_page_free;
8626 				} else {
8627 					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8628 						if (error & UPL_ABORT_REFERENCE) {
8629 							/*
8630 							 * we've been told to explictly
8631 							 * reference this page... for
8632 							 * file I/O, this is done by
8633 							 * implementing an LRU on the inactive q
8634 							 */
8635 							dwp->dw_mask |= DW_vm_page_lru;
8636 						} else if (!VM_PAGE_PAGEABLE(m)) {
8637 							dwp->dw_mask |= DW_vm_page_deactivate_internal;
8638 						}
8639 					}
8640 					dwp->dw_mask |= DW_PAGE_WAKEUP;
8641 				}
8642 			}
8643 		}
8644 abort_next_page:
8645 		target_offset += PAGE_SIZE_64;
8646 		xfer_size -= PAGE_SIZE;
8647 
8648 		if (dwp->dw_mask) {
8649 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8650 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8651 
8652 				if (dw_count >= dw_limit) {
8653 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8654 
8655 					dwp = dwp_start;
8656 					dw_count = 0;
8657 				}
8658 			} else {
8659 				if (dwp->dw_mask & DW_clear_busy) {
8660 					m->vmp_busy = FALSE;
8661 				}
8662 
8663 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8664 					PAGE_WAKEUP(m);
8665 				}
8666 			}
8667 		}
8668 	}
8669 	if (dw_count) {
8670 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8671 		dwp = dwp_start;
8672 		dw_count = 0;
8673 	}
8674 
8675 	if (upl->flags & UPL_DEVICE_MEMORY) {
8676 		occupied = 0;
8677 	} else if (upl->flags & UPL_LITE) {
8678 		uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8679 
8680 		occupied = !bitmap_is_empty(upl->lite_list, pages);
8681 	} else {
8682 		occupied = !vm_page_queue_empty(&upl->map_object->memq);
8683 	}
8684 	if (occupied == 0) {
8685 		/*
8686 		 * If this UPL element belongs to a Vector UPL and is
8687 		 * empty, then this is the right function to deallocate
8688 		 * it. So go ahead set the *empty variable. The flag
8689 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8690 		 * should be considered relevant for the Vector UPL and
8691 		 * not the internal UPLs.
8692 		 */
8693 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8694 			*empty = TRUE;
8695 		}
8696 
8697 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8698 			/*
8699 			 * this is not a paging object
8700 			 * so we need to drop the paging reference
8701 			 * that was taken when we created the UPL
8702 			 * against this object
8703 			 */
8704 			vm_object_activity_end(shadow_object);
8705 			vm_object_collapse(shadow_object, 0, TRUE);
8706 		} else {
8707 			/*
8708 			 * we dontated the paging reference to
8709 			 * the map object... vm_pageout_object_terminate
8710 			 * will drop this reference
8711 			 */
8712 		}
8713 	}
8714 	vm_object_unlock(shadow_object);
8715 	if (object != shadow_object) {
8716 		vm_object_unlock(object);
8717 	}
8718 
8719 	if (!isVectorUPL) {
8720 		upl_unlock(upl);
8721 	} else {
8722 		/*
8723 		 * If we completed our operations on an UPL that is
8724 		 * part of a Vectored UPL and if empty is TRUE, then
8725 		 * we should go ahead and deallocate this UPL element.
8726 		 * Then we check if this was the last of the UPL elements
8727 		 * within that Vectored UPL. If so, set empty to TRUE
8728 		 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8729 		 * can go ahead and deallocate the Vector UPL too.
8730 		 */
8731 		if (*empty == TRUE) {
8732 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8733 			upl_deallocate(upl);
8734 		}
8735 		goto process_upl_to_abort;
8736 	}
8737 
8738 	kr = KERN_SUCCESS;
8739 
8740 done:
8741 	if (dwp_start && dwp_finish_ctx) {
8742 		vm_page_delayed_work_finish_ctx(dwp_start);
8743 		dwp_start = dwp = NULL;
8744 	}
8745 
8746 	return kr;
8747 }
8748 
8749 
8750 kern_return_t
upl_abort(upl_t upl,int error)8751 upl_abort(
8752 	upl_t   upl,
8753 	int     error)
8754 {
8755 	boolean_t       empty;
8756 
8757 	if (upl == UPL_NULL) {
8758 		return KERN_INVALID_ARGUMENT;
8759 	}
8760 
8761 	return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8762 }
8763 
8764 
8765 /* an option on commit should be wire */
8766 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8767 upl_commit(
8768 	upl_t                   upl,
8769 	upl_page_info_t         *page_list,
8770 	mach_msg_type_number_t  count)
8771 {
8772 	boolean_t       empty;
8773 
8774 	if (upl == UPL_NULL) {
8775 		return KERN_INVALID_ARGUMENT;
8776 	}
8777 
8778 	return upl_commit_range(upl, 0, upl->u_size, 0,
8779 	           page_list, count, &empty);
8780 }
8781 
8782 
8783 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8784 iopl_valid_data(
8785 	upl_t    upl,
8786 	vm_tag_t tag)
8787 {
8788 	vm_object_t     object;
8789 	vm_offset_t     offset;
8790 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
8791 	upl_size_t      size;
8792 	int             wired_count = 0;
8793 
8794 	if (upl == NULL) {
8795 		panic("iopl_valid_data: NULL upl");
8796 	}
8797 	if (vector_upl_is_valid(upl)) {
8798 		panic("iopl_valid_data: vector upl");
8799 	}
8800 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8801 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8802 	}
8803 
8804 	object = upl->map_object;
8805 
8806 	if (is_kernel_object(object) || object == compressor_object) {
8807 		panic("iopl_valid_data: object == kernel or compressor");
8808 	}
8809 
8810 	if (object->purgable == VM_PURGABLE_VOLATILE ||
8811 	    object->purgable == VM_PURGABLE_EMPTY) {
8812 		panic("iopl_valid_data: object %p purgable %d",
8813 		    object, object->purgable);
8814 	}
8815 
8816 	size = upl_adjusted_size(upl, PAGE_MASK);
8817 
8818 	vm_object_lock(object);
8819 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8820 
8821 	bool whole_object;
8822 
8823 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8824 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8825 		whole_object = true;
8826 	} else {
8827 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8828 		whole_object = false;
8829 	}
8830 
8831 	while (size) {
8832 		if (whole_object) {
8833 			if (nxt_page != VM_PAGE_NULL) {
8834 				m = nxt_page;
8835 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8836 			}
8837 		} else {
8838 			m = vm_page_lookup(object, offset);
8839 			offset += PAGE_SIZE;
8840 
8841 			if (m == VM_PAGE_NULL) {
8842 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8843 			}
8844 		}
8845 		if (m->vmp_busy) {
8846 			if (!m->vmp_absent) {
8847 				panic("iopl_valid_data: busy page w/o absent");
8848 			}
8849 
8850 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8851 				panic("iopl_valid_data: busy+absent page on page queue");
8852 			}
8853 			if (m->vmp_reusable) {
8854 				panic("iopl_valid_data: %p is reusable", m);
8855 			}
8856 
8857 			m->vmp_absent = FALSE;
8858 			m->vmp_dirty = TRUE;
8859 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8860 			assert(m->vmp_wire_count == 0);
8861 			m->vmp_wire_count++;
8862 			assert(m->vmp_wire_count);
8863 			if (m->vmp_wire_count == 1) {
8864 				m->vmp_q_state = VM_PAGE_IS_WIRED;
8865 				wired_count++;
8866 			} else {
8867 				panic("iopl_valid_data: %p already wired", m);
8868 			}
8869 
8870 			PAGE_WAKEUP_DONE(m);
8871 		}
8872 		size -= PAGE_SIZE;
8873 	}
8874 	if (wired_count) {
8875 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8876 		assert(object->resident_page_count >= object->wired_page_count);
8877 
8878 		/* no need to adjust purgeable accounting for this object: */
8879 		assert(object->purgable != VM_PURGABLE_VOLATILE);
8880 		assert(object->purgable != VM_PURGABLE_EMPTY);
8881 
8882 		vm_page_lockspin_queues();
8883 		vm_page_wire_count += wired_count;
8884 		vm_page_unlock_queues();
8885 	}
8886 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8887 	vm_object_unlock(object);
8888 }
8889 
8890 
8891 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8892 vm_object_set_pmap_cache_attr(
8893 	vm_object_t             object,
8894 	upl_page_info_array_t   user_page_list,
8895 	unsigned int            num_pages,
8896 	boolean_t               batch_pmap_op)
8897 {
8898 	unsigned int    cache_attr = 0;
8899 
8900 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8901 	assert(user_page_list);
8902 	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8903 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8904 	}
8905 }
8906 
8907 
8908 static bool
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8909 vm_object_iopl_wire_full(
8910 	vm_object_t             object,
8911 	upl_t                   upl,
8912 	upl_page_info_array_t   user_page_list,
8913 	upl_control_flags_t     cntrl_flags,
8914 	vm_tag_t                tag)
8915 {
8916 	vm_page_t       dst_page;
8917 	unsigned int    entry;
8918 	int             page_count;
8919 	int             delayed_unlock = 0;
8920 	boolean_t       retval = TRUE;
8921 	ppnum_t         phys_page;
8922 
8923 	vm_object_lock_assert_exclusive(object);
8924 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8925 	assert(object->purgable != VM_PURGABLE_EMPTY);
8926 	assert(object->pager == NULL);
8927 	assert(object->vo_copy == NULL);
8928 	assert(object->shadow == NULL);
8929 
8930 	page_count = object->resident_page_count;
8931 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8932 
8933 	vm_page_lock_queues();
8934 
8935 	while (page_count--) {
8936 		if (dst_page->vmp_busy ||
8937 		    dst_page->vmp_fictitious ||
8938 		    dst_page->vmp_absent ||
8939 		    VMP_ERROR_GET(dst_page) ||
8940 		    dst_page->vmp_cleaning ||
8941 		    dst_page->vmp_restart ||
8942 		    dst_page->vmp_laundry) {
8943 			retval = FALSE;
8944 			goto done;
8945 		}
8946 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8947 			retval = FALSE;
8948 			goto done;
8949 		}
8950 		dst_page->vmp_reference = TRUE;
8951 
8952 		vm_page_wire(dst_page, tag, FALSE);
8953 
8954 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8955 			SET_PAGE_DIRTY(dst_page, FALSE);
8956 		}
8957 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8958 		assert(entry >= 0 && entry < object->resident_page_count);
8959 		bitmap_set(upl->lite_list, entry);
8960 
8961 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8962 
8963 		if (phys_page > upl->highest_page) {
8964 			upl->highest_page = phys_page;
8965 		}
8966 
8967 		if (user_page_list) {
8968 			user_page_list[entry].phys_addr = phys_page;
8969 			user_page_list[entry].absent    = dst_page->vmp_absent;
8970 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8971 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8972 			user_page_list[entry].precious  = dst_page->vmp_precious;
8973 			user_page_list[entry].device    = FALSE;
8974 			user_page_list[entry].speculative = FALSE;
8975 			user_page_list[entry].cs_validated = FALSE;
8976 			user_page_list[entry].cs_tainted = FALSE;
8977 			user_page_list[entry].cs_nx     = FALSE;
8978 			user_page_list[entry].needed    = FALSE;
8979 			user_page_list[entry].mark      = FALSE;
8980 		}
8981 		if (delayed_unlock++ > 256) {
8982 			delayed_unlock = 0;
8983 			lck_mtx_yield(&vm_page_queue_lock);
8984 
8985 			VM_CHECK_MEMORYSTATUS;
8986 		}
8987 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8988 	}
8989 done:
8990 	vm_page_unlock_queues();
8991 
8992 	VM_CHECK_MEMORYSTATUS;
8993 
8994 	return retval;
8995 }
8996 
8997 
8998 static kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8999 vm_object_iopl_wire_empty(
9000 	vm_object_t             object,
9001 	upl_t                   upl,
9002 	upl_page_info_array_t   user_page_list,
9003 	upl_control_flags_t     cntrl_flags,
9004 	vm_tag_t                tag,
9005 	vm_object_offset_t     *dst_offset,
9006 	int                     page_count,
9007 	int                    *page_grab_count)
9008 {
9009 	vm_page_t       dst_page;
9010 	boolean_t       no_zero_fill = FALSE;
9011 	int             interruptible;
9012 	int             pages_wired = 0;
9013 	int             pages_inserted = 0;
9014 	int             entry = 0;
9015 	uint64_t        delayed_ledger_update = 0;
9016 	kern_return_t   ret = KERN_SUCCESS;
9017 	int             grab_options;
9018 	ppnum_t         phys_page;
9019 
9020 	vm_object_lock_assert_exclusive(object);
9021 	assert(object->purgable != VM_PURGABLE_VOLATILE);
9022 	assert(object->purgable != VM_PURGABLE_EMPTY);
9023 	assert(object->pager == NULL);
9024 	assert(object->vo_copy == NULL);
9025 	assert(object->shadow == NULL);
9026 
9027 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9028 		interruptible = THREAD_ABORTSAFE;
9029 	} else {
9030 		interruptible = THREAD_UNINT;
9031 	}
9032 
9033 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9034 		no_zero_fill = TRUE;
9035 	}
9036 
9037 	grab_options = 0;
9038 #if CONFIG_SECLUDED_MEMORY
9039 	if (object->can_grab_secluded) {
9040 		grab_options |= VM_PAGE_GRAB_SECLUDED;
9041 	}
9042 #endif /* CONFIG_SECLUDED_MEMORY */
9043 
9044 	while (page_count--) {
9045 		while ((dst_page = vm_page_grab_options(grab_options))
9046 		    == VM_PAGE_NULL) {
9047 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9048 
9049 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9050 
9051 			if (vm_page_wait(interruptible) == FALSE) {
9052 				/*
9053 				 * interrupted case
9054 				 */
9055 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9056 
9057 				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9058 
9059 				ret = MACH_SEND_INTERRUPTED;
9060 				goto done;
9061 			}
9062 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9063 
9064 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9065 		}
9066 		if (no_zero_fill == FALSE) {
9067 			vm_page_zero_fill(dst_page);
9068 		} else {
9069 			dst_page->vmp_absent = TRUE;
9070 		}
9071 
9072 		dst_page->vmp_reference = TRUE;
9073 
9074 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9075 			SET_PAGE_DIRTY(dst_page, FALSE);
9076 		}
9077 		if (dst_page->vmp_absent == FALSE) {
9078 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9079 			assert(dst_page->vmp_wire_count == 0);
9080 			dst_page->vmp_wire_count++;
9081 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9082 			assert(dst_page->vmp_wire_count);
9083 			pages_wired++;
9084 			PAGE_WAKEUP_DONE(dst_page);
9085 		}
9086 		pages_inserted++;
9087 
9088 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9089 
9090 		bitmap_set(upl->lite_list, entry);
9091 
9092 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9093 
9094 		if (phys_page > upl->highest_page) {
9095 			upl->highest_page = phys_page;
9096 		}
9097 
9098 		if (user_page_list) {
9099 			user_page_list[entry].phys_addr = phys_page;
9100 			user_page_list[entry].absent    = dst_page->vmp_absent;
9101 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9102 			user_page_list[entry].free_when_done    = FALSE;
9103 			user_page_list[entry].precious  = FALSE;
9104 			user_page_list[entry].device    = FALSE;
9105 			user_page_list[entry].speculative = FALSE;
9106 			user_page_list[entry].cs_validated = FALSE;
9107 			user_page_list[entry].cs_tainted = FALSE;
9108 			user_page_list[entry].cs_nx     = FALSE;
9109 			user_page_list[entry].needed    = FALSE;
9110 			user_page_list[entry].mark      = FALSE;
9111 		}
9112 		entry++;
9113 		*dst_offset += PAGE_SIZE_64;
9114 	}
9115 done:
9116 	if (pages_wired) {
9117 		vm_page_lockspin_queues();
9118 		vm_page_wire_count += pages_wired;
9119 		vm_page_unlock_queues();
9120 	}
9121 	if (pages_inserted) {
9122 		if (object->internal) {
9123 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
9124 		} else {
9125 			OSAddAtomic(pages_inserted, &vm_page_external_count);
9126 		}
9127 	}
9128 	if (delayed_ledger_update) {
9129 		task_t          owner;
9130 		int             ledger_idx_volatile;
9131 		int             ledger_idx_nonvolatile;
9132 		int             ledger_idx_volatile_compressed;
9133 		int             ledger_idx_nonvolatile_compressed;
9134 		boolean_t       do_footprint;
9135 
9136 		owner = VM_OBJECT_OWNER(object);
9137 		assert(owner);
9138 
9139 		vm_object_ledger_tag_ledgers(object,
9140 		    &ledger_idx_volatile,
9141 		    &ledger_idx_nonvolatile,
9142 		    &ledger_idx_volatile_compressed,
9143 		    &ledger_idx_nonvolatile_compressed,
9144 		    &do_footprint);
9145 
9146 		/* more non-volatile bytes */
9147 		ledger_credit(owner->ledger,
9148 		    ledger_idx_nonvolatile,
9149 		    delayed_ledger_update);
9150 		if (do_footprint) {
9151 			/* more footprint */
9152 			ledger_credit(owner->ledger,
9153 			    task_ledgers.phys_footprint,
9154 			    delayed_ledger_update);
9155 		}
9156 	}
9157 
9158 	assert(page_grab_count);
9159 	*page_grab_count = pages_inserted;
9160 
9161 	return ret;
9162 }
9163 
9164 
9165 
9166 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9167 vm_object_iopl_request(
9168 	vm_object_t             object,
9169 	vm_object_offset_t      offset,
9170 	upl_size_t              size,
9171 	upl_t                   *upl_ptr,
9172 	upl_page_info_array_t   user_page_list,
9173 	unsigned int            *page_list_count,
9174 	upl_control_flags_t     cntrl_flags,
9175 	vm_tag_t                tag)
9176 {
9177 	vm_page_t               dst_page;
9178 	vm_object_offset_t      dst_offset;
9179 	upl_size_t              xfer_size;
9180 	upl_t                   upl = NULL;
9181 	unsigned int            entry;
9182 	int                     no_zero_fill = FALSE;
9183 	unsigned int            size_in_pages;
9184 	int                     page_grab_count = 0;
9185 	u_int32_t               psize;
9186 	kern_return_t           ret;
9187 	vm_prot_t               prot;
9188 	struct vm_object_fault_info fault_info = {};
9189 	struct  vm_page_delayed_work    dw_array;
9190 	struct  vm_page_delayed_work    *dwp, *dwp_start;
9191 	bool                    dwp_finish_ctx = TRUE;
9192 	int                     dw_count;
9193 	int                     dw_limit;
9194 	int                     dw_index;
9195 	boolean_t               caller_lookup;
9196 	int                     io_tracking_flag = 0;
9197 	int                     interruptible;
9198 	ppnum_t                 phys_page;
9199 
9200 	boolean_t               set_cache_attr_needed = FALSE;
9201 	boolean_t               free_wired_pages = FALSE;
9202 	boolean_t               fast_path_empty_req = FALSE;
9203 	boolean_t               fast_path_full_req = FALSE;
9204 
9205 #if DEVELOPMENT || DEBUG
9206 	task_t                  task = current_task();
9207 #endif /* DEVELOPMENT || DEBUG */
9208 
9209 	dwp_start = dwp = NULL;
9210 
9211 	vm_object_offset_t original_offset = offset;
9212 	upl_size_t original_size = size;
9213 
9214 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9215 
9216 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9217 	offset = vm_object_trunc_page(offset);
9218 	if (size != original_size || offset != original_offset) {
9219 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9220 	}
9221 
9222 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9223 		/*
9224 		 * For forward compatibility's sake,
9225 		 * reject any unknown flag.
9226 		 */
9227 		return KERN_INVALID_VALUE;
9228 	}
9229 	if (vm_lopage_needed == FALSE) {
9230 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9231 	}
9232 
9233 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9234 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9235 			return KERN_INVALID_VALUE;
9236 		}
9237 
9238 		if (object->phys_contiguous) {
9239 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9240 				return KERN_INVALID_ADDRESS;
9241 			}
9242 
9243 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9244 				return KERN_INVALID_ADDRESS;
9245 			}
9246 		}
9247 	}
9248 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9249 		no_zero_fill = TRUE;
9250 	}
9251 
9252 	if (cntrl_flags & UPL_COPYOUT_FROM) {
9253 		prot = VM_PROT_READ;
9254 	} else {
9255 		prot = VM_PROT_READ | VM_PROT_WRITE;
9256 	}
9257 
9258 	if ((!object->internal) && (object->paging_offset != 0)) {
9259 		panic("vm_object_iopl_request: external object with non-zero paging offset");
9260 	}
9261 
9262 
9263 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9264 
9265 #if CONFIG_IOSCHED || UPL_DEBUG
9266 	if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
9267 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9268 	}
9269 #endif
9270 
9271 #if CONFIG_IOSCHED
9272 	if (object->io_tracking) {
9273 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9274 		if (!is_kernel_object(object)) {
9275 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9276 		}
9277 	}
9278 #endif
9279 
9280 	if (object->phys_contiguous) {
9281 		psize = PAGE_SIZE;
9282 	} else {
9283 		psize = size;
9284 
9285 		dw_count = 0;
9286 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9287 		dwp_start = vm_page_delayed_work_get_ctx();
9288 		if (dwp_start == NULL) {
9289 			dwp_start = &dw_array;
9290 			dw_limit = 1;
9291 			dwp_finish_ctx = FALSE;
9292 		}
9293 
9294 		dwp = dwp_start;
9295 	}
9296 
9297 	if (cntrl_flags & UPL_SET_INTERNAL) {
9298 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9299 		user_page_list = size ? upl->page_list : NULL;
9300 	} else {
9301 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9302 	}
9303 	if (user_page_list) {
9304 		user_page_list[0].device = FALSE;
9305 	}
9306 	*upl_ptr = upl;
9307 
9308 	if (cntrl_flags & UPL_NOZEROFILLIO) {
9309 		DTRACE_VM4(upl_nozerofillio,
9310 		    vm_object_t, object,
9311 		    vm_object_offset_t, offset,
9312 		    upl_size_t, size,
9313 		    upl_t, upl);
9314 	}
9315 
9316 	upl->map_object = object;
9317 	upl->u_offset = original_offset;
9318 	upl->u_size = original_size;
9319 
9320 	size_in_pages = size / PAGE_SIZE;
9321 
9322 	if (is_kernel_object(object) &&
9323 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9324 		upl->flags |= UPL_KERNEL_OBJECT;
9325 #if UPL_DEBUG
9326 		vm_object_lock(object);
9327 #else
9328 		vm_object_lock_shared(object);
9329 #endif
9330 	} else {
9331 		vm_object_lock(object);
9332 		vm_object_activity_begin(object);
9333 	}
9334 	/*
9335 	 * paging in progress also protects the paging_offset
9336 	 */
9337 	upl->u_offset = original_offset + object->paging_offset;
9338 
9339 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9340 		/*
9341 		 * The user requested that access to the pages in this UPL
9342 		 * be blocked until the UPL is commited or aborted.
9343 		 */
9344 		upl->flags |= UPL_ACCESS_BLOCKED;
9345 	}
9346 
9347 #if CONFIG_IOSCHED || UPL_DEBUG
9348 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9349 		vm_object_activity_begin(object);
9350 		queue_enter(&object->uplq, upl, upl_t, uplq);
9351 	}
9352 #endif
9353 
9354 	if (object->phys_contiguous) {
9355 		if (upl->flags & UPL_ACCESS_BLOCKED) {
9356 			assert(!object->blocked_access);
9357 			object->blocked_access = TRUE;
9358 		}
9359 
9360 		vm_object_unlock(object);
9361 
9362 		/*
9363 		 * don't need any shadow mappings for this one
9364 		 * since it is already I/O memory
9365 		 */
9366 		upl->flags |= UPL_DEVICE_MEMORY;
9367 
9368 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9369 
9370 		if (user_page_list) {
9371 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9372 			user_page_list[0].device = TRUE;
9373 		}
9374 		if (page_list_count != NULL) {
9375 			if (upl->flags & UPL_INTERNAL) {
9376 				*page_list_count = 0;
9377 			} else {
9378 				*page_list_count = 1;
9379 			}
9380 		}
9381 
9382 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9383 #if DEVELOPMENT || DEBUG
9384 		if (task != NULL) {
9385 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9386 		}
9387 #endif /* DEVELOPMENT || DEBUG */
9388 		return KERN_SUCCESS;
9389 	}
9390 	if (!is_kernel_object(object) && object != compressor_object) {
9391 		/*
9392 		 * Protect user space from future COW operations
9393 		 */
9394 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9395 		if (!object->true_share &&
9396 		    vm_object_tracking_btlog) {
9397 			btlog_record(vm_object_tracking_btlog, object,
9398 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
9399 			    btref_get(__builtin_frame_address(0), 0));
9400 		}
9401 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9402 
9403 		vm_object_lock_assert_exclusive(object);
9404 		object->true_share = TRUE;
9405 
9406 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9407 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9408 		}
9409 	}
9410 
9411 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9412 	    object->vo_copy != VM_OBJECT_NULL) {
9413 		/*
9414 		 * Honor copy-on-write obligations
9415 		 *
9416 		 * The caller is gathering these pages and
9417 		 * might modify their contents.  We need to
9418 		 * make sure that the copy object has its own
9419 		 * private copies of these pages before we let
9420 		 * the caller modify them.
9421 		 *
9422 		 * NOTE: someone else could map the original object
9423 		 * after we've done this copy-on-write here, and they
9424 		 * could then see an inconsistent picture of the memory
9425 		 * while it's being modified via the UPL.  To prevent this,
9426 		 * we would have to block access to these pages until the
9427 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
9428 		 * code path for that...
9429 		 */
9430 		vm_object_update(object,
9431 		    offset,
9432 		    size,
9433 		    NULL,
9434 		    NULL,
9435 		    FALSE,              /* should_return */
9436 		    MEMORY_OBJECT_COPY_SYNC,
9437 		    VM_PROT_NO_CHANGE);
9438 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
9439 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9440 	}
9441 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9442 	    object->purgable != VM_PURGABLE_VOLATILE &&
9443 	    object->purgable != VM_PURGABLE_EMPTY &&
9444 	    object->vo_copy == NULL &&
9445 	    size == object->vo_size &&
9446 	    offset == 0 &&
9447 	    object->shadow == NULL &&
9448 	    object->pager == NULL) {
9449 		if (object->resident_page_count == size_in_pages) {
9450 			assert(object != compressor_object);
9451 			assert(!is_kernel_object(object));
9452 			fast_path_full_req = TRUE;
9453 		} else if (object->resident_page_count == 0) {
9454 			assert(object != compressor_object);
9455 			assert(!is_kernel_object(object));
9456 			fast_path_empty_req = TRUE;
9457 			set_cache_attr_needed = TRUE;
9458 		}
9459 	}
9460 
9461 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9462 		interruptible = THREAD_ABORTSAFE;
9463 	} else {
9464 		interruptible = THREAD_UNINT;
9465 	}
9466 
9467 	entry = 0;
9468 
9469 	xfer_size = size;
9470 	dst_offset = offset;
9471 
9472 	if (fast_path_full_req) {
9473 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9474 			goto finish;
9475 		}
9476 		/*
9477 		 * we couldn't complete the processing of this request on the fast path
9478 		 * so fall through to the slow path and finish up
9479 		 */
9480 	} else if (fast_path_empty_req) {
9481 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9482 			ret = KERN_MEMORY_ERROR;
9483 			goto return_err;
9484 		}
9485 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9486 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9487 
9488 		if (ret) {
9489 			free_wired_pages = TRUE;
9490 			goto return_err;
9491 		}
9492 		goto finish;
9493 	}
9494 
9495 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9496 	fault_info.lo_offset = offset;
9497 	fault_info.hi_offset = offset + xfer_size;
9498 	fault_info.mark_zf_absent = TRUE;
9499 	fault_info.interruptible = interruptible;
9500 	fault_info.batch_pmap_op = TRUE;
9501 
9502 	while (xfer_size) {
9503 		vm_fault_return_t       result;
9504 
9505 		dwp->dw_mask = 0;
9506 
9507 		if (fast_path_full_req) {
9508 			/*
9509 			 * if we get here, it means that we ran into a page
9510 			 * state we couldn't handle in the fast path and
9511 			 * bailed out to the slow path... since the order
9512 			 * we look at pages is different between the 2 paths,
9513 			 * the following check is needed to determine whether
9514 			 * this page was already processed in the fast path
9515 			 */
9516 			if (bitmap_test(upl->lite_list, entry)) {
9517 				goto skip_page;
9518 			}
9519 		}
9520 		dst_page = vm_page_lookup(object, dst_offset);
9521 
9522 		if (dst_page == VM_PAGE_NULL ||
9523 		    dst_page->vmp_busy ||
9524 		    VMP_ERROR_GET(dst_page) ||
9525 		    dst_page->vmp_restart ||
9526 		    dst_page->vmp_absent ||
9527 		    dst_page->vmp_fictitious) {
9528 			if (is_kernel_object(object)) {
9529 				panic("vm_object_iopl_request: missing/bad page in kernel object");
9530 			}
9531 			if (object == compressor_object) {
9532 				panic("vm_object_iopl_request: missing/bad page in compressor object");
9533 			}
9534 
9535 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9536 				ret = KERN_MEMORY_ERROR;
9537 				goto return_err;
9538 			}
9539 			set_cache_attr_needed = TRUE;
9540 
9541 			/*
9542 			 * We just looked up the page and the result remains valid
9543 			 * until the object lock is release, so send it to
9544 			 * vm_fault_page() (as "dst_page"), to avoid having to
9545 			 * look it up again there.
9546 			 */
9547 			caller_lookup = TRUE;
9548 
9549 			do {
9550 				vm_page_t       top_page;
9551 				kern_return_t   error_code;
9552 
9553 				fault_info.cluster_size = xfer_size;
9554 
9555 				vm_object_paging_begin(object);
9556 
9557 				result = vm_fault_page(object, dst_offset,
9558 				    prot | VM_PROT_WRITE, FALSE,
9559 				    caller_lookup,
9560 				    &prot, &dst_page, &top_page,
9561 				    (int *)0,
9562 				    &error_code, no_zero_fill,
9563 				    &fault_info);
9564 
9565 				/* our lookup is no longer valid at this point */
9566 				caller_lookup = FALSE;
9567 
9568 				switch (result) {
9569 				case VM_FAULT_SUCCESS:
9570 					page_grab_count++;
9571 
9572 					if (!dst_page->vmp_absent) {
9573 						PAGE_WAKEUP_DONE(dst_page);
9574 					} else {
9575 						/*
9576 						 * we only get back an absent page if we
9577 						 * requested that it not be zero-filled
9578 						 * because we are about to fill it via I/O
9579 						 *
9580 						 * absent pages should be left BUSY
9581 						 * to prevent them from being faulted
9582 						 * into an address space before we've
9583 						 * had a chance to complete the I/O on
9584 						 * them since they may contain info that
9585 						 * shouldn't be seen by the faulting task
9586 						 */
9587 					}
9588 					/*
9589 					 *	Release paging references and
9590 					 *	top-level placeholder page, if any.
9591 					 */
9592 					if (top_page != VM_PAGE_NULL) {
9593 						vm_object_t local_object;
9594 
9595 						local_object = VM_PAGE_OBJECT(top_page);
9596 
9597 						/*
9598 						 * comparing 2 packed pointers
9599 						 */
9600 						if (top_page->vmp_object != dst_page->vmp_object) {
9601 							vm_object_lock(local_object);
9602 							VM_PAGE_FREE(top_page);
9603 							vm_object_paging_end(local_object);
9604 							vm_object_unlock(local_object);
9605 						} else {
9606 							VM_PAGE_FREE(top_page);
9607 							vm_object_paging_end(local_object);
9608 						}
9609 					}
9610 					vm_object_paging_end(object);
9611 					break;
9612 
9613 				case VM_FAULT_RETRY:
9614 					vm_object_lock(object);
9615 					break;
9616 
9617 				case VM_FAULT_MEMORY_SHORTAGE:
9618 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9619 
9620 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9621 
9622 					if (vm_page_wait(interruptible)) {
9623 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9624 
9625 						VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9626 						vm_object_lock(object);
9627 
9628 						break;
9629 					}
9630 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9631 
9632 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9633 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9634 					OS_FALLTHROUGH;
9635 
9636 				case VM_FAULT_INTERRUPTED:
9637 					error_code = MACH_SEND_INTERRUPTED;
9638 					OS_FALLTHROUGH;
9639 				case VM_FAULT_MEMORY_ERROR:
9640 memory_error:
9641 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9642 
9643 					vm_object_lock(object);
9644 					goto return_err;
9645 
9646 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
9647 					/* success but no page: fail */
9648 					vm_object_paging_end(object);
9649 					vm_object_unlock(object);
9650 					goto memory_error;
9651 
9652 				default:
9653 					panic("vm_object_iopl_request: unexpected error"
9654 					    " 0x%x from vm_fault_page()\n", result);
9655 				}
9656 			} while (result != VM_FAULT_SUCCESS);
9657 		}
9658 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9659 
9660 		if (upl->flags & UPL_KERNEL_OBJECT) {
9661 			goto record_phys_addr;
9662 		}
9663 
9664 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9665 			dst_page->vmp_busy = TRUE;
9666 			goto record_phys_addr;
9667 		}
9668 
9669 		if (dst_page->vmp_cleaning) {
9670 			/*
9671 			 * Someone else is cleaning this page in place.
9672 			 * In theory, we should be able to  proceed and use this
9673 			 * page but they'll probably end up clearing the "busy"
9674 			 * bit on it in upl_commit_range() but they didn't set
9675 			 * it, so they would clear our "busy" bit and open
9676 			 * us to race conditions.
9677 			 * We'd better wait for the cleaning to complete and
9678 			 * then try again.
9679 			 */
9680 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9681 			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9682 			continue;
9683 		}
9684 		if (dst_page->vmp_laundry) {
9685 			vm_pageout_steal_laundry(dst_page, FALSE);
9686 		}
9687 
9688 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9689 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9690 			vm_page_t       low_page;
9691 			int             refmod;
9692 
9693 			/*
9694 			 * support devices that can't DMA above 32 bits
9695 			 * by substituting pages from a pool of low address
9696 			 * memory for any pages we find above the 4G mark
9697 			 * can't substitute if the page is already wired because
9698 			 * we don't know whether that physical address has been
9699 			 * handed out to some other 64 bit capable DMA device to use
9700 			 */
9701 			if (VM_PAGE_WIRED(dst_page)) {
9702 				ret = KERN_PROTECTION_FAILURE;
9703 				goto return_err;
9704 			}
9705 			low_page = vm_page_grablo();
9706 
9707 			if (low_page == VM_PAGE_NULL) {
9708 				ret = KERN_RESOURCE_SHORTAGE;
9709 				goto return_err;
9710 			}
9711 			/*
9712 			 * from here until the vm_page_replace completes
9713 			 * we musn't drop the object lock... we don't
9714 			 * want anyone refaulting this page in and using
9715 			 * it after we disconnect it... we want the fault
9716 			 * to find the new page being substituted.
9717 			 */
9718 			if (dst_page->vmp_pmapped) {
9719 				refmod = pmap_disconnect(phys_page);
9720 			} else {
9721 				refmod = 0;
9722 			}
9723 
9724 			if (!dst_page->vmp_absent) {
9725 				vm_page_copy(dst_page, low_page);
9726 			}
9727 
9728 			low_page->vmp_reference = dst_page->vmp_reference;
9729 			low_page->vmp_dirty     = dst_page->vmp_dirty;
9730 			low_page->vmp_absent    = dst_page->vmp_absent;
9731 
9732 			if (refmod & VM_MEM_REFERENCED) {
9733 				low_page->vmp_reference = TRUE;
9734 			}
9735 			if (refmod & VM_MEM_MODIFIED) {
9736 				SET_PAGE_DIRTY(low_page, FALSE);
9737 			}
9738 
9739 			vm_page_replace(low_page, object, dst_offset);
9740 
9741 			dst_page = low_page;
9742 			/*
9743 			 * vm_page_grablo returned the page marked
9744 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
9745 			 * here, because we've never dropped the object lock
9746 			 */
9747 			if (!dst_page->vmp_absent) {
9748 				dst_page->vmp_busy = FALSE;
9749 			}
9750 
9751 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9752 		}
9753 		if (!dst_page->vmp_busy) {
9754 			dwp->dw_mask |= DW_vm_page_wire;
9755 		}
9756 
9757 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
9758 			/*
9759 			 * Mark the page "busy" to block any future page fault
9760 			 * on this page in addition to wiring it.
9761 			 * We'll also remove the mapping
9762 			 * of all these pages before leaving this routine.
9763 			 */
9764 			assert(!dst_page->vmp_fictitious);
9765 			dst_page->vmp_busy = TRUE;
9766 		}
9767 		/*
9768 		 * expect the page to be used
9769 		 * page queues lock must be held to set 'reference'
9770 		 */
9771 		dwp->dw_mask |= DW_set_reference;
9772 
9773 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9774 			SET_PAGE_DIRTY(dst_page, TRUE);
9775 			/*
9776 			 * Page belonging to a code-signed object is about to
9777 			 * be written. Mark it tainted and disconnect it from
9778 			 * all pmaps so processes have to fault it back in and
9779 			 * deal with the tainted bit.
9780 			 */
9781 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9782 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9783 				vm_page_iopl_tainted++;
9784 				if (dst_page->vmp_pmapped) {
9785 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9786 					if (refmod & VM_MEM_REFERENCED) {
9787 						dst_page->vmp_reference = TRUE;
9788 					}
9789 				}
9790 			}
9791 		}
9792 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9793 			pmap_sync_page_attributes_phys(phys_page);
9794 			dst_page->vmp_written_by_kernel = FALSE;
9795 		}
9796 
9797 record_phys_addr:
9798 		if (dst_page->vmp_busy) {
9799 			upl->flags |= UPL_HAS_BUSY;
9800 		}
9801 
9802 		bitmap_set(upl->lite_list, entry);
9803 
9804 		if (phys_page > upl->highest_page) {
9805 			upl->highest_page = phys_page;
9806 		}
9807 
9808 		if (user_page_list) {
9809 			user_page_list[entry].phys_addr = phys_page;
9810 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9811 			user_page_list[entry].absent    = dst_page->vmp_absent;
9812 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9813 			user_page_list[entry].precious  = dst_page->vmp_precious;
9814 			user_page_list[entry].device    = FALSE;
9815 			user_page_list[entry].needed    = FALSE;
9816 			if (dst_page->vmp_clustered == TRUE) {
9817 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9818 			} else {
9819 				user_page_list[entry].speculative = FALSE;
9820 			}
9821 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9822 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9823 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9824 			user_page_list[entry].mark      = FALSE;
9825 		}
9826 		if (!is_kernel_object(object) && object != compressor_object) {
9827 			/*
9828 			 * someone is explicitly grabbing this page...
9829 			 * update clustered and speculative state
9830 			 *
9831 			 */
9832 			if (dst_page->vmp_clustered) {
9833 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
9834 			}
9835 		}
9836 skip_page:
9837 		entry++;
9838 		dst_offset += PAGE_SIZE_64;
9839 		xfer_size -= PAGE_SIZE;
9840 
9841 		if (dwp->dw_mask) {
9842 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9843 
9844 			if (dw_count >= dw_limit) {
9845 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9846 
9847 				dwp = dwp_start;
9848 				dw_count = 0;
9849 			}
9850 		}
9851 	}
9852 	assert(entry == size_in_pages);
9853 
9854 	if (dw_count) {
9855 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9856 		dwp = dwp_start;
9857 		dw_count = 0;
9858 	}
9859 finish:
9860 	if (user_page_list && set_cache_attr_needed == TRUE) {
9861 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9862 	}
9863 
9864 	if (page_list_count != NULL) {
9865 		if (upl->flags & UPL_INTERNAL) {
9866 			*page_list_count = 0;
9867 		} else if (*page_list_count > size_in_pages) {
9868 			*page_list_count = size_in_pages;
9869 		}
9870 	}
9871 	vm_object_unlock(object);
9872 
9873 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9874 		/*
9875 		 * We've marked all the pages "busy" so that future
9876 		 * page faults will block.
9877 		 * Now remove the mapping for these pages, so that they
9878 		 * can't be accessed without causing a page fault.
9879 		 */
9880 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9881 		    PMAP_NULL,
9882 		    PAGE_SIZE,
9883 		    0, VM_PROT_NONE);
9884 		assert(!object->blocked_access);
9885 		object->blocked_access = TRUE;
9886 	}
9887 
9888 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9889 #if DEVELOPMENT || DEBUG
9890 	if (task != NULL) {
9891 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9892 	}
9893 #endif /* DEVELOPMENT || DEBUG */
9894 
9895 	if (dwp_start && dwp_finish_ctx) {
9896 		vm_page_delayed_work_finish_ctx(dwp_start);
9897 		dwp_start = dwp = NULL;
9898 	}
9899 
9900 	return KERN_SUCCESS;
9901 
9902 return_err:
9903 	dw_index = 0;
9904 
9905 	for (; offset < dst_offset; offset += PAGE_SIZE) {
9906 		boolean_t need_unwire;
9907 
9908 		dst_page = vm_page_lookup(object, offset);
9909 
9910 		if (dst_page == VM_PAGE_NULL) {
9911 			panic("vm_object_iopl_request: Wired page missing.");
9912 		}
9913 
9914 		/*
9915 		 * if we've already processed this page in an earlier
9916 		 * dw_do_work, we need to undo the wiring... we will
9917 		 * leave the dirty and reference bits on if they
9918 		 * were set, since we don't have a good way of knowing
9919 		 * what the previous state was and we won't get here
9920 		 * under any normal circumstances...  we will always
9921 		 * clear BUSY and wakeup any waiters via vm_page_free
9922 		 * or PAGE_WAKEUP_DONE
9923 		 */
9924 		need_unwire = TRUE;
9925 
9926 		if (dw_count) {
9927 			if ((dwp_start)[dw_index].dw_m == dst_page) {
9928 				/*
9929 				 * still in the deferred work list
9930 				 * which means we haven't yet called
9931 				 * vm_page_wire on this page
9932 				 */
9933 				need_unwire = FALSE;
9934 
9935 				dw_index++;
9936 				dw_count--;
9937 			}
9938 		}
9939 		vm_page_lock_queues();
9940 
9941 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9942 			vm_page_free(dst_page);
9943 
9944 			need_unwire = FALSE;
9945 		} else {
9946 			if (need_unwire == TRUE) {
9947 				vm_page_unwire(dst_page, TRUE);
9948 			}
9949 
9950 			PAGE_WAKEUP_DONE(dst_page);
9951 		}
9952 		vm_page_unlock_queues();
9953 
9954 		if (need_unwire == TRUE) {
9955 			counter_inc(&vm_statistics_reactivations);
9956 		}
9957 	}
9958 #if UPL_DEBUG
9959 	upl->upl_state = 2;
9960 #endif
9961 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9962 		vm_object_activity_end(object);
9963 		vm_object_collapse(object, 0, TRUE);
9964 	}
9965 	vm_object_unlock(object);
9966 	upl_destroy(upl);
9967 
9968 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9969 #if DEVELOPMENT || DEBUG
9970 	if (task != NULL) {
9971 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9972 	}
9973 #endif /* DEVELOPMENT || DEBUG */
9974 
9975 	if (dwp_start && dwp_finish_ctx) {
9976 		vm_page_delayed_work_finish_ctx(dwp_start);
9977 		dwp_start = dwp = NULL;
9978 	}
9979 	return ret;
9980 }
9981 
9982 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9983 upl_transpose(
9984 	upl_t           upl1,
9985 	upl_t           upl2)
9986 {
9987 	kern_return_t           retval;
9988 	boolean_t               upls_locked;
9989 	vm_object_t             object1, object2;
9990 
9991 	/* LD: Should mapped UPLs be eligible for a transpose? */
9992 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9993 		return KERN_INVALID_ARGUMENT;
9994 	}
9995 
9996 	upls_locked = FALSE;
9997 
9998 	/*
9999 	 * Since we need to lock both UPLs at the same time,
10000 	 * avoid deadlocks by always taking locks in the same order.
10001 	 */
10002 	if (upl1 < upl2) {
10003 		upl_lock(upl1);
10004 		upl_lock(upl2);
10005 	} else {
10006 		upl_lock(upl2);
10007 		upl_lock(upl1);
10008 	}
10009 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
10010 
10011 	object1 = upl1->map_object;
10012 	object2 = upl2->map_object;
10013 
10014 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
10015 	    upl1->u_size != upl2->u_size) {
10016 		/*
10017 		 * We deal only with full objects, not subsets.
10018 		 * That's because we exchange the entire backing store info
10019 		 * for the objects: pager, resident pages, etc...  We can't do
10020 		 * only part of it.
10021 		 */
10022 		retval = KERN_INVALID_VALUE;
10023 		goto done;
10024 	}
10025 
10026 	/*
10027 	 * Tranpose the VM objects' backing store.
10028 	 */
10029 	retval = vm_object_transpose(object1, object2,
10030 	    upl_adjusted_size(upl1, PAGE_MASK));
10031 
10032 	if (retval == KERN_SUCCESS) {
10033 		/*
10034 		 * Make each UPL point to the correct VM object, i.e. the
10035 		 * object holding the pages that the UPL refers to...
10036 		 */
10037 #if CONFIG_IOSCHED || UPL_DEBUG
10038 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10039 			vm_object_lock(object1);
10040 			vm_object_lock(object2);
10041 		}
10042 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10043 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
10044 		}
10045 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10046 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
10047 		}
10048 #endif
10049 		upl1->map_object = object2;
10050 		upl2->map_object = object1;
10051 
10052 #if CONFIG_IOSCHED || UPL_DEBUG
10053 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10054 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
10055 		}
10056 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10057 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
10058 		}
10059 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10060 			vm_object_unlock(object2);
10061 			vm_object_unlock(object1);
10062 		}
10063 #endif
10064 	}
10065 
10066 done:
10067 	/*
10068 	 * Cleanup.
10069 	 */
10070 	if (upls_locked) {
10071 		upl_unlock(upl1);
10072 		upl_unlock(upl2);
10073 		upls_locked = FALSE;
10074 	}
10075 
10076 	return retval;
10077 }
10078 
10079 void
upl_range_needed(upl_t upl,int index,int count)10080 upl_range_needed(
10081 	upl_t           upl,
10082 	int             index,
10083 	int             count)
10084 {
10085 	int             size_in_pages;
10086 
10087 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10088 		return;
10089 	}
10090 
10091 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10092 
10093 	while (count-- && index < size_in_pages) {
10094 		upl->page_list[index++].needed = TRUE;
10095 	}
10096 }
10097 
10098 
10099 /*
10100  * Reserve of virtual addresses in the kernel address space.
10101  * We need to map the physical pages in the kernel, so that we
10102  * can call the code-signing or slide routines with a kernel
10103  * virtual address.  We keep this pool of pre-allocated kernel
10104  * virtual addresses so that we don't have to scan the kernel's
10105  * virtaul address space each time we need to work with
10106  * a physical page.
10107  */
10108 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10109 #define VM_PAGING_NUM_PAGES     64
10110 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10111 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10112 int             vm_paging_max_index = 0;
10113 int             vm_paging_page_waiter = 0;
10114 int             vm_paging_page_waiter_total = 0;
10115 
10116 unsigned long   vm_paging_no_kernel_page = 0;
10117 unsigned long   vm_paging_objects_mapped = 0;
10118 unsigned long   vm_paging_pages_mapped = 0;
10119 unsigned long   vm_paging_objects_mapped_slow = 0;
10120 unsigned long   vm_paging_pages_mapped_slow = 0;
10121 
10122 __startup_func
10123 static void
vm_paging_map_init(void)10124 vm_paging_map_init(void)
10125 {
10126 	kmem_alloc(kernel_map, &vm_paging_base_address,
10127 	    ptoa(VM_PAGING_NUM_PAGES),
10128 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10129 	    VM_KERN_MEMORY_NONE);
10130 }
10131 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10132 
10133 /*
10134  * vm_paging_map_object:
10135  *	Maps part of a VM object's pages in the kernel
10136  *      virtual address space, using the pre-allocated
10137  *	kernel virtual addresses, if possible.
10138  * Context:
10139  *      The VM object is locked.  This lock will get
10140  *      dropped and re-acquired though, so the caller
10141  *      must make sure the VM object is kept alive
10142  *	(by holding a VM map that has a reference
10143  *      on it, for example, or taking an extra reference).
10144  *      The page should also be kept busy to prevent
10145  *	it from being reclaimed.
10146  */
10147 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10148 vm_paging_map_object(
10149 	vm_page_t               page,
10150 	vm_object_t             object,
10151 	vm_object_offset_t      offset,
10152 	vm_prot_t               protection,
10153 	boolean_t               can_unlock_object,
10154 	vm_map_size_t           *size,          /* IN/OUT */
10155 	vm_map_offset_t         *address,       /* OUT */
10156 	boolean_t               *need_unmap)    /* OUT */
10157 {
10158 	kern_return_t           kr;
10159 	vm_map_offset_t         page_map_offset;
10160 	vm_map_size_t           map_size;
10161 	vm_object_offset_t      object_offset;
10162 	int                     i;
10163 
10164 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10165 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
10166 		*address = (vm_map_offset_t)
10167 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10168 		*need_unmap = FALSE;
10169 		return KERN_SUCCESS;
10170 
10171 		assert(page->vmp_busy);
10172 		/*
10173 		 * Use one of the pre-allocated kernel virtual addresses
10174 		 * and just enter the VM page in the kernel address space
10175 		 * at that virtual address.
10176 		 */
10177 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10178 
10179 		/*
10180 		 * Try and find an available kernel virtual address
10181 		 * from our pre-allocated pool.
10182 		 */
10183 		page_map_offset = 0;
10184 		for (;;) {
10185 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10186 				if (vm_paging_page_inuse[i] == FALSE) {
10187 					page_map_offset =
10188 					    vm_paging_base_address +
10189 					    (i * PAGE_SIZE);
10190 					break;
10191 				}
10192 			}
10193 			if (page_map_offset != 0) {
10194 				/* found a space to map our page ! */
10195 				break;
10196 			}
10197 
10198 			if (can_unlock_object) {
10199 				/*
10200 				 * If we can afford to unlock the VM object,
10201 				 * let's take the slow path now...
10202 				 */
10203 				break;
10204 			}
10205 			/*
10206 			 * We can't afford to unlock the VM object, so
10207 			 * let's wait for a space to become available...
10208 			 */
10209 			vm_paging_page_waiter_total++;
10210 			vm_paging_page_waiter++;
10211 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10212 			if (kr == THREAD_WAITING) {
10213 				simple_unlock(&vm_paging_lock);
10214 				kr = thread_block(THREAD_CONTINUE_NULL);
10215 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10216 			}
10217 			vm_paging_page_waiter--;
10218 			/* ... and try again */
10219 		}
10220 
10221 		if (page_map_offset != 0) {
10222 			/*
10223 			 * We found a kernel virtual address;
10224 			 * map the physical page to that virtual address.
10225 			 */
10226 			if (i > vm_paging_max_index) {
10227 				vm_paging_max_index = i;
10228 			}
10229 			vm_paging_page_inuse[i] = TRUE;
10230 			simple_unlock(&vm_paging_lock);
10231 
10232 			page->vmp_pmapped = TRUE;
10233 
10234 			/*
10235 			 * Keep the VM object locked over the PMAP_ENTER
10236 			 * and the actual use of the page by the kernel,
10237 			 * or this pmap mapping might get undone by a
10238 			 * vm_object_pmap_protect() call...
10239 			 */
10240 			kr = pmap_enter_check(kernel_pmap,
10241 			    page_map_offset,
10242 			    page,
10243 			    protection,
10244 			    VM_PROT_NONE,
10245 			    0,
10246 			    TRUE);
10247 			assert(kr == KERN_SUCCESS);
10248 			vm_paging_objects_mapped++;
10249 			vm_paging_pages_mapped++;
10250 			*address = page_map_offset;
10251 			*need_unmap = TRUE;
10252 
10253 #if KASAN
10254 			kasan_notify_address(page_map_offset, PAGE_SIZE);
10255 #endif
10256 
10257 			/* all done and mapped, ready to use ! */
10258 			return KERN_SUCCESS;
10259 		}
10260 
10261 		/*
10262 		 * We ran out of pre-allocated kernel virtual
10263 		 * addresses.  Just map the page in the kernel
10264 		 * the slow and regular way.
10265 		 */
10266 		vm_paging_no_kernel_page++;
10267 		simple_unlock(&vm_paging_lock);
10268 	}
10269 
10270 	if (!can_unlock_object) {
10271 		*address = 0;
10272 		*size = 0;
10273 		*need_unmap = FALSE;
10274 		return KERN_NOT_SUPPORTED;
10275 	}
10276 
10277 	object_offset = vm_object_trunc_page(offset);
10278 	map_size = vm_map_round_page(*size,
10279 	    VM_MAP_PAGE_MASK(kernel_map));
10280 
10281 	/*
10282 	 * Try and map the required range of the object
10283 	 * in the kernel_map. Given that allocation is
10284 	 * for pageable memory, it shouldn't contain
10285 	 * pointers and is mapped into the data range.
10286 	 */
10287 
10288 	vm_object_reference_locked(object);     /* for the map entry */
10289 	vm_object_unlock(object);
10290 
10291 	kr = vm_map_enter(kernel_map,
10292 	    address,
10293 	    map_size,
10294 	    0,
10295 	    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10296 	    object,
10297 	    object_offset,
10298 	    FALSE,
10299 	    protection,
10300 	    VM_PROT_ALL,
10301 	    VM_INHERIT_NONE);
10302 	if (kr != KERN_SUCCESS) {
10303 		*address = 0;
10304 		*size = 0;
10305 		*need_unmap = FALSE;
10306 		vm_object_deallocate(object);   /* for the map entry */
10307 		vm_object_lock(object);
10308 		return kr;
10309 	}
10310 
10311 	*size = map_size;
10312 
10313 	/*
10314 	 * Enter the mapped pages in the page table now.
10315 	 */
10316 	vm_object_lock(object);
10317 	/*
10318 	 * VM object must be kept locked from before PMAP_ENTER()
10319 	 * until after the kernel is done accessing the page(s).
10320 	 * Otherwise, the pmap mappings in the kernel could be
10321 	 * undone by a call to vm_object_pmap_protect().
10322 	 */
10323 
10324 	for (page_map_offset = 0;
10325 	    map_size != 0;
10326 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10327 		page = vm_page_lookup(object, offset + page_map_offset);
10328 		if (page == VM_PAGE_NULL) {
10329 			printf("vm_paging_map_object: no page !?");
10330 			vm_object_unlock(object);
10331 			vm_map_remove(kernel_map, *address, *size);
10332 			*address = 0;
10333 			*size = 0;
10334 			*need_unmap = FALSE;
10335 			vm_object_lock(object);
10336 			return KERN_MEMORY_ERROR;
10337 		}
10338 		page->vmp_pmapped = TRUE;
10339 
10340 		kr = pmap_enter_check(kernel_pmap,
10341 		    *address + page_map_offset,
10342 		    page,
10343 		    protection,
10344 		    VM_PROT_NONE,
10345 		    0,
10346 		    TRUE);
10347 		assert(kr == KERN_SUCCESS);
10348 #if KASAN
10349 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10350 #endif
10351 	}
10352 
10353 	vm_paging_objects_mapped_slow++;
10354 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10355 
10356 	*need_unmap = TRUE;
10357 
10358 	return KERN_SUCCESS;
10359 }
10360 
10361 /*
10362  * vm_paging_unmap_object:
10363  *	Unmaps part of a VM object's pages from the kernel
10364  *      virtual address space.
10365  * Context:
10366  *      The VM object is locked.  This lock will get
10367  *      dropped and re-acquired though.
10368  */
10369 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10370 vm_paging_unmap_object(
10371 	vm_object_t     object,
10372 	vm_map_offset_t start,
10373 	vm_map_offset_t end)
10374 {
10375 	int             i;
10376 
10377 	if ((vm_paging_base_address == 0) ||
10378 	    (start < vm_paging_base_address) ||
10379 	    (end > (vm_paging_base_address
10380 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10381 		/*
10382 		 * We didn't use our pre-allocated pool of
10383 		 * kernel virtual address.  Deallocate the
10384 		 * virtual memory.
10385 		 */
10386 		if (object != VM_OBJECT_NULL) {
10387 			vm_object_unlock(object);
10388 		}
10389 		vm_map_remove(kernel_map, start, end);
10390 		if (object != VM_OBJECT_NULL) {
10391 			vm_object_lock(object);
10392 		}
10393 	} else {
10394 		/*
10395 		 * We used a kernel virtual address from our
10396 		 * pre-allocated pool.  Put it back in the pool
10397 		 * for next time.
10398 		 */
10399 		assert(end - start == PAGE_SIZE);
10400 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10401 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10402 
10403 		/* undo the pmap mapping */
10404 		pmap_remove(kernel_pmap, start, end);
10405 
10406 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10407 		vm_paging_page_inuse[i] = FALSE;
10408 		if (vm_paging_page_waiter) {
10409 			thread_wakeup(&vm_paging_page_waiter);
10410 		}
10411 		simple_unlock(&vm_paging_lock);
10412 	}
10413 }
10414 
10415 
10416 /*
10417  * page->vmp_object must be locked
10418  */
10419 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10420 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10421 {
10422 	if (!queues_locked) {
10423 		vm_page_lockspin_queues();
10424 	}
10425 
10426 	page->vmp_free_when_done = FALSE;
10427 	/*
10428 	 * need to drop the laundry count...
10429 	 * we may also need to remove it
10430 	 * from the I/O paging queue...
10431 	 * vm_pageout_throttle_up handles both cases
10432 	 *
10433 	 * the laundry and pageout_queue flags are cleared...
10434 	 */
10435 	vm_pageout_throttle_up(page);
10436 
10437 	if (!queues_locked) {
10438 		vm_page_unlock_queues();
10439 	}
10440 }
10441 
10442 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10443 
10444 upl_t
vector_upl_create(vm_offset_t upl_offset,uint32_t max_upls)10445 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10446 {
10447 	int i = 0;
10448 	upl_t   upl;
10449 
10450 	assert(max_upls > 0);
10451 	if (max_upls == 0) {
10452 		return NULL;
10453 	}
10454 
10455 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10456 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10457 	}
10458 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10459 
10460 	upl = upl_create(0, UPL_VECTOR, 0);
10461 	upl->vector_upl = vector_upl;
10462 	upl->u_offset = upl_offset;
10463 	vector_upl->size = 0;
10464 	vector_upl->offset = upl_offset;
10465 	vector_upl->invalid_upls = 0;
10466 	vector_upl->num_upls = 0;
10467 	vector_upl->pagelist = NULL;
10468 	vector_upl->max_upls = max_upls;
10469 
10470 	for (i = 0; i < max_upls; i++) {
10471 		vector_upl->upls[i].iostate.size = 0;
10472 		vector_upl->upls[i].iostate.offset = 0;
10473 	}
10474 	return upl;
10475 }
10476 
10477 uint32_t
vector_upl_max_upls(const upl_t upl)10478 vector_upl_max_upls(const upl_t upl)
10479 {
10480 	if (!vector_upl_is_valid(upl)) {
10481 		return 0;
10482 	}
10483 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
10484 }
10485 
10486 void
vector_upl_deallocate(upl_t upl)10487 vector_upl_deallocate(upl_t upl)
10488 {
10489 	vector_upl_t vector_upl = upl->vector_upl;
10490 
10491 	assert(vector_upl_is_valid(upl));
10492 
10493 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
10494 		panic("Deallocating non-empty Vectored UPL");
10495 	}
10496 	uint32_t max_upls = vector_upl->max_upls;
10497 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10498 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10499 	upl->vector_upl = NULL;
10500 }
10501 
10502 boolean_t
vector_upl_is_valid(upl_t upl)10503 vector_upl_is_valid(upl_t upl)
10504 {
10505 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10506 }
10507 
10508 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10509 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10510 {
10511 	if (vector_upl_is_valid(upl)) {
10512 		vector_upl_t vector_upl = upl->vector_upl;
10513 
10514 		if (vector_upl) {
10515 			if (subupl) {
10516 				if (io_size) {
10517 					if (io_size < PAGE_SIZE) {
10518 						io_size = PAGE_SIZE;
10519 					}
10520 					subupl->vector_upl = (void*)vector_upl;
10521 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10522 					vector_upl->size += io_size;
10523 					upl->u_size += io_size;
10524 				} else {
10525 					uint32_t i = 0, invalid_upls = 0;
10526 					for (i = 0; i < vector_upl->num_upls; i++) {
10527 						if (vector_upl->upls[i].elem == subupl) {
10528 							break;
10529 						}
10530 					}
10531 					if (i == vector_upl->num_upls) {
10532 						panic("Trying to remove sub-upl when none exists");
10533 					}
10534 
10535 					vector_upl->upls[i].elem = NULL;
10536 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10537 					    relaxed);
10538 					if (invalid_upls == vector_upl->num_upls) {
10539 						return TRUE;
10540 					} else {
10541 						return FALSE;
10542 					}
10543 				}
10544 			} else {
10545 				panic("vector_upl_set_subupl was passed a NULL upl element");
10546 			}
10547 		} else {
10548 			panic("vector_upl_set_subupl was passed a non-vectored upl");
10549 		}
10550 	} else {
10551 		panic("vector_upl_set_subupl was passed a NULL upl");
10552 	}
10553 
10554 	return FALSE;
10555 }
10556 
10557 void
vector_upl_set_pagelist(upl_t upl)10558 vector_upl_set_pagelist(upl_t upl)
10559 {
10560 	if (vector_upl_is_valid(upl)) {
10561 		uint32_t i = 0;
10562 		vector_upl_t vector_upl = upl->vector_upl;
10563 
10564 		if (vector_upl) {
10565 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10566 
10567 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
10568 			    atop(vector_upl->size), Z_WAITOK);
10569 
10570 			for (i = 0; i < vector_upl->num_upls; i++) {
10571 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10572 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10573 				pagelist_size += cur_upl_pagelist_size;
10574 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10575 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
10576 				}
10577 			}
10578 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10579 		} else {
10580 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
10581 		}
10582 	} else {
10583 		panic("vector_upl_set_pagelist was passed a NULL upl");
10584 	}
10585 }
10586 
10587 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10588 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10589 {
10590 	if (vector_upl_is_valid(upl)) {
10591 		vector_upl_t vector_upl = upl->vector_upl;
10592 		if (vector_upl) {
10593 			if (index < vector_upl->num_upls) {
10594 				return vector_upl->upls[index].elem;
10595 			}
10596 		} else {
10597 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10598 		}
10599 	}
10600 	return NULL;
10601 }
10602 
10603 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10604 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10605 {
10606 	if (vector_upl_is_valid(upl)) {
10607 		uint32_t i = 0;
10608 		vector_upl_t vector_upl = upl->vector_upl;
10609 
10610 		if (vector_upl) {
10611 			upl_t subupl = NULL;
10612 			vector_upl_iostates_t subupl_state;
10613 
10614 			for (i = 0; i < vector_upl->num_upls; i++) {
10615 				subupl = vector_upl->upls[i].elem;
10616 				subupl_state = vector_upl->upls[i].iostate;
10617 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10618 					/* We could have been passed an offset/size pair that belongs
10619 					 * to an UPL element that has already been committed/aborted.
10620 					 * If so, return NULL.
10621 					 */
10622 					if (subupl == NULL) {
10623 						return NULL;
10624 					}
10625 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10626 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10627 						if (*upl_size > subupl_state.size) {
10628 							*upl_size = subupl_state.size;
10629 						}
10630 					}
10631 					if (*upl_offset >= subupl_state.offset) {
10632 						*upl_offset -= subupl_state.offset;
10633 					} else if (i) {
10634 						panic("Vector UPL offset miscalculation");
10635 					}
10636 					return subupl;
10637 				}
10638 			}
10639 		} else {
10640 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10641 		}
10642 	}
10643 	return NULL;
10644 }
10645 
10646 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10647 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10648 {
10649 	*v_upl_submap = NULL;
10650 
10651 	if (vector_upl_is_valid(upl)) {
10652 		vector_upl_t vector_upl = upl->vector_upl;
10653 		if (vector_upl) {
10654 			*v_upl_submap = vector_upl->submap;
10655 			*submap_dst_addr = vector_upl->submap_dst_addr;
10656 		} else {
10657 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10658 		}
10659 	} else {
10660 		panic("vector_upl_get_submap was passed a null UPL");
10661 	}
10662 }
10663 
10664 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10665 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10666 {
10667 	if (vector_upl_is_valid(upl)) {
10668 		vector_upl_t vector_upl = upl->vector_upl;
10669 		if (vector_upl) {
10670 			vector_upl->submap = submap;
10671 			vector_upl->submap_dst_addr = submap_dst_addr;
10672 		} else {
10673 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10674 		}
10675 	} else {
10676 		panic("vector_upl_get_submap was passed a NULL UPL");
10677 	}
10678 }
10679 
10680 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10681 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10682 {
10683 	if (vector_upl_is_valid(upl)) {
10684 		uint32_t i = 0;
10685 		vector_upl_t vector_upl = upl->vector_upl;
10686 
10687 		if (vector_upl) {
10688 			for (i = 0; i < vector_upl->num_upls; i++) {
10689 				if (vector_upl->upls[i].elem == subupl) {
10690 					break;
10691 				}
10692 			}
10693 
10694 			if (i == vector_upl->num_upls) {
10695 				panic("setting sub-upl iostate when none exists");
10696 			}
10697 
10698 			vector_upl->upls[i].iostate.offset = offset;
10699 			if (size < PAGE_SIZE) {
10700 				size = PAGE_SIZE;
10701 			}
10702 			vector_upl->upls[i].iostate.size = size;
10703 		} else {
10704 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
10705 		}
10706 	} else {
10707 		panic("vector_upl_set_iostate was passed a NULL UPL");
10708 	}
10709 }
10710 
10711 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10712 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10713 {
10714 	if (vector_upl_is_valid(upl)) {
10715 		uint32_t i = 0;
10716 		vector_upl_t vector_upl = upl->vector_upl;
10717 
10718 		if (vector_upl) {
10719 			for (i = 0; i < vector_upl->num_upls; i++) {
10720 				if (vector_upl->upls[i].elem == subupl) {
10721 					break;
10722 				}
10723 			}
10724 
10725 			if (i == vector_upl->num_upls) {
10726 				panic("getting sub-upl iostate when none exists");
10727 			}
10728 
10729 			*offset = vector_upl->upls[i].iostate.offset;
10730 			*size = vector_upl->upls[i].iostate.size;
10731 		} else {
10732 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
10733 		}
10734 	} else {
10735 		panic("vector_upl_get_iostate was passed a NULL UPL");
10736 	}
10737 }
10738 
10739 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10740 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10741 {
10742 	if (vector_upl_is_valid(upl)) {
10743 		vector_upl_t vector_upl = upl->vector_upl;
10744 		if (vector_upl) {
10745 			if (index < vector_upl->num_upls) {
10746 				*offset = vector_upl->upls[index].iostate.offset;
10747 				*size = vector_upl->upls[index].iostate.size;
10748 			} else {
10749 				*offset = *size = 0;
10750 			}
10751 		} else {
10752 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10753 		}
10754 	} else {
10755 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10756 	}
10757 }
10758 
10759 void *
upl_get_internal_vectorupl(upl_t upl)10760 upl_get_internal_vectorupl(upl_t upl)
10761 {
10762 	return upl->vector_upl;
10763 }
10764 
10765 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10766 upl_get_internal_vectorupl_pagelist(upl_t upl)
10767 {
10768 	return upl->vector_upl->pagelist;
10769 }
10770 
10771 upl_page_info_t *
upl_get_internal_page_list(upl_t upl)10772 upl_get_internal_page_list(upl_t upl)
10773 {
10774 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10775 }
10776 
10777 void
upl_clear_dirty(upl_t upl,boolean_t value)10778 upl_clear_dirty(
10779 	upl_t           upl,
10780 	boolean_t       value)
10781 {
10782 	if (value) {
10783 		upl->flags |= UPL_CLEAR_DIRTY;
10784 	} else {
10785 		upl->flags &= ~UPL_CLEAR_DIRTY;
10786 	}
10787 }
10788 
10789 void
upl_set_referenced(upl_t upl,boolean_t value)10790 upl_set_referenced(
10791 	upl_t           upl,
10792 	boolean_t       value)
10793 {
10794 	upl_lock(upl);
10795 	if (value) {
10796 		upl->ext_ref_count++;
10797 	} else {
10798 		if (!upl->ext_ref_count) {
10799 			panic("upl_set_referenced not %p", upl);
10800 		}
10801 		upl->ext_ref_count--;
10802 	}
10803 	upl_unlock(upl);
10804 }
10805 
10806 #if CONFIG_IOSCHED
10807 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10808 upl_set_blkno(
10809 	upl_t           upl,
10810 	vm_offset_t     upl_offset,
10811 	int             io_size,
10812 	int64_t         blkno)
10813 {
10814 	int i, j;
10815 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10816 		return;
10817 	}
10818 
10819 	assert(upl->upl_reprio_info != 0);
10820 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10821 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10822 	}
10823 }
10824 #endif
10825 
10826 void inline
memoryshot(unsigned int event,unsigned int control)10827 memoryshot(unsigned int event, unsigned int control)
10828 {
10829 	if (vm_debug_events) {
10830 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10831 		    vm_page_active_count, vm_page_inactive_count,
10832 		    vm_page_free_count, vm_page_speculative_count,
10833 		    vm_page_throttled_count);
10834 	} else {
10835 		(void) event;
10836 		(void) control;
10837 	}
10838 }
10839 
10840 #ifdef MACH_BSD
10841 
10842 boolean_t
upl_device_page(upl_page_info_t * upl)10843 upl_device_page(upl_page_info_t *upl)
10844 {
10845 	return UPL_DEVICE_PAGE(upl);
10846 }
10847 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10848 upl_page_present(upl_page_info_t *upl, int index)
10849 {
10850 	return UPL_PAGE_PRESENT(upl, index);
10851 }
10852 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10853 upl_speculative_page(upl_page_info_t *upl, int index)
10854 {
10855 	return UPL_SPECULATIVE_PAGE(upl, index);
10856 }
10857 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10858 upl_dirty_page(upl_page_info_t *upl, int index)
10859 {
10860 	return UPL_DIRTY_PAGE(upl, index);
10861 }
10862 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10863 upl_valid_page(upl_page_info_t *upl, int index)
10864 {
10865 	return UPL_VALID_PAGE(upl, index);
10866 }
10867 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10868 upl_phys_page(upl_page_info_t *upl, int index)
10869 {
10870 	return UPL_PHYS_PAGE(upl, index);
10871 }
10872 
10873 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10874 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10875 {
10876 	upl[index].mark = v;
10877 }
10878 
10879 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10880 upl_page_get_mark(upl_page_info_t *upl, int index)
10881 {
10882 	return upl[index].mark;
10883 }
10884 
10885 void
vm_countdirtypages(void)10886 vm_countdirtypages(void)
10887 {
10888 	vm_page_t m;
10889 	int dpages;
10890 	int pgopages;
10891 	int precpages;
10892 
10893 
10894 	dpages = 0;
10895 	pgopages = 0;
10896 	precpages = 0;
10897 
10898 	vm_page_lock_queues();
10899 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10900 	do {
10901 		if (m == (vm_page_t)0) {
10902 			break;
10903 		}
10904 
10905 		if (m->vmp_dirty) {
10906 			dpages++;
10907 		}
10908 		if (m->vmp_free_when_done) {
10909 			pgopages++;
10910 		}
10911 		if (m->vmp_precious) {
10912 			precpages++;
10913 		}
10914 
10915 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10916 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10917 		if (m == (vm_page_t)0) {
10918 			break;
10919 		}
10920 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10921 	vm_page_unlock_queues();
10922 
10923 	vm_page_lock_queues();
10924 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10925 	do {
10926 		if (m == (vm_page_t)0) {
10927 			break;
10928 		}
10929 
10930 		dpages++;
10931 		assert(m->vmp_dirty);
10932 		assert(!m->vmp_free_when_done);
10933 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10934 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10935 		if (m == (vm_page_t)0) {
10936 			break;
10937 		}
10938 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10939 	vm_page_unlock_queues();
10940 
10941 	vm_page_lock_queues();
10942 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10943 	do {
10944 		if (m == (vm_page_t)0) {
10945 			break;
10946 		}
10947 
10948 		if (m->vmp_dirty) {
10949 			dpages++;
10950 		}
10951 		if (m->vmp_free_when_done) {
10952 			pgopages++;
10953 		}
10954 		if (m->vmp_precious) {
10955 			precpages++;
10956 		}
10957 
10958 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10959 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10960 		if (m == (vm_page_t)0) {
10961 			break;
10962 		}
10963 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10964 	vm_page_unlock_queues();
10965 
10966 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10967 
10968 	dpages = 0;
10969 	pgopages = 0;
10970 	precpages = 0;
10971 
10972 	vm_page_lock_queues();
10973 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10974 
10975 	do {
10976 		if (m == (vm_page_t)0) {
10977 			break;
10978 		}
10979 		if (m->vmp_dirty) {
10980 			dpages++;
10981 		}
10982 		if (m->vmp_free_when_done) {
10983 			pgopages++;
10984 		}
10985 		if (m->vmp_precious) {
10986 			precpages++;
10987 		}
10988 
10989 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10990 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10991 		if (m == (vm_page_t)0) {
10992 			break;
10993 		}
10994 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10995 	vm_page_unlock_queues();
10996 
10997 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10998 }
10999 #endif /* MACH_BSD */
11000 
11001 
11002 #if CONFIG_IOSCHED
11003 int
upl_get_cached_tier(upl_t upl)11004 upl_get_cached_tier(upl_t  upl)
11005 {
11006 	assert(upl);
11007 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
11008 		return upl->upl_priority;
11009 	}
11010 	return -1;
11011 }
11012 #endif /* CONFIG_IOSCHED */
11013 
11014 
11015 void
upl_callout_iodone(upl_t upl)11016 upl_callout_iodone(upl_t upl)
11017 {
11018 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11019 
11020 	if (upl_ctx) {
11021 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
11022 
11023 		assert(upl_ctx->io_done);
11024 
11025 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11026 	}
11027 }
11028 
11029 void
upl_set_iodone(upl_t upl,void * upl_iodone)11030 upl_set_iodone(upl_t upl, void *upl_iodone)
11031 {
11032 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11033 }
11034 
11035 void
upl_set_iodone_error(upl_t upl,int error)11036 upl_set_iodone_error(upl_t upl, int error)
11037 {
11038 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11039 
11040 	if (upl_ctx) {
11041 		upl_ctx->io_error = error;
11042 	}
11043 }
11044 
11045 
11046 ppnum_t
upl_get_highest_page(upl_t upl)11047 upl_get_highest_page(
11048 	upl_t                      upl)
11049 {
11050 	return upl->highest_page;
11051 }
11052 
11053 upl_size_t
upl_get_size(upl_t upl)11054 upl_get_size(
11055 	upl_t                      upl)
11056 {
11057 	return upl_adjusted_size(upl, PAGE_MASK);
11058 }
11059 
11060 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11061 upl_adjusted_size(
11062 	upl_t upl,
11063 	vm_map_offset_t pgmask)
11064 {
11065 	vm_object_offset_t start_offset, end_offset;
11066 
11067 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11068 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11069 
11070 	return (upl_size_t)(end_offset - start_offset);
11071 }
11072 
11073 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11074 upl_adjusted_offset(
11075 	upl_t upl,
11076 	vm_map_offset_t pgmask)
11077 {
11078 	return trunc_page_mask_64(upl->u_offset, pgmask);
11079 }
11080 
11081 vm_object_offset_t
upl_get_data_offset(upl_t upl)11082 upl_get_data_offset(
11083 	upl_t upl)
11084 {
11085 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11086 }
11087 
11088 upl_t
upl_associated_upl(upl_t upl)11089 upl_associated_upl(upl_t upl)
11090 {
11091 	return upl->associated_upl;
11092 }
11093 
11094 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11095 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11096 {
11097 	upl->associated_upl = associated_upl;
11098 }
11099 
11100 struct vnode *
upl_lookup_vnode(upl_t upl)11101 upl_lookup_vnode(upl_t upl)
11102 {
11103 	if (!upl->map_object->internal) {
11104 		return vnode_pager_lookup_vnode(upl->map_object->pager);
11105 	} else {
11106 		return NULL;
11107 	}
11108 }
11109 
11110 #if UPL_DEBUG
11111 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11112 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11113 {
11114 	upl->ubc_alias1 = alias1;
11115 	upl->ubc_alias2 = alias2;
11116 	return KERN_SUCCESS;
11117 }
11118 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11119 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11120 {
11121 	if (al) {
11122 		*al = upl->ubc_alias1;
11123 	}
11124 	if (al2) {
11125 		*al2 = upl->ubc_alias2;
11126 	}
11127 	return KERN_SUCCESS;
11128 }
11129 #endif /* UPL_DEBUG */
11130 
11131 #if VM_PRESSURE_EVENTS
11132 /*
11133  * Upward trajectory.
11134  */
11135 extern boolean_t vm_compressor_low_on_space(void);
11136 
11137 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11138 VM_PRESSURE_NORMAL_TO_WARNING(void)
11139 {
11140 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11141 		/* Available pages below our threshold */
11142 		if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11143 			/* No frozen processes to kill */
11144 			if (memorystatus_frozen_count == 0) {
11145 				/* Not enough suspended processes available. */
11146 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11147 					return TRUE;
11148 				}
11149 			}
11150 		}
11151 		return FALSE;
11152 	} else {
11153 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11154 	}
11155 }
11156 
11157 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11158 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11159 {
11160 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161 		/* Available pages below our threshold */
11162 		if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11163 			return TRUE;
11164 		}
11165 		return FALSE;
11166 	} else {
11167 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11168 	}
11169 }
11170 
11171 /*
11172  * Downward trajectory.
11173  */
11174 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11175 VM_PRESSURE_WARNING_TO_NORMAL(void)
11176 {
11177 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11178 		/* Available pages above our threshold */
11179 		unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11180 		if (memorystatus_available_pages > target_threshold) {
11181 			return TRUE;
11182 		}
11183 		return FALSE;
11184 	} else {
11185 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11186 	}
11187 }
11188 
11189 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11190 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11191 {
11192 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11193 		/* Available pages above our threshold */
11194 		unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11195 		if (memorystatus_available_pages > target_threshold) {
11196 			return TRUE;
11197 		}
11198 		return FALSE;
11199 	} else {
11200 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11201 	}
11202 }
11203 #endif /* VM_PRESSURE_EVENTS */
11204 
11205 #if DEVELOPMENT || DEBUG
11206 bool compressor_running_perf_test;
11207 uint64_t compressor_perf_test_pages_processed;
11208 
11209 kern_return_t
11210 run_compressor_perf_test(
11211 	user_addr_t buf,
11212 	size_t buffer_size,
11213 	uint64_t *time,
11214 	uint64_t *bytes_compressed,
11215 	uint64_t *compressor_growth);
11216 
11217 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11218 move_pages_to_queue(
11219 	vm_map_t map,
11220 	user_addr_t start_addr,
11221 	size_t buffer_size,
11222 	vm_page_queue_head_t *queue,
11223 	size_t *pages_moved)
11224 {
11225 	kern_return_t err = KERN_SUCCESS;
11226 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11227 	boolean_t addr_in_map = FALSE;
11228 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11229 	vm_object_t curr_object = VM_OBJECT_NULL;
11230 	*pages_moved = 0;
11231 
11232 
11233 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11234 		/*
11235 		 * We don't currently support benchmarking maps with a different page size
11236 		 * than the kernel.
11237 		 */
11238 		return KERN_INVALID_ARGUMENT;
11239 	}
11240 
11241 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11242 		return KERN_INVALID_ARGUMENT;
11243 	}
11244 
11245 	vm_map_lock_read(map);
11246 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11247 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11248 
11249 
11250 	while (curr_addr < end_addr) {
11251 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11252 		if (!addr_in_map) {
11253 			err = KERN_INVALID_ARGUMENT;
11254 			break;
11255 		}
11256 		curr_object = VME_OBJECT(curr_entry);
11257 		if (curr_object) {
11258 			vm_object_lock(curr_object);
11259 			/* We really only want anonymous memory that's in the top level map and object here. */
11260 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11261 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11262 				err = KERN_INVALID_ARGUMENT;
11263 				vm_object_unlock(curr_object);
11264 				break;
11265 			}
11266 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11267 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11268 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
11269 			vm_map_offset_t curr_offset = start_offset;
11270 			vm_page_t curr_page;
11271 			while (curr_offset < end_offset) {
11272 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11273 				if (curr_page != VM_PAGE_NULL) {
11274 					vm_page_lock_queues();
11275 					if (curr_page->vmp_laundry) {
11276 						vm_pageout_steal_laundry(curr_page, TRUE);
11277 					}
11278 					/*
11279 					 * we've already factored out pages in the laundry which
11280 					 * means this page can't be on the pageout queue so it's
11281 					 * safe to do the vm_page_queues_remove
11282 					 */
11283 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11284 					vm_page_queues_remove(curr_page, TRUE);
11285 					if (donate) {
11286 						/*
11287 						 * The compressor needs to see this bit to know
11288 						 * where this page needs to land. Also if stolen,
11289 						 * this bit helps put the page back in the right
11290 						 * special queue where it belongs.
11291 						 */
11292 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11293 					}
11294 					// Clear the referenced bit so we ensure this gets paged out
11295 					curr_page->vmp_reference = false;
11296 					if (curr_page->vmp_pmapped) {
11297 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11298 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11299 					}
11300 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
11301 					vm_page_unlock_queues();
11302 					*pages_moved += 1;
11303 				}
11304 				curr_offset += PAGE_SIZE_64;
11305 				curr_addr += PAGE_SIZE_64;
11306 			}
11307 		}
11308 		vm_object_unlock(curr_object);
11309 	}
11310 	vm_map_unlock_read(map);
11311 	return err;
11312 }
11313 
11314 /*
11315  * Local queue for processing benchmark pages.
11316  * Can't be allocated on the stack because the pointer has to
11317  * be packable.
11318  */
11319 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11320 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11321 run_compressor_perf_test(
11322 	user_addr_t buf,
11323 	size_t buffer_size,
11324 	uint64_t *time,
11325 	uint64_t *bytes_compressed,
11326 	uint64_t *compressor_growth)
11327 {
11328 	kern_return_t err = KERN_SUCCESS;
11329 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11330 		return KERN_NOT_SUPPORTED;
11331 	}
11332 	if (current_task() == kernel_task) {
11333 		return KERN_INVALID_ARGUMENT;
11334 	}
11335 	vm_page_lock_queues();
11336 	if (compressor_running_perf_test) {
11337 		/* Only run one instance of the benchmark at a time. */
11338 		vm_page_unlock_queues();
11339 		return KERN_RESOURCE_SHORTAGE;
11340 	}
11341 	vm_page_unlock_queues();
11342 	size_t page_count = 0;
11343 	vm_map_t map;
11344 	vm_page_t p, next;
11345 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11346 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11347 	*bytes_compressed = *compressor_growth = 0;
11348 
11349 	vm_page_queue_init(&compressor_perf_test_queue);
11350 	map = current_task()->map;
11351 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11352 	if (err != KERN_SUCCESS) {
11353 		goto out;
11354 	}
11355 
11356 	vm_page_lock_queues();
11357 	compressor_running_perf_test = true;
11358 	compressor_perf_test_pages_processed = 0;
11359 	/*
11360 	 * At this point the compressor threads should only process the benchmark queue
11361 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11362 	 * to determine how many compressed bytes we ended up using.
11363 	 */
11364 	compressed_bytes_start = c_segment_compressed_bytes;
11365 	vm_page_unlock_queues();
11366 
11367 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11368 
11369 	vm_page_lock_queues();
11370 	compressor_perf_test_start = mach_absolute_time();
11371 
11372 	// Wake up the compressor thread(s)
11373 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11374 	    pgo_iothread_internal_state[0].pgo_iothread);
11375 
11376 	/*
11377 	 * Depending on when this test is run we could overshoot or be right on the mark
11378 	 * with our page_count. So the comparison is of the _less than_ variety.
11379 	 */
11380 	while (compressor_perf_test_pages_processed < page_count) {
11381 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11382 		vm_page_unlock_queues();
11383 		thread_block(THREAD_CONTINUE_NULL);
11384 		vm_page_lock_queues();
11385 	}
11386 	compressor_perf_test_end = mach_absolute_time();
11387 	compressed_bytes_end = c_segment_compressed_bytes;
11388 	vm_page_unlock_queues();
11389 
11390 
11391 out:
11392 	/*
11393 	 * If we errored out above, then we could still have some pages
11394 	 * on the local queue. Make sure to put them back on the active queue before
11395 	 * returning so they're not orphaned.
11396 	 */
11397 	vm_page_lock_queues();
11398 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11399 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11400 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11401 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11402 
11403 		vm_page_enqueue_active(p, FALSE);
11404 		p = next;
11405 	}
11406 
11407 	compressor_running_perf_test = false;
11408 	vm_page_unlock_queues();
11409 	if (err == KERN_SUCCESS) {
11410 		*bytes_compressed = page_count * PAGE_SIZE_64;
11411 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
11412 	}
11413 
11414 	/*
11415 	 * pageout_scan will consider waking the compactor swapper
11416 	 * before it blocks. Do the same thing here before we return
11417 	 * to ensure that back to back benchmark runs can't overly fragment the
11418 	 * compressor pool.
11419 	 */
11420 	vm_consider_waking_compactor_swapper();
11421 	return err;
11422 }
11423 #endif /* DEVELOPMENT || DEBUG */
11424