xref: /xnu-8796.141.3/osfmk/vm/vm_pageout.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include <stdint.h>
67 #include <ptrauth.h>
68 
69 #include <debug.h>
70 
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79 
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91 
92 #include <os/log.h>
93 
94 #include <sys/kdebug_triage.h>
95 
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98 
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map_internal.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111 
112 #include <san/kasan.h>
113 
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117 
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121 
122 extern int cs_debug;
123 
124 extern void mbuf_drain(boolean_t);
125 
126 #if VM_PRESSURE_EVENTS
127 #if CONFIG_JETSAM
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 #else /* CONFIG_JETSAM */
132 extern uint64_t memorystatus_available_pages;
133 extern uint64_t memorystatus_available_pages_pressure;
134 extern uint64_t memorystatus_available_pages_critical;
135 #endif /* CONFIG_JETSAM */
136 
137 extern unsigned int memorystatus_frozen_count;
138 extern unsigned int memorystatus_suspended_count;
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140 
141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 
144 void vm_pressure_response(void);
145 extern void consider_vm_pressure_events(void);
146 
147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
148 #endif /* VM_PRESSURE_EVENTS */
149 
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 boolean_t vps_dynamic_priority_enabled = FALSE;
153 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
154 
155 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
156 #if !XNU_TARGET_OS_OSX
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
158 #else /* !XNU_TARGET_OS_OSX */
159 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
160 #endif /* !XNU_TARGET_OS_OSX */
161 #endif
162 
163 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
164 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
165 #endif
166 
167 #ifndef VM_PAGE_LAUNDRY_MAX
168 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
169 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
170 
171 #ifndef VM_PAGEOUT_BURST_WAIT
172 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
173 #endif  /* VM_PAGEOUT_BURST_WAIT */
174 
175 #ifndef VM_PAGEOUT_EMPTY_WAIT
176 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
177 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
178 
179 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
180 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
181 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
182 
183 #ifndef VM_PAGEOUT_IDLE_WAIT
184 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
185 #endif  /* VM_PAGEOUT_IDLE_WAIT */
186 
187 #ifndef VM_PAGEOUT_SWAP_WAIT
188 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
189 #endif  /* VM_PAGEOUT_SWAP_WAIT */
190 
191 
192 #ifndef VM_PAGE_SPECULATIVE_TARGET
193 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
194 #endif /* VM_PAGE_SPECULATIVE_TARGET */
195 
196 
197 /*
198  *	To obtain a reasonable LRU approximation, the inactive queue
199  *	needs to be large enough to give pages on it a chance to be
200  *	referenced a second time.  This macro defines the fraction
201  *	of active+inactive pages that should be inactive.
202  *	The pageout daemon uses it to update vm_page_inactive_target.
203  *
204  *	If vm_page_free_count falls below vm_page_free_target and
205  *	vm_page_inactive_count is below vm_page_inactive_target,
206  *	then the pageout daemon starts running.
207  */
208 
209 #ifndef VM_PAGE_INACTIVE_TARGET
210 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
211 #endif  /* VM_PAGE_INACTIVE_TARGET */
212 
213 /*
214  *	Once the pageout daemon starts running, it keeps going
215  *	until vm_page_free_count meets or exceeds vm_page_free_target.
216  */
217 
218 #ifndef VM_PAGE_FREE_TARGET
219 #if !XNU_TARGET_OS_OSX
220 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
221 #else /* !XNU_TARGET_OS_OSX */
222 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
223 #endif /* !XNU_TARGET_OS_OSX */
224 #endif  /* VM_PAGE_FREE_TARGET */
225 
226 
227 /*
228  *	The pageout daemon always starts running once vm_page_free_count
229  *	falls below vm_page_free_min.
230  */
231 
232 #ifndef VM_PAGE_FREE_MIN
233 #if !XNU_TARGET_OS_OSX
234 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
235 #else /* !XNU_TARGET_OS_OSX */
236 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
237 #endif /* !XNU_TARGET_OS_OSX */
238 #endif  /* VM_PAGE_FREE_MIN */
239 
240 #if !XNU_TARGET_OS_OSX
241 #define VM_PAGE_FREE_RESERVED_LIMIT     100
242 #define VM_PAGE_FREE_MIN_LIMIT          1500
243 #define VM_PAGE_FREE_TARGET_LIMIT       2000
244 #else /* !XNU_TARGET_OS_OSX */
245 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
246 #define VM_PAGE_FREE_MIN_LIMIT          3500
247 #define VM_PAGE_FREE_TARGET_LIMIT       4000
248 #endif /* !XNU_TARGET_OS_OSX */
249 
250 /*
251  *	When vm_page_free_count falls below vm_page_free_reserved,
252  *	only vm-privileged threads can allocate pages.  vm-privilege
253  *	allows the pageout daemon and default pager (and any other
254  *	associated threads needed for default pageout) to continue
255  *	operation by dipping into the reserved pool of pages.
256  */
257 
258 #ifndef VM_PAGE_FREE_RESERVED
259 #define VM_PAGE_FREE_RESERVED(n)        \
260 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
261 #endif  /* VM_PAGE_FREE_RESERVED */
262 
263 /*
264  *	When we dequeue pages from the inactive list, they are
265  *	reactivated (ie, put back on the active queue) if referenced.
266  *	However, it is possible to starve the free list if other
267  *	processors are referencing pages faster than we can turn off
268  *	the referenced bit.  So we limit the number of reactivations
269  *	we will make per call of vm_pageout_scan().
270  */
271 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
272 
273 #ifndef VM_PAGE_REACTIVATE_LIMIT
274 #if !XNU_TARGET_OS_OSX
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
276 #else /* !XNU_TARGET_OS_OSX */
277 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
278 #endif /* !XNU_TARGET_OS_OSX */
279 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
280 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
281 
282 int vm_pageout_protect_realtime = true;
283 
284 extern boolean_t hibernate_cleaning_in_progress;
285 
286 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
287 struct pgo_iothread_state pgo_iothread_external_state;
288 
289 #if VM_PRESSURE_EVENTS
290 void vm_pressure_thread(void);
291 
292 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
293 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
294 
295 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
296 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
297 #endif
298 
299 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
300 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
301 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
302 
303 extern void vm_pageout_continue(void);
304 extern void vm_pageout_scan(void);
305 
306 boolean_t vm_pageout_running = FALSE;
307 
308 uint32_t vm_page_upl_tainted = 0;
309 uint32_t vm_page_iopl_tainted = 0;
310 
311 #if XNU_TARGET_OS_OSX
312 static boolean_t vm_pageout_waiter  = FALSE;
313 #endif /* XNU_TARGET_OS_OSX */
314 
315 
316 #if DEVELOPMENT || DEBUG
317 struct vm_pageout_debug vm_pageout_debug;
318 #endif
319 struct vm_pageout_vminfo vm_pageout_vminfo;
320 struct vm_pageout_state  vm_pageout_state;
321 struct vm_config         vm_config;
322 
323 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
324 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
325 #if DEVELOPMENT || DEBUG
326 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
327 #endif /* DEVELOPMENT || DEBUG */
328 
329 int         vm_upl_wait_for_pages = 0;
330 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
331 
332 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
333 
334 int     vm_debug_events = 0;
335 
336 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
337 
338 #if CONFIG_MEMORYSTATUS
339 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
340 
341 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
342 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
343 
344 #endif
345 
346 #if __AMP__
347 
348 // bind compressor threads e-cores
349 #define VM_COMPRESSOR_EBOUND_DEFAULT 1
350 
351 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
352 int vm_pgo_pbound = 0;
353 extern void thread_bind_cluster_type(thread_t, char, bool);
354 
355 #endif /* __AMP__ */
356 
357 
358 /*
359  *	Routine:	vm_pageout_object_terminate
360  *	Purpose:
361  *		Destroy the pageout_object, and perform all of the
362  *		required cleanup actions.
363  *
364  *	In/Out conditions:
365  *		The object must be locked, and will be returned locked.
366  */
367 void
vm_pageout_object_terminate(vm_object_t object)368 vm_pageout_object_terminate(
369 	vm_object_t     object)
370 {
371 	vm_object_t     shadow_object;
372 
373 	/*
374 	 * Deal with the deallocation (last reference) of a pageout object
375 	 * (used for cleaning-in-place) by dropping the paging references/
376 	 * freeing pages in the original object.
377 	 */
378 
379 	assert(object->pageout);
380 	shadow_object = object->shadow;
381 	vm_object_lock(shadow_object);
382 
383 	while (!vm_page_queue_empty(&object->memq)) {
384 		vm_page_t               p, m;
385 		vm_object_offset_t      offset;
386 
387 		p = (vm_page_t) vm_page_queue_first(&object->memq);
388 
389 		assert(p->vmp_private);
390 		assert(p->vmp_free_when_done);
391 		p->vmp_free_when_done = FALSE;
392 		assert(!p->vmp_cleaning);
393 		assert(!p->vmp_laundry);
394 
395 		offset = p->vmp_offset;
396 		VM_PAGE_FREE(p);
397 		p = VM_PAGE_NULL;
398 
399 		m = vm_page_lookup(shadow_object,
400 		    offset + object->vo_shadow_offset);
401 
402 		if (m == VM_PAGE_NULL) {
403 			continue;
404 		}
405 
406 		assert((m->vmp_dirty) || (m->vmp_precious) ||
407 		    (m->vmp_busy && m->vmp_cleaning));
408 
409 		/*
410 		 * Handle the trusted pager throttle.
411 		 * Also decrement the burst throttle (if external).
412 		 */
413 		vm_page_lock_queues();
414 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
415 			vm_pageout_throttle_up(m);
416 		}
417 
418 		/*
419 		 * Handle the "target" page(s). These pages are to be freed if
420 		 * successfully cleaned. Target pages are always busy, and are
421 		 * wired exactly once. The initial target pages are not mapped,
422 		 * (so cannot be referenced or modified) but converted target
423 		 * pages may have been modified between the selection as an
424 		 * adjacent page and conversion to a target.
425 		 */
426 		if (m->vmp_free_when_done) {
427 			assert(m->vmp_busy);
428 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
429 			assert(m->vmp_wire_count == 1);
430 			m->vmp_cleaning = FALSE;
431 			m->vmp_free_when_done = FALSE;
432 			/*
433 			 * Revoke all access to the page. Since the object is
434 			 * locked, and the page is busy, this prevents the page
435 			 * from being dirtied after the pmap_disconnect() call
436 			 * returns.
437 			 *
438 			 * Since the page is left "dirty" but "not modifed", we
439 			 * can detect whether the page was redirtied during
440 			 * pageout by checking the modify state.
441 			 */
442 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
443 				SET_PAGE_DIRTY(m, FALSE);
444 			} else {
445 				m->vmp_dirty = FALSE;
446 			}
447 
448 			if (m->vmp_dirty) {
449 				vm_page_unwire(m, TRUE);        /* reactivates */
450 				counter_inc(&vm_statistics_reactivations);
451 				PAGE_WAKEUP_DONE(m);
452 			} else {
453 				vm_page_free(m);  /* clears busy, etc. */
454 			}
455 			vm_page_unlock_queues();
456 			continue;
457 		}
458 		/*
459 		 * Handle the "adjacent" pages. These pages were cleaned in
460 		 * place, and should be left alone.
461 		 * If prep_pin_count is nonzero, then someone is using the
462 		 * page, so make it active.
463 		 */
464 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
465 			if (m->vmp_reference) {
466 				vm_page_activate(m);
467 			} else {
468 				vm_page_deactivate(m);
469 			}
470 		}
471 		if (m->vmp_overwriting) {
472 			/*
473 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
474 			 */
475 			if (m->vmp_busy) {
476 				/*
477 				 * We do not re-set m->vmp_dirty !
478 				 * The page was busy so no extraneous activity
479 				 * could have occurred. COPY_INTO is a read into the
480 				 * new pages. CLEAN_IN_PLACE does actually write
481 				 * out the pages but handling outside of this code
482 				 * will take care of resetting dirty. We clear the
483 				 * modify however for the Programmed I/O case.
484 				 */
485 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
486 
487 				m->vmp_busy = FALSE;
488 				m->vmp_absent = FALSE;
489 			} else {
490 				/*
491 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
492 				 * Occurs when the original page was wired
493 				 * at the time of the list request
494 				 */
495 				assert(VM_PAGE_WIRED(m));
496 				vm_page_unwire(m, TRUE);        /* reactivates */
497 			}
498 			m->vmp_overwriting = FALSE;
499 		} else {
500 			m->vmp_dirty = FALSE;
501 		}
502 		m->vmp_cleaning = FALSE;
503 
504 		/*
505 		 * Wakeup any thread waiting for the page to be un-cleaning.
506 		 */
507 		PAGE_WAKEUP(m);
508 		vm_page_unlock_queues();
509 	}
510 	/*
511 	 * Account for the paging reference taken in vm_paging_object_allocate.
512 	 */
513 	vm_object_activity_end(shadow_object);
514 	vm_object_unlock(shadow_object);
515 
516 	assert(object->ref_count == 0);
517 	assert(object->paging_in_progress == 0);
518 	assert(object->activity_in_progress == 0);
519 	assert(object->resident_page_count == 0);
520 	return;
521 }
522 
523 /*
524  * Routine:	vm_pageclean_setup
525  *
526  * Purpose:	setup a page to be cleaned (made non-dirty), but not
527  *		necessarily flushed from the VM page cache.
528  *		This is accomplished by cleaning in place.
529  *
530  *		The page must not be busy, and new_object
531  *		must be locked.
532  *
533  */
534 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)535 vm_pageclean_setup(
536 	vm_page_t               m,
537 	vm_page_t               new_m,
538 	vm_object_t             new_object,
539 	vm_object_offset_t      new_offset)
540 {
541 	assert(!m->vmp_busy);
542 #if 0
543 	assert(!m->vmp_cleaning);
544 #endif
545 
546 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
547 
548 	/*
549 	 * Mark original page as cleaning in place.
550 	 */
551 	m->vmp_cleaning = TRUE;
552 	SET_PAGE_DIRTY(m, FALSE);
553 	m->vmp_precious = FALSE;
554 
555 	/*
556 	 * Convert the fictitious page to a private shadow of
557 	 * the real page.
558 	 */
559 	assert(new_m->vmp_fictitious);
560 	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
561 	new_m->vmp_fictitious = FALSE;
562 	new_m->vmp_private = TRUE;
563 	new_m->vmp_free_when_done = TRUE;
564 	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
565 
566 	vm_page_lockspin_queues();
567 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
568 	vm_page_unlock_queues();
569 
570 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
571 	assert(!new_m->vmp_wanted);
572 	new_m->vmp_busy = FALSE;
573 }
574 
575 /*
576  *	Routine:	vm_pageout_initialize_page
577  *	Purpose:
578  *		Causes the specified page to be initialized in
579  *		the appropriate memory object. This routine is used to push
580  *		pages into a copy-object when they are modified in the
581  *		permanent object.
582  *
583  *		The page is moved to a temporary object and paged out.
584  *
585  *	In/out conditions:
586  *		The page in question must not be on any pageout queues.
587  *		The object to which it belongs must be locked.
588  *		The page must be busy, but not hold a paging reference.
589  *
590  *	Implementation:
591  *		Move this page to a completely new object.
592  */
593 void
vm_pageout_initialize_page(vm_page_t m)594 vm_pageout_initialize_page(
595 	vm_page_t       m)
596 {
597 	vm_object_t             object;
598 	vm_object_offset_t      paging_offset;
599 	memory_object_t         pager;
600 
601 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
602 
603 	object = VM_PAGE_OBJECT(m);
604 
605 	assert(m->vmp_busy);
606 	assert(object->internal);
607 
608 	/*
609 	 *	Verify that we really want to clean this page
610 	 */
611 	assert(!m->vmp_absent);
612 	assert(m->vmp_dirty);
613 
614 	/*
615 	 *	Create a paging reference to let us play with the object.
616 	 */
617 	paging_offset = m->vmp_offset + object->paging_offset;
618 
619 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
620 		panic("reservation without pageout?"); /* alan */
621 
622 		VM_PAGE_FREE(m);
623 		vm_object_unlock(object);
624 
625 		return;
626 	}
627 
628 	/*
629 	 * If there's no pager, then we can't clean the page.  This should
630 	 * never happen since this should be a copy object and therefore not
631 	 * an external object, so the pager should always be there.
632 	 */
633 
634 	pager = object->pager;
635 
636 	if (pager == MEMORY_OBJECT_NULL) {
637 		panic("missing pager for copy object");
638 
639 		VM_PAGE_FREE(m);
640 		return;
641 	}
642 
643 	/*
644 	 * set the page for future call to vm_fault_list_request
645 	 */
646 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
647 	SET_PAGE_DIRTY(m, FALSE);
648 
649 	/*
650 	 * keep the object from collapsing or terminating
651 	 */
652 	vm_object_paging_begin(object);
653 	vm_object_unlock(object);
654 
655 	/*
656 	 *	Write the data to its pager.
657 	 *	Note that the data is passed by naming the new object,
658 	 *	not a virtual address; the pager interface has been
659 	 *	manipulated to use the "internal memory" data type.
660 	 *	[The object reference from its allocation is donated
661 	 *	to the eventual recipient.]
662 	 */
663 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
664 
665 	vm_object_lock(object);
666 	vm_object_paging_end(object);
667 }
668 
669 
670 /*
671  * vm_pageout_cluster:
672  *
673  * Given a page, queue it to the appropriate I/O thread,
674  * which will page it out and attempt to clean adjacent pages
675  * in the same operation.
676  *
677  * The object and queues must be locked. We will take a
678  * paging reference to prevent deallocation or collapse when we
679  * release the object lock back at the call site.  The I/O thread
680  * is responsible for consuming this reference
681  *
682  * The page must not be on any pageout queue.
683  */
684 #if DEVELOPMENT || DEBUG
685 vmct_stats_t vmct_stats;
686 
687 int32_t vmct_active = 0;
688 uint64_t vm_compressor_epoch_start = 0;
689 uint64_t vm_compressor_epoch_stop = 0;
690 
691 typedef enum vmct_state_t {
692 	VMCT_IDLE,
693 	VMCT_AWAKENED,
694 	VMCT_ACTIVE,
695 } vmct_state_t;
696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
697 #endif
698 
699 
700 
701 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)702 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
703 {
704 	vm_object_t object = VM_PAGE_OBJECT(m);
705 
706 	VM_PAGE_CHECK(m);
707 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
708 	vm_object_lock_assert_exclusive(object);
709 
710 	/*
711 	 * Make sure it's OK to page this out.
712 	 */
713 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
714 	assert(!m->vmp_cleaning && !m->vmp_laundry);
715 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
716 
717 	/*
718 	 * protect the object from collapse or termination
719 	 */
720 	vm_object_activity_begin(object);
721 
722 
723 	/*
724 	 * pgo_laundry count is tied to the laundry bit
725 	 */
726 	m->vmp_laundry = TRUE;
727 	q->pgo_laundry++;
728 
729 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
730 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
731 
732 	// the benchmark queue will be woken up independently by the benchmark itself
733 	if (
734 		object->internal == TRUE
735 #if DEVELOPMENT || DEBUG
736 		&& q != &vm_pageout_queue_benchmark
737 #endif
738 		) {
739 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
740 		m->vmp_busy = TRUE;
741 		// Wake up the first compressor thread. It will wake subsequent threads if necessary.
742 		sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup, pgo_iothread_internal_state[0].pgo_iothread);
743 	} else {
744 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
745 	}
746 	VM_PAGE_CHECK(m);
747 }
748 
749 void
vm_pageout_cluster(vm_page_t m)750 vm_pageout_cluster(vm_page_t m)
751 {
752 	struct          vm_pageout_queue *q;
753 	vm_object_t     object = VM_PAGE_OBJECT(m);
754 	if (object->internal) {
755 		q = &vm_pageout_queue_internal;
756 	} else {
757 		q = &vm_pageout_queue_external;
758 	}
759 	vm_pageout_cluster_to_queue(m, q);
760 }
761 
762 
763 /*
764  * A page is back from laundry or we are stealing it back from
765  * the laundering state.  See if there are some pages waiting to
766  * go to laundry and if we can let some of them go now.
767  *
768  * Object and page queues must be locked.
769  */
770 void
vm_pageout_throttle_up(vm_page_t m)771 vm_pageout_throttle_up(
772 	vm_page_t       m)
773 {
774 	struct vm_pageout_queue *q;
775 	vm_object_t      m_object;
776 
777 	m_object = VM_PAGE_OBJECT(m);
778 
779 	assert(m_object != VM_OBJECT_NULL);
780 	assert(m_object != kernel_object);
781 
782 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
783 	vm_object_lock_assert_exclusive(m_object);
784 
785 	if (m_object->internal == TRUE) {
786 		q = &vm_pageout_queue_internal;
787 	} else {
788 		q = &vm_pageout_queue_external;
789 	}
790 
791 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
792 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
793 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
794 
795 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
796 
797 		vm_object_activity_end(m_object);
798 
799 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
800 	}
801 	if (m->vmp_laundry == TRUE) {
802 		m->vmp_laundry = FALSE;
803 		q->pgo_laundry--;
804 
805 		if (q->pgo_throttled == TRUE) {
806 			q->pgo_throttled = FALSE;
807 			thread_wakeup((event_t) &q->pgo_laundry);
808 		}
809 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
810 			q->pgo_draining = FALSE;
811 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
812 		}
813 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
814 	}
815 }
816 
817 
818 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)819 vm_pageout_throttle_up_batch(
820 	struct vm_pageout_queue *q,
821 	int             batch_cnt)
822 {
823 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
824 
825 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
826 
827 	q->pgo_laundry -= batch_cnt;
828 
829 	if (q->pgo_throttled == TRUE) {
830 		q->pgo_throttled = FALSE;
831 		thread_wakeup((event_t) &q->pgo_laundry);
832 	}
833 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
834 		q->pgo_draining = FALSE;
835 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
836 	}
837 }
838 
839 
840 
841 /*
842  * VM memory pressure monitoring.
843  *
844  * vm_pageout_scan() keeps track of the number of pages it considers and
845  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
846  *
847  * compute_memory_pressure() is called every second from compute_averages()
848  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
849  * of recalimed pages in a new vm_pageout_stat[] bucket.
850  *
851  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
852  * The caller provides the number of seconds ("nsecs") worth of statistics
853  * it wants, up to 30 seconds.
854  * It computes the number of pages reclaimed in the past "nsecs" seconds and
855  * also returns the number of pages the system still needs to reclaim at this
856  * moment in time.
857  */
858 #if DEVELOPMENT || DEBUG
859 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
860 #else
861 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
862 #endif
863 struct vm_pageout_stat {
864 	unsigned long vm_page_active_count;
865 	unsigned long vm_page_speculative_count;
866 	unsigned long vm_page_inactive_count;
867 	unsigned long vm_page_anonymous_count;
868 
869 	unsigned long vm_page_free_count;
870 	unsigned long vm_page_wire_count;
871 	unsigned long vm_page_compressor_count;
872 
873 	unsigned long vm_page_pages_compressed;
874 	unsigned long vm_page_pageable_internal_count;
875 	unsigned long vm_page_pageable_external_count;
876 	unsigned long vm_page_xpmapped_external_count;
877 
878 	unsigned int pages_grabbed;
879 	unsigned int pages_freed;
880 
881 	unsigned int pages_compressed;
882 	unsigned int pages_grabbed_by_compressor;
883 	unsigned int failed_compressions;
884 
885 	unsigned int pages_evicted;
886 	unsigned int pages_purged;
887 
888 	unsigned int considered;
889 	unsigned int considered_bq_internal;
890 	unsigned int considered_bq_external;
891 
892 	unsigned int skipped_external;
893 	unsigned int skipped_internal;
894 	unsigned int filecache_min_reactivations;
895 
896 	unsigned int freed_speculative;
897 	unsigned int freed_cleaned;
898 	unsigned int freed_internal;
899 	unsigned int freed_external;
900 
901 	unsigned int cleaned_dirty_external;
902 	unsigned int cleaned_dirty_internal;
903 
904 	unsigned int inactive_referenced;
905 	unsigned int inactive_nolock;
906 	unsigned int reactivation_limit_exceeded;
907 	unsigned int forced_inactive_reclaim;
908 
909 	unsigned int throttled_internal_q;
910 	unsigned int throttled_external_q;
911 
912 	unsigned int phantom_ghosts_found;
913 	unsigned int phantom_ghosts_added;
914 
915 	unsigned int vm_page_realtime_count;
916 	unsigned int forcereclaimed_sharedcache;
917 	unsigned int forcereclaimed_realtime;
918 	unsigned int protected_sharedcache;
919 	unsigned int protected_realtime;
920 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
921 
922 unsigned int vm_pageout_stat_now = 0;
923 
924 #define VM_PAGEOUT_STAT_BEFORE(i) \
925 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
926 #define VM_PAGEOUT_STAT_AFTER(i) \
927 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
928 
929 #if VM_PAGE_BUCKETS_CHECK
930 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
931 #endif /* VM_PAGE_BUCKETS_CHECK */
932 
933 
934 void
935 record_memory_pressure(void);
936 void
record_memory_pressure(void)937 record_memory_pressure(void)
938 {
939 	unsigned int vm_pageout_next;
940 
941 #if VM_PAGE_BUCKETS_CHECK
942 	/* check the consistency of VM page buckets at regular interval */
943 	static int counter = 0;
944 	if ((++counter % vm_page_buckets_check_interval) == 0) {
945 		vm_page_buckets_check();
946 	}
947 #endif /* VM_PAGE_BUCKETS_CHECK */
948 
949 	vm_pageout_state.vm_memory_pressure =
950 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
951 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
952 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
953 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
954 
955 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
956 
957 	/* move "now" forward */
958 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
959 
960 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
961 
962 	vm_pageout_stat_now = vm_pageout_next;
963 }
964 
965 
966 /*
967  * IMPORTANT
968  * mach_vm_ctl_page_free_wanted() is called indirectly, via
969  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
970  * it must be safe in the restricted stackshot context. Locks and/or
971  * blocking are not allowable.
972  */
973 unsigned int
mach_vm_ctl_page_free_wanted(void)974 mach_vm_ctl_page_free_wanted(void)
975 {
976 	unsigned int page_free_target, page_free_count, page_free_wanted;
977 
978 	page_free_target = vm_page_free_target;
979 	page_free_count = vm_page_free_count;
980 	if (page_free_target > page_free_count) {
981 		page_free_wanted = page_free_target - page_free_count;
982 	} else {
983 		page_free_wanted = 0;
984 	}
985 
986 	return page_free_wanted;
987 }
988 
989 
990 /*
991  * IMPORTANT:
992  * mach_vm_pressure_monitor() is called when taking a stackshot, with
993  * wait_for_pressure FALSE, so that code path must remain safe in the
994  * restricted stackshot context. No blocking or locks are allowable.
995  * on that code path.
996  */
997 
998 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)999 mach_vm_pressure_monitor(
1000 	boolean_t       wait_for_pressure,
1001 	unsigned int    nsecs_monitored,
1002 	unsigned int    *pages_reclaimed_p,
1003 	unsigned int    *pages_wanted_p)
1004 {
1005 	wait_result_t   wr;
1006 	unsigned int    vm_pageout_then, vm_pageout_now;
1007 	unsigned int    pages_reclaimed;
1008 	unsigned int    units_of_monitor;
1009 
1010 	units_of_monitor = 8 * nsecs_monitored;
1011 	/*
1012 	 * We don't take the vm_page_queue_lock here because we don't want
1013 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1014 	 * thread when it's trying to reclaim memory.  We don't need fully
1015 	 * accurate monitoring anyway...
1016 	 */
1017 
1018 	if (wait_for_pressure) {
1019 		/* wait until there's memory pressure */
1020 		while (vm_page_free_count >= vm_page_free_target) {
1021 			wr = assert_wait((event_t) &vm_page_free_wanted,
1022 			    THREAD_INTERRUPTIBLE);
1023 			if (wr == THREAD_WAITING) {
1024 				wr = thread_block(THREAD_CONTINUE_NULL);
1025 			}
1026 			if (wr == THREAD_INTERRUPTED) {
1027 				return KERN_ABORTED;
1028 			}
1029 			if (wr == THREAD_AWAKENED) {
1030 				/*
1031 				 * The memory pressure might have already
1032 				 * been relieved but let's not block again
1033 				 * and let's report that there was memory
1034 				 * pressure at some point.
1035 				 */
1036 				break;
1037 			}
1038 		}
1039 	}
1040 
1041 	/* provide the number of pages the system wants to reclaim */
1042 	if (pages_wanted_p != NULL) {
1043 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1044 	}
1045 
1046 	if (pages_reclaimed_p == NULL) {
1047 		return KERN_SUCCESS;
1048 	}
1049 
1050 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1051 	vm_pageout_now = vm_pageout_stat_now;
1052 	pages_reclaimed = 0;
1053 	for (vm_pageout_then =
1054 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1055 	    vm_pageout_then != vm_pageout_now &&
1056 	    units_of_monitor-- != 0;
1057 	    vm_pageout_then =
1058 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1059 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1060 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1061 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1062 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1063 	}
1064 	*pages_reclaimed_p = pages_reclaimed;
1065 
1066 	return KERN_SUCCESS;
1067 }
1068 
1069 
1070 
1071 #if DEVELOPMENT || DEBUG
1072 
1073 static void
1074 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1075 
1076 /*
1077  * condition variable used to make sure there is
1078  * only a single sweep going on at a time
1079  */
1080 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1081 
1082 
1083 void
vm_pageout_disconnect_all_pages()1084 vm_pageout_disconnect_all_pages()
1085 {
1086 	vm_page_lock_queues();
1087 
1088 	if (vm_pageout_disconnect_all_pages_active == TRUE) {
1089 		vm_page_unlock_queues();
1090 		return;
1091 	}
1092 	vm_pageout_disconnect_all_pages_active = TRUE;
1093 	vm_page_unlock_queues();
1094 
1095 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1096 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1097 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1098 
1099 	vm_pageout_disconnect_all_pages_active = FALSE;
1100 }
1101 
1102 
1103 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1104 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1105 {
1106 	vm_page_t       m;
1107 	vm_object_t     t_object = NULL;
1108 	vm_object_t     l_object = NULL;
1109 	vm_object_t     m_object = NULL;
1110 	int             delayed_unlock = 0;
1111 	int             try_failed_count = 0;
1112 	int             disconnected_count = 0;
1113 	int             paused_count = 0;
1114 	int             object_locked_count = 0;
1115 
1116 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1117 	    q, qcount, 0, 0, 0);
1118 
1119 	vm_page_lock_queues();
1120 
1121 	while (qcount && !vm_page_queue_empty(q)) {
1122 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1123 
1124 		m = (vm_page_t) vm_page_queue_first(q);
1125 		m_object = VM_PAGE_OBJECT(m);
1126 
1127 		/*
1128 		 * check to see if we currently are working
1129 		 * with the same object... if so, we've
1130 		 * already got the lock
1131 		 */
1132 		if (m_object != l_object) {
1133 			/*
1134 			 * the object associated with candidate page is
1135 			 * different from the one we were just working
1136 			 * with... dump the lock if we still own it
1137 			 */
1138 			if (l_object != NULL) {
1139 				vm_object_unlock(l_object);
1140 				l_object = NULL;
1141 			}
1142 			if (m_object != t_object) {
1143 				try_failed_count = 0;
1144 			}
1145 
1146 			/*
1147 			 * Try to lock object; since we've alread got the
1148 			 * page queues lock, we can only 'try' for this one.
1149 			 * if the 'try' fails, we need to do a mutex_pause
1150 			 * to allow the owner of the object lock a chance to
1151 			 * run...
1152 			 */
1153 			if (!vm_object_lock_try_scan(m_object)) {
1154 				if (try_failed_count > 20) {
1155 					goto reenter_pg_on_q;
1156 				}
1157 				vm_page_unlock_queues();
1158 				mutex_pause(try_failed_count++);
1159 				vm_page_lock_queues();
1160 				delayed_unlock = 0;
1161 
1162 				paused_count++;
1163 
1164 				t_object = m_object;
1165 				continue;
1166 			}
1167 			object_locked_count++;
1168 
1169 			l_object = m_object;
1170 		}
1171 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1172 			/*
1173 			 * put it back on the head of its queue
1174 			 */
1175 			goto reenter_pg_on_q;
1176 		}
1177 		if (m->vmp_pmapped == TRUE) {
1178 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1179 
1180 			disconnected_count++;
1181 		}
1182 reenter_pg_on_q:
1183 		vm_page_queue_remove(q, m, vmp_pageq);
1184 		vm_page_queue_enter(q, m, vmp_pageq);
1185 
1186 		qcount--;
1187 		try_failed_count = 0;
1188 
1189 		if (delayed_unlock++ > 128) {
1190 			if (l_object != NULL) {
1191 				vm_object_unlock(l_object);
1192 				l_object = NULL;
1193 			}
1194 			lck_mtx_yield(&vm_page_queue_lock);
1195 			delayed_unlock = 0;
1196 		}
1197 	}
1198 	if (l_object != NULL) {
1199 		vm_object_unlock(l_object);
1200 		l_object = NULL;
1201 	}
1202 	vm_page_unlock_queues();
1203 
1204 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1205 	    q, disconnected_count, object_locked_count, paused_count, 0);
1206 }
1207 
1208 extern char* proc_best_name(struct proc* proc);
1209 
1210 int
vm_toggle_task_selfdonate_pages(task_t task)1211 vm_toggle_task_selfdonate_pages(task_t task)
1212 {
1213 	int state = 0;
1214 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1215 		printf("VM Donation mode is OFF on the system\n");
1216 		return state;
1217 	}
1218 	if (task != kernel_task) {
1219 		task_lock(task);
1220 		if (!task->donates_own_pages) {
1221 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1222 			task->donates_own_pages = true;
1223 			state = 1;
1224 		} else if (task->donates_own_pages) {
1225 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1226 			task->donates_own_pages = false;
1227 			state = 0;
1228 		}
1229 		task_unlock(task);
1230 	}
1231 	return state;
1232 }
1233 #endif /* DEVELOPMENT || DEBUG */
1234 
1235 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1236 vm_task_set_selfdonate_pages(task_t task, bool donate)
1237 {
1238 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1239 	assert(task != kernel_task);
1240 
1241 	task_lock(task);
1242 	task->donates_own_pages = donate;
1243 	task_unlock(task);
1244 }
1245 
1246 
1247 
1248 static size_t
1249 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1250 
1251 /*
1252  * condition variable used to make sure there is
1253  * only a single sweep going on at a time
1254  */
1255 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1256 
1257 
1258 void
vm_pageout_anonymous_pages()1259 vm_pageout_anonymous_pages()
1260 {
1261 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1262 		vm_page_lock_queues();
1263 
1264 		if (vm_pageout_anonymous_pages_active == TRUE) {
1265 			vm_page_unlock_queues();
1266 			return;
1267 		}
1268 		vm_pageout_anonymous_pages_active = TRUE;
1269 		vm_page_unlock_queues();
1270 
1271 		vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1272 		vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1273 		vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1274 
1275 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1276 			vm_consider_swapping();
1277 		}
1278 
1279 		vm_page_lock_queues();
1280 		vm_pageout_anonymous_pages_active = FALSE;
1281 		vm_page_unlock_queues();
1282 	}
1283 }
1284 
1285 
1286 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1287 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1288 {
1289 	vm_page_t       m;
1290 	vm_object_t     t_object = NULL;
1291 	vm_object_t     l_object = NULL;
1292 	vm_object_t     m_object = NULL;
1293 	int             delayed_unlock = 0;
1294 	int             try_failed_count = 0;
1295 	int             refmod_state;
1296 	int             pmap_options;
1297 	struct          vm_pageout_queue *iq;
1298 	ppnum_t         phys_page;
1299 	size_t          pages_moved = 0;
1300 
1301 
1302 	iq = &vm_pageout_queue_internal;
1303 
1304 	vm_page_lock_queues();
1305 
1306 #if DEVELOPMENT || DEBUG
1307 	if (perf_test) {
1308 		iq = &vm_pageout_queue_benchmark;
1309 		// ensure the benchmark queue isn't throttled
1310 		iq->pgo_maxlaundry = (unsigned int) qcount;
1311 	}
1312 #endif /* DEVELOPMENT ||DEBUG */
1313 
1314 	while (qcount && !vm_page_queue_empty(q)) {
1315 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1316 
1317 		if (VM_PAGE_Q_THROTTLED(iq)) {
1318 			if (l_object != NULL) {
1319 				vm_object_unlock(l_object);
1320 				l_object = NULL;
1321 			}
1322 			iq->pgo_draining = TRUE;
1323 
1324 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1325 			vm_page_unlock_queues();
1326 
1327 			thread_block(THREAD_CONTINUE_NULL);
1328 
1329 			vm_page_lock_queues();
1330 			delayed_unlock = 0;
1331 			continue;
1332 		}
1333 		m = (vm_page_t) vm_page_queue_first(q);
1334 		m_object = VM_PAGE_OBJECT(m);
1335 
1336 		/*
1337 		 * check to see if we currently are working
1338 		 * with the same object... if so, we've
1339 		 * already got the lock
1340 		 */
1341 		if (m_object != l_object) {
1342 			if (!m_object->internal) {
1343 				goto reenter_pg_on_q;
1344 			}
1345 
1346 			/*
1347 			 * the object associated with candidate page is
1348 			 * different from the one we were just working
1349 			 * with... dump the lock if we still own it
1350 			 */
1351 			if (l_object != NULL) {
1352 				vm_object_unlock(l_object);
1353 				l_object = NULL;
1354 			}
1355 			if (m_object != t_object) {
1356 				try_failed_count = 0;
1357 			}
1358 
1359 			/*
1360 			 * Try to lock object; since we've alread got the
1361 			 * page queues lock, we can only 'try' for this one.
1362 			 * if the 'try' fails, we need to do a mutex_pause
1363 			 * to allow the owner of the object lock a chance to
1364 			 * run...
1365 			 */
1366 			if (!vm_object_lock_try_scan(m_object)) {
1367 				if (try_failed_count > 20) {
1368 					goto reenter_pg_on_q;
1369 				}
1370 				vm_page_unlock_queues();
1371 				mutex_pause(try_failed_count++);
1372 				vm_page_lock_queues();
1373 				delayed_unlock = 0;
1374 
1375 				t_object = m_object;
1376 				continue;
1377 			}
1378 			l_object = m_object;
1379 		}
1380 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1381 			/*
1382 			 * page is not to be cleaned
1383 			 * put it back on the head of its queue
1384 			 */
1385 			goto reenter_pg_on_q;
1386 		}
1387 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1388 
1389 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1390 			refmod_state = pmap_get_refmod(phys_page);
1391 
1392 			if (refmod_state & VM_MEM_REFERENCED) {
1393 				m->vmp_reference = TRUE;
1394 			}
1395 			if (refmod_state & VM_MEM_MODIFIED) {
1396 				SET_PAGE_DIRTY(m, FALSE);
1397 			}
1398 		}
1399 		if (m->vmp_reference == TRUE) {
1400 			m->vmp_reference = FALSE;
1401 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1402 			goto reenter_pg_on_q;
1403 		}
1404 		if (m->vmp_pmapped == TRUE) {
1405 			if (m->vmp_dirty || m->vmp_precious) {
1406 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1407 			} else {
1408 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1409 			}
1410 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1411 			if (refmod_state & VM_MEM_MODIFIED) {
1412 				SET_PAGE_DIRTY(m, FALSE);
1413 			}
1414 		}
1415 
1416 		if (!m->vmp_dirty && !m->vmp_precious) {
1417 			vm_page_unlock_queues();
1418 			VM_PAGE_FREE(m);
1419 			vm_page_lock_queues();
1420 			delayed_unlock = 0;
1421 
1422 			goto next_pg;
1423 		}
1424 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1425 			if (!m_object->pager_initialized) {
1426 				vm_page_unlock_queues();
1427 
1428 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1429 
1430 				if (!m_object->pager_initialized) {
1431 					vm_object_compressor_pager_create(m_object);
1432 				}
1433 
1434 				vm_page_lock_queues();
1435 				delayed_unlock = 0;
1436 			}
1437 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1438 				goto reenter_pg_on_q;
1439 			}
1440 			/*
1441 			 * vm_object_compressor_pager_create will drop the object lock
1442 			 * which means 'm' may no longer be valid to use
1443 			 */
1444 			continue;
1445 		}
1446 
1447 		if (!perf_test) {
1448 			/*
1449 			 * we've already factored out pages in the laundry which
1450 			 * means this page can't be on the pageout queue so it's
1451 			 * safe to do the vm_page_queues_remove
1452 			 */
1453 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1454 			vm_page_queues_remove(m, TRUE);
1455 			if (donate) {
1456 				/*
1457 				 * The compressor needs to see this bit to know
1458 				 * where this page needs to land. Also if stolen,
1459 				 * this bit helps put the page back in the right
1460 				 * special queue where it belongs.
1461 				 */
1462 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1463 			}
1464 		} else {
1465 			vm_page_queue_remove(q, m, vmp_pageq);
1466 		}
1467 
1468 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1469 
1470 		vm_pageout_cluster_to_queue(m, iq);
1471 
1472 		pages_moved++;
1473 		goto next_pg;
1474 
1475 reenter_pg_on_q:
1476 		vm_page_queue_remove(q, m, vmp_pageq);
1477 		vm_page_queue_enter(q, m, vmp_pageq);
1478 next_pg:
1479 		qcount--;
1480 		try_failed_count = 0;
1481 
1482 		if (delayed_unlock++ > 128) {
1483 			if (l_object != NULL) {
1484 				vm_object_unlock(l_object);
1485 				l_object = NULL;
1486 			}
1487 			lck_mtx_yield(&vm_page_queue_lock);
1488 			delayed_unlock = 0;
1489 		}
1490 	}
1491 	if (l_object != NULL) {
1492 		vm_object_unlock(l_object);
1493 		l_object = NULL;
1494 	}
1495 	vm_page_unlock_queues();
1496 	return pages_moved;
1497 }
1498 
1499 
1500 
1501 /*
1502  * function in BSD to apply I/O throttle to the pageout thread
1503  */
1504 extern void vm_pageout_io_throttle(void);
1505 
1506 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1507 	MACRO_BEGIN                                                     \
1508 	/* \
1509 	 * If a "reusable" page somehow made it back into \
1510 	 * the active queue, it's been re-used and is not \
1511 	 * quite re-usable. \
1512 	 * If the VM object was "all_reusable", consider it \
1513 	 * as "all re-used" instead of converting it to \
1514 	 * "partially re-used", which could be expensive. \
1515 	 */                                                             \
1516 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1517 	if ((m)->vmp_reusable ||                                        \
1518 	    (obj)->all_reusable) {                                      \
1519 	        vm_object_reuse_pages((obj),                            \
1520 	                              (m)->vmp_offset,                  \
1521 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1522 	                              FALSE);                           \
1523 	}                                                               \
1524 	MACRO_END
1525 
1526 
1527 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1528 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1529 
1530 #define FCS_IDLE                0
1531 #define FCS_DELAYED             1
1532 #define FCS_DEADLOCK_DETECTED   2
1533 
1534 struct flow_control {
1535 	int             state;
1536 	mach_timespec_t ts;
1537 };
1538 
1539 
1540 uint64_t vm_pageout_rejected_bq_internal = 0;
1541 uint64_t vm_pageout_rejected_bq_external = 0;
1542 uint64_t vm_pageout_skipped_bq_internal = 0;
1543 uint64_t vm_pageout_skipped_bq_external = 0;
1544 
1545 #define ANONS_GRABBED_LIMIT     2
1546 
1547 
1548 #if 0
1549 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1550 #endif
1551 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1552 
1553 #define VM_PAGEOUT_PB_NO_ACTION                         0
1554 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1555 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1556 
1557 
1558 #if 0
1559 static void
1560 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1561 {
1562 	if (*local_freeq) {
1563 		vm_page_unlock_queues();
1564 
1565 		VM_DEBUG_CONSTANT_EVENT(
1566 			vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1567 			vm_page_free_count, 0, 0, 1);
1568 
1569 		vm_page_free_list(*local_freeq, TRUE);
1570 
1571 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1572 		    vm_page_free_count, *local_freed, 0, 1);
1573 
1574 		*local_freeq = NULL;
1575 		*local_freed = 0;
1576 
1577 		vm_page_lock_queues();
1578 	} else {
1579 		lck_mtx_yield(&vm_page_queue_lock);
1580 	}
1581 	*delayed_unlock = 1;
1582 }
1583 #endif
1584 
1585 
1586 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1587 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1588     vm_page_t *local_freeq, int *local_freed, int action)
1589 {
1590 	vm_page_unlock_queues();
1591 
1592 	if (*object != NULL) {
1593 		vm_object_unlock(*object);
1594 		*object = NULL;
1595 	}
1596 	if (*local_freeq) {
1597 		vm_page_free_list(*local_freeq, TRUE);
1598 
1599 		*local_freeq = NULL;
1600 		*local_freed = 0;
1601 	}
1602 	*delayed_unlock = 1;
1603 
1604 	switch (action) {
1605 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1606 		vm_consider_waking_compactor_swapper();
1607 		break;
1608 	case VM_PAGEOUT_PB_THREAD_YIELD:
1609 		thread_yield_internal(1);
1610 		break;
1611 	case VM_PAGEOUT_PB_NO_ACTION:
1612 	default:
1613 		break;
1614 	}
1615 	vm_page_lock_queues();
1616 }
1617 
1618 
1619 static struct vm_pageout_vminfo last;
1620 
1621 uint64_t last_vm_page_pages_grabbed = 0;
1622 
1623 extern  uint32_t c_segment_pages_compressed;
1624 
1625 extern uint64_t shared_region_pager_reclaimed;
1626 extern struct memory_object_pager_ops shared_region_pager_ops;
1627 
1628 void
update_vm_info(void)1629 update_vm_info(void)
1630 {
1631 	unsigned long tmp;
1632 	uint64_t tmp64;
1633 
1634 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1635 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1636 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1637 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1638 
1639 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1640 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1641 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1642 
1643 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1644 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1645 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1646 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1647 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1648 
1649 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1650 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1651 	last.vm_pageout_considered_page = tmp;
1652 
1653 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1654 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1655 	last.vm_pageout_compressions = tmp64;
1656 
1657 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1658 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1659 	last.vm_compressor_failed = tmp;
1660 
1661 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1662 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1663 	last.vm_compressor_pages_grabbed = tmp64;
1664 
1665 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1666 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1667 	last.vm_phantom_cache_found_ghost = tmp;
1668 
1669 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1670 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1671 	last.vm_phantom_cache_added_ghost = tmp;
1672 
1673 	tmp64 = counter_load(&vm_page_grab_count);
1674 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1675 	last_vm_page_pages_grabbed = tmp64;
1676 
1677 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1678 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1679 	last.vm_page_pages_freed = tmp;
1680 
1681 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1682 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1683 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1684 		last.vm_pageout_pages_evicted = tmp;
1685 
1686 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1687 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1688 		last.vm_pageout_pages_purged = tmp;
1689 
1690 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1691 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1692 		last.vm_pageout_freed_speculative = tmp;
1693 
1694 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1695 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1696 		last.vm_pageout_freed_external = tmp;
1697 
1698 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1699 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1700 		last.vm_pageout_inactive_referenced = tmp;
1701 
1702 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1703 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1704 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1705 
1706 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1707 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1708 		last.vm_pageout_inactive_dirty_external = tmp;
1709 
1710 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1711 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1712 		last.vm_pageout_freed_cleaned = tmp;
1713 
1714 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1715 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1716 		last.vm_pageout_inactive_nolock = tmp;
1717 
1718 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1719 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1720 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1721 
1722 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1723 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1724 		last.vm_pageout_skipped_external = tmp;
1725 
1726 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1727 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1728 		last.vm_pageout_skipped_internal = tmp;
1729 
1730 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1731 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1732 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1733 
1734 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1735 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1736 		last.vm_pageout_inactive_force_reclaim = tmp;
1737 
1738 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1739 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1740 		last.vm_pageout_freed_internal = tmp;
1741 
1742 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1743 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1744 		last.vm_pageout_considered_bq_internal = tmp;
1745 
1746 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1747 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1748 		last.vm_pageout_considered_bq_external = tmp;
1749 
1750 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1751 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1752 		last.vm_pageout_filecache_min_reactivated = tmp;
1753 
1754 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1755 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1756 		last.vm_pageout_inactive_dirty_internal = tmp;
1757 
1758 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1759 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1760 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1761 
1762 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1763 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1764 		last.vm_pageout_forcereclaimed_realtime = tmp;
1765 
1766 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1767 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1768 		last.vm_pageout_protected_sharedcache = tmp;
1769 
1770 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1771 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1772 		last.vm_pageout_protected_realtime = tmp;
1773 	}
1774 
1775 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1776 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1777 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1778 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1779 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1780 	    0);
1781 
1782 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1783 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1784 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1785 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1786 	    0,
1787 	    0);
1788 
1789 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1790 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1791 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1792 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1793 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1794 	    0);
1795 
1796 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1797 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1798 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1799 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1800 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1801 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1802 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1803 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1804 		    0);
1805 
1806 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1807 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1808 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1809 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1810 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1811 		    0);
1812 
1813 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1814 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1815 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1816 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1817 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1818 		    0);
1819 
1820 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1821 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1822 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1823 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1824 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1825 		    0);
1826 
1827 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1828 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1829 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1830 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1831 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1832 		    0);
1833 
1834 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1835 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1836 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1837 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1838 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1839 		    0);
1840 	}
1841 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1842 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1843 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1844 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1845 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1846 	    0);
1847 
1848 	record_memory_pressure();
1849 }
1850 
1851 extern boolean_t hibernation_vmqueues_inspection;
1852 
1853 /*
1854  * Return values for functions called by vm_pageout_scan
1855  * that control its flow.
1856  *
1857  * PROCEED -- vm_pageout_scan will keep making forward progress.
1858  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1859  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1860  */
1861 
1862 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1863 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1864 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1865 
1866 /*
1867  * This function is called only from vm_pageout_scan and
1868  * it moves overflow secluded pages (one-at-a-time) to the
1869  * batched 'local' free Q or active Q.
1870  */
1871 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1872 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1873 {
1874 #if CONFIG_SECLUDED_MEMORY
1875 	/*
1876 	 * Deal with secluded_q overflow.
1877 	 */
1878 	if (vm_page_secluded_count > vm_page_secluded_target) {
1879 		vm_page_t secluded_page;
1880 
1881 		/*
1882 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1883 		 * Excess secluded pages go to the active queue and
1884 		 * will later go to the inactive queue.
1885 		 */
1886 		assert((vm_page_secluded_count_free +
1887 		    vm_page_secluded_count_inuse) ==
1888 		    vm_page_secluded_count);
1889 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1890 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1891 
1892 		vm_page_queues_remove(secluded_page, FALSE);
1893 		assert(!secluded_page->vmp_fictitious);
1894 		assert(!VM_PAGE_WIRED(secluded_page));
1895 
1896 		if (secluded_page->vmp_object == 0) {
1897 			/* transfer to free queue */
1898 			assert(secluded_page->vmp_busy);
1899 			secluded_page->vmp_snext = *local_freeq;
1900 			*local_freeq = secluded_page;
1901 			*local_freed += 1;
1902 		} else {
1903 			/* transfer to head of active queue */
1904 			vm_page_enqueue_active(secluded_page, FALSE);
1905 			secluded_page = VM_PAGE_NULL;
1906 		}
1907 	}
1908 #else /* CONFIG_SECLUDED_MEMORY */
1909 
1910 #pragma unused(local_freeq)
1911 #pragma unused(local_freed)
1912 
1913 	return;
1914 
1915 #endif /* CONFIG_SECLUDED_MEMORY */
1916 }
1917 
1918 /*
1919  * This function is called only from vm_pageout_scan and
1920  * it initializes the loop targets for vm_pageout_scan().
1921  */
1922 static void
vps_init_page_targets(void)1923 vps_init_page_targets(void)
1924 {
1925 	/*
1926 	 * LD TODO: Other page targets should be calculated here too.
1927 	 */
1928 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1929 
1930 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1931 		vm_pageout_state.vm_page_speculative_percentage = 50;
1932 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1933 		vm_pageout_state.vm_page_speculative_percentage = 1;
1934 	}
1935 
1936 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1937 	    vm_page_inactive_count);
1938 }
1939 
1940 /*
1941  * This function is called only from vm_pageout_scan and
1942  * it purges a single VM object at-a-time and will either
1943  * make vm_pageout_scan() restart the loop or keeping moving forward.
1944  */
1945 static int
vps_purge_object()1946 vps_purge_object()
1947 {
1948 	int             force_purge;
1949 
1950 	assert(available_for_purge >= 0);
1951 	force_purge = 0; /* no force-purging */
1952 
1953 #if VM_PRESSURE_EVENTS
1954 	vm_pressure_level_t pressure_level;
1955 
1956 	pressure_level = memorystatus_vm_pressure_level;
1957 
1958 	if (pressure_level > kVMPressureNormal) {
1959 		if (pressure_level >= kVMPressureCritical) {
1960 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1961 		} else if (pressure_level >= kVMPressureUrgent) {
1962 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1963 		} else if (pressure_level >= kVMPressureWarning) {
1964 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1965 		}
1966 	}
1967 #endif /* VM_PRESSURE_EVENTS */
1968 
1969 	if (available_for_purge || force_purge) {
1970 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1971 
1972 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1973 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1974 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1975 			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1976 			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1977 
1978 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1979 		}
1980 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1981 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1982 	}
1983 
1984 	return VM_PAGEOUT_SCAN_PROCEED;
1985 }
1986 
1987 /*
1988  * This function is called only from vm_pageout_scan and
1989  * it will try to age the next speculative Q if the oldest
1990  * one is empty.
1991  */
1992 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1993 vps_age_speculative_queue(boolean_t force_speculative_aging)
1994 {
1995 #define DELAY_SPECULATIVE_AGE   1000
1996 
1997 	/*
1998 	 * try to pull pages from the aging bins...
1999 	 * see vm_page.h for an explanation of how
2000 	 * this mechanism works
2001 	 */
2002 	boolean_t                       can_steal = FALSE;
2003 	int                             num_scanned_queues;
2004 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2005 	mach_timespec_t                 ts;
2006 	struct vm_speculative_age_q     *aq;
2007 	struct vm_speculative_age_q     *sq;
2008 
2009 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2010 
2011 	aq = &vm_page_queue_speculative[speculative_steal_index];
2012 
2013 	num_scanned_queues = 0;
2014 	while (vm_page_queue_empty(&aq->age_q) &&
2015 	    num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2016 		speculative_steal_index++;
2017 
2018 		if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2019 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2020 		}
2021 
2022 		aq = &vm_page_queue_speculative[speculative_steal_index];
2023 	}
2024 
2025 	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2026 		/*
2027 		 * XXX We've scanned all the speculative
2028 		 * queues but still haven't found one
2029 		 * that is not empty, even though
2030 		 * vm_page_speculative_count is not 0.
2031 		 */
2032 		if (!vm_page_queue_empty(&sq->age_q)) {
2033 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2034 		}
2035 #if DEVELOPMENT || DEBUG
2036 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2037 #endif
2038 		/* readjust... */
2039 		vm_page_speculative_count = 0;
2040 		/* ... and continue */
2041 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2042 	}
2043 
2044 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2045 		can_steal = TRUE;
2046 	} else {
2047 		if (!delay_speculative_age) {
2048 			mach_timespec_t ts_fully_aged;
2049 
2050 			ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2051 			ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2052 			    * 1000 * NSEC_PER_USEC;
2053 
2054 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2055 
2056 			clock_sec_t sec;
2057 			clock_nsec_t nsec;
2058 			clock_get_system_nanotime(&sec, &nsec);
2059 			ts.tv_sec = (unsigned int) sec;
2060 			ts.tv_nsec = nsec;
2061 
2062 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2063 				can_steal = TRUE;
2064 			} else {
2065 				delay_speculative_age++;
2066 			}
2067 		} else {
2068 			delay_speculative_age++;
2069 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2070 				delay_speculative_age = 0;
2071 			}
2072 		}
2073 	}
2074 	if (can_steal == TRUE) {
2075 		vm_page_speculate_ageit(aq);
2076 	}
2077 
2078 	return VM_PAGEOUT_SCAN_PROCEED;
2079 }
2080 
2081 /*
2082  * This function is called only from vm_pageout_scan and
2083  * it evicts a single VM object from the cache.
2084  */
2085 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2086 vps_object_cache_evict(vm_object_t *object_to_unlock)
2087 {
2088 	static int                      cache_evict_throttle = 0;
2089 	struct vm_speculative_age_q     *sq;
2090 
2091 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2092 
2093 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2094 		int     pages_evicted;
2095 
2096 		if (*object_to_unlock != NULL) {
2097 			vm_object_unlock(*object_to_unlock);
2098 			*object_to_unlock = NULL;
2099 		}
2100 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2101 
2102 		pages_evicted = vm_object_cache_evict(100, 10);
2103 
2104 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2105 
2106 		if (pages_evicted) {
2107 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2108 
2109 			VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2110 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2111 			memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2112 
2113 			/*
2114 			 * we just freed up to 100 pages,
2115 			 * so go back to the top of the main loop
2116 			 * and re-evaulate the memory situation
2117 			 */
2118 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2119 		} else {
2120 			cache_evict_throttle = 1000;
2121 		}
2122 	}
2123 	if (cache_evict_throttle) {
2124 		cache_evict_throttle--;
2125 	}
2126 
2127 	return VM_PAGEOUT_SCAN_PROCEED;
2128 }
2129 
2130 
2131 /*
2132  * This function is called only from vm_pageout_scan and
2133  * it calculates the filecache min. that needs to be maintained
2134  * as we start to steal pages.
2135  */
2136 static void
vps_calculate_filecache_min(void)2137 vps_calculate_filecache_min(void)
2138 {
2139 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2140 
2141 #if CONFIG_JETSAM
2142 	/*
2143 	 * don't let the filecache_min fall below 15% of available memory
2144 	 * on systems with an active compressor that isn't nearing its
2145 	 * limits w/r to accepting new data
2146 	 *
2147 	 * on systems w/o the compressor/swapper, the filecache is always
2148 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2149 	 * since most (if not all) of the anonymous pages are in the
2150 	 * throttled queue (which isn't counted as available) which
2151 	 * effectively disables this filter
2152 	 */
2153 	if (vm_compressor_low_on_space() || divisor == 0) {
2154 		vm_pageout_state.vm_page_filecache_min = 0;
2155 	} else {
2156 		vm_pageout_state.vm_page_filecache_min =
2157 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2158 	}
2159 #else
2160 	if (vm_compressor_out_of_space() || divisor == 0) {
2161 		vm_pageout_state.vm_page_filecache_min = 0;
2162 	} else {
2163 		/*
2164 		 * don't let the filecache_min fall below the specified critical level
2165 		 */
2166 		vm_pageout_state.vm_page_filecache_min =
2167 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2168 	}
2169 #endif
2170 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2171 		vm_pageout_state.vm_page_filecache_min = 0;
2172 	}
2173 }
2174 
2175 /*
2176  * This function is called only from vm_pageout_scan and
2177  * it updates the flow control time to detect if VM pageoutscan
2178  * isn't making progress.
2179  */
2180 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2181 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2182 {
2183 	mach_timespec_t ts;
2184 	clock_sec_t sec;
2185 	clock_nsec_t nsec;
2186 
2187 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2188 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2189 	clock_get_system_nanotime(&sec, &nsec);
2190 	flow_control->ts.tv_sec = (unsigned int) sec;
2191 	flow_control->ts.tv_nsec = nsec;
2192 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2193 
2194 	flow_control->state = FCS_DELAYED;
2195 
2196 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2197 }
2198 
2199 /*
2200  * This function is called only from vm_pageout_scan and
2201  * it is the flow control logic of VM pageout scan which
2202  * controls if it should block and for how long.
2203  * Any blocking of vm_pageout_scan happens ONLY in this function.
2204  */
2205 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2206 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2207     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2208 {
2209 	boolean_t       exceeded_burst_throttle = FALSE;
2210 	unsigned int    msecs = 0;
2211 	uint32_t        inactive_external_count;
2212 	mach_timespec_t ts;
2213 	struct  vm_pageout_queue *iq;
2214 	struct  vm_pageout_queue *eq;
2215 	struct  vm_speculative_age_q *sq;
2216 
2217 	iq = &vm_pageout_queue_internal;
2218 	eq = &vm_pageout_queue_external;
2219 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2220 
2221 	/*
2222 	 * Sometimes we have to pause:
2223 	 *	1) No inactive pages - nothing to do.
2224 	 *	2) Loop control - no acceptable pages found on the inactive queue
2225 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2226 	 *	3) Flow control - default pageout queue is full
2227 	 */
2228 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2229 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2230 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2231 	    vm_page_queue_empty(&sq->age_q)) {
2232 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2233 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2234 	} else if (inactive_burst_count >=
2235 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2236 	    (vm_page_inactive_count +
2237 	    vm_page_speculative_count))) {
2238 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2239 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2240 
2241 		exceeded_burst_throttle = TRUE;
2242 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2243 	    VM_DYNAMIC_PAGING_ENABLED()) {
2244 		clock_sec_t sec;
2245 		clock_nsec_t nsec;
2246 
2247 		switch (flow_control->state) {
2248 		case FCS_IDLE:
2249 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2250 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2251 				/*
2252 				 * since the compressor is running independently of vm_pageout_scan
2253 				 * let's not wait for it just yet... as long as we have a healthy supply
2254 				 * of filecache pages to work with, let's keep stealing those.
2255 				 */
2256 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2257 
2258 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2259 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2260 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2261 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2262 					return VM_PAGEOUT_SCAN_PROCEED;
2263 				}
2264 			}
2265 
2266 			vps_flow_control_reset_deadlock_timer(flow_control);
2267 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2268 
2269 			break;
2270 
2271 		case FCS_DELAYED:
2272 			clock_get_system_nanotime(&sec, &nsec);
2273 			ts.tv_sec = (unsigned int) sec;
2274 			ts.tv_nsec = nsec;
2275 
2276 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2277 				/*
2278 				 * the pageout thread for the default pager is potentially
2279 				 * deadlocked since the
2280 				 * default pager queue has been throttled for more than the
2281 				 * allowable time... we need to move some clean pages or dirty
2282 				 * pages belonging to the external pagers if they aren't throttled
2283 				 * vm_page_free_wanted represents the number of threads currently
2284 				 * blocked waiting for pages... we'll move one page for each of
2285 				 * these plus a fixed amount to break the logjam... once we're done
2286 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2287 				 * with a new timeout target since we have no way of knowing
2288 				 * whether we've broken the deadlock except through observation
2289 				 * of the queue associated with the default pager... we need to
2290 				 * stop moving pages and allow the system to run to see what
2291 				 * state it settles into.
2292 				 */
2293 
2294 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2295 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2296 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2297 				flow_control->state = FCS_DEADLOCK_DETECTED;
2298 				thread_wakeup(VM_PAGEOUT_GC_EVENT);
2299 				return VM_PAGEOUT_SCAN_PROCEED;
2300 			}
2301 			/*
2302 			 * just resniff instead of trying
2303 			 * to compute a new delay time... we're going to be
2304 			 * awakened immediately upon a laundry completion,
2305 			 * so we won't wait any longer than necessary
2306 			 */
2307 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2308 			break;
2309 
2310 		case FCS_DEADLOCK_DETECTED:
2311 			if (*vm_pageout_deadlock_target) {
2312 				return VM_PAGEOUT_SCAN_PROCEED;
2313 			}
2314 
2315 			vps_flow_control_reset_deadlock_timer(flow_control);
2316 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2317 
2318 			break;
2319 		}
2320 	} else {
2321 		/*
2322 		 * No need to pause...
2323 		 */
2324 		return VM_PAGEOUT_SCAN_PROCEED;
2325 	}
2326 
2327 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2328 
2329 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2330 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2331 
2332 	if (vm_page_free_count >= vm_page_free_target) {
2333 		/*
2334 		 * we're here because
2335 		 *  1) someone else freed up some pages while we had
2336 		 *     the queues unlocked above
2337 		 * and we've hit one of the 3 conditions that
2338 		 * cause us to pause the pageout scan thread
2339 		 *
2340 		 * since we already have enough free pages,
2341 		 * let's avoid stalling and return normally
2342 		 *
2343 		 * before we return, make sure the pageout I/O threads
2344 		 * are running throttled in case there are still requests
2345 		 * in the laundry... since we have enough free pages
2346 		 * we don't need the laundry to be cleaned in a timely
2347 		 * fashion... so let's avoid interfering with foreground
2348 		 * activity
2349 		 *
2350 		 * we don't want to hold vm_page_queue_free_lock when
2351 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2352 		 * may cause other locks to be taken), we do the intitial
2353 		 * check outside of the lock.  Once we take the lock,
2354 		 * we recheck the condition since it may have changed.
2355 		 * if it has, no problem, we will make the threads
2356 		 * non-throttled before actually blocking
2357 		 */
2358 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2359 	}
2360 	vm_free_page_lock();
2361 
2362 	if (vm_page_free_count >= vm_page_free_target &&
2363 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2364 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2365 	}
2366 	vm_free_page_unlock();
2367 
2368 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2369 		/*
2370 		 * we're most likely about to block due to one of
2371 		 * the 3 conditions that cause vm_pageout_scan to
2372 		 * not be able to make forward progress w/r
2373 		 * to providing new pages to the free queue,
2374 		 * so unthrottle the I/O threads in case we
2375 		 * have laundry to be cleaned... it needs
2376 		 * to be completed ASAP.
2377 		 *
2378 		 * even if we don't block, we want the io threads
2379 		 * running unthrottled since the sum of free +
2380 		 * clean pages is still under our free target
2381 		 */
2382 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2383 	}
2384 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2385 		/*
2386 		 * if we get here we're below our free target and
2387 		 * we're stalling due to a full laundry queue or
2388 		 * we don't have any inactive pages other then
2389 		 * those in the clean queue...
2390 		 * however, we have pages on the clean queue that
2391 		 * can be moved to the free queue, so let's not
2392 		 * stall the pageout scan
2393 		 */
2394 		flow_control->state = FCS_IDLE;
2395 		return VM_PAGEOUT_SCAN_PROCEED;
2396 	}
2397 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2398 		flow_control->state = FCS_IDLE;
2399 		return VM_PAGEOUT_SCAN_PROCEED;
2400 	}
2401 
2402 	VM_CHECK_MEMORYSTATUS;
2403 
2404 	if (flow_control->state != FCS_IDLE) {
2405 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2406 	}
2407 
2408 	iq->pgo_throttled = TRUE;
2409 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2410 
2411 	vm_page_unlock_queues();
2412 
2413 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2414 
2415 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2416 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2417 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2418 
2419 	thread_block(THREAD_CONTINUE_NULL);
2420 
2421 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2422 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2423 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2424 
2425 	vm_page_lock_queues();
2426 
2427 	iq->pgo_throttled = FALSE;
2428 
2429 	vps_init_page_targets();
2430 
2431 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2432 }
2433 
2434 extern boolean_t vm_darkwake_mode;
2435 /*
2436  * This function is called only from vm_pageout_scan and
2437  * it will find and return the most appropriate page to be
2438  * reclaimed.
2439  */
2440 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2441 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2442     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2443 {
2444 	vm_page_t                       m = NULL;
2445 	vm_object_t                     m_object = VM_OBJECT_NULL;
2446 	uint32_t                        inactive_external_count;
2447 	struct vm_speculative_age_q     *sq;
2448 	struct vm_pageout_queue         *iq;
2449 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2450 
2451 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2452 	iq = &vm_pageout_queue_internal;
2453 
2454 	*is_page_from_bg_q = FALSE;
2455 
2456 	m = NULL;
2457 	m_object = VM_OBJECT_NULL;
2458 
2459 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2460 		assert(vm_page_throttled_count == 0);
2461 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2462 	}
2463 
2464 	/*
2465 	 * Try for a clean-queue inactive page.
2466 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2467 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2468 	 */
2469 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2470 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2471 
2472 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2473 
2474 		goto found_page;
2475 	}
2476 
2477 	/*
2478 	 * The next most eligible pages are ones we paged in speculatively,
2479 	 * but which have not yet been touched and have been aged out.
2480 	 */
2481 	if (!vm_page_queue_empty(&sq->age_q)) {
2482 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2483 
2484 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2485 
2486 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2487 			goto found_page;
2488 		} else {
2489 			m = NULL;
2490 		}
2491 	}
2492 
2493 #if !CONFIG_JETSAM
2494 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2495 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2496 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2497 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2498 			goto found_page;
2499 		}
2500 	}
2501 #endif /* !CONFIG_JETSAM */
2502 
2503 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2504 		vm_object_t     bg_m_object = NULL;
2505 
2506 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2507 
2508 		bg_m_object = VM_PAGE_OBJECT(m);
2509 
2510 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2511 			/*
2512 			 * This page is on the background queue
2513 			 * but not on a pageable queue OR is busy during
2514 			 * darkwake mode when the target is artificially lowered.
2515 			 * If it is busy during darkwake mode, and we don't skip it,
2516 			 * we will just swing back around and try again with the same
2517 			 * queue and might hit the same page or its neighbor in a
2518 			 * similar state. Both of these are transient states and will
2519 			 * get resolved, but, at this point let's ignore this page.
2520 			 */
2521 			if (vm_darkwake_mode && m->vmp_busy) {
2522 				if (bg_m_object->internal) {
2523 					vm_pageout_skipped_bq_internal++;
2524 				} else {
2525 					vm_pageout_skipped_bq_external++;
2526 				}
2527 			}
2528 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2529 			if (bg_m_object->internal &&
2530 			    (VM_PAGE_Q_THROTTLED(iq) ||
2531 			    vm_compressor_out_of_space() == TRUE ||
2532 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2533 				vm_pageout_skipped_bq_internal++;
2534 			} else {
2535 				*is_page_from_bg_q = TRUE;
2536 
2537 				if (bg_m_object->internal) {
2538 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2539 				} else {
2540 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2541 				}
2542 				goto found_page;
2543 			}
2544 		}
2545 	}
2546 
2547 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2548 
2549 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2550 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2551 		*grab_anonymous = TRUE;
2552 		*anons_grabbed = 0;
2553 
2554 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2555 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2556 		} else {
2557 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2558 				/*
2559 				 * No swap and we are in dangerously low levels of free memory.
2560 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2561 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2562 				 *
2563 				 * So, pick a file backed page...
2564 				 */
2565 				*grab_anonymous = FALSE;
2566 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2567 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2568 			}
2569 		}
2570 		goto want_anonymous;
2571 	}
2572 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2573 
2574 #if CONFIG_JETSAM
2575 	/* If the file-backed pool has accumulated
2576 	 * significantly more pages than the jetsam
2577 	 * threshold, prefer to reclaim those
2578 	 * inline to minimise compute overhead of reclaiming
2579 	 * anonymous pages.
2580 	 * This calculation does not account for the CPU local
2581 	 * external page queues, as those are expected to be
2582 	 * much smaller relative to the global pools.
2583 	 */
2584 
2585 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2586 
2587 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2588 		if (vm_page_pageable_external_count >
2589 		    vm_pageout_state.vm_page_filecache_min) {
2590 			if ((vm_page_pageable_external_count *
2591 			    vm_pageout_memorystatus_fb_factor_dr) >
2592 			    (memorystatus_available_pages_critical *
2593 			    vm_pageout_memorystatus_fb_factor_nr)) {
2594 				*grab_anonymous = FALSE;
2595 
2596 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2597 			}
2598 		}
2599 		if (*grab_anonymous) {
2600 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2601 		}
2602 	}
2603 #endif /* CONFIG_JETSAM */
2604 
2605 want_anonymous:
2606 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2607 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2608 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2609 
2610 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2611 			*anons_grabbed = 0;
2612 
2613 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2614 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2615 					if ((++(*reactivated_this_call) % 100)) {
2616 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2617 
2618 						vm_page_activate(m);
2619 						counter_inc(&vm_statistics_reactivations);
2620 #if DEVELOPMENT || DEBUG
2621 						if (*is_page_from_bg_q == TRUE) {
2622 							if (m_object->internal) {
2623 								vm_pageout_rejected_bq_internal++;
2624 							} else {
2625 								vm_pageout_rejected_bq_external++;
2626 							}
2627 						}
2628 #endif /* DEVELOPMENT || DEBUG */
2629 						vm_pageout_state.vm_pageout_inactive_used++;
2630 
2631 						m = NULL;
2632 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2633 
2634 						goto found_page;
2635 					}
2636 
2637 					/*
2638 					 * steal 1 of the file backed pages even if
2639 					 * we are under the limit that has been set
2640 					 * for a healthy filecache
2641 					 */
2642 				}
2643 			}
2644 			goto found_page;
2645 		}
2646 	}
2647 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2648 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2649 
2650 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2651 		*anons_grabbed += 1;
2652 
2653 		goto found_page;
2654 	}
2655 
2656 	m = NULL;
2657 
2658 found_page:
2659 	*victim_page = m;
2660 
2661 	return retval;
2662 }
2663 
2664 /*
2665  * This function is called only from vm_pageout_scan and
2666  * it will put a page back on the active/inactive queue
2667  * if we can't reclaim it for some reason.
2668  */
2669 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2670 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2671 {
2672 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2673 		vm_page_enqueue_inactive(m, FALSE);
2674 	} else {
2675 		vm_page_activate(m);
2676 	}
2677 
2678 #if DEVELOPMENT || DEBUG
2679 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2680 
2681 	if (page_from_bg_q == TRUE) {
2682 		if (m_object->internal) {
2683 			vm_pageout_rejected_bq_internal++;
2684 		} else {
2685 			vm_pageout_rejected_bq_external++;
2686 		}
2687 	}
2688 #endif /* DEVELOPMENT || DEBUG */
2689 }
2690 
2691 /*
2692  * This function is called only from vm_pageout_scan and
2693  * it will try to grab the victim page's VM object (m_object)
2694  * which differs from the previous victim page's object (object).
2695  */
2696 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2697 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2698 {
2699 	struct vm_speculative_age_q *sq;
2700 
2701 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2702 
2703 	/*
2704 	 * the object associated with candidate page is
2705 	 * different from the one we were just working
2706 	 * with... dump the lock if we still own it
2707 	 */
2708 	if (*object != NULL) {
2709 		vm_object_unlock(*object);
2710 		*object = NULL;
2711 	}
2712 	/*
2713 	 * Try to lock object; since we've alread got the
2714 	 * page queues lock, we can only 'try' for this one.
2715 	 * if the 'try' fails, we need to do a mutex_pause
2716 	 * to allow the owner of the object lock a chance to
2717 	 * run... otherwise, we're likely to trip over this
2718 	 * object in the same state as we work our way through
2719 	 * the queue... clumps of pages associated with the same
2720 	 * object are fairly typical on the inactive and active queues
2721 	 */
2722 	if (!vm_object_lock_try_scan(m_object)) {
2723 		vm_page_t m_want = NULL;
2724 
2725 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2726 
2727 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2728 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2729 		}
2730 
2731 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2732 
2733 		m->vmp_reference = FALSE;
2734 
2735 		if (!m_object->object_is_shared_cache) {
2736 			/*
2737 			 * don't apply this optimization if this is the shared cache
2738 			 * object, it's too easy to get rid of very hot and important
2739 			 * pages...
2740 			 * m->vmp_object must be stable since we hold the page queues lock...
2741 			 * we can update the scan_collisions field sans the object lock
2742 			 * since it is a separate field and this is the only spot that does
2743 			 * a read-modify-write operation and it is never executed concurrently...
2744 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2745 			 * is possible for the value to be a bit non-determistic, but that's ok
2746 			 * since it's only used as a hint
2747 			 */
2748 			m_object->scan_collisions = 1;
2749 		}
2750 		if (page_from_bg_q) {
2751 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2752 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2753 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2754 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2755 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2756 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2757 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2758 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2759 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2760 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2761 		}
2762 
2763 		/*
2764 		 * this is the next object we're going to be interested in
2765 		 * try to make sure its available after the mutex_pause
2766 		 * returns control
2767 		 */
2768 		if (m_want) {
2769 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2770 		}
2771 
2772 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2773 
2774 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2775 	} else {
2776 		*object = m_object;
2777 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2778 	}
2779 
2780 	return VM_PAGEOUT_SCAN_PROCEED;
2781 }
2782 
2783 /*
2784  * This function is called only from vm_pageout_scan and
2785  * it notices that pageout scan may be rendered ineffective
2786  * due to a FS deadlock and will jetsam a process if possible.
2787  * If jetsam isn't supported, it'll move the page to the active
2788  * queue to try and get some different pages pushed onwards so
2789  * we can try to get out of this scenario.
2790  */
2791 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2792 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2793     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2794 {
2795 	struct  vm_pageout_queue *eq;
2796 	vm_object_t cur_object = VM_OBJECT_NULL;
2797 
2798 	cur_object = *object;
2799 
2800 	eq = &vm_pageout_queue_external;
2801 
2802 	if (cur_object->internal == FALSE) {
2803 		/*
2804 		 * we need to break up the following potential deadlock case...
2805 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2806 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2807 		 *  c) Most of the pages in the inactive queue belong to this file.
2808 		 *
2809 		 * we are potentially in this deadlock because...
2810 		 *  a) the external pageout queue is throttled
2811 		 *  b) we're done with the active queue and moved on to the inactive queue
2812 		 *  c) we've got a dirty external page
2813 		 *
2814 		 * since we don't know the reason for the external pageout queue being throttled we
2815 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2816 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2817 		 *
2818 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2819 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2820 		 * pool the next time we select a victim page... if we can make enough new free pages,
2821 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2822 		 * be throttled
2823 		 *
2824 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2825 		 * that we can try to find clean pages in the active/inactive queues before
2826 		 * deciding to jetsam a process
2827 		 */
2828 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2829 
2830 		vm_page_check_pageable_safe(m);
2831 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2832 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2833 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2834 		vm_page_active_count++;
2835 		vm_page_pageable_external_count++;
2836 
2837 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2838 
2839 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2840 
2841 #pragma unused(force_anonymous)
2842 
2843 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2844 
2845 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2846 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2847 			/*
2848 			 * Possible deadlock scenario so request jetsam action
2849 			 */
2850 
2851 			assert(cur_object);
2852 			vm_object_unlock(cur_object);
2853 
2854 			cur_object = VM_OBJECT_NULL;
2855 
2856 			/*
2857 			 * VM pageout scan needs to know we have dropped this lock and so set the
2858 			 * object variable we got passed in to NULL.
2859 			 */
2860 			*object = VM_OBJECT_NULL;
2861 
2862 			vm_page_unlock_queues();
2863 
2864 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2865 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2866 
2867 			/* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2868 			if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2869 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2870 			}
2871 
2872 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2873 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2874 
2875 			vm_page_lock_queues();
2876 			*delayed_unlock = 1;
2877 		}
2878 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2879 
2880 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2881 #pragma unused(delayed_unlock)
2882 
2883 		*force_anonymous = TRUE;
2884 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2885 	} else {
2886 		vm_page_activate(m);
2887 		counter_inc(&vm_statistics_reactivations);
2888 
2889 #if DEVELOPMENT || DEBUG
2890 		if (is_page_from_bg_q == TRUE) {
2891 			if (cur_object->internal) {
2892 				vm_pageout_rejected_bq_internal++;
2893 			} else {
2894 				vm_pageout_rejected_bq_external++;
2895 			}
2896 		}
2897 #endif /* DEVELOPMENT || DEBUG */
2898 
2899 		vm_pageout_state.vm_pageout_inactive_used++;
2900 	}
2901 }
2902 
2903 
2904 void
vm_page_balance_inactive(int max_to_move)2905 vm_page_balance_inactive(int max_to_move)
2906 {
2907 	vm_page_t m;
2908 
2909 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2910 
2911 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2912 		/*
2913 		 * It is likely that the hibernation code path is
2914 		 * dealing with these very queues as we are about
2915 		 * to move pages around in/from them and completely
2916 		 * change the linkage of the pages.
2917 		 *
2918 		 * And so we skip the rebalancing of these queues.
2919 		 */
2920 		return;
2921 	}
2922 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2923 	    vm_page_inactive_count +
2924 	    vm_page_speculative_count);
2925 
2926 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2927 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2928 
2929 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2930 
2931 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2932 		assert(!m->vmp_laundry);
2933 		assert(VM_PAGE_OBJECT(m) != kernel_object);
2934 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2935 
2936 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2937 
2938 		/*
2939 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2940 		 *
2941 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2942 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2943 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2944 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2945 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2946 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2947 		 * have happened before we moved the page
2948 		 */
2949 		if (m->vmp_pmapped == TRUE) {
2950 			/*
2951 			 * We might be holding the page queue lock as a
2952 			 * spin lock and clearing the "referenced" bit could
2953 			 * take a while if there are lots of mappings of
2954 			 * that page, so make sure we acquire the lock as
2955 			 * as mutex to avoid a spinlock timeout.
2956 			 */
2957 			vm_page_lockconvert_queues();
2958 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2959 		}
2960 
2961 		/*
2962 		 * The page might be absent or busy,
2963 		 * but vm_page_deactivate can handle that.
2964 		 * FALSE indicates that we don't want a H/W clear reference
2965 		 */
2966 		vm_page_deactivate_internal(m, FALSE);
2967 	}
2968 }
2969 
2970 /*
2971  *	vm_pageout_scan does the dirty work for the pageout daemon.
2972  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2973  *	held and vm_page_free_wanted == 0.
2974  */
2975 void
vm_pageout_scan(void)2976 vm_pageout_scan(void)
2977 {
2978 	unsigned int loop_count = 0;
2979 	unsigned int inactive_burst_count = 0;
2980 	unsigned int reactivated_this_call;
2981 	unsigned int reactivate_limit;
2982 	vm_page_t   local_freeq = NULL;
2983 	int         local_freed = 0;
2984 	int         delayed_unlock;
2985 	int         delayed_unlock_limit = 0;
2986 	int         refmod_state = 0;
2987 	int     vm_pageout_deadlock_target = 0;
2988 	struct  vm_pageout_queue *iq;
2989 	struct  vm_pageout_queue *eq;
2990 	struct  vm_speculative_age_q *sq;
2991 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2992 	boolean_t inactive_throttled = FALSE;
2993 	vm_object_t     object = NULL;
2994 	uint32_t        inactive_reclaim_run;
2995 	boolean_t       grab_anonymous = FALSE;
2996 	boolean_t       force_anonymous = FALSE;
2997 	boolean_t       force_speculative_aging = FALSE;
2998 	int             anons_grabbed = 0;
2999 	int             page_prev_q_state = 0;
3000 	boolean_t       page_from_bg_q = FALSE;
3001 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3002 	vm_object_t     m_object = VM_OBJECT_NULL;
3003 	int             retval = 0;
3004 	boolean_t       lock_yield_check = FALSE;
3005 
3006 
3007 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3008 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3009 	    vm_pageout_state.vm_pageout_inactive_clean,
3010 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3011 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3012 
3013 	flow_control.state = FCS_IDLE;
3014 	iq = &vm_pageout_queue_internal;
3015 	eq = &vm_pageout_queue_external;
3016 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3017 
3018 	/* Ask the pmap layer to return any pages it no longer needs. */
3019 	pmap_release_pages_fast();
3020 
3021 	vm_page_lock_queues();
3022 
3023 	delayed_unlock = 1;
3024 
3025 	/*
3026 	 *	Calculate the max number of referenced pages on the inactive
3027 	 *	queue that we will reactivate.
3028 	 */
3029 	reactivated_this_call = 0;
3030 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3031 	    vm_page_inactive_count);
3032 	inactive_reclaim_run = 0;
3033 
3034 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3035 
3036 	/*
3037 	 *	We must limit the rate at which we send pages to the pagers
3038 	 *	so that we don't tie up too many pages in the I/O queues.
3039 	 *	We implement a throttling mechanism using the laundry count
3040 	 *      to limit the number of pages outstanding to the default
3041 	 *	and external pagers.  We can bypass the throttles and look
3042 	 *	for clean pages if the pageout queues don't drain in a timely
3043 	 *	fashion since this may indicate that the pageout paths are
3044 	 *	stalled waiting for memory, which only we can provide.
3045 	 */
3046 
3047 	vps_init_page_targets();
3048 	assert(object == NULL);
3049 	assert(delayed_unlock != 0);
3050 
3051 	for (;;) {
3052 		vm_page_t m;
3053 
3054 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3055 
3056 		if (lock_yield_check) {
3057 			lock_yield_check = FALSE;
3058 
3059 			if (delayed_unlock++ > delayed_unlock_limit) {
3060 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3061 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3062 			} else if (vm_pageout_scan_wants_object) {
3063 				vm_page_unlock_queues();
3064 				mutex_pause(0);
3065 				vm_page_lock_queues();
3066 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3067 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3068 			}
3069 		}
3070 
3071 		if (vm_upl_wait_for_pages < 0) {
3072 			vm_upl_wait_for_pages = 0;
3073 		}
3074 
3075 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3076 
3077 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3078 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3079 		}
3080 
3081 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3082 
3083 		assert(delayed_unlock);
3084 
3085 		/*
3086 		 * maintain our balance
3087 		 */
3088 		vm_page_balance_inactive(1);
3089 
3090 
3091 		/**********************************************************************
3092 		* above this point we're playing with the active and secluded queues
3093 		* below this point we're playing with the throttling mechanisms
3094 		* and the inactive queue
3095 		**********************************************************************/
3096 
3097 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3098 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3099 
3100 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3101 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3102 			/*
3103 			 * make sure the pageout I/O threads are running
3104 			 * throttled in case there are still requests
3105 			 * in the laundry... since we have met our targets
3106 			 * we don't need the laundry to be cleaned in a timely
3107 			 * fashion... so let's avoid interfering with foreground
3108 			 * activity
3109 			 */
3110 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3111 
3112 			vm_free_page_lock();
3113 
3114 			if ((vm_page_free_count >= vm_page_free_target) &&
3115 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3116 				/*
3117 				 * done - we have met our target *and*
3118 				 * there is no one waiting for a page.
3119 				 */
3120 return_from_scan:
3121 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3122 
3123 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3124 				    vm_pageout_state.vm_pageout_inactive,
3125 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3126 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3127 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3128 				    vm_pageout_state.vm_pageout_inactive_clean,
3129 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3130 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3131 
3132 				return;
3133 			}
3134 			vm_free_page_unlock();
3135 		}
3136 
3137 		/*
3138 		 * Before anything, we check if we have any ripe volatile
3139 		 * objects around. If so, try to purge the first object.
3140 		 * If the purge fails, fall through to reclaim a page instead.
3141 		 * If the purge succeeds, go back to the top and reevalute
3142 		 * the new memory situation.
3143 		 */
3144 		retval = vps_purge_object();
3145 
3146 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3147 			/*
3148 			 * Success
3149 			 */
3150 			if (object != NULL) {
3151 				vm_object_unlock(object);
3152 				object = NULL;
3153 			}
3154 
3155 			lock_yield_check = FALSE;
3156 			continue;
3157 		}
3158 
3159 		/*
3160 		 * If our 'aged' queue is empty and we have some speculative pages
3161 		 * in the other queues, let's go through and see if we need to age
3162 		 * them.
3163 		 *
3164 		 * If we succeeded in aging a speculative Q or just that everything
3165 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3166 		 *
3167 		 * If, for some reason, we seem to have a mismatch between the spec.
3168 		 * page count and the page queues, we reset those variables and
3169 		 * restart the loop (LD TODO: Track this better?).
3170 		 */
3171 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3172 			retval = vps_age_speculative_queue(force_speculative_aging);
3173 
3174 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3175 				lock_yield_check = FALSE;
3176 				continue;
3177 			}
3178 		}
3179 		force_speculative_aging = FALSE;
3180 
3181 		/*
3182 		 * Check to see if we need to evict objects from the cache.
3183 		 *
3184 		 * Note: 'object' here doesn't have anything to do with
3185 		 * the eviction part. We just need to make sure we have dropped
3186 		 * any object lock we might be holding if we need to go down
3187 		 * into the eviction logic.
3188 		 */
3189 		retval = vps_object_cache_evict(&object);
3190 
3191 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3192 			lock_yield_check = FALSE;
3193 			continue;
3194 		}
3195 
3196 
3197 		/*
3198 		 * Calculate our filecache_min that will affect the loop
3199 		 * going forward.
3200 		 */
3201 		vps_calculate_filecache_min();
3202 
3203 		/*
3204 		 * LD TODO: Use a structure to hold all state variables for a single
3205 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3206 		 */
3207 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3208 		    &delayed_unlock, &local_freeq, &local_freed,
3209 		    &vm_pageout_deadlock_target, inactive_burst_count);
3210 
3211 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3212 			if (loop_count >= vm_page_inactive_count) {
3213 				loop_count = 0;
3214 			}
3215 
3216 			inactive_burst_count = 0;
3217 
3218 			assert(object == NULL);
3219 			assert(delayed_unlock != 0);
3220 
3221 			lock_yield_check = FALSE;
3222 			continue;
3223 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3224 			goto return_from_scan;
3225 		}
3226 
3227 		flow_control.state = FCS_IDLE;
3228 
3229 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3230 		    vm_pageout_inactive_external_forced_reactivate_limit);
3231 		loop_count++;
3232 		inactive_burst_count++;
3233 		vm_pageout_state.vm_pageout_inactive++;
3234 
3235 		/*
3236 		 * Choose a victim.
3237 		 */
3238 
3239 		m = NULL;
3240 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3241 
3242 		if (m == NULL) {
3243 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3244 				inactive_burst_count = 0;
3245 
3246 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3247 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3248 				}
3249 
3250 				lock_yield_check = TRUE;
3251 				continue;
3252 			}
3253 
3254 			/*
3255 			 * if we've gotten here, we have no victim page.
3256 			 * check to see if we've not finished balancing the queues
3257 			 * or we have a page on the aged speculative queue that we
3258 			 * skipped due to force_anonymous == TRUE.. or we have
3259 			 * speculative  pages that we can prematurely age... if
3260 			 * one of these cases we'll keep going, else panic
3261 			 */
3262 			force_anonymous = FALSE;
3263 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3264 
3265 			if (!vm_page_queue_empty(&sq->age_q)) {
3266 				lock_yield_check = TRUE;
3267 				continue;
3268 			}
3269 
3270 			if (vm_page_speculative_count) {
3271 				force_speculative_aging = TRUE;
3272 				lock_yield_check = TRUE;
3273 				continue;
3274 			}
3275 			panic("vm_pageout: no victim");
3276 
3277 			/* NOTREACHED */
3278 		}
3279 
3280 		assert(VM_PAGE_PAGEABLE(m));
3281 		m_object = VM_PAGE_OBJECT(m);
3282 		force_anonymous = FALSE;
3283 
3284 		page_prev_q_state = m->vmp_q_state;
3285 		/*
3286 		 * we just found this page on one of our queues...
3287 		 * it can't also be on the pageout queue, so safe
3288 		 * to call vm_page_queues_remove
3289 		 */
3290 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3291 		vm_page_queues_remove(m, TRUE);
3292 		if (donate) {
3293 			/*
3294 			 * The compressor needs to see this bit to know
3295 			 * where this page needs to land. Also if stolen,
3296 			 * this bit helps put the page back in the right
3297 			 * special queue where it belongs.
3298 			 */
3299 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3300 		}
3301 
3302 		assert(!m->vmp_laundry);
3303 		assert(!m->vmp_private);
3304 		assert(!m->vmp_fictitious);
3305 		assert(m_object != kernel_object);
3306 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3307 
3308 		vm_pageout_vminfo.vm_pageout_considered_page++;
3309 
3310 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3311 
3312 		/*
3313 		 * check to see if we currently are working
3314 		 * with the same object... if so, we've
3315 		 * already got the lock
3316 		 */
3317 		if (m_object != object) {
3318 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3319 
3320 			/*
3321 			 * vps_switch_object() will always drop the 'object' lock first
3322 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3323 			 * either 'm_object' or NULL.
3324 			 */
3325 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3326 
3327 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3328 				lock_yield_check = TRUE;
3329 				continue;
3330 			}
3331 		}
3332 		assert(m_object == object);
3333 		assert(VM_PAGE_OBJECT(m) == m_object);
3334 
3335 		if (m->vmp_busy) {
3336 			/*
3337 			 *	Somebody is already playing with this page.
3338 			 *	Put it back on the appropriate queue
3339 			 *
3340 			 */
3341 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3342 
3343 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3344 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3345 			}
3346 
3347 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3348 
3349 			lock_yield_check = TRUE;
3350 			continue;
3351 		}
3352 
3353 		/*
3354 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3355 		 *	If already cleaning this page in place
3356 		 *	just leave if off the paging queues.
3357 		 *	We can leave the page mapped, and upl_commit_range
3358 		 *	will put it on the clean queue.
3359 		 *
3360 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3361 		 *	an msync INVALIDATE is in progress...
3362 		 *	this page has been marked for destruction
3363 		 *      after it has been cleaned,
3364 		 *      but not yet gathered into a UPL
3365 		 *	where 'cleaning' will be set...
3366 		 *	just leave it off the paging queues
3367 		 *
3368 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3369 		 *	an msync INVALIDATE is in progress
3370 		 *	and the UPL has already gathered this page...
3371 		 *	just leave it off the paging queues
3372 		 */
3373 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3374 			lock_yield_check = TRUE;
3375 			continue;
3376 		}
3377 
3378 
3379 		/*
3380 		 *	If it's absent, in error or the object is no longer alive,
3381 		 *	we can reclaim the page... in the no longer alive case,
3382 		 *	there are 2 states the page can be in that preclude us
3383 		 *	from reclaiming it - busy or cleaning - that we've already
3384 		 *	dealt with
3385 		 */
3386 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3387 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3388 			if (m->vmp_absent) {
3389 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3390 			} else if (!object->alive ||
3391 			    (!object->internal &&
3392 			    object->pager == MEMORY_OBJECT_NULL)) {
3393 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3394 			} else {
3395 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3396 			}
3397 reclaim_page:
3398 			if (vm_pageout_deadlock_target) {
3399 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3400 				vm_pageout_deadlock_target--;
3401 			}
3402 
3403 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3404 
3405 			if (object->internal) {
3406 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3407 			} else {
3408 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3409 			}
3410 			assert(!m->vmp_cleaning);
3411 			assert(!m->vmp_laundry);
3412 
3413 			if (!object->internal &&
3414 			    object->pager != NULL &&
3415 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3416 				shared_region_pager_reclaimed++;
3417 			}
3418 
3419 			m->vmp_busy = TRUE;
3420 
3421 			/*
3422 			 * remove page from object here since we're already
3423 			 * behind the object lock... defer the rest of the work
3424 			 * we'd normally do in vm_page_free_prepare_object
3425 			 * until 'vm_page_free_list' is called
3426 			 */
3427 			if (m->vmp_tabled) {
3428 				vm_page_remove(m, TRUE);
3429 			}
3430 
3431 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3432 			m->vmp_snext = local_freeq;
3433 			local_freeq = m;
3434 			local_freed++;
3435 
3436 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3437 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3438 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3439 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3440 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3441 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3442 			} else {
3443 				vm_pageout_vminfo.vm_pageout_freed_external++;
3444 			}
3445 
3446 			inactive_burst_count = 0;
3447 
3448 			lock_yield_check = TRUE;
3449 			continue;
3450 		}
3451 		if (object->copy == VM_OBJECT_NULL) {
3452 			/*
3453 			 * No one else can have any interest in this page.
3454 			 * If this is an empty purgable object, the page can be
3455 			 * reclaimed even if dirty.
3456 			 * If the page belongs to a volatile purgable object, we
3457 			 * reactivate it if the compressor isn't active.
3458 			 */
3459 			if (object->purgable == VM_PURGABLE_EMPTY) {
3460 				if (m->vmp_pmapped == TRUE) {
3461 					/* unmap the page */
3462 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3463 					if (refmod_state & VM_MEM_MODIFIED) {
3464 						SET_PAGE_DIRTY(m, FALSE);
3465 					}
3466 				}
3467 				if (m->vmp_dirty || m->vmp_precious) {
3468 					/* we saved the cost of cleaning this page ! */
3469 					vm_page_purged_count++;
3470 				}
3471 				goto reclaim_page;
3472 			}
3473 
3474 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3475 				/*
3476 				 * With the VM compressor, the cost of
3477 				 * reclaiming a page is much lower (no I/O),
3478 				 * so if we find a "volatile" page, it's better
3479 				 * to let it get compressed rather than letting
3480 				 * it occupy a full page until it gets purged.
3481 				 * So no need to check for "volatile" here.
3482 				 */
3483 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3484 				/*
3485 				 * Avoid cleaning a "volatile" page which might
3486 				 * be purged soon.
3487 				 */
3488 
3489 				/* if it's wired, we can't put it on our queue */
3490 				assert(!VM_PAGE_WIRED(m));
3491 
3492 				/* just stick it back on! */
3493 				reactivated_this_call++;
3494 
3495 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3496 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3497 				}
3498 
3499 				goto reactivate_page;
3500 			}
3501 		}
3502 		/*
3503 		 *	If it's being used, reactivate.
3504 		 *	(Fictitious pages are either busy or absent.)
3505 		 *	First, update the reference and dirty bits
3506 		 *	to make sure the page is unreferenced.
3507 		 */
3508 		refmod_state = -1;
3509 
3510 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3511 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3512 
3513 			if (refmod_state & VM_MEM_REFERENCED) {
3514 				m->vmp_reference = TRUE;
3515 			}
3516 			if (refmod_state & VM_MEM_MODIFIED) {
3517 				SET_PAGE_DIRTY(m, FALSE);
3518 			}
3519 		}
3520 
3521 		if (m->vmp_reference || m->vmp_dirty) {
3522 			/* deal with a rogue "reusable" page */
3523 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3524 		}
3525 
3526 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3527 			vm_pageout_state.vm_page_xpmapped_min = 0;
3528 		} else {
3529 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3530 		}
3531 
3532 		if (!m->vmp_no_cache &&
3533 		    page_from_bg_q == FALSE &&
3534 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3535 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3536 			/*
3537 			 * The page we pulled off the inactive list has
3538 			 * been referenced.  It is possible for other
3539 			 * processors to be touching pages faster than we
3540 			 * can clear the referenced bit and traverse the
3541 			 * inactive queue, so we limit the number of
3542 			 * reactivations.
3543 			 */
3544 			if (++reactivated_this_call >= reactivate_limit &&
3545 			    !object->object_is_shared_cache &&
3546 			    !((m->vmp_realtime ||
3547 			    object->for_realtime) &&
3548 			    vm_pageout_protect_realtime)) {
3549 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3550 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3551 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3552 				if (object->object_is_shared_cache) {
3553 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3554 				} else if (m->vmp_realtime ||
3555 				    object->for_realtime) {
3556 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3557 				}
3558 			} else {
3559 				uint32_t isinuse;
3560 
3561 				if (reactivated_this_call >= reactivate_limit) {
3562 					if (object->object_is_shared_cache) {
3563 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3564 					} else if ((m->vmp_realtime ||
3565 					    object->for_realtime) &&
3566 					    vm_pageout_protect_realtime) {
3567 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3568 					}
3569 				}
3570 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3571 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3572 				}
3573 
3574 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3575 reactivate_page:
3576 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3577 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3578 					/*
3579 					 * no explict mappings of this object exist
3580 					 * and it's not open via the filesystem
3581 					 */
3582 					vm_page_deactivate(m);
3583 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3584 				} else {
3585 					/*
3586 					 * The page was/is being used, so put back on active list.
3587 					 */
3588 					vm_page_activate(m);
3589 					counter_inc(&vm_statistics_reactivations);
3590 					inactive_burst_count = 0;
3591 				}
3592 #if DEVELOPMENT || DEBUG
3593 				if (page_from_bg_q == TRUE) {
3594 					if (m_object->internal) {
3595 						vm_pageout_rejected_bq_internal++;
3596 					} else {
3597 						vm_pageout_rejected_bq_external++;
3598 					}
3599 				}
3600 #endif /* DEVELOPMENT || DEBUG */
3601 
3602 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3603 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3604 				}
3605 				vm_pageout_state.vm_pageout_inactive_used++;
3606 
3607 				lock_yield_check = TRUE;
3608 				continue;
3609 			}
3610 			/*
3611 			 * Make sure we call pmap_get_refmod() if it
3612 			 * wasn't already called just above, to update
3613 			 * the dirty bit.
3614 			 */
3615 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3616 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3617 				if (refmod_state & VM_MEM_MODIFIED) {
3618 					SET_PAGE_DIRTY(m, FALSE);
3619 				}
3620 			}
3621 		}
3622 
3623 		/*
3624 		 * we've got a candidate page to steal...
3625 		 *
3626 		 * m->vmp_dirty is up to date courtesy of the
3627 		 * preceding check for m->vmp_reference... if
3628 		 * we get here, then m->vmp_reference had to be
3629 		 * FALSE (or possibly "reactivate_limit" was
3630 		 * exceeded), but in either case we called
3631 		 * pmap_get_refmod() and updated both
3632 		 * m->vmp_reference and m->vmp_dirty
3633 		 *
3634 		 * if it's dirty or precious we need to
3635 		 * see if the target queue is throtttled
3636 		 * it if is, we need to skip over it by moving it back
3637 		 * to the end of the inactive queue
3638 		 */
3639 
3640 		inactive_throttled = FALSE;
3641 
3642 		if (m->vmp_dirty || m->vmp_precious) {
3643 			if (object->internal) {
3644 				if (VM_PAGE_Q_THROTTLED(iq)) {
3645 					inactive_throttled = TRUE;
3646 				}
3647 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3648 				inactive_throttled = TRUE;
3649 			}
3650 		}
3651 throttle_inactive:
3652 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3653 		    object->internal && m->vmp_dirty &&
3654 		    (object->purgable == VM_PURGABLE_DENY ||
3655 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3656 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3657 			vm_page_check_pageable_safe(m);
3658 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3659 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3660 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3661 			vm_page_throttled_count++;
3662 
3663 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3664 
3665 			inactive_burst_count = 0;
3666 
3667 			lock_yield_check = TRUE;
3668 			continue;
3669 		}
3670 		if (inactive_throttled == TRUE) {
3671 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3672 			    &delayed_unlock, &force_anonymous, page_from_bg_q);
3673 
3674 			inactive_burst_count = 0;
3675 
3676 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3677 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3678 			}
3679 
3680 			lock_yield_check = TRUE;
3681 			continue;
3682 		}
3683 
3684 		/*
3685 		 * we've got a page that we can steal...
3686 		 * eliminate all mappings and make sure
3687 		 * we have the up-to-date modified state
3688 		 *
3689 		 * if we need to do a pmap_disconnect then we
3690 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3691 		 * provides the true state atomically... the
3692 		 * page was still mapped up to the pmap_disconnect
3693 		 * and may have been dirtied at the last microsecond
3694 		 *
3695 		 * Note that if 'pmapped' is FALSE then the page is not
3696 		 * and has not been in any map, so there is no point calling
3697 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3698 		 * of likely usage of the page.
3699 		 */
3700 		if (m->vmp_pmapped == TRUE) {
3701 			int pmap_options;
3702 
3703 			/*
3704 			 * Don't count this page as going into the compressor
3705 			 * if any of these are true:
3706 			 * 1) compressed pager isn't enabled
3707 			 * 2) Freezer enabled device with compressed pager
3708 			 *    backend (exclusive use) i.e. most of the VM system
3709 			 *    (including vm_pageout_scan) has no knowledge of
3710 			 *    the compressor
3711 			 * 3) This page belongs to a file and hence will not be
3712 			 *    sent into the compressor
3713 			 */
3714 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3715 			    object->internal == FALSE) {
3716 				pmap_options = 0;
3717 			} else if (m->vmp_dirty || m->vmp_precious) {
3718 				/*
3719 				 * VM knows that this page is dirty (or
3720 				 * precious) and needs to be compressed
3721 				 * rather than freed.
3722 				 * Tell the pmap layer to count this page
3723 				 * as "compressed".
3724 				 */
3725 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3726 			} else {
3727 				/*
3728 				 * VM does not know if the page needs to
3729 				 * be preserved but the pmap layer might tell
3730 				 * us if any mapping has "modified" it.
3731 				 * Let's the pmap layer to count this page
3732 				 * as compressed if and only if it has been
3733 				 * modified.
3734 				 */
3735 				pmap_options =
3736 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3737 			}
3738 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3739 			    pmap_options,
3740 			    NULL);
3741 			if (refmod_state & VM_MEM_MODIFIED) {
3742 				SET_PAGE_DIRTY(m, FALSE);
3743 			}
3744 		}
3745 
3746 		/*
3747 		 * reset our count of pages that have been reclaimed
3748 		 * since the last page was 'stolen'
3749 		 */
3750 		inactive_reclaim_run = 0;
3751 
3752 		/*
3753 		 *	If it's clean and not precious, we can free the page.
3754 		 */
3755 		if (!m->vmp_dirty && !m->vmp_precious) {
3756 			vm_pageout_state.vm_pageout_inactive_clean++;
3757 
3758 			/*
3759 			 * OK, at this point we have found a page we are going to free.
3760 			 */
3761 #if CONFIG_PHANTOM_CACHE
3762 			if (!object->internal) {
3763 				vm_phantom_cache_add_ghost(m);
3764 			}
3765 #endif
3766 			goto reclaim_page;
3767 		}
3768 
3769 		/*
3770 		 * The page may have been dirtied since the last check
3771 		 * for a throttled target queue (which may have been skipped
3772 		 * if the page was clean then).  With the dirty page
3773 		 * disconnected here, we can make one final check.
3774 		 */
3775 		if (object->internal) {
3776 			if (VM_PAGE_Q_THROTTLED(iq)) {
3777 				inactive_throttled = TRUE;
3778 			}
3779 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3780 			inactive_throttled = TRUE;
3781 		}
3782 
3783 		if (inactive_throttled == TRUE) {
3784 			goto throttle_inactive;
3785 		}
3786 
3787 #if VM_PRESSURE_EVENTS
3788 #if CONFIG_JETSAM
3789 
3790 		/*
3791 		 * If Jetsam is enabled, then the sending
3792 		 * of memory pressure notifications is handled
3793 		 * from the same thread that takes care of high-water
3794 		 * and other jetsams i.e. the memorystatus_thread.
3795 		 */
3796 
3797 #else /* CONFIG_JETSAM */
3798 
3799 		vm_pressure_response();
3800 
3801 #endif /* CONFIG_JETSAM */
3802 #endif /* VM_PRESSURE_EVENTS */
3803 
3804 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3805 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3806 		}
3807 
3808 		if (object->internal) {
3809 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3810 		} else {
3811 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3812 		}
3813 
3814 		/*
3815 		 * internal pages will go to the compressor...
3816 		 * external pages will go to the appropriate pager to be cleaned
3817 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3818 		 * is a preferred queue to steal from
3819 		 */
3820 		vm_pageout_cluster(m);
3821 		inactive_burst_count = 0;
3822 
3823 		/*
3824 		 * back to top of pageout scan loop
3825 		 */
3826 	}
3827 }
3828 
3829 
3830 void
vm_page_free_reserve(int pages)3831 vm_page_free_reserve(
3832 	int pages)
3833 {
3834 	int             free_after_reserve;
3835 
3836 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3837 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3838 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3839 		} else {
3840 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3841 		}
3842 	} else {
3843 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3844 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3845 		} else {
3846 			vm_page_free_reserved += pages;
3847 		}
3848 	}
3849 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3850 
3851 	vm_page_free_min = vm_page_free_reserved +
3852 	    VM_PAGE_FREE_MIN(free_after_reserve);
3853 
3854 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3855 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3856 	}
3857 
3858 	vm_page_free_target = vm_page_free_reserved +
3859 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3860 
3861 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3862 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3863 	}
3864 
3865 	if (vm_page_free_target < vm_page_free_min + 5) {
3866 		vm_page_free_target = vm_page_free_min + 5;
3867 	}
3868 
3869 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3870 }
3871 
3872 /*
3873  *	vm_pageout is the high level pageout daemon.
3874  */
3875 
3876 void
vm_pageout_continue(void)3877 vm_pageout_continue(void)
3878 {
3879 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3880 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3881 
3882 	vm_free_page_lock();
3883 	vm_pageout_running = TRUE;
3884 	vm_free_page_unlock();
3885 
3886 	vm_pageout_scan();
3887 	/*
3888 	 * we hold both the vm_page_queue_free_lock
3889 	 * and the vm_page_queues_lock at this point
3890 	 */
3891 	assert(vm_page_free_wanted == 0);
3892 	assert(vm_page_free_wanted_privileged == 0);
3893 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3894 
3895 	vm_pageout_running = FALSE;
3896 #if XNU_TARGET_OS_OSX
3897 	if (vm_pageout_waiter) {
3898 		vm_pageout_waiter = FALSE;
3899 		thread_wakeup((event_t)&vm_pageout_waiter);
3900 	}
3901 #endif /* XNU_TARGET_OS_OSX */
3902 
3903 	vm_free_page_unlock();
3904 	vm_page_unlock_queues();
3905 
3906 	thread_block((thread_continue_t)vm_pageout_continue);
3907 	/*NOTREACHED*/
3908 }
3909 
3910 #if XNU_TARGET_OS_OSX
3911 kern_return_t
vm_pageout_wait(uint64_t deadline)3912 vm_pageout_wait(uint64_t deadline)
3913 {
3914 	kern_return_t kr;
3915 
3916 	vm_free_page_lock();
3917 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3918 		vm_pageout_waiter = TRUE;
3919 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3920 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3921 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3922 			kr = KERN_OPERATION_TIMED_OUT;
3923 		}
3924 	}
3925 	vm_free_page_unlock();
3926 
3927 	return kr;
3928 }
3929 #endif /* XNU_TARGET_OS_OSX */
3930 
3931 OS_NORETURN
3932 static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state * ethr,__unused wait_result_t w)3933 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3934 {
3935 	vm_page_t       m = NULL;
3936 	vm_object_t     object;
3937 	vm_object_offset_t offset;
3938 	memory_object_t pager;
3939 	struct vm_pageout_queue *q = ethr->q;
3940 
3941 	/* On systems with a compressor, the external IO thread clears its
3942 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3943 	 * creation)
3944 	 */
3945 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3946 		current_thread()->options &= ~TH_OPT_VMPRIV;
3947 	}
3948 
3949 	sched_cond_ack(&(ethr->pgo_wakeup));
3950 
3951 	while (true) {
3952 		vm_page_lockspin_queues();
3953 
3954 		while (!vm_page_queue_empty(&q->pgo_pending)) {
3955 			q->pgo_busy = TRUE;
3956 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3957 
3958 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3959 			VM_PAGE_CHECK(m);
3960 			/*
3961 			 * grab a snapshot of the object and offset this
3962 			 * page is tabled in so that we can relookup this
3963 			 * page after we've taken the object lock - these
3964 			 * fields are stable while we hold the page queues lock
3965 			 * but as soon as we drop it, there is nothing to keep
3966 			 * this page in this object... we hold an activity_in_progress
3967 			 * on this object which will keep it from terminating
3968 			 */
3969 			object = VM_PAGE_OBJECT(m);
3970 			offset = m->vmp_offset;
3971 
3972 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3973 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3974 
3975 			vm_page_unlock_queues();
3976 
3977 			vm_object_lock(object);
3978 
3979 			m = vm_page_lookup(object, offset);
3980 
3981 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3982 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3983 				/*
3984 				 * it's either the same page that someone else has
3985 				 * started cleaning (or it's finished cleaning or
3986 				 * been put back on the pageout queue), or
3987 				 * the page has been freed or we have found a
3988 				 * new page at this offset... in all of these cases
3989 				 * we merely need to release the activity_in_progress
3990 				 * we took when we put the page on the pageout queue
3991 				 */
3992 				vm_object_activity_end(object);
3993 				vm_object_unlock(object);
3994 
3995 				vm_page_lockspin_queues();
3996 				continue;
3997 			}
3998 			pager = object->pager;
3999 
4000 			if (pager == MEMORY_OBJECT_NULL) {
4001 				/*
4002 				 * This pager has been destroyed by either
4003 				 * memory_object_destroy or vm_object_destroy, and
4004 				 * so there is nowhere for the page to go.
4005 				 */
4006 				if (m->vmp_free_when_done) {
4007 					/*
4008 					 * Just free the page... VM_PAGE_FREE takes
4009 					 * care of cleaning up all the state...
4010 					 * including doing the vm_pageout_throttle_up
4011 					 */
4012 					VM_PAGE_FREE(m);
4013 				} else {
4014 					vm_page_lockspin_queues();
4015 
4016 					vm_pageout_throttle_up(m);
4017 					vm_page_activate(m);
4018 
4019 					vm_page_unlock_queues();
4020 
4021 					/*
4022 					 *	And we are done with it.
4023 					 */
4024 				}
4025 				vm_object_activity_end(object);
4026 				vm_object_unlock(object);
4027 
4028 				vm_page_lockspin_queues();
4029 				continue;
4030 			}
4031 	#if 0
4032 			/*
4033 			 * we don't hold the page queue lock
4034 			 * so this check isn't safe to make
4035 			 */
4036 			VM_PAGE_CHECK(m);
4037 	#endif
4038 			/*
4039 			 * give back the activity_in_progress reference we
4040 			 * took when we queued up this page and replace it
4041 			 * it with a paging_in_progress reference that will
4042 			 * also hold the paging offset from changing and
4043 			 * prevent the object from terminating
4044 			 */
4045 			vm_object_activity_end(object);
4046 			vm_object_paging_begin(object);
4047 			vm_object_unlock(object);
4048 
4049 			/*
4050 			 * Send the data to the pager.
4051 			 * any pageout clustering happens there
4052 			 */
4053 			memory_object_data_return(pager,
4054 			    m->vmp_offset + object->paging_offset,
4055 			    PAGE_SIZE,
4056 			    NULL,
4057 			    NULL,
4058 			    FALSE,
4059 			    FALSE,
4060 			    0);
4061 
4062 			vm_object_lock(object);
4063 			vm_object_paging_end(object);
4064 			vm_object_unlock(object);
4065 
4066 			vm_pageout_io_throttle();
4067 
4068 			vm_page_lockspin_queues();
4069 		}
4070 		q->pgo_busy = FALSE;
4071 
4072 		vm_page_unlock_queues();
4073 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4074 	}
4075 	/*NOTREACHED*/
4076 }
4077 
4078 
4079 #define         MAX_FREE_BATCH          32
4080 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4081                                      * this thread.
4082                                      */
4083 
4084 
4085 OS_NORETURN
4086 static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state * cq,__unused wait_result_t w)4087 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4088 {
4089 	struct vm_pageout_queue *q;
4090 	vm_page_t       m = NULL;
4091 	boolean_t       pgo_draining;
4092 	vm_page_t   local_q;
4093 	int         local_cnt;
4094 	vm_page_t   local_freeq = NULL;
4095 	int         local_freed = 0;
4096 	int         local_batch_size;
4097 #if DEVELOPMENT || DEBUG
4098 	int       ncomps = 0;
4099 	boolean_t marked_active = FALSE;
4100 	int       num_pages_processed = 0;
4101 #endif
4102 	void *chead = NULL;
4103 
4104 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4105 
4106 	sched_cond_ack(&(cq->pgo_wakeup));
4107 
4108 	q = cq->q;
4109 
4110 	while (true) {
4111 #if DEVELOPMENT || DEBUG
4112 		bool benchmark_accounting = false;
4113 		/*
4114 		 * If we're running the compressor perf test, only process the benchmark pages.
4115 		 * We'll get back to our regular queue once the benchmark is done
4116 		 */
4117 		if (compressor_running_perf_test) {
4118 			q = cq->benchmark_q;
4119 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4120 				benchmark_accounting = true;
4121 			} else {
4122 				q = cq->q;
4123 				benchmark_accounting = false;
4124 			}
4125 		}
4126 #endif /* DEVELOPMENT || DEBUG */
4127 
4128 #if __AMP__
4129 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4130 			local_batch_size = (q->pgo_maxlaundry >> 3);
4131 			local_batch_size = MAX(local_batch_size, 16);
4132 		} else {
4133 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4134 		}
4135 #else
4136 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4137 #endif
4138 
4139 #if RECORD_THE_COMPRESSED_DATA
4140 		if (q->pgo_laundry) {
4141 			c_compressed_record_init();
4142 		}
4143 #endif
4144 		while (true) {
4145 			int     pages_left_on_q = 0;
4146 
4147 			local_cnt = 0;
4148 			local_q = NULL;
4149 
4150 			KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4151 
4152 			vm_page_lock_queues();
4153 #if DEVELOPMENT || DEBUG
4154 			if (marked_active == FALSE) {
4155 				vmct_active++;
4156 				vmct_state[cq->id] = VMCT_ACTIVE;
4157 				marked_active = TRUE;
4158 				if (vmct_active == 1) {
4159 					vm_compressor_epoch_start = mach_absolute_time();
4160 				}
4161 			}
4162 #endif
4163 			KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4164 
4165 			KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4166 
4167 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4168 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4169 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4170 				VM_PAGE_CHECK(m);
4171 
4172 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4173 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4174 				m->vmp_laundry = FALSE;
4175 
4176 				m->vmp_snext = local_q;
4177 				local_q = m;
4178 				local_cnt++;
4179 			}
4180 			if (local_q == NULL) {
4181 				break;
4182 			}
4183 
4184 			q->pgo_busy = TRUE;
4185 
4186 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4187 				vm_pageout_throttle_up_batch(q, local_cnt);
4188 				pages_left_on_q = q->pgo_laundry;
4189 			} else {
4190 				pages_left_on_q = q->pgo_laundry - local_cnt;
4191 			}
4192 
4193 			vm_page_unlock_queues();
4194 
4195 #if !RECORD_THE_COMPRESSED_DATA
4196 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4197 				// wake up the next compressor thread
4198 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4199 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4200 			}
4201 #endif
4202 			KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4203 
4204 			while (local_q) {
4205 				KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4206 
4207 				m = local_q;
4208 				local_q = m->vmp_snext;
4209 				m->vmp_snext = NULL;
4210 
4211 				/*
4212 				 * Technically we need the pageq locks to manipulate this field.
4213 				 * However, this page has been removed from all queues and is only
4214 				 * known to this compressor thread dealing with this local queue.
4215 				 *
4216 				 * TODO LIONEL: Add a second localq that is the early localq and
4217 				 * put special pages like this one on that queue in the block above
4218 				 * under the pageq lock to avoid this 'works but not clean' logic.
4219 				 */
4220 				void *donate_queue_head;
4221 #if XNU_TARGET_OS_OSX
4222 				donate_queue_head = &cq->current_early_swapout_chead;
4223 #else /* XNU_TARGET_OS_OSX */
4224 				donate_queue_head = &cq->current_late_swapout_chead;
4225 #endif /* XNU_TARGET_OS_OSX */
4226 				if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4227 					m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4228 					chead = donate_queue_head;
4229 				} else {
4230 					chead = &cq->current_regular_swapout_chead;
4231 				}
4232 
4233 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4234 #if DEVELOPMENT || DEBUG
4235 					ncomps++;
4236 #endif
4237 					KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4238 
4239 					m->vmp_snext = local_freeq;
4240 					local_freeq = m;
4241 					local_freed++;
4242 
4243 					if (local_freed >= MAX_FREE_BATCH) {
4244 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4245 
4246 						vm_page_free_list(local_freeq, TRUE);
4247 
4248 						local_freeq = NULL;
4249 						local_freed = 0;
4250 					}
4251 				}
4252 #if DEVELOPMENT || DEBUG
4253 				num_pages_processed++;
4254 #endif /* DEVELOPMENT || DEBUG */
4255 #if !CONFIG_JETSAM
4256 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4257 					kern_return_t   wait_result;
4258 					int             need_wakeup = 0;
4259 
4260 					if (local_freeq) {
4261 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4262 
4263 						vm_page_free_list(local_freeq, TRUE);
4264 						local_freeq = NULL;
4265 						local_freed = 0;
4266 
4267 						continue;
4268 					}
4269 					vm_free_page_lock_spin();
4270 
4271 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4272 						if (vm_page_free_wanted_privileged++ == 0) {
4273 							need_wakeup = 1;
4274 						}
4275 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4276 
4277 						vm_free_page_unlock();
4278 
4279 						if (need_wakeup) {
4280 							thread_wakeup((event_t)&vm_page_free_wanted);
4281 						}
4282 
4283 						if (wait_result == THREAD_WAITING) {
4284 							thread_block(THREAD_CONTINUE_NULL);
4285 						}
4286 					} else {
4287 						vm_free_page_unlock();
4288 					}
4289 				}
4290 #endif
4291 			}
4292 			if (local_freeq) {
4293 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4294 
4295 				vm_page_free_list(local_freeq, TRUE);
4296 				local_freeq = NULL;
4297 				local_freed = 0;
4298 			}
4299 			if (pgo_draining == TRUE) {
4300 				vm_page_lockspin_queues();
4301 				vm_pageout_throttle_up_batch(q, local_cnt);
4302 				vm_page_unlock_queues();
4303 			}
4304 		}
4305 		KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4306 
4307 		/*
4308 		 * queue lock is held and our q is empty
4309 		 */
4310 		q->pgo_busy = FALSE;
4311 #if DEVELOPMENT || DEBUG
4312 		if (marked_active == TRUE) {
4313 			vmct_active--;
4314 			vmct_state[cq->id] = VMCT_IDLE;
4315 
4316 			if (vmct_active == 0) {
4317 				vm_compressor_epoch_stop = mach_absolute_time();
4318 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4319 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4320 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4321 				/* This interval includes intervals where one or more
4322 				 * compressor threads were pre-empted
4323 				 */
4324 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4325 			}
4326 		}
4327 		if (compressor_running_perf_test && benchmark_accounting) {
4328 			/*
4329 			 * We could turn ON compressor_running_perf_test while still processing
4330 			 * regular non-benchmark pages. We shouldn't count them here else we
4331 			 * could overshoot. We might also still be populating that benchmark Q
4332 			 * and be under pressure. So we will go back to the regular queues. And
4333 			 * benchmark accounting will be off for that case too.
4334 			 */
4335 			compressor_perf_test_pages_processed += num_pages_processed;
4336 			thread_wakeup(&compressor_perf_test_pages_processed);
4337 		}
4338 #endif
4339 		vm_page_unlock_queues();
4340 #if DEVELOPMENT || DEBUG
4341 		if (__improbable(vm_compressor_time_thread)) {
4342 			vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4343 			vmct_stats.vmct_pages[cq->id] += ncomps;
4344 			vmct_stats.vmct_iterations[cq->id]++;
4345 			if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4346 				vmct_stats.vmct_maxpages[cq->id] = ncomps;
4347 			}
4348 			if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4349 				vmct_stats.vmct_minpages[cq->id] = ncomps;
4350 			}
4351 		}
4352 #endif
4353 
4354 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4355 #if DEVELOPMENT || DEBUG
4356 		if (compressor_running_perf_test && benchmark_accounting) {
4357 			/*
4358 			 * We've been exclusively compressing pages from the benchmark queue,
4359 			 * do 1 pass over the internal queue before blocking.
4360 			 */
4361 			continue;
4362 		}
4363 #endif
4364 
4365 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4366 	}
4367 	/*NOTREACHED*/
4368 }
4369 
4370 
4371 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4372 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4373 {
4374 	vm_object_t     object;
4375 	memory_object_t pager;
4376 	int             compressed_count_delta;
4377 	kern_return_t   retval;
4378 
4379 	object = VM_PAGE_OBJECT(m);
4380 
4381 	assert(!m->vmp_free_when_done);
4382 	assert(!m->vmp_laundry);
4383 
4384 	pager = object->pager;
4385 
4386 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4387 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4388 
4389 		vm_object_lock(object);
4390 
4391 		/*
4392 		 * If there is no memory object for the page, create
4393 		 * one and hand it to the compression pager.
4394 		 */
4395 
4396 		if (!object->pager_initialized) {
4397 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4398 		}
4399 		if (!object->pager_initialized) {
4400 			vm_object_compressor_pager_create(object);
4401 		}
4402 
4403 		pager = object->pager;
4404 
4405 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4406 			/*
4407 			 * Still no pager for the object,
4408 			 * or the pager has been destroyed.
4409 			 * Reactivate the page.
4410 			 *
4411 			 * Should only happen if there is no
4412 			 * compression pager
4413 			 */
4414 			PAGE_WAKEUP_DONE(m);
4415 
4416 			vm_page_lockspin_queues();
4417 			vm_page_activate(m);
4418 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4419 			vm_page_unlock_queues();
4420 
4421 			/*
4422 			 *	And we are done with it.
4423 			 */
4424 			vm_object_activity_end(object);
4425 			vm_object_unlock(object);
4426 
4427 			return KERN_FAILURE;
4428 		}
4429 		vm_object_unlock(object);
4430 
4431 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4432 	}
4433 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4434 	assert(object->activity_in_progress > 0);
4435 
4436 	retval = vm_compressor_pager_put(
4437 		pager,
4438 		m->vmp_offset + object->paging_offset,
4439 		VM_PAGE_GET_PHYS_PAGE(m),
4440 		current_chead,
4441 		scratch_buf,
4442 		&compressed_count_delta);
4443 
4444 	vm_object_lock(object);
4445 
4446 	assert(object->activity_in_progress > 0);
4447 	assert(VM_PAGE_OBJECT(m) == object);
4448 	assert( !VM_PAGE_WIRED(m));
4449 
4450 	vm_compressor_pager_count(pager,
4451 	    compressed_count_delta,
4452 	    FALSE,                       /* shared_lock */
4453 	    object);
4454 
4455 	if (retval == KERN_SUCCESS) {
4456 		/*
4457 		 * If the object is purgeable, its owner's
4458 		 * purgeable ledgers will be updated in
4459 		 * vm_page_remove() but the page still
4460 		 * contributes to the owner's memory footprint,
4461 		 * so account for it as such.
4462 		 */
4463 		if ((object->purgable != VM_PURGABLE_DENY ||
4464 		    object->vo_ledger_tag) &&
4465 		    object->vo_owner != NULL) {
4466 			/* one more compressed purgeable/tagged page */
4467 			vm_object_owner_compressed_update(object,
4468 			    +1);
4469 		}
4470 		counter_inc(&vm_statistics_compressions);
4471 
4472 		if (m->vmp_tabled) {
4473 			vm_page_remove(m, TRUE);
4474 		}
4475 	} else {
4476 		PAGE_WAKEUP_DONE(m);
4477 
4478 		vm_page_lockspin_queues();
4479 
4480 		vm_page_activate(m);
4481 		vm_pageout_vminfo.vm_compressor_failed++;
4482 
4483 		vm_page_unlock_queues();
4484 	}
4485 	vm_object_activity_end(object);
4486 	vm_object_unlock(object);
4487 
4488 	return retval;
4489 }
4490 
4491 
4492 static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state * ethr,boolean_t req_lowpriority)4493 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4494 {
4495 	uint32_t        policy;
4496 
4497 	if (hibernate_cleaning_in_progress == TRUE) {
4498 		req_lowpriority = FALSE;
4499 	}
4500 
4501 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4502 		vm_page_unlock_queues();
4503 
4504 		if (req_lowpriority == TRUE) {
4505 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4506 			DTRACE_VM(laundrythrottle);
4507 		} else {
4508 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4509 			DTRACE_VM(laundryunthrottle);
4510 		}
4511 		proc_set_thread_policy(ethr->pgo_iothread,
4512 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4513 
4514 		vm_page_lock_queues();
4515 		ethr->q->pgo_lowpriority = req_lowpriority;
4516 	}
4517 }
4518 
4519 OS_NORETURN
4520 static void
vm_pageout_iothread_external(struct pgo_iothread_state * ethr,__unused wait_result_t w)4521 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4522 {
4523 	thread_t        self = current_thread();
4524 
4525 	self->options |= TH_OPT_VMPRIV;
4526 
4527 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4528 
4529 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4530 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4531 
4532 	vm_page_lock_queues();
4533 
4534 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4535 	vm_pageout_queue_external.pgo_inited = TRUE;
4536 
4537 	vm_page_unlock_queues();
4538 
4539 #if CONFIG_THREAD_GROUPS
4540 	thread_group_vm_add();
4541 #endif /* CONFIG_THREAD_GROUPS */
4542 
4543 	vm_pageout_iothread_external_continue(ethr, 0);
4544 	/*NOTREACHED*/
4545 }
4546 
4547 
4548 OS_NORETURN
4549 static void
vm_pageout_iothread_internal(struct pgo_iothread_state * cthr,__unused wait_result_t w)4550 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4551 {
4552 	thread_t        self = current_thread();
4553 
4554 	self->options |= TH_OPT_VMPRIV;
4555 
4556 	vm_page_lock_queues();
4557 
4558 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4559 	vm_pageout_queue_internal.pgo_inited = TRUE;
4560 
4561 #if DEVELOPMENT || DEBUG
4562 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4563 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4564 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4565 #endif /* DEVELOPMENT || DEBUG */
4566 
4567 	vm_page_unlock_queues();
4568 
4569 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4570 		thread_vm_bind_group_add();
4571 	}
4572 
4573 #if CONFIG_THREAD_GROUPS
4574 	thread_group_vm_add();
4575 #endif /* CONFIG_THREAD_GROUPS */
4576 
4577 #if __AMP__
4578 	if (vm_compressor_ebound) {
4579 		/*
4580 		 * Use the soft bound option for vm_compressor to allow it to run on
4581 		 * P-cores if E-cluster is unavailable.
4582 		 */
4583 		thread_bind_cluster_type(self, 'E', true);
4584 	}
4585 #endif /* __AMP__ */
4586 
4587 	thread_set_thread_name(current_thread(), "VM_compressor");
4588 #if DEVELOPMENT || DEBUG
4589 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4590 #endif
4591 	vm_pageout_iothread_internal_continue(cthr, 0);
4592 
4593 	/*NOTREACHED*/
4594 }
4595 
4596 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4597 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4598 {
4599 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4600 		return KERN_SUCCESS;
4601 	} else {
4602 		return KERN_FAILURE; /* Already set */
4603 	}
4604 }
4605 
4606 extern boolean_t        memorystatus_manual_testing_on;
4607 extern unsigned int     memorystatus_level;
4608 
4609 
4610 #if VM_PRESSURE_EVENTS
4611 
4612 boolean_t vm_pressure_events_enabled = FALSE;
4613 
4614 extern uint64_t next_warning_notification_sent_at_ts;
4615 extern uint64_t next_critical_notification_sent_at_ts;
4616 
4617 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4618 
4619 /*
4620  * The last time there was change in pressure level OR we forced a check
4621  * because the system is stuck in a non-normal pressure level.
4622  */
4623 uint64_t  vm_pressure_last_level_transition_abs = 0;
4624 
4625 /*
4626  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4627  * level before resending out notifications for that level again.
4628  */
4629 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4630 
4631 void
vm_pressure_response(void)4632 vm_pressure_response(void)
4633 {
4634 	vm_pressure_level_t     old_level = kVMPressureNormal;
4635 	int                     new_level = -1;
4636 	unsigned int            total_pages;
4637 	uint64_t                available_memory = 0;
4638 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4639 	bool                    force_check = false;
4640 	int                     time_in_mins;
4641 
4642 
4643 	if (vm_pressure_events_enabled == FALSE) {
4644 		return;
4645 	}
4646 
4647 #if !XNU_TARGET_OS_OSX
4648 
4649 	available_memory = (uint64_t) memorystatus_available_pages;
4650 
4651 #else /* !XNU_TARGET_OS_OSX */
4652 
4653 	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4654 	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4655 
4656 #endif /* !XNU_TARGET_OS_OSX */
4657 
4658 	total_pages = (unsigned int) atop_64(max_mem);
4659 #if CONFIG_SECLUDED_MEMORY
4660 	total_pages -= vm_page_secluded_count;
4661 #endif /* CONFIG_SECLUDED_MEMORY */
4662 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4663 
4664 	if (memorystatus_manual_testing_on) {
4665 		return;
4666 	}
4667 
4668 	curr_ts = mach_absolute_time();
4669 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4670 
4671 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4672 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4673 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4674 
4675 	old_level = memorystatus_vm_pressure_level;
4676 
4677 	switch (memorystatus_vm_pressure_level) {
4678 	case kVMPressureNormal:
4679 	{
4680 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4681 			new_level = kVMPressureCritical;
4682 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4683 			new_level = kVMPressureWarning;
4684 		}
4685 		break;
4686 	}
4687 
4688 	case kVMPressureWarning:
4689 	case kVMPressureUrgent:
4690 	{
4691 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4692 			new_level = kVMPressureNormal;
4693 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4694 			new_level = kVMPressureCritical;
4695 		} else if (force_check) {
4696 			new_level = kVMPressureWarning;
4697 			next_warning_notification_sent_at_ts = curr_ts;
4698 		}
4699 		break;
4700 	}
4701 
4702 	case kVMPressureCritical:
4703 	{
4704 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4705 			new_level = kVMPressureNormal;
4706 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4707 			new_level = kVMPressureWarning;
4708 		} else if (force_check) {
4709 			new_level = kVMPressureCritical;
4710 			next_critical_notification_sent_at_ts = curr_ts;
4711 		}
4712 		break;
4713 	}
4714 
4715 	default:
4716 		return;
4717 	}
4718 
4719 	if (new_level != -1 || force_check) {
4720 		if (new_level != -1) {
4721 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4722 
4723 			if (new_level != (int) old_level) {
4724 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4725 				    new_level, old_level, 0, 0);
4726 			}
4727 		} else {
4728 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4729 			    new_level, old_level, force_check, 0);
4730 		}
4731 
4732 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4733 			/*
4734 			 * We don't want to schedule a wakeup while hibernation is in progress
4735 			 * because that could collide with checks for non-monotonicity in the scheduler.
4736 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4737 			 * we _might_ want to use that for decisions regarding which pages or how
4738 			 * many pages we want to dump in hibernation.
4739 			 */
4740 			return;
4741 		}
4742 
4743 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4744 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4745 				thread_wakeup(&vm_pressure_thread);
4746 			}
4747 
4748 			if (old_level != memorystatus_vm_pressure_level) {
4749 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4750 			}
4751 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4752 		}
4753 	}
4754 }
4755 #endif /* VM_PRESSURE_EVENTS */
4756 
4757 
4758 /**
4759  * Called by a kernel thread to ask if a number of pages may be wired.
4760  */
4761 kern_return_t
mach_vm_wire_level_monitor(int64_t requested_pages)4762 mach_vm_wire_level_monitor(int64_t requested_pages)
4763 {
4764 	if (requested_pages <= 0) {
4765 		return KERN_INVALID_ARGUMENT;
4766 	}
4767 
4768 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4769 	/**
4770 	 * Available pages can be negative in the case where more system memory is
4771 	 * wired than the threshold, so we must use a signed integer.
4772 	 */
4773 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4774 
4775 	if (requested_pages > available_pages) {
4776 		return KERN_RESOURCE_SHORTAGE;
4777 	}
4778 	return KERN_SUCCESS;
4779 }
4780 
4781 /*
4782  * Function called by a kernel thread to either get the current pressure level or
4783  * wait until memory pressure changes from a given level.
4784  */
4785 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4786 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4787 {
4788 #if !VM_PRESSURE_EVENTS
4789 
4790 	return KERN_FAILURE;
4791 
4792 #else /* VM_PRESSURE_EVENTS */
4793 
4794 	wait_result_t       wr = 0;
4795 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4796 
4797 	if (pressure_level == NULL) {
4798 		return KERN_INVALID_ARGUMENT;
4799 	}
4800 
4801 	if (*pressure_level == kVMPressureJetsam) {
4802 		if (!wait_for_pressure) {
4803 			return KERN_INVALID_ARGUMENT;
4804 		}
4805 
4806 		lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4807 		wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4808 		    THREAD_INTERRUPTIBLE);
4809 		if (wr == THREAD_WAITING) {
4810 			++memorystatus_jetsam_fg_band_waiters;
4811 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4812 			wr = thread_block(THREAD_CONTINUE_NULL);
4813 		} else {
4814 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4815 		}
4816 		if (wr != THREAD_AWAKENED) {
4817 			return KERN_ABORTED;
4818 		}
4819 		*pressure_level = kVMPressureJetsam;
4820 		return KERN_SUCCESS;
4821 	}
4822 
4823 	if (wait_for_pressure == TRUE) {
4824 		while (old_level == *pressure_level) {
4825 			wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4826 			    THREAD_INTERRUPTIBLE);
4827 			if (wr == THREAD_WAITING) {
4828 				wr = thread_block(THREAD_CONTINUE_NULL);
4829 			}
4830 			if (wr == THREAD_INTERRUPTED) {
4831 				return KERN_ABORTED;
4832 			}
4833 
4834 			if (wr == THREAD_AWAKENED) {
4835 				old_level = memorystatus_vm_pressure_level;
4836 			}
4837 		}
4838 	}
4839 
4840 	*pressure_level = old_level;
4841 	return KERN_SUCCESS;
4842 #endif /* VM_PRESSURE_EVENTS */
4843 }
4844 
4845 #if VM_PRESSURE_EVENTS
4846 void
vm_pressure_thread(void)4847 vm_pressure_thread(void)
4848 {
4849 	static boolean_t thread_initialized = FALSE;
4850 
4851 	if (thread_initialized == TRUE) {
4852 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4853 		consider_vm_pressure_events();
4854 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4855 	}
4856 
4857 #if CONFIG_THREAD_GROUPS
4858 	thread_group_vm_add();
4859 #endif /* CONFIG_THREAD_GROUPS */
4860 
4861 	thread_set_thread_name(current_thread(), "VM_pressure");
4862 	thread_initialized = TRUE;
4863 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4864 	thread_block((thread_continue_t)vm_pressure_thread);
4865 }
4866 #endif /* VM_PRESSURE_EVENTS */
4867 
4868 
4869 /*
4870  * called once per-second via "compute_averages"
4871  */
4872 void
compute_pageout_gc_throttle(__unused void * arg)4873 compute_pageout_gc_throttle(__unused void *arg)
4874 {
4875 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4876 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4877 
4878 		thread_wakeup(VM_PAGEOUT_GC_EVENT);
4879 	}
4880 }
4881 
4882 /*
4883  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4884  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4885  * jetsams. We need to check if the zone map size is above its jetsam limit to
4886  * decide if this was indeed the case.
4887  *
4888  * We need to do this on a different thread because of the following reasons:
4889  *
4890  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4891  * itself causing the system to hang. We perform synchronous jetsams if we're
4892  * leaking in the VM map entries zone, so the leaking process could be doing a
4893  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4894  * jetsam itself. We also need the vm_map lock on the process termination path,
4895  * which would now lead the dying process to deadlock against itself.
4896  *
4897  * 2. The jetsam path might need to allocate zone memory itself. We could try
4898  * using the non-blocking variant of zalloc for this path, but we can still
4899  * end up trying to do a kmem_alloc when the zone maps are almost full.
4900  */
4901 __dead2
4902 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4903 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4904 {
4905 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4906 
4907 	if (step == VM_PAGEOUT_GC_INIT) {
4908 		/* first time being called is not about GC */
4909 #if CONFIG_THREAD_GROUPS
4910 		thread_group_vm_add();
4911 #endif /* CONFIG_THREAD_GROUPS */
4912 	} else if (zone_map_nearing_exhaustion()) {
4913 		/*
4914 		 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4915 		 *
4916 		 * Bail out after calling zone_gc (which triggers the
4917 		 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4918 		 * operations that clear out a bunch of caches might allocate zone
4919 		 * memory themselves (for eg. vm_map operations would need VM map
4920 		 * entries). Since the zone map is almost full at this point, we
4921 		 * could end up with a panic. We just need to quickly jetsam a
4922 		 * process and exit here.
4923 		 *
4924 		 * It could so happen that we were woken up to relieve memory
4925 		 * pressure and the zone map also happened to be near its limit at
4926 		 * the time, in which case we'll skip out early. But that should be
4927 		 * ok; if memory pressure persists, the thread will simply be woken
4928 		 * up again.
4929 		 */
4930 		zone_gc(ZONE_GC_JETSAM);
4931 	} else {
4932 		/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4933 		boolean_t buf_large_zfree = FALSE;
4934 		boolean_t first_try = TRUE;
4935 
4936 		stack_collect();
4937 
4938 		consider_machine_collect();
4939 		mbuf_drain(FALSE);
4940 
4941 		do {
4942 			if (consider_buffer_cache_collect != NULL) {
4943 				buf_large_zfree = (*consider_buffer_cache_collect)(0);
4944 			}
4945 			if (first_try == TRUE || buf_large_zfree == TRUE) {
4946 				/*
4947 				 * zone_gc should be last, because the other operations
4948 				 * might return memory to zones.
4949 				 */
4950 				zone_gc(ZONE_GC_TRIM);
4951 			}
4952 			first_try = FALSE;
4953 		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4954 
4955 		consider_machine_adjust();
4956 	}
4957 
4958 	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4959 
4960 	thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4961 	__builtin_unreachable();
4962 }
4963 
4964 
4965 #if VM_PAGE_BUCKETS_CHECK
4966 #if VM_PAGE_FAKE_BUCKETS
4967 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4968 #endif /* VM_PAGE_FAKE_BUCKETS */
4969 #endif /* VM_PAGE_BUCKETS_CHECK */
4970 
4971 
4972 
4973 void
vm_set_restrictions(unsigned int num_cpus)4974 vm_set_restrictions(unsigned int num_cpus)
4975 {
4976 	int vm_restricted_to_single_processor = 0;
4977 
4978 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4979 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4980 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4981 	} else {
4982 		assert(num_cpus > 0);
4983 
4984 		if (num_cpus <= 3) {
4985 			/*
4986 			 * on systems with a limited number of CPUS, bind the
4987 			 * 4 major threads that can free memory and that tend to use
4988 			 * a fair bit of CPU under pressured conditions to a single processor.
4989 			 * This insures that these threads don't hog all of the available CPUs
4990 			 * (important for camera launch), while allowing them to run independently
4991 			 * w/r to locks... the 4 threads are
4992 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4993 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
4994 			 * memorystatus_thread (jetsams).
4995 			 *
4996 			 * the first time the thread is run, it is responsible for checking the
4997 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
4998 			 * thread_bind_master...  someday this should be replaced with a group
4999 			 * scheduling mechanism and KPI.
5000 			 */
5001 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5002 		} else {
5003 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5004 		}
5005 	}
5006 }
5007 
5008 /*
5009  * Set up vm_config based on the vm_compressor_mode.
5010  * Must run BEFORE the pageout thread starts up.
5011  */
5012 __startup_func
5013 void
vm_config_init(void)5014 vm_config_init(void)
5015 {
5016 	bzero(&vm_config, sizeof(vm_config));
5017 
5018 	switch (vm_compressor_mode) {
5019 	case VM_PAGER_DEFAULT:
5020 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5021 		OS_FALLTHROUGH;
5022 
5023 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5024 		vm_config.compressor_is_present = TRUE;
5025 		vm_config.swap_is_present = TRUE;
5026 		vm_config.compressor_is_active = TRUE;
5027 		vm_config.swap_is_active = TRUE;
5028 		break;
5029 
5030 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5031 		vm_config.compressor_is_present = TRUE;
5032 		vm_config.swap_is_present = TRUE;
5033 		vm_config.compressor_is_active = TRUE;
5034 		break;
5035 
5036 	case VM_PAGER_FREEZER_DEFAULT:
5037 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5038 		OS_FALLTHROUGH;
5039 
5040 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5041 		vm_config.compressor_is_present = TRUE;
5042 		vm_config.swap_is_present = TRUE;
5043 		break;
5044 
5045 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5046 		vm_config.compressor_is_present = TRUE;
5047 		vm_config.swap_is_present = TRUE;
5048 		vm_config.compressor_is_active = TRUE;
5049 		vm_config.freezer_swap_is_active = TRUE;
5050 		break;
5051 
5052 	case VM_PAGER_NOT_CONFIGURED:
5053 		break;
5054 
5055 	default:
5056 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5057 		break;
5058 	}
5059 }
5060 
5061 __startup_func
5062 static void
vm_pageout_create_gc_thread(void)5063 vm_pageout_create_gc_thread(void)
5064 {
5065 	thread_t thread;
5066 
5067 	if (kernel_thread_create(vm_pageout_garbage_collect,
5068 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5069 		panic("vm_pageout_garbage_collect: create failed");
5070 	}
5071 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5072 	if (thread->reserved_stack == 0) {
5073 		assert(thread->kernel_stack);
5074 		thread->reserved_stack = thread->kernel_stack;
5075 	}
5076 
5077 	/* thread is started in vm_pageout() */
5078 	vm_pageout_gc_thread = thread;
5079 }
5080 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5081 
5082 void
vm_pageout(void)5083 vm_pageout(void)
5084 {
5085 	thread_t        self = current_thread();
5086 	thread_t        thread;
5087 	kern_return_t   result;
5088 	spl_t           s;
5089 
5090 	/*
5091 	 * Set thread privileges.
5092 	 */
5093 	s = splsched();
5094 
5095 #if CONFIG_VPS_DYNAMIC_PRIO
5096 
5097 	int             vps_dynprio_bootarg = 0;
5098 
5099 	if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5100 		vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5101 		kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5102 	} else {
5103 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5104 			vps_dynamic_priority_enabled = TRUE;
5105 		} else {
5106 			vps_dynamic_priority_enabled = FALSE;
5107 		}
5108 	}
5109 
5110 	if (vps_dynamic_priority_enabled) {
5111 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5112 		thread_set_eager_preempt(self);
5113 	} else {
5114 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5115 	}
5116 
5117 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5118 
5119 	vps_dynamic_priority_enabled = FALSE;
5120 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5121 
5122 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5123 
5124 	thread_lock(self);
5125 	self->options |= TH_OPT_VMPRIV;
5126 	thread_unlock(self);
5127 
5128 	if (!self->reserved_stack) {
5129 		self->reserved_stack = self->kernel_stack;
5130 	}
5131 
5132 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5133 	    vps_dynamic_priority_enabled == FALSE) {
5134 		thread_vm_bind_group_add();
5135 	}
5136 
5137 
5138 #if CONFIG_THREAD_GROUPS
5139 	thread_group_vm_add();
5140 #endif /* CONFIG_THREAD_GROUPS */
5141 
5142 #if __AMP__
5143 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5144 	if (vm_pgo_pbound) {
5145 		/*
5146 		 * Use the soft bound option for vm pageout to allow it to run on
5147 		 * E-cores if P-cluster is unavailable.
5148 		 */
5149 		thread_bind_cluster_type(self, 'P', true);
5150 	}
5151 #endif /* __AMP__ */
5152 
5153 	PE_parse_boot_argn("vmpgo_protect_realtime",
5154 	    &vm_pageout_protect_realtime,
5155 	    sizeof(vm_pageout_protect_realtime));
5156 	splx(s);
5157 
5158 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5159 
5160 	/*
5161 	 *	Initialize some paging parameters.
5162 	 */
5163 
5164 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5165 	vm_pageout_state.vm_pressure_changed = FALSE;
5166 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5167 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5168 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5169 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5170 	vm_pageout_state.vm_page_speculative_percentage = 5;
5171 	vm_pageout_state.vm_page_speculative_target = 0;
5172 
5173 	vm_pageout_state.vm_pageout_swap_wait = 0;
5174 	vm_pageout_state.vm_pageout_idle_wait = 0;
5175 	vm_pageout_state.vm_pageout_empty_wait = 0;
5176 	vm_pageout_state.vm_pageout_burst_wait = 0;
5177 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5178 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5179 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5180 
5181 	vm_pageout_state.vm_pageout_inactive = 0;
5182 	vm_pageout_state.vm_pageout_inactive_used = 0;
5183 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5184 
5185 	vm_pageout_state.vm_memory_pressure = 0;
5186 	vm_pageout_state.vm_page_filecache_min = 0;
5187 #if CONFIG_JETSAM
5188 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5189 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5190 #else
5191 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5192 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5193 #endif
5194 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5195 
5196 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5197 
5198 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5199 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5200 	}
5201 
5202 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5203 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5204 	}
5205 
5206 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5207 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5208 	}
5209 
5210 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5211 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5212 	}
5213 
5214 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5215 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5216 	}
5217 
5218 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5219 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5220 	}
5221 
5222 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5223 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5224 	}
5225 	/*
5226 	 * even if we've already called vm_page_free_reserve
5227 	 * call it again here to insure that the targets are
5228 	 * accurately calculated (it uses vm_page_free_count_init)
5229 	 * calling it with an arg of 0 will not change the reserve
5230 	 * but will re-calculate free_min and free_target
5231 	 */
5232 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5233 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5234 	} else {
5235 		vm_page_free_reserve(0);
5236 	}
5237 
5238 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5239 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5240 
5241 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5242 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5243 
5244 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5245 
5246 #if DEVELOPMENT || DEBUG
5247 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5248 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5249 #endif /* DEVELOPMENT || DEBUG */
5250 
5251 
5252 	/* internal pageout thread started when default pager registered first time */
5253 	/* external pageout and garbage collection threads started here */
5254 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5255 	ethr->id = 0;
5256 	ethr->q = &vm_pageout_queue_external;
5257 	ethr->current_early_swapout_chead = NULL;
5258 	ethr->current_regular_swapout_chead = NULL;
5259 	ethr->current_late_swapout_chead = NULL;
5260 	ethr->scratch_buf = NULL;
5261 #if DEVELOPMENT || DEBUG
5262 	ethr->benchmark_q = NULL;
5263 #endif /* DEVELOPMENT || DEBUG */
5264 	sched_cond_init(&(ethr->pgo_wakeup));
5265 
5266 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5267 	    (void *)ethr, BASEPRI_VM,
5268 	    &(ethr->pgo_iothread));
5269 	if (result != KERN_SUCCESS) {
5270 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5271 	}
5272 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5273 
5274 	thread_mtx_lock(vm_pageout_gc_thread );
5275 	thread_start(vm_pageout_gc_thread );
5276 	thread_mtx_unlock(vm_pageout_gc_thread);
5277 
5278 #if VM_PRESSURE_EVENTS
5279 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5280 	    BASEPRI_DEFAULT,
5281 	    &thread);
5282 
5283 	if (result != KERN_SUCCESS) {
5284 		panic("vm_pressure_thread: create failed");
5285 	}
5286 
5287 	thread_deallocate(thread);
5288 #endif
5289 
5290 	vm_object_reaper_init();
5291 
5292 
5293 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5294 		vm_compressor_init();
5295 	}
5296 
5297 #if VM_PRESSURE_EVENTS
5298 	vm_pressure_events_enabled = TRUE;
5299 #endif /* VM_PRESSURE_EVENTS */
5300 
5301 #if CONFIG_PHANTOM_CACHE
5302 	vm_phantom_cache_init();
5303 #endif
5304 #if VM_PAGE_BUCKETS_CHECK
5305 #if VM_PAGE_FAKE_BUCKETS
5306 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5307 	    (uint64_t) vm_page_fake_buckets_start,
5308 	    (uint64_t) vm_page_fake_buckets_end);
5309 	pmap_protect(kernel_pmap,
5310 	    vm_page_fake_buckets_start,
5311 	    vm_page_fake_buckets_end,
5312 	    VM_PROT_READ);
5313 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5314 #endif /* VM_PAGE_FAKE_BUCKETS */
5315 #endif /* VM_PAGE_BUCKETS_CHECK */
5316 
5317 #if VM_OBJECT_TRACKING
5318 	vm_object_tracking_init();
5319 #endif /* VM_OBJECT_TRACKING */
5320 
5321 #if __arm64__
5322 //	vm_tests();
5323 #endif /* __arm64__ */
5324 
5325 	vm_pageout_continue();
5326 
5327 	/*
5328 	 * Unreached code!
5329 	 *
5330 	 * The vm_pageout_continue() call above never returns, so the code below is never
5331 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5332 	 * points that our kernel doesn't have an analog for.  These are probe points that
5333 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5334 	 * scripts that use them.  Declaring the probe points here means their scripts will
5335 	 * compile and execute which we want for portability of the scripts, but since this
5336 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5337 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5338 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5339 	 */
5340 
5341 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5342 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5343 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5344 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5345 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5346 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5347 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5348 	/*NOTREACHED*/
5349 }
5350 
5351 
5352 
5353 kern_return_t
vm_pageout_internal_start(void)5354 vm_pageout_internal_start(void)
5355 {
5356 	kern_return_t   result = KERN_SUCCESS;
5357 	host_basic_info_data_t hinfo;
5358 	vm_offset_t     buf, bufsize;
5359 
5360 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5361 
5362 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5363 #define BSD_HOST 1
5364 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5365 
5366 	assert(hinfo.max_cpus > 0);
5367 
5368 #if !XNU_TARGET_OS_OSX
5369 	vm_pageout_state.vm_compressor_thread_count = 1;
5370 #else /* !XNU_TARGET_OS_OSX */
5371 	if (hinfo.max_cpus > 4) {
5372 		vm_pageout_state.vm_compressor_thread_count = 2;
5373 	} else {
5374 		vm_pageout_state.vm_compressor_thread_count = 1;
5375 	}
5376 #endif /* !XNU_TARGET_OS_OSX */
5377 #if     __AMP__
5378 	if (vm_compressor_ebound) {
5379 		vm_pageout_state.vm_compressor_thread_count = 2;
5380 	}
5381 #endif
5382 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5383 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5384 
5385 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5386 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5387 	}
5388 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5389 		vm_pageout_state.vm_compressor_thread_count = 1;
5390 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5391 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5392 	}
5393 
5394 	vm_pageout_queue_internal.pgo_maxlaundry =
5395 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5396 
5397 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5398 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5399 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5400 
5401 #if DEVELOPMENT || DEBUG
5402 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5403 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5404 #endif /* DEVELOPMENT || DEBUG */
5405 
5406 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5407 
5408 	kmem_alloc(kernel_map, &buf,
5409 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5410 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5411 	    VM_KERN_MEMORY_COMPRESSOR);
5412 
5413 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5414 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5415 		iq->id = i;
5416 		iq->q = &vm_pageout_queue_internal;
5417 		iq->current_early_swapout_chead = NULL;
5418 		iq->current_regular_swapout_chead = NULL;
5419 		iq->current_late_swapout_chead = NULL;
5420 		iq->scratch_buf = (char *)(buf + i * bufsize);
5421 #if DEVELOPMENT || DEBUG
5422 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5423 #endif /* DEVELOPMENT || DEBUG */
5424 		sched_cond_init(&(iq->pgo_wakeup));
5425 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5426 		    (void *)iq, BASEPRI_VM,
5427 		    &(iq->pgo_iothread));
5428 
5429 		if (result != KERN_SUCCESS) {
5430 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5431 		}
5432 	}
5433 	return result;
5434 }
5435 
5436 #if CONFIG_IOSCHED
5437 /*
5438  * To support I/O Expedite for compressed files we mark the upls with special flags.
5439  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5440  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5441  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5442  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5443  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5444  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5445  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5446  * unless the real I/O upl is being destroyed).
5447  */
5448 
5449 
5450 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5451 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5452 {
5453 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5454 
5455 	upl_lock(src_upl);
5456 	if (src_upl->decmp_io_upl) {
5457 		/*
5458 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5459 		 * This case should rarely happen and even if it does, it just means
5460 		 * that we might issue a spurious expedite which the driver is expected
5461 		 * to handle.
5462 		 */
5463 		upl_unlock(src_upl);
5464 		return;
5465 	}
5466 	src_upl->decmp_io_upl = (void *)upl;
5467 	src_upl->ref_count++;
5468 
5469 	upl->flags |= UPL_DECMP_REAL_IO;
5470 	upl->decmp_io_upl = (void *)src_upl;
5471 	upl_unlock(src_upl);
5472 }
5473 #endif /* CONFIG_IOSCHED */
5474 
5475 #if UPL_DEBUG
5476 int     upl_debug_enabled = 1;
5477 #else
5478 int     upl_debug_enabled = 0;
5479 #endif
5480 
5481 static upl_t
upl_create(int type,int flags,upl_size_t size)5482 upl_create(int type, int flags, upl_size_t size)
5483 {
5484 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5485 	upl_t    upl;
5486 
5487 	assert(page_aligned(size));
5488 
5489 	/*
5490 	 * FIXME: this code assumes the allocation always succeeds,
5491 	 *        however `pages` can be up to MAX_UPL_SIZE.
5492 	 *
5493 	 *        The allocation size is above 32k (resp. 128k)
5494 	 *        on 16k pages (resp. 4k), which kalloc might fail
5495 	 *        to allocate.
5496 	 */
5497 	upl = kalloc_type(struct upl, struct upl_page_info,
5498 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5499 	if (type & UPL_CREATE_INTERNAL) {
5500 		flags |= UPL_INTERNAL;
5501 	}
5502 
5503 	if (type & UPL_CREATE_LITE) {
5504 		flags |= UPL_LITE;
5505 		if (pages) {
5506 			upl->lite_list = bitmap_alloc(pages);
5507 		}
5508 	}
5509 
5510 	upl->flags = flags;
5511 	upl->ref_count = 1;
5512 	upl_lock_init(upl);
5513 #if CONFIG_IOSCHED
5514 	if (type & UPL_CREATE_IO_TRACKING) {
5515 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5516 	}
5517 
5518 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5519 		/* Only support expedite on internal UPLs */
5520 		thread_t        curthread = current_thread();
5521 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5522 		    Z_WAITOK | Z_ZERO);
5523 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5524 		if (curthread->decmp_upl != NULL) {
5525 			upl_set_decmp_info(upl, curthread->decmp_upl);
5526 		}
5527 	}
5528 #endif
5529 #if CONFIG_IOSCHED || UPL_DEBUG
5530 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5531 		upl->upl_creator = current_thread();
5532 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5533 	}
5534 #endif
5535 
5536 #if UPL_DEBUG
5537 	upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5538 #endif /* UPL_DEBUG */
5539 
5540 	return upl;
5541 }
5542 
5543 static void
upl_destroy(upl_t upl)5544 upl_destroy(upl_t upl)
5545 {
5546 	uint32_t pages;
5547 
5548 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5549 
5550 	if (upl->ext_ref_count) {
5551 		panic("upl(%p) ext_ref_count", upl);
5552 	}
5553 
5554 #if CONFIG_IOSCHED
5555 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5556 		upl_t src_upl;
5557 		src_upl = upl->decmp_io_upl;
5558 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5559 		upl_lock(src_upl);
5560 		src_upl->decmp_io_upl = NULL;
5561 		upl_unlock(src_upl);
5562 		upl_deallocate(src_upl);
5563 	}
5564 #endif /* CONFIG_IOSCHED */
5565 
5566 #if CONFIG_IOSCHED || UPL_DEBUG
5567 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5568 	    !(upl->flags & UPL_VECTOR)) {
5569 		vm_object_t     object;
5570 
5571 		if (upl->flags & UPL_SHADOWED) {
5572 			object = upl->map_object->shadow;
5573 		} else {
5574 			object = upl->map_object;
5575 		}
5576 
5577 		vm_object_lock(object);
5578 		queue_remove(&object->uplq, upl, upl_t, uplq);
5579 		vm_object_activity_end(object);
5580 		vm_object_collapse(object, 0, TRUE);
5581 		vm_object_unlock(object);
5582 	}
5583 #endif
5584 	/*
5585 	 * drop a reference on the map_object whether or
5586 	 * not a pageout object is inserted
5587 	 */
5588 	if (upl->flags & UPL_SHADOWED) {
5589 		vm_object_deallocate(upl->map_object);
5590 	}
5591 
5592 	if (upl->flags & UPL_DEVICE_MEMORY) {
5593 		pages = 1;
5594 	} else {
5595 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5596 	}
5597 
5598 	upl_lock_destroy(upl);
5599 
5600 #if CONFIG_IOSCHED
5601 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5602 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5603 	}
5604 #endif
5605 
5606 #if UPL_DEBUG
5607 	for (int i = 0; i < upl->upl_commit_index; i++) {
5608 		btref_put(upl->upl_commit_records[i].c_btref);
5609 	}
5610 	btref_put(upl->uple_create_btref);
5611 #endif /* UPL_DEBUG */
5612 
5613 	if ((upl->flags & UPL_LITE) && pages) {
5614 		bitmap_free(upl->lite_list, pages);
5615 	}
5616 	kfree_type(struct upl, struct upl_page_info,
5617 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5618 }
5619 
5620 void
upl_deallocate(upl_t upl)5621 upl_deallocate(upl_t upl)
5622 {
5623 	upl_lock(upl);
5624 
5625 	if (--upl->ref_count == 0) {
5626 		if (vector_upl_is_valid(upl)) {
5627 			vector_upl_deallocate(upl);
5628 		}
5629 		upl_unlock(upl);
5630 
5631 		if (upl->upl_iodone) {
5632 			upl_callout_iodone(upl);
5633 		}
5634 
5635 		upl_destroy(upl);
5636 	} else {
5637 		upl_unlock(upl);
5638 	}
5639 }
5640 
5641 #if CONFIG_IOSCHED
5642 void
upl_mark_decmp(upl_t upl)5643 upl_mark_decmp(upl_t upl)
5644 {
5645 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5646 		upl->flags |= UPL_DECMP_REQ;
5647 		upl->upl_creator->decmp_upl = (void *)upl;
5648 	}
5649 }
5650 
5651 void
upl_unmark_decmp(upl_t upl)5652 upl_unmark_decmp(upl_t upl)
5653 {
5654 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5655 		upl->upl_creator->decmp_upl = NULL;
5656 	}
5657 }
5658 
5659 #endif /* CONFIG_IOSCHED */
5660 
5661 #define VM_PAGE_Q_BACKING_UP(q)         \
5662 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5663 
5664 boolean_t must_throttle_writes(void);
5665 
5666 boolean_t
must_throttle_writes()5667 must_throttle_writes()
5668 {
5669 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5670 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5671 		return TRUE;
5672 	}
5673 
5674 	return FALSE;
5675 }
5676 
5677 int vm_page_delayed_work_ctx_needed = 0;
5678 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5679 
5680 __startup_func
5681 static void
vm_page_delayed_work_init_ctx(void)5682 vm_page_delayed_work_init_ctx(void)
5683 {
5684 	uint16_t min_delayed_work_ctx_allocated = 16;
5685 
5686 	/*
5687 	 * try really hard to always keep NCPU elements around in the zone
5688 	 * in order for the UPL code to almost always get an element.
5689 	 */
5690 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5691 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5692 	}
5693 
5694 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5695 }
5696 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5697 
5698 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5699 vm_page_delayed_work_get_ctx(void)
5700 {
5701 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5702 
5703 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5704 
5705 	if (__probable(dw_ctx)) {
5706 		dw_ctx->delayed_owner = current_thread();
5707 	} else {
5708 		vm_page_delayed_work_ctx_needed++;
5709 	}
5710 	return dw_ctx ? dw_ctx->dwp : NULL;
5711 }
5712 
5713 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5714 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5715 {
5716 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5717 
5718 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5719 	ldw_ctx->delayed_owner = NULL;
5720 
5721 	zfree(dw_ctx_zone, ldw_ctx);
5722 }
5723 
5724 /*
5725  *	Routine:	vm_object_upl_request
5726  *	Purpose:
5727  *		Cause the population of a portion of a vm_object.
5728  *		Depending on the nature of the request, the pages
5729  *		returned may be contain valid data or be uninitialized.
5730  *		A page list structure, listing the physical pages
5731  *		will be returned upon request.
5732  *		This function is called by the file system or any other
5733  *		supplier of backing store to a pager.
5734  *		IMPORTANT NOTE: The caller must still respect the relationship
5735  *		between the vm_object and its backing memory object.  The
5736  *		caller MUST NOT substitute changes in the backing file
5737  *		without first doing a memory_object_lock_request on the
5738  *		target range unless it is know that the pages are not
5739  *		shared with another entity at the pager level.
5740  *		Copy_in_to:
5741  *			if a page list structure is present
5742  *			return the mapped physical pages, where a
5743  *			page is not present, return a non-initialized
5744  *			one.  If the no_sync bit is turned on, don't
5745  *			call the pager unlock to synchronize with other
5746  *			possible copies of the page. Leave pages busy
5747  *			in the original object, if a page list structure
5748  *			was specified.  When a commit of the page list
5749  *			pages is done, the dirty bit will be set for each one.
5750  *		Copy_out_from:
5751  *			If a page list structure is present, return
5752  *			all mapped pages.  Where a page does not exist
5753  *			map a zero filled one. Leave pages busy in
5754  *			the original object.  If a page list structure
5755  *			is not specified, this call is a no-op.
5756  *
5757  *		Note:  access of default pager objects has a rather interesting
5758  *		twist.  The caller of this routine, presumably the file system
5759  *		page cache handling code, will never actually make a request
5760  *		against a default pager backed object.  Only the default
5761  *		pager will make requests on backing store related vm_objects
5762  *		In this way the default pager can maintain the relationship
5763  *		between backing store files (abstract memory objects) and
5764  *		the vm_objects (cache objects), they support.
5765  *
5766  */
5767 
5768 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5769 vm_object_upl_request(
5770 	vm_object_t             object,
5771 	vm_object_offset_t      offset,
5772 	upl_size_t              size,
5773 	upl_t                   *upl_ptr,
5774 	upl_page_info_array_t   user_page_list,
5775 	unsigned int            *page_list_count,
5776 	upl_control_flags_t     cntrl_flags,
5777 	vm_tag_t                tag)
5778 {
5779 	vm_page_t               dst_page = VM_PAGE_NULL;
5780 	vm_object_offset_t      dst_offset;
5781 	upl_size_t              xfer_size;
5782 	unsigned int            size_in_pages;
5783 	boolean_t               dirty;
5784 	boolean_t               hw_dirty;
5785 	upl_t                   upl = NULL;
5786 	unsigned int            entry;
5787 	vm_page_t               alias_page = NULL;
5788 	int                     refmod_state = 0;
5789 	vm_object_t             last_copy_object;
5790 	struct  vm_page_delayed_work    dw_array;
5791 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5792 	bool                    dwp_finish_ctx = TRUE;
5793 	int                     dw_count;
5794 	int                     dw_limit;
5795 	int                     io_tracking_flag = 0;
5796 	int                     grab_options;
5797 	int                     page_grab_count = 0;
5798 	ppnum_t                 phys_page;
5799 	pmap_flush_context      pmap_flush_context_storage;
5800 	boolean_t               pmap_flushes_delayed = FALSE;
5801 #if DEVELOPMENT || DEBUG
5802 	task_t                  task = current_task();
5803 #endif /* DEVELOPMENT || DEBUG */
5804 
5805 	dwp_start = dwp = NULL;
5806 
5807 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5808 		/*
5809 		 * For forward compatibility's sake,
5810 		 * reject any unknown flag.
5811 		 */
5812 		return KERN_INVALID_VALUE;
5813 	}
5814 	if ((!object->internal) && (object->paging_offset != 0)) {
5815 		panic("vm_object_upl_request: external object with non-zero paging offset");
5816 	}
5817 	if (object->phys_contiguous) {
5818 		panic("vm_object_upl_request: contiguous object specified");
5819 	}
5820 
5821 	assertf(page_aligned(offset) && page_aligned(size),
5822 	    "offset 0x%llx size 0x%x",
5823 	    offset, size);
5824 
5825 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5826 
5827 	dw_count = 0;
5828 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5829 	dwp_start = vm_page_delayed_work_get_ctx();
5830 	if (dwp_start == NULL) {
5831 		dwp_start = &dw_array;
5832 		dw_limit = 1;
5833 		dwp_finish_ctx = FALSE;
5834 	}
5835 
5836 	dwp = dwp_start;
5837 
5838 	if (size > MAX_UPL_SIZE_BYTES) {
5839 		size = MAX_UPL_SIZE_BYTES;
5840 	}
5841 
5842 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5843 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5844 	}
5845 
5846 #if CONFIG_IOSCHED || UPL_DEBUG
5847 	if (object->io_tracking || upl_debug_enabled) {
5848 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5849 	}
5850 #endif
5851 #if CONFIG_IOSCHED
5852 	if (object->io_tracking) {
5853 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5854 	}
5855 #endif
5856 
5857 	if (cntrl_flags & UPL_SET_INTERNAL) {
5858 		if (cntrl_flags & UPL_SET_LITE) {
5859 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5860 		} else {
5861 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5862 		}
5863 		user_page_list = size ? upl->page_list : NULL;
5864 	} else {
5865 		if (cntrl_flags & UPL_SET_LITE) {
5866 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5867 		} else {
5868 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5869 		}
5870 	}
5871 	*upl_ptr = upl;
5872 
5873 	if (user_page_list) {
5874 		user_page_list[0].device = FALSE;
5875 	}
5876 
5877 	if (cntrl_flags & UPL_SET_LITE) {
5878 		upl->map_object = object;
5879 	} else {
5880 		upl->map_object = vm_object_allocate(size);
5881 		/*
5882 		 * No neeed to lock the new object: nobody else knows
5883 		 * about it yet, so it's all ours so far.
5884 		 */
5885 		upl->map_object->shadow = object;
5886 		upl->map_object->pageout = TRUE;
5887 		upl->map_object->can_persist = FALSE;
5888 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5889 		upl->map_object->vo_shadow_offset = offset;
5890 		upl->map_object->wimg_bits = object->wimg_bits;
5891 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5892 		    "object %p shadow_offset 0x%llx",
5893 		    upl->map_object, upl->map_object->vo_shadow_offset);
5894 
5895 		alias_page = vm_page_grab_fictitious(TRUE);
5896 
5897 		upl->flags |= UPL_SHADOWED;
5898 	}
5899 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5900 		upl->flags |= UPL_PAGEOUT;
5901 	}
5902 
5903 	vm_object_lock(object);
5904 	vm_object_activity_begin(object);
5905 
5906 	grab_options = 0;
5907 #if CONFIG_SECLUDED_MEMORY
5908 	if (object->can_grab_secluded) {
5909 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5910 	}
5911 #endif /* CONFIG_SECLUDED_MEMORY */
5912 
5913 	/*
5914 	 * we can lock in the paging_offset once paging_in_progress is set
5915 	 */
5916 	upl->u_size = size;
5917 	upl->u_offset = offset + object->paging_offset;
5918 
5919 #if CONFIG_IOSCHED || UPL_DEBUG
5920 	if (object->io_tracking || upl_debug_enabled) {
5921 		vm_object_activity_begin(object);
5922 		queue_enter(&object->uplq, upl, upl_t, uplq);
5923 	}
5924 #endif
5925 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5926 		/*
5927 		 * Honor copy-on-write obligations
5928 		 *
5929 		 * The caller is gathering these pages and
5930 		 * might modify their contents.  We need to
5931 		 * make sure that the copy object has its own
5932 		 * private copies of these pages before we let
5933 		 * the caller modify them.
5934 		 */
5935 		vm_object_update(object,
5936 		    offset,
5937 		    size,
5938 		    NULL,
5939 		    NULL,
5940 		    FALSE,              /* should_return */
5941 		    MEMORY_OBJECT_COPY_SYNC,
5942 		    VM_PROT_NO_CHANGE);
5943 
5944 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5945 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5946 	}
5947 	/*
5948 	 * remember which copy object we synchronized with
5949 	 */
5950 	last_copy_object = object->copy;
5951 	entry = 0;
5952 
5953 	xfer_size = size;
5954 	dst_offset = offset;
5955 	size_in_pages = size / PAGE_SIZE;
5956 
5957 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5958 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5959 		object->scan_collisions = 0;
5960 	}
5961 
5962 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5963 		boolean_t       isSSD = FALSE;
5964 
5965 #if !XNU_TARGET_OS_OSX
5966 		isSSD = TRUE;
5967 #else /* !XNU_TARGET_OS_OSX */
5968 		vnode_pager_get_isSSD(object->pager, &isSSD);
5969 #endif /* !XNU_TARGET_OS_OSX */
5970 		vm_object_unlock(object);
5971 
5972 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5973 
5974 		if (isSSD == TRUE) {
5975 			delay(1000 * size_in_pages);
5976 		} else {
5977 			delay(5000 * size_in_pages);
5978 		}
5979 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5980 
5981 		vm_object_lock(object);
5982 	}
5983 
5984 	while (xfer_size) {
5985 		dwp->dw_mask = 0;
5986 
5987 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5988 			vm_object_unlock(object);
5989 			alias_page = vm_page_grab_fictitious(TRUE);
5990 			vm_object_lock(object);
5991 		}
5992 		if (cntrl_flags & UPL_COPYOUT_FROM) {
5993 			upl->flags |= UPL_PAGE_SYNC_DONE;
5994 
5995 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5996 			    dst_page->vmp_fictitious ||
5997 			    dst_page->vmp_absent ||
5998 			    VMP_ERROR_GET(dst_page) ||
5999 			    dst_page->vmp_cleaning ||
6000 			    (VM_PAGE_WIRED(dst_page))) {
6001 				if (user_page_list) {
6002 					user_page_list[entry].phys_addr = 0;
6003 				}
6004 
6005 				goto try_next_page;
6006 			}
6007 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6008 
6009 			/*
6010 			 * grab this up front...
6011 			 * a high percentange of the time we're going to
6012 			 * need the hardware modification state a bit later
6013 			 * anyway... so we can eliminate an extra call into
6014 			 * the pmap layer by grabbing it here and recording it
6015 			 */
6016 			if (dst_page->vmp_pmapped) {
6017 				refmod_state = pmap_get_refmod(phys_page);
6018 			} else {
6019 				refmod_state = 0;
6020 			}
6021 
6022 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6023 				/*
6024 				 * page is on inactive list and referenced...
6025 				 * reactivate it now... this gets it out of the
6026 				 * way of vm_pageout_scan which would have to
6027 				 * reactivate it upon tripping over it
6028 				 */
6029 				dwp->dw_mask |= DW_vm_page_activate;
6030 			}
6031 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6032 				/*
6033 				 * we're only asking for DIRTY pages to be returned
6034 				 */
6035 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6036 					/*
6037 					 * if we were the page stolen by vm_pageout_scan to be
6038 					 * cleaned (as opposed to a buddy being clustered in
6039 					 * or this request is not being driven by a PAGEOUT cluster
6040 					 * then we only need to check for the page being dirty or
6041 					 * precious to decide whether to return it
6042 					 */
6043 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6044 						goto check_busy;
6045 					}
6046 					goto dont_return;
6047 				}
6048 				/*
6049 				 * this is a request for a PAGEOUT cluster and this page
6050 				 * is merely along for the ride as a 'buddy'... not only
6051 				 * does it have to be dirty to be returned, but it also
6052 				 * can't have been referenced recently...
6053 				 */
6054 				if ((hibernate_cleaning_in_progress == TRUE ||
6055 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6056 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6057 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6058 					goto check_busy;
6059 				}
6060 dont_return:
6061 				/*
6062 				 * if we reach here, we're not to return
6063 				 * the page... go on to the next one
6064 				 */
6065 				if (dst_page->vmp_laundry == TRUE) {
6066 					/*
6067 					 * if we get here, the page is not 'cleaning' (filtered out above).
6068 					 * since it has been referenced, remove it from the laundry
6069 					 * so we don't pay the cost of an I/O to clean a page
6070 					 * we're just going to take back
6071 					 */
6072 					vm_page_lockspin_queues();
6073 
6074 					vm_pageout_steal_laundry(dst_page, TRUE);
6075 					vm_page_activate(dst_page);
6076 
6077 					vm_page_unlock_queues();
6078 				}
6079 				if (user_page_list) {
6080 					user_page_list[entry].phys_addr = 0;
6081 				}
6082 
6083 				goto try_next_page;
6084 			}
6085 check_busy:
6086 			if (dst_page->vmp_busy) {
6087 				if (cntrl_flags & UPL_NOBLOCK) {
6088 					if (user_page_list) {
6089 						user_page_list[entry].phys_addr = 0;
6090 					}
6091 					dwp->dw_mask = 0;
6092 
6093 					goto try_next_page;
6094 				}
6095 				/*
6096 				 * someone else is playing with the
6097 				 * page.  We will have to wait.
6098 				 */
6099 				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6100 
6101 				continue;
6102 			}
6103 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6104 				vm_page_lockspin_queues();
6105 
6106 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6107 					/*
6108 					 * we've buddied up a page for a clustered pageout
6109 					 * that has already been moved to the pageout
6110 					 * queue by pageout_scan... we need to remove
6111 					 * it from the queue and drop the laundry count
6112 					 * on that queue
6113 					 */
6114 					vm_pageout_throttle_up(dst_page);
6115 				}
6116 				vm_page_unlock_queues();
6117 			}
6118 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6119 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6120 
6121 			if (phys_page > upl->highest_page) {
6122 				upl->highest_page = phys_page;
6123 			}
6124 
6125 			assert(!pmap_is_noencrypt(phys_page));
6126 
6127 			if (cntrl_flags & UPL_SET_LITE) {
6128 				unsigned int    pg_num;
6129 
6130 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6131 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6132 				bitmap_set(upl->lite_list, pg_num);
6133 
6134 				if (hw_dirty) {
6135 					if (pmap_flushes_delayed == FALSE) {
6136 						pmap_flush_context_init(&pmap_flush_context_storage);
6137 						pmap_flushes_delayed = TRUE;
6138 					}
6139 					pmap_clear_refmod_options(phys_page,
6140 					    VM_MEM_MODIFIED,
6141 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6142 					    &pmap_flush_context_storage);
6143 				}
6144 
6145 				/*
6146 				 * Mark original page as cleaning
6147 				 * in place.
6148 				 */
6149 				dst_page->vmp_cleaning = TRUE;
6150 				dst_page->vmp_precious = FALSE;
6151 			} else {
6152 				/*
6153 				 * use pageclean setup, it is more
6154 				 * convenient even for the pageout
6155 				 * cases here
6156 				 */
6157 				vm_object_lock(upl->map_object);
6158 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6159 				vm_object_unlock(upl->map_object);
6160 
6161 				alias_page->vmp_absent = FALSE;
6162 				alias_page = NULL;
6163 			}
6164 			if (dirty) {
6165 				SET_PAGE_DIRTY(dst_page, FALSE);
6166 			} else {
6167 				dst_page->vmp_dirty = FALSE;
6168 			}
6169 
6170 			if (!dirty) {
6171 				dst_page->vmp_precious = TRUE;
6172 			}
6173 
6174 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6175 				if (!VM_PAGE_WIRED(dst_page)) {
6176 					dst_page->vmp_free_when_done = TRUE;
6177 				}
6178 			}
6179 		} else {
6180 			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6181 				/*
6182 				 * Honor copy-on-write obligations
6183 				 *
6184 				 * The copy object has changed since we
6185 				 * last synchronized for copy-on-write.
6186 				 * Another copy object might have been
6187 				 * inserted while we released the object's
6188 				 * lock.  Since someone could have seen the
6189 				 * original contents of the remaining pages
6190 				 * through that new object, we have to
6191 				 * synchronize with it again for the remaining
6192 				 * pages only.  The previous pages are "busy"
6193 				 * so they can not be seen through the new
6194 				 * mapping.  The new mapping will see our
6195 				 * upcoming changes for those previous pages,
6196 				 * but that's OK since they couldn't see what
6197 				 * was there before.  It's just a race anyway
6198 				 * and there's no guarantee of consistency or
6199 				 * atomicity.  We just don't want new mappings
6200 				 * to see both the *before* and *after* pages.
6201 				 */
6202 				if (object->copy != VM_OBJECT_NULL) {
6203 					vm_object_update(
6204 						object,
6205 						dst_offset,/* current offset */
6206 						xfer_size, /* remaining size */
6207 						NULL,
6208 						NULL,
6209 						FALSE,     /* should_return */
6210 						MEMORY_OBJECT_COPY_SYNC,
6211 						VM_PROT_NO_CHANGE);
6212 
6213 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6214 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6215 				}
6216 				/*
6217 				 * remember the copy object we synced with
6218 				 */
6219 				last_copy_object = object->copy;
6220 			}
6221 			dst_page = vm_page_lookup(object, dst_offset);
6222 
6223 			if (dst_page != VM_PAGE_NULL) {
6224 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6225 					/*
6226 					 * skip over pages already present in the cache
6227 					 */
6228 					if (user_page_list) {
6229 						user_page_list[entry].phys_addr = 0;
6230 					}
6231 
6232 					goto try_next_page;
6233 				}
6234 				if (dst_page->vmp_fictitious) {
6235 					panic("need corner case for fictitious page");
6236 				}
6237 
6238 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6239 					/*
6240 					 * someone else is playing with the
6241 					 * page.  We will have to wait.
6242 					 */
6243 					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6244 
6245 					continue;
6246 				}
6247 				if (dst_page->vmp_laundry) {
6248 					vm_pageout_steal_laundry(dst_page, FALSE);
6249 				}
6250 			} else {
6251 				if (object->private) {
6252 					/*
6253 					 * This is a nasty wrinkle for users
6254 					 * of upl who encounter device or
6255 					 * private memory however, it is
6256 					 * unavoidable, only a fault can
6257 					 * resolve the actual backing
6258 					 * physical page by asking the
6259 					 * backing device.
6260 					 */
6261 					if (user_page_list) {
6262 						user_page_list[entry].phys_addr = 0;
6263 					}
6264 
6265 					goto try_next_page;
6266 				}
6267 				if (object->scan_collisions) {
6268 					/*
6269 					 * the pageout_scan thread is trying to steal
6270 					 * pages from this object, but has run into our
6271 					 * lock... grab 2 pages from the head of the object...
6272 					 * the first is freed on behalf of pageout_scan, the
6273 					 * 2nd is for our own use... we use vm_object_page_grab
6274 					 * in both cases to avoid taking pages from the free
6275 					 * list since we are under memory pressure and our
6276 					 * lock on this object is getting in the way of
6277 					 * relieving it
6278 					 */
6279 					dst_page = vm_object_page_grab(object);
6280 
6281 					if (dst_page != VM_PAGE_NULL) {
6282 						vm_page_release(dst_page,
6283 						    FALSE);
6284 					}
6285 
6286 					dst_page = vm_object_page_grab(object);
6287 				}
6288 				if (dst_page == VM_PAGE_NULL) {
6289 					/*
6290 					 * need to allocate a page
6291 					 */
6292 					dst_page = vm_page_grab_options(grab_options);
6293 					if (dst_page != VM_PAGE_NULL) {
6294 						page_grab_count++;
6295 					}
6296 				}
6297 				if (dst_page == VM_PAGE_NULL) {
6298 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6299 						/*
6300 						 * we don't want to stall waiting for pages to come onto the free list
6301 						 * while we're already holding absent pages in this UPL
6302 						 * the caller will deal with the empty slots
6303 						 */
6304 						if (user_page_list) {
6305 							user_page_list[entry].phys_addr = 0;
6306 						}
6307 
6308 						goto try_next_page;
6309 					}
6310 					/*
6311 					 * no pages available... wait
6312 					 * then try again for the same
6313 					 * offset...
6314 					 */
6315 					vm_object_unlock(object);
6316 
6317 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6318 
6319 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6320 
6321 					VM_PAGE_WAIT();
6322 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6323 
6324 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6325 
6326 					vm_object_lock(object);
6327 
6328 					continue;
6329 				}
6330 				vm_page_insert(dst_page, object, dst_offset);
6331 
6332 				dst_page->vmp_absent = TRUE;
6333 				dst_page->vmp_busy = FALSE;
6334 
6335 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6336 					/*
6337 					 * if UPL_RET_ONLY_ABSENT was specified,
6338 					 * than we're definitely setting up a
6339 					 * upl for a clustered read/pagein
6340 					 * operation... mark the pages as clustered
6341 					 * so upl_commit_range can put them on the
6342 					 * speculative list
6343 					 */
6344 					dst_page->vmp_clustered = TRUE;
6345 
6346 					if (!(cntrl_flags & UPL_FILE_IO)) {
6347 						counter_inc(&vm_statistics_pageins);
6348 					}
6349 				}
6350 			}
6351 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6352 
6353 			dst_page->vmp_overwriting = TRUE;
6354 
6355 			if (dst_page->vmp_pmapped) {
6356 				if (!(cntrl_flags & UPL_FILE_IO)) {
6357 					/*
6358 					 * eliminate all mappings from the
6359 					 * original object and its prodigy
6360 					 */
6361 					refmod_state = pmap_disconnect(phys_page);
6362 				} else {
6363 					refmod_state = pmap_get_refmod(phys_page);
6364 				}
6365 			} else {
6366 				refmod_state = 0;
6367 			}
6368 
6369 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6370 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6371 
6372 			if (cntrl_flags & UPL_SET_LITE) {
6373 				unsigned int    pg_num;
6374 
6375 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6376 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6377 				bitmap_set(upl->lite_list, pg_num);
6378 
6379 				if (hw_dirty) {
6380 					pmap_clear_modify(phys_page);
6381 				}
6382 
6383 				/*
6384 				 * Mark original page as cleaning
6385 				 * in place.
6386 				 */
6387 				dst_page->vmp_cleaning = TRUE;
6388 				dst_page->vmp_precious = FALSE;
6389 			} else {
6390 				/*
6391 				 * use pageclean setup, it is more
6392 				 * convenient even for the pageout
6393 				 * cases here
6394 				 */
6395 				vm_object_lock(upl->map_object);
6396 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6397 				vm_object_unlock(upl->map_object);
6398 
6399 				alias_page->vmp_absent = FALSE;
6400 				alias_page = NULL;
6401 			}
6402 
6403 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6404 				upl->flags &= ~UPL_CLEAR_DIRTY;
6405 				upl->flags |= UPL_SET_DIRTY;
6406 				dirty = TRUE;
6407 				/*
6408 				 * Page belonging to a code-signed object is about to
6409 				 * be written. Mark it tainted and disconnect it from
6410 				 * all pmaps so processes have to fault it back in and
6411 				 * deal with the tainted bit.
6412 				 */
6413 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6414 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6415 					vm_page_upl_tainted++;
6416 					if (dst_page->vmp_pmapped) {
6417 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6418 						if (refmod_state & VM_MEM_REFERENCED) {
6419 							dst_page->vmp_reference = TRUE;
6420 						}
6421 					}
6422 				}
6423 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6424 				/*
6425 				 * clean in place for read implies
6426 				 * that a write will be done on all
6427 				 * the pages that are dirty before
6428 				 * a upl commit is done.  The caller
6429 				 * is obligated to preserve the
6430 				 * contents of all pages marked dirty
6431 				 */
6432 				upl->flags |= UPL_CLEAR_DIRTY;
6433 			}
6434 			dst_page->vmp_dirty = dirty;
6435 
6436 			if (!dirty) {
6437 				dst_page->vmp_precious = TRUE;
6438 			}
6439 
6440 			if (!VM_PAGE_WIRED(dst_page)) {
6441 				/*
6442 				 * deny access to the target page while
6443 				 * it is being worked on
6444 				 */
6445 				dst_page->vmp_busy = TRUE;
6446 			} else {
6447 				dwp->dw_mask |= DW_vm_page_wire;
6448 			}
6449 
6450 			/*
6451 			 * We might be about to satisfy a fault which has been
6452 			 * requested. So no need for the "restart" bit.
6453 			 */
6454 			dst_page->vmp_restart = FALSE;
6455 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6456 				/*
6457 				 * expect the page to be used
6458 				 */
6459 				dwp->dw_mask |= DW_set_reference;
6460 			}
6461 			if (cntrl_flags & UPL_PRECIOUS) {
6462 				if (object->internal) {
6463 					SET_PAGE_DIRTY(dst_page, FALSE);
6464 					dst_page->vmp_precious = FALSE;
6465 				} else {
6466 					dst_page->vmp_precious = TRUE;
6467 				}
6468 			} else {
6469 				dst_page->vmp_precious = FALSE;
6470 			}
6471 		}
6472 		if (dst_page->vmp_busy) {
6473 			upl->flags |= UPL_HAS_BUSY;
6474 		}
6475 
6476 		if (phys_page > upl->highest_page) {
6477 			upl->highest_page = phys_page;
6478 		}
6479 		assert(!pmap_is_noencrypt(phys_page));
6480 		if (user_page_list) {
6481 			user_page_list[entry].phys_addr = phys_page;
6482 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6483 			user_page_list[entry].absent    = dst_page->vmp_absent;
6484 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6485 			user_page_list[entry].precious  = dst_page->vmp_precious;
6486 			user_page_list[entry].device    = FALSE;
6487 			user_page_list[entry].needed    = FALSE;
6488 			if (dst_page->vmp_clustered == TRUE) {
6489 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6490 			} else {
6491 				user_page_list[entry].speculative = FALSE;
6492 			}
6493 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6494 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6495 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6496 			user_page_list[entry].mark      = FALSE;
6497 		}
6498 		/*
6499 		 * if UPL_RET_ONLY_ABSENT is set, then
6500 		 * we are working with a fresh page and we've
6501 		 * just set the clustered flag on it to
6502 		 * indicate that it was drug in as part of a
6503 		 * speculative cluster... so leave it alone
6504 		 */
6505 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6506 			/*
6507 			 * someone is explicitly grabbing this page...
6508 			 * update clustered and speculative state
6509 			 *
6510 			 */
6511 			if (dst_page->vmp_clustered) {
6512 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6513 			}
6514 		}
6515 try_next_page:
6516 		if (dwp->dw_mask) {
6517 			if (dwp->dw_mask & DW_vm_page_activate) {
6518 				counter_inc(&vm_statistics_reactivations);
6519 			}
6520 
6521 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6522 
6523 			if (dw_count >= dw_limit) {
6524 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6525 
6526 				dwp = dwp_start;
6527 				dw_count = 0;
6528 			}
6529 		}
6530 		entry++;
6531 		dst_offset += PAGE_SIZE_64;
6532 		xfer_size -= PAGE_SIZE;
6533 	}
6534 	if (dw_count) {
6535 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6536 		dwp = dwp_start;
6537 		dw_count = 0;
6538 	}
6539 
6540 	if (alias_page != NULL) {
6541 		VM_PAGE_FREE(alias_page);
6542 	}
6543 	if (pmap_flushes_delayed == TRUE) {
6544 		pmap_flush(&pmap_flush_context_storage);
6545 	}
6546 
6547 	if (page_list_count != NULL) {
6548 		if (upl->flags & UPL_INTERNAL) {
6549 			*page_list_count = 0;
6550 		} else if (*page_list_count > entry) {
6551 			*page_list_count = entry;
6552 		}
6553 	}
6554 #if UPL_DEBUG
6555 	upl->upl_state = 1;
6556 #endif
6557 	vm_object_unlock(object);
6558 
6559 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6560 #if DEVELOPMENT || DEBUG
6561 	if (task != NULL) {
6562 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6563 	}
6564 #endif /* DEVELOPMENT || DEBUG */
6565 
6566 	if (dwp_start && dwp_finish_ctx) {
6567 		vm_page_delayed_work_finish_ctx(dwp_start);
6568 		dwp_start = dwp = NULL;
6569 	}
6570 
6571 	return KERN_SUCCESS;
6572 }
6573 
6574 /*
6575  *	Routine:	vm_object_super_upl_request
6576  *	Purpose:
6577  *		Cause the population of a portion of a vm_object
6578  *		in much the same way as memory_object_upl_request.
6579  *		Depending on the nature of the request, the pages
6580  *		returned may be contain valid data or be uninitialized.
6581  *		However, the region may be expanded up to the super
6582  *		cluster size provided.
6583  */
6584 
6585 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6586 vm_object_super_upl_request(
6587 	vm_object_t object,
6588 	vm_object_offset_t      offset,
6589 	upl_size_t              size,
6590 	upl_size_t              super_cluster,
6591 	upl_t                   *upl,
6592 	upl_page_info_t         *user_page_list,
6593 	unsigned int            *page_list_count,
6594 	upl_control_flags_t     cntrl_flags,
6595 	vm_tag_t                tag)
6596 {
6597 	if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6598 		return KERN_FAILURE;
6599 	}
6600 
6601 	assert(object->paging_in_progress);
6602 	offset = offset - object->paging_offset;
6603 
6604 	if (super_cluster > size) {
6605 		vm_object_offset_t      base_offset;
6606 		upl_size_t              super_size;
6607 		vm_object_size_t        super_size_64;
6608 
6609 		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6610 		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6611 		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6612 		super_size = (upl_size_t) super_size_64;
6613 		assert(super_size == super_size_64);
6614 
6615 		if (offset > (base_offset + super_size)) {
6616 			panic("vm_object_super_upl_request: Missed target pageout"
6617 			    " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6618 			    offset, base_offset, super_size, super_cluster,
6619 			    size, object->paging_offset);
6620 		}
6621 		/*
6622 		 * apparently there is a case where the vm requests a
6623 		 * page to be written out who's offset is beyond the
6624 		 * object size
6625 		 */
6626 		if ((offset + size) > (base_offset + super_size)) {
6627 			super_size_64 = (offset + size) - base_offset;
6628 			super_size = (upl_size_t) super_size_64;
6629 			assert(super_size == super_size_64);
6630 		}
6631 
6632 		offset = base_offset;
6633 		size = super_size;
6634 	}
6635 	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6636 }
6637 
6638 int cs_executable_create_upl = 0;
6639 extern int proc_selfpid(void);
6640 extern char *proc_name_address(void *p);
6641 
6642 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6643 vm_map_create_upl(
6644 	vm_map_t                map,
6645 	vm_map_address_t        offset,
6646 	upl_size_t              *upl_size,
6647 	upl_t                   *upl,
6648 	upl_page_info_array_t   page_list,
6649 	unsigned int            *count,
6650 	upl_control_flags_t     *flags,
6651 	vm_tag_t                tag)
6652 {
6653 	vm_map_entry_t          entry;
6654 	upl_control_flags_t     caller_flags;
6655 	int                     force_data_sync;
6656 	int                     sync_cow_data;
6657 	vm_object_t             local_object;
6658 	vm_map_offset_t         local_offset;
6659 	vm_map_offset_t         local_start;
6660 	kern_return_t           ret;
6661 	vm_map_address_t        original_offset;
6662 	vm_map_size_t           original_size, adjusted_size;
6663 	vm_map_offset_t         local_entry_start;
6664 	vm_object_offset_t      local_entry_offset;
6665 	vm_object_offset_t      offset_in_mapped_page;
6666 	boolean_t               release_map = FALSE;
6667 
6668 start_with_map:
6669 
6670 	original_offset = offset;
6671 	original_size = *upl_size;
6672 	adjusted_size = original_size;
6673 
6674 	caller_flags = *flags;
6675 
6676 	if (caller_flags & ~UPL_VALID_FLAGS) {
6677 		/*
6678 		 * For forward compatibility's sake,
6679 		 * reject any unknown flag.
6680 		 */
6681 		ret = KERN_INVALID_VALUE;
6682 		goto done;
6683 	}
6684 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686 
6687 	if (upl == NULL) {
6688 		ret = KERN_INVALID_ARGUMENT;
6689 		goto done;
6690 	}
6691 
6692 REDISCOVER_ENTRY:
6693 	vm_map_lock_read(map);
6694 
6695 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6696 		vm_map_unlock_read(map);
6697 		ret = KERN_FAILURE;
6698 		goto done;
6699 	}
6700 
6701 	local_entry_start = entry->vme_start;
6702 	local_entry_offset = VME_OFFSET(entry);
6703 
6704 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6706 	}
6707 
6708 	if (entry->vme_end - original_offset < adjusted_size) {
6709 		adjusted_size = entry->vme_end - original_offset;
6710 		assert(adjusted_size > 0);
6711 		*upl_size = (upl_size_t) adjusted_size;
6712 		assert(*upl_size == adjusted_size);
6713 	}
6714 
6715 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716 		*flags = 0;
6717 
6718 		if (!entry->is_sub_map &&
6719 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720 			if (VME_OBJECT(entry)->private) {
6721 				*flags = UPL_DEV_MEMORY;
6722 			}
6723 
6724 			if (VME_OBJECT(entry)->phys_contiguous) {
6725 				*flags |= UPL_PHYS_CONTIG;
6726 			}
6727 		}
6728 		vm_map_unlock_read(map);
6729 		ret = KERN_SUCCESS;
6730 		goto done;
6731 	}
6732 
6733 	offset_in_mapped_page = 0;
6734 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736 		*upl_size = (upl_size_t)
6737 		    (vm_map_round_page(original_offset + adjusted_size,
6738 		    VM_MAP_PAGE_MASK(map))
6739 		    - offset);
6740 
6741 		offset_in_mapped_page = original_offset - offset;
6742 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743 
6744 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745 	}
6746 
6747 	if (!entry->is_sub_map) {
6748 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6749 		    !VME_OBJECT(entry)->phys_contiguous) {
6750 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751 				*upl_size = MAX_UPL_SIZE_BYTES;
6752 			}
6753 		}
6754 
6755 		/*
6756 		 *      Create an object if necessary.
6757 		 */
6758 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759 			if (vm_map_lock_read_to_write(map)) {
6760 				goto REDISCOVER_ENTRY;
6761 			}
6762 
6763 			VME_OBJECT_SET(entry,
6764 			    vm_object_allocate((vm_size_t)
6765 			    vm_object_round_page((entry->vme_end - entry->vme_start))),
6766 			    false, 0);
6767 			VME_OFFSET_SET(entry, 0);
6768 			assert(entry->use_pmap);
6769 
6770 			vm_map_lock_write_to_read(map);
6771 		}
6772 
6773 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6774 		    !(entry->protection & VM_PROT_WRITE)) {
6775 			vm_map_unlock_read(map);
6776 			ret = KERN_PROTECTION_FAILURE;
6777 			goto done;
6778 		}
6779 	}
6780 
6781 #if !XNU_TARGET_OS_OSX
6782 	if (map->pmap != kernel_pmap &&
6783 	    (caller_flags & UPL_COPYOUT_FROM) &&
6784 	    (entry->protection & VM_PROT_EXECUTE) &&
6785 	    !(entry->protection & VM_PROT_WRITE)) {
6786 		vm_offset_t     kaddr;
6787 		vm_size_t       ksize;
6788 
6789 		/*
6790 		 * We're about to create a read-only UPL backed by
6791 		 * memory from an executable mapping.
6792 		 * Wiring the pages would result in the pages being copied
6793 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6794 		 * code-signed, so no longer eligible for execution.
6795 		 * Instead, let's copy the data into a kernel buffer and
6796 		 * create the UPL from this kernel buffer.
6797 		 * The kernel buffer is then freed, leaving the UPL holding
6798 		 * the last reference on the VM object, so the memory will
6799 		 * be released when the UPL is committed.
6800 		 */
6801 
6802 		vm_map_unlock_read(map);
6803 		entry = VM_MAP_ENTRY_NULL;
6804 		/* allocate kernel buffer */
6805 		ksize = round_page(*upl_size);
6806 		kaddr = 0;
6807 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6808 		    KMA_PAGEABLE | KMA_DATA, tag);
6809 		if (ret == KERN_SUCCESS) {
6810 			/* copyin the user data */
6811 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6812 		}
6813 		if (ret == KERN_SUCCESS) {
6814 			if (ksize > *upl_size) {
6815 				/* zero out the extra space in kernel buffer */
6816 				memset((void *)(kaddr + *upl_size),
6817 				    0,
6818 				    ksize - *upl_size);
6819 			}
6820 			/* create the UPL from the kernel buffer */
6821 			vm_object_offset_t      offset_in_object;
6822 			vm_object_offset_t      offset_in_object_page;
6823 
6824 			offset_in_object = offset - local_entry_start + local_entry_offset;
6825 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6826 			assert(offset_in_object_page < PAGE_SIZE);
6827 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6828 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6829 			ret = vm_map_create_upl(kernel_map,
6830 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6831 			    upl_size, upl, page_list, count, flags, tag);
6832 		}
6833 		if (kaddr != 0) {
6834 			/* free the kernel buffer */
6835 			kmem_free(kernel_map, kaddr, ksize);
6836 			kaddr = 0;
6837 			ksize = 0;
6838 		}
6839 #if DEVELOPMENT || DEBUG
6840 		DTRACE_VM4(create_upl_from_executable,
6841 		    vm_map_t, map,
6842 		    vm_map_address_t, offset,
6843 		    upl_size_t, *upl_size,
6844 		    kern_return_t, ret);
6845 #endif /* DEVELOPMENT || DEBUG */
6846 		goto done;
6847 	}
6848 #endif /* !XNU_TARGET_OS_OSX */
6849 
6850 	if (!entry->is_sub_map) {
6851 		local_object = VME_OBJECT(entry);
6852 		assert(local_object != VM_OBJECT_NULL);
6853 	}
6854 
6855 	if (!entry->is_sub_map &&
6856 	    !entry->needs_copy &&
6857 	    *upl_size != 0 &&
6858 	    local_object->vo_size > *upl_size && /* partial UPL */
6859 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6860 	    (map->pmap != kernel_pmap) && /* alias checks */
6861 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6862 	    ||
6863 	    ( /* case 2 */
6864 		    local_object->internal &&
6865 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6866 		    local_object->ref_count > 1))) {
6867 		vm_prot_t       prot;
6868 
6869 		/*
6870 		 * Case 1:
6871 		 * Set up the targeted range for copy-on-write to avoid
6872 		 * applying true_share/copy_delay to the entire object.
6873 		 *
6874 		 * Case 2:
6875 		 * This map entry covers only part of an internal
6876 		 * object.  There could be other map entries covering
6877 		 * other areas of this object and some of these map
6878 		 * entries could be marked as "needs_copy", which
6879 		 * assumes that the object is COPY_SYMMETRIC.
6880 		 * To avoid marking this object as COPY_DELAY and
6881 		 * "true_share", let's shadow it and mark the new
6882 		 * (smaller) object as "true_share" and COPY_DELAY.
6883 		 */
6884 
6885 		if (vm_map_lock_read_to_write(map)) {
6886 			goto REDISCOVER_ENTRY;
6887 		}
6888 		vm_map_lock_assert_exclusive(map);
6889 		assert(VME_OBJECT(entry) == local_object);
6890 
6891 		vm_map_clip_start(map,
6892 		    entry,
6893 		    vm_map_trunc_page(offset,
6894 		    VM_MAP_PAGE_MASK(map)));
6895 		vm_map_clip_end(map,
6896 		    entry,
6897 		    vm_map_round_page(offset + *upl_size,
6898 		    VM_MAP_PAGE_MASK(map)));
6899 		if ((entry->vme_end - offset) < *upl_size) {
6900 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6901 			assert(*upl_size == entry->vme_end - offset);
6902 		}
6903 
6904 		prot = entry->protection & ~VM_PROT_WRITE;
6905 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6906 			prot |= VM_PROT_EXECUTE;
6907 		}
6908 		vm_object_pmap_protect(local_object,
6909 		    VME_OFFSET(entry),
6910 		    entry->vme_end - entry->vme_start,
6911 		    ((entry->is_shared ||
6912 		    map->mapped_in_other_pmaps)
6913 		    ? PMAP_NULL
6914 		    : map->pmap),
6915 		    VM_MAP_PAGE_SIZE(map),
6916 		    entry->vme_start,
6917 		    prot);
6918 
6919 		assert(entry->wired_count == 0);
6920 
6921 		/*
6922 		 * Lock the VM object and re-check its status: if it's mapped
6923 		 * in another address space, we could still be racing with
6924 		 * another thread holding that other VM map exclusively.
6925 		 */
6926 		vm_object_lock(local_object);
6927 		if (local_object->true_share) {
6928 			/* object is already in proper state: no COW needed */
6929 			assert(local_object->copy_strategy !=
6930 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6931 		} else {
6932 			/* not true_share: ask for copy-on-write below */
6933 			assert(local_object->copy_strategy ==
6934 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6935 			entry->needs_copy = TRUE;
6936 		}
6937 		vm_object_unlock(local_object);
6938 
6939 		vm_map_lock_write_to_read(map);
6940 	}
6941 
6942 	if (entry->needs_copy) {
6943 		/*
6944 		 * Honor copy-on-write for COPY_SYMMETRIC
6945 		 * strategy.
6946 		 */
6947 		vm_map_t                local_map;
6948 		vm_object_t             object;
6949 		vm_object_offset_t      new_offset;
6950 		vm_prot_t               prot;
6951 		boolean_t               wired;
6952 		vm_map_version_t        version;
6953 		vm_map_t                real_map;
6954 		vm_prot_t               fault_type;
6955 
6956 		local_map = map;
6957 
6958 		if (caller_flags & UPL_COPYOUT_FROM) {
6959 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6960 			vm_counters.create_upl_extra_cow++;
6961 			vm_counters.create_upl_extra_cow_pages +=
6962 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6963 		} else {
6964 			fault_type = VM_PROT_WRITE;
6965 		}
6966 		if (vm_map_lookup_and_lock_object(&local_map,
6967 		    offset, fault_type,
6968 		    OBJECT_LOCK_EXCLUSIVE,
6969 		    &version, &object,
6970 		    &new_offset, &prot, &wired,
6971 		    NULL,
6972 		    &real_map, NULL) != KERN_SUCCESS) {
6973 			if (fault_type == VM_PROT_WRITE) {
6974 				vm_counters.create_upl_lookup_failure_write++;
6975 			} else {
6976 				vm_counters.create_upl_lookup_failure_copy++;
6977 			}
6978 			vm_map_unlock_read(local_map);
6979 			ret = KERN_FAILURE;
6980 			goto done;
6981 		}
6982 		if (real_map != local_map) {
6983 			vm_map_unlock(real_map);
6984 		}
6985 		vm_map_unlock_read(local_map);
6986 
6987 		vm_object_unlock(object);
6988 
6989 		goto REDISCOVER_ENTRY;
6990 	}
6991 
6992 	if (entry->is_sub_map) {
6993 		vm_map_t        submap;
6994 
6995 		submap = VME_SUBMAP(entry);
6996 		local_start = entry->vme_start;
6997 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6998 
6999 		vm_map_reference(submap);
7000 		vm_map_unlock_read(map);
7001 
7002 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7003 		offset += offset_in_mapped_page;
7004 		*upl_size -= offset_in_mapped_page;
7005 
7006 		if (release_map) {
7007 			vm_map_deallocate(map);
7008 		}
7009 		map = submap;
7010 		release_map = TRUE;
7011 		offset = local_offset + (offset - local_start);
7012 		goto start_with_map;
7013 	}
7014 
7015 	if (sync_cow_data &&
7016 	    (VME_OBJECT(entry)->shadow ||
7017 	    VME_OBJECT(entry)->copy)) {
7018 		local_object = VME_OBJECT(entry);
7019 		local_start = entry->vme_start;
7020 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7021 
7022 		vm_object_reference(local_object);
7023 		vm_map_unlock_read(map);
7024 
7025 		if (local_object->shadow && local_object->copy) {
7026 			vm_object_lock_request(local_object->shadow,
7027 			    ((vm_object_offset_t)
7028 			    ((offset - local_start) +
7029 			    local_offset) +
7030 			    local_object->vo_shadow_offset),
7031 			    *upl_size, FALSE,
7032 			    MEMORY_OBJECT_DATA_SYNC,
7033 			    VM_PROT_NO_CHANGE);
7034 		}
7035 		sync_cow_data = FALSE;
7036 		vm_object_deallocate(local_object);
7037 
7038 		goto REDISCOVER_ENTRY;
7039 	}
7040 	if (force_data_sync) {
7041 		local_object = VME_OBJECT(entry);
7042 		local_start = entry->vme_start;
7043 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7044 
7045 		vm_object_reference(local_object);
7046 		vm_map_unlock_read(map);
7047 
7048 		vm_object_lock_request(local_object,
7049 		    ((vm_object_offset_t)
7050 		    ((offset - local_start) +
7051 		    local_offset)),
7052 		    (vm_object_size_t)*upl_size,
7053 		    FALSE,
7054 		    MEMORY_OBJECT_DATA_SYNC,
7055 		    VM_PROT_NO_CHANGE);
7056 
7057 		force_data_sync = FALSE;
7058 		vm_object_deallocate(local_object);
7059 
7060 		goto REDISCOVER_ENTRY;
7061 	}
7062 	if (VME_OBJECT(entry)->private) {
7063 		*flags = UPL_DEV_MEMORY;
7064 	} else {
7065 		*flags = 0;
7066 	}
7067 
7068 	if (VME_OBJECT(entry)->phys_contiguous) {
7069 		*flags |= UPL_PHYS_CONTIG;
7070 	}
7071 
7072 	local_object = VME_OBJECT(entry);
7073 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7074 	local_start = entry->vme_start;
7075 
7076 	/*
7077 	 * Wiring will copy the pages to the shadow object.
7078 	 * The shadow object will not be code-signed so
7079 	 * attempting to execute code from these copied pages
7080 	 * would trigger a code-signing violation.
7081 	 */
7082 	if (entry->protection & VM_PROT_EXECUTE) {
7083 #if MACH_ASSERT
7084 		printf("pid %d[%s] create_upl out of executable range from "
7085 		    "0x%llx to 0x%llx: side effects may include "
7086 		    "code-signing violations later on\n",
7087 		    proc_selfpid(),
7088 		    (get_bsdtask_info(current_task())
7089 		    ? proc_name_address(get_bsdtask_info(current_task()))
7090 		    : "?"),
7091 		    (uint64_t) entry->vme_start,
7092 		    (uint64_t) entry->vme_end);
7093 #endif /* MACH_ASSERT */
7094 		DTRACE_VM2(cs_executable_create_upl,
7095 		    uint64_t, (uint64_t)entry->vme_start,
7096 		    uint64_t, (uint64_t)entry->vme_end);
7097 		cs_executable_create_upl++;
7098 	}
7099 
7100 	vm_object_lock(local_object);
7101 
7102 	/*
7103 	 * Ensure that this object is "true_share" and "copy_delay" now,
7104 	 * while we're still holding the VM map lock.  After we unlock the map,
7105 	 * anything could happen to that mapping, including some copy-on-write
7106 	 * activity.  We need to make sure that the IOPL will point at the
7107 	 * same memory as the mapping.
7108 	 */
7109 	if (local_object->true_share) {
7110 		assert(local_object->copy_strategy !=
7111 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7112 	} else if (local_object != kernel_object &&
7113 	    local_object != compressor_object &&
7114 	    !local_object->phys_contiguous) {
7115 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7116 		if (!local_object->true_share &&
7117 		    vm_object_tracking_btlog) {
7118 			btlog_record(vm_object_tracking_btlog, local_object,
7119 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7120 			    btref_get(__builtin_frame_address(0), 0));
7121 		}
7122 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7123 		local_object->true_share = TRUE;
7124 		if (local_object->copy_strategy ==
7125 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7126 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7127 		}
7128 	}
7129 
7130 	vm_object_reference_locked(local_object);
7131 	vm_object_unlock(local_object);
7132 
7133 	vm_map_unlock_read(map);
7134 
7135 	offset += offset_in_mapped_page;
7136 	assert(*upl_size > offset_in_mapped_page);
7137 	*upl_size -= offset_in_mapped_page;
7138 
7139 	ret = vm_object_iopl_request(local_object,
7140 	    ((vm_object_offset_t)
7141 	    ((offset - local_start) + local_offset)),
7142 	    *upl_size,
7143 	    upl,
7144 	    page_list,
7145 	    count,
7146 	    caller_flags,
7147 	    tag);
7148 	vm_object_deallocate(local_object);
7149 
7150 done:
7151 	if (release_map) {
7152 		vm_map_deallocate(map);
7153 	}
7154 
7155 	return ret;
7156 }
7157 
7158 /*
7159  * Internal routine to enter a UPL into a VM map.
7160  *
7161  * JMM - This should just be doable through the standard
7162  * vm_map_enter() API.
7163  */
7164 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7165 vm_map_enter_upl_range(
7166 	vm_map_t                map,
7167 	upl_t                   upl,
7168 	vm_object_offset_t      offset_to_map,
7169 	upl_size_t              size_to_map,
7170 	vm_prot_t               prot_to_map,
7171 	vm_map_offset_t         *dst_addr)
7172 {
7173 	vm_map_size_t           size;
7174 	vm_object_offset_t      offset;
7175 	vm_map_offset_t         addr;
7176 	vm_page_t               m;
7177 	kern_return_t           kr;
7178 	int                     isVectorUPL = 0, curr_upl = 0;
7179 	upl_t                   vector_upl = NULL;
7180 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7181 	vm_map_t                vector_upl_submap = NULL;
7182 	upl_offset_t            subupl_offset = 0;
7183 	upl_size_t              subupl_size = 0;
7184 
7185 	if (upl == UPL_NULL) {
7186 		return KERN_INVALID_ARGUMENT;
7187 	}
7188 
7189 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7190 	assert(map == kernel_map);
7191 
7192 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7193 		int mapped = 0, valid_upls = 0;
7194 		vector_upl = upl;
7195 
7196 		upl_lock(vector_upl);
7197 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7198 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7199 			if (upl == NULL) {
7200 				continue;
7201 			}
7202 			valid_upls++;
7203 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7204 				mapped++;
7205 			}
7206 		}
7207 
7208 		if (mapped) {
7209 			if (mapped != valid_upls) {
7210 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7211 			} else {
7212 				upl_unlock(vector_upl);
7213 				return KERN_FAILURE;
7214 			}
7215 		}
7216 
7217 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7218 			panic("TODO4K: vector UPL not implemented");
7219 		}
7220 
7221 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7222 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7223 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7224 		    VM_KERN_MEMORY_NONE).kmr_submap;
7225 		map = vector_upl_submap;
7226 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7227 		curr_upl = 0;
7228 	} else {
7229 		upl_lock(upl);
7230 	}
7231 
7232 process_upl_to_enter:
7233 	if (isVectorUPL) {
7234 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7235 			*dst_addr = vector_upl_dst_addr;
7236 			upl_unlock(vector_upl);
7237 			return KERN_SUCCESS;
7238 		}
7239 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7240 		if (upl == NULL) {
7241 			goto process_upl_to_enter;
7242 		}
7243 
7244 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7245 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7246 	} else {
7247 		/*
7248 		 * check to see if already mapped
7249 		 */
7250 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7251 			upl_unlock(upl);
7252 			return KERN_FAILURE;
7253 		}
7254 	}
7255 
7256 	if ((!(upl->flags & UPL_SHADOWED)) &&
7257 	    ((upl->flags & UPL_HAS_BUSY) ||
7258 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7259 		vm_object_t             object;
7260 		vm_page_t               alias_page;
7261 		vm_object_offset_t      new_offset;
7262 		unsigned int            pg_num;
7263 
7264 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7265 		object = upl->map_object;
7266 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7267 
7268 		vm_object_lock(upl->map_object);
7269 
7270 		upl->map_object->shadow = object;
7271 		upl->map_object->pageout = TRUE;
7272 		upl->map_object->can_persist = FALSE;
7273 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7274 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7275 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7276 		    "object %p shadow_offset 0x%llx",
7277 		    upl->map_object,
7278 		    (uint64_t)upl->map_object->vo_shadow_offset);
7279 		upl->map_object->wimg_bits = object->wimg_bits;
7280 		offset = upl->map_object->vo_shadow_offset;
7281 		new_offset = 0;
7282 
7283 		upl->flags |= UPL_SHADOWED;
7284 
7285 		while (size) {
7286 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7287 			assert(pg_num == new_offset / PAGE_SIZE);
7288 
7289 			if (bitmap_test(upl->lite_list, pg_num)) {
7290 				alias_page = vm_page_grab_fictitious(TRUE);
7291 
7292 				vm_object_lock(object);
7293 
7294 				m = vm_page_lookup(object, offset);
7295 				if (m == VM_PAGE_NULL) {
7296 					panic("vm_upl_map: page missing");
7297 				}
7298 
7299 				/*
7300 				 * Convert the fictitious page to a private
7301 				 * shadow of the real page.
7302 				 */
7303 				assert(alias_page->vmp_fictitious);
7304 				alias_page->vmp_fictitious = FALSE;
7305 				alias_page->vmp_private = TRUE;
7306 				alias_page->vmp_free_when_done = TRUE;
7307 				/*
7308 				 * since m is a page in the upl it must
7309 				 * already be wired or BUSY, so it's
7310 				 * safe to assign the underlying physical
7311 				 * page to the alias
7312 				 */
7313 				VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7314 
7315 				vm_object_unlock(object);
7316 
7317 				vm_page_lockspin_queues();
7318 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7319 				vm_page_unlock_queues();
7320 
7321 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7322 
7323 				assert(!alias_page->vmp_wanted);
7324 				alias_page->vmp_busy = FALSE;
7325 				alias_page->vmp_absent = FALSE;
7326 			}
7327 			size -= PAGE_SIZE;
7328 			offset += PAGE_SIZE_64;
7329 			new_offset += PAGE_SIZE_64;
7330 		}
7331 		vm_object_unlock(upl->map_object);
7332 	}
7333 	if (upl->flags & UPL_SHADOWED) {
7334 		if (isVectorUPL) {
7335 			offset = 0;
7336 		} else {
7337 			offset = offset_to_map;
7338 		}
7339 	} else {
7340 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7341 		if (!isVectorUPL) {
7342 			offset += offset_to_map;
7343 		}
7344 	}
7345 
7346 	if (isVectorUPL) {
7347 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7348 	} else {
7349 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7350 	}
7351 
7352 	vm_object_reference(upl->map_object);
7353 
7354 	if (!isVectorUPL) {
7355 		*dst_addr = 0;
7356 		/*
7357 		 * NEED A UPL_MAP ALIAS
7358 		 */
7359 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7360 		    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7361 		    upl->map_object, offset, FALSE,
7362 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7363 
7364 		if (kr != KERN_SUCCESS) {
7365 			vm_object_deallocate(upl->map_object);
7366 			upl_unlock(upl);
7367 			return kr;
7368 		}
7369 	} else {
7370 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7371 		    VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7372 		    upl->map_object, offset, FALSE,
7373 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7374 		if (kr) {
7375 			panic("vm_map_enter failed for a Vector UPL");
7376 		}
7377 	}
7378 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7379 	                                        /* this will have to be an increment rather than */
7380 	                                        /* an assignment. */
7381 	vm_object_lock(upl->map_object);
7382 
7383 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7384 		m = vm_page_lookup(upl->map_object, offset);
7385 
7386 		if (m) {
7387 			m->vmp_pmapped = TRUE;
7388 
7389 			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7390 			 * but only in kernel space. If this was on a user map,
7391 			 * we'd have to set the wpmapped bit. */
7392 			/* m->vmp_wpmapped = TRUE; */
7393 			assert(map->pmap == kernel_pmap);
7394 
7395 			PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7396 
7397 			assert(kr == KERN_SUCCESS);
7398 #if KASAN
7399 			kasan_notify_address(addr, PAGE_SIZE_64);
7400 #endif
7401 		}
7402 		offset += PAGE_SIZE_64;
7403 	}
7404 	vm_object_unlock(upl->map_object);
7405 
7406 	/*
7407 	 * hold a reference for the mapping
7408 	 */
7409 	upl->ref_count++;
7410 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7411 	upl->kaddr = (vm_offset_t) *dst_addr;
7412 	assert(upl->kaddr == *dst_addr);
7413 
7414 	if (isVectorUPL) {
7415 		goto process_upl_to_enter;
7416 	}
7417 
7418 	if (!isVectorUPL) {
7419 		vm_map_offset_t addr_adjustment;
7420 
7421 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7422 		if (addr_adjustment) {
7423 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7424 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7425 			*dst_addr += addr_adjustment;
7426 		}
7427 	}
7428 
7429 	upl_unlock(upl);
7430 
7431 	return KERN_SUCCESS;
7432 }
7433 
7434 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7435 vm_map_enter_upl(
7436 	vm_map_t                map,
7437 	upl_t                   upl,
7438 	vm_map_offset_t         *dst_addr)
7439 {
7440 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7441 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7442 }
7443 
7444 /*
7445  * Internal routine to remove a UPL mapping from a VM map.
7446  *
7447  * XXX - This should just be doable through a standard
7448  * vm_map_remove() operation.  Otherwise, implicit clean-up
7449  * of the target map won't be able to correctly remove
7450  * these (and release the reference on the UPL).  Having
7451  * to do this means we can't map these into user-space
7452  * maps yet.
7453  */
7454 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7455 vm_map_remove_upl_range(
7456 	vm_map_t        map,
7457 	upl_t           upl,
7458 	__unused vm_object_offset_t    offset_to_unmap,
7459 	__unused upl_size_t      size_to_unmap)
7460 {
7461 	vm_address_t    addr;
7462 	upl_size_t      size;
7463 	int             isVectorUPL = 0, curr_upl = 0;
7464 	upl_t           vector_upl = NULL;
7465 
7466 	if (upl == UPL_NULL) {
7467 		return KERN_INVALID_ARGUMENT;
7468 	}
7469 
7470 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7471 		int     unmapped = 0, valid_upls = 0;
7472 		vector_upl = upl;
7473 		upl_lock(vector_upl);
7474 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7475 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7476 			if (upl == NULL) {
7477 				continue;
7478 			}
7479 			valid_upls++;
7480 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7481 				unmapped++;
7482 			}
7483 		}
7484 
7485 		if (unmapped) {
7486 			if (unmapped != valid_upls) {
7487 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7488 			} else {
7489 				upl_unlock(vector_upl);
7490 				return KERN_FAILURE;
7491 			}
7492 		}
7493 		curr_upl = 0;
7494 	} else {
7495 		upl_lock(upl);
7496 	}
7497 
7498 process_upl_to_remove:
7499 	if (isVectorUPL) {
7500 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7501 			vm_map_t v_upl_submap;
7502 			vm_offset_t v_upl_submap_dst_addr;
7503 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7504 
7505 			kmem_free_guard(map, v_upl_submap_dst_addr,
7506 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7507 			vm_map_deallocate(v_upl_submap);
7508 			upl_unlock(vector_upl);
7509 			return KERN_SUCCESS;
7510 		}
7511 
7512 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7513 		if (upl == NULL) {
7514 			goto process_upl_to_remove;
7515 		}
7516 	}
7517 
7518 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7519 		addr = upl->kaddr;
7520 		size = upl->u_mapped_size;
7521 
7522 		assert(upl->ref_count > 1);
7523 		upl->ref_count--;               /* removing mapping ref */
7524 
7525 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7526 		upl->kaddr = (vm_offset_t) 0;
7527 		upl->u_mapped_size = 0;
7528 
7529 		if (isVectorUPL) {
7530 			/*
7531 			 * If it's a Vectored UPL, we'll be removing the entire
7532 			 * submap anyways, so no need to remove individual UPL
7533 			 * element mappings from within the submap
7534 			 */
7535 			goto process_upl_to_remove;
7536 		}
7537 
7538 		upl_unlock(upl);
7539 
7540 		vm_map_remove(map,
7541 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7542 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7543 		return KERN_SUCCESS;
7544 	}
7545 	upl_unlock(upl);
7546 
7547 	return KERN_FAILURE;
7548 }
7549 
7550 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7551 vm_map_remove_upl(
7552 	vm_map_t        map,
7553 	upl_t           upl)
7554 {
7555 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7556 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7557 }
7558 
7559 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7560 upl_commit_range(
7561 	upl_t                   upl,
7562 	upl_offset_t            offset,
7563 	upl_size_t              size,
7564 	int                     flags,
7565 	upl_page_info_t         *page_list,
7566 	mach_msg_type_number_t  count,
7567 	boolean_t               *empty)
7568 {
7569 	upl_size_t              xfer_size, subupl_size;
7570 	vm_object_t             shadow_object;
7571 	vm_object_t             object;
7572 	vm_object_t             m_object;
7573 	vm_object_offset_t      target_offset;
7574 	upl_offset_t            subupl_offset = offset;
7575 	int                     entry;
7576 	int                     occupied;
7577 	int                     clear_refmod = 0;
7578 	int                     pgpgout_count = 0;
7579 	struct  vm_page_delayed_work    dw_array;
7580 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7581 	bool                    dwp_finish_ctx = TRUE;
7582 	int                     dw_count;
7583 	int                     dw_limit;
7584 	int                     isVectorUPL = 0;
7585 	upl_t                   vector_upl = NULL;
7586 	boolean_t               should_be_throttled = FALSE;
7587 
7588 	vm_page_t               nxt_page = VM_PAGE_NULL;
7589 	int                     fast_path_possible = 0;
7590 	int                     fast_path_full_commit = 0;
7591 	int                     throttle_page = 0;
7592 	int                     unwired_count = 0;
7593 	int                     local_queue_count = 0;
7594 	vm_page_t               first_local, last_local;
7595 	vm_object_offset_t      obj_start, obj_end, obj_offset;
7596 	kern_return_t           kr = KERN_SUCCESS;
7597 
7598 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7599 
7600 	dwp_start = dwp = NULL;
7601 
7602 	subupl_size = size;
7603 	*empty = FALSE;
7604 
7605 	if (upl == UPL_NULL) {
7606 		return KERN_INVALID_ARGUMENT;
7607 	}
7608 
7609 	dw_count = 0;
7610 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7611 	dwp_start = vm_page_delayed_work_get_ctx();
7612 	if (dwp_start == NULL) {
7613 		dwp_start = &dw_array;
7614 		dw_limit = 1;
7615 		dwp_finish_ctx = FALSE;
7616 	}
7617 
7618 	dwp = dwp_start;
7619 
7620 	if (count == 0) {
7621 		page_list = NULL;
7622 	}
7623 
7624 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7625 		vector_upl = upl;
7626 		upl_lock(vector_upl);
7627 	} else {
7628 		upl_lock(upl);
7629 	}
7630 
7631 process_upl_to_commit:
7632 
7633 	if (isVectorUPL) {
7634 		size = subupl_size;
7635 		offset = subupl_offset;
7636 		if (size == 0) {
7637 			upl_unlock(vector_upl);
7638 			kr = KERN_SUCCESS;
7639 			goto done;
7640 		}
7641 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7642 		if (upl == NULL) {
7643 			upl_unlock(vector_upl);
7644 			kr = KERN_FAILURE;
7645 			goto done;
7646 		}
7647 		page_list = upl->page_list;
7648 		subupl_size -= size;
7649 		subupl_offset += size;
7650 	}
7651 
7652 #if UPL_DEBUG
7653 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7654 		upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7655 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7656 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7657 
7658 		upl->upl_commit_index++;
7659 	}
7660 #endif
7661 	if (upl->flags & UPL_DEVICE_MEMORY) {
7662 		xfer_size = 0;
7663 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7664 		xfer_size = size;
7665 	} else {
7666 		if (!isVectorUPL) {
7667 			upl_unlock(upl);
7668 		} else {
7669 			upl_unlock(vector_upl);
7670 		}
7671 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7672 		kr = KERN_FAILURE;
7673 		goto done;
7674 	}
7675 	if (upl->flags & UPL_SET_DIRTY) {
7676 		flags |= UPL_COMMIT_SET_DIRTY;
7677 	}
7678 	if (upl->flags & UPL_CLEAR_DIRTY) {
7679 		flags |= UPL_COMMIT_CLEAR_DIRTY;
7680 	}
7681 
7682 	object = upl->map_object;
7683 
7684 	if (upl->flags & UPL_SHADOWED) {
7685 		vm_object_lock(object);
7686 		shadow_object = object->shadow;
7687 	} else {
7688 		shadow_object = object;
7689 	}
7690 	entry = offset / PAGE_SIZE;
7691 	target_offset = (vm_object_offset_t)offset;
7692 
7693 	if (upl->flags & UPL_KERNEL_OBJECT) {
7694 		vm_object_lock_shared(shadow_object);
7695 	} else {
7696 		vm_object_lock(shadow_object);
7697 	}
7698 
7699 	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7700 
7701 	if (upl->flags & UPL_ACCESS_BLOCKED) {
7702 		assert(shadow_object->blocked_access);
7703 		shadow_object->blocked_access = FALSE;
7704 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7705 	}
7706 
7707 	if (shadow_object->code_signed) {
7708 		/*
7709 		 * CODE SIGNING:
7710 		 * If the object is code-signed, do not let this UPL tell
7711 		 * us if the pages are valid or not.  Let the pages be
7712 		 * validated by VM the normal way (when they get mapped or
7713 		 * copied).
7714 		 */
7715 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7716 	}
7717 	if (!page_list) {
7718 		/*
7719 		 * No page list to get the code-signing info from !?
7720 		 */
7721 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7722 	}
7723 	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7724 		should_be_throttled = TRUE;
7725 	}
7726 
7727 	if ((upl->flags & UPL_IO_WIRE) &&
7728 	    !(flags & UPL_COMMIT_FREE_ABSENT) &&
7729 	    !isVectorUPL &&
7730 	    shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7731 	    shadow_object->purgable != VM_PURGABLE_EMPTY) {
7732 		if (!vm_page_queue_empty(&shadow_object->memq)) {
7733 			if (size == shadow_object->vo_size) {
7734 				nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7735 				fast_path_full_commit = 1;
7736 			}
7737 			fast_path_possible = 1;
7738 
7739 			if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7740 			    (shadow_object->purgable == VM_PURGABLE_DENY ||
7741 			    shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7742 			    shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7743 				throttle_page = 1;
7744 			}
7745 		}
7746 	}
7747 	first_local = VM_PAGE_NULL;
7748 	last_local = VM_PAGE_NULL;
7749 
7750 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7751 	obj_end = obj_start + xfer_size;
7752 	obj_start = vm_object_trunc_page(obj_start);
7753 	obj_end = vm_object_round_page(obj_end);
7754 	for (obj_offset = obj_start;
7755 	    obj_offset < obj_end;
7756 	    obj_offset += PAGE_SIZE) {
7757 		vm_page_t       t, m;
7758 
7759 		dwp->dw_mask = 0;
7760 		clear_refmod = 0;
7761 
7762 		m = VM_PAGE_NULL;
7763 
7764 		if (upl->flags & UPL_LITE) {
7765 			unsigned int    pg_num;
7766 
7767 			if (nxt_page != VM_PAGE_NULL) {
7768 				m = nxt_page;
7769 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7770 				target_offset = m->vmp_offset;
7771 			}
7772 			pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7773 			assert(pg_num == target_offset / PAGE_SIZE);
7774 
7775 			if (bitmap_test(upl->lite_list, pg_num)) {
7776 				bitmap_clear(upl->lite_list, pg_num);
7777 
7778 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7779 					m = vm_page_lookup(shadow_object, obj_offset);
7780 				}
7781 			} else {
7782 				m = NULL;
7783 			}
7784 		}
7785 		if (upl->flags & UPL_SHADOWED) {
7786 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7787 				t->vmp_free_when_done = FALSE;
7788 
7789 				VM_PAGE_FREE(t);
7790 
7791 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7792 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7793 				}
7794 			}
7795 		}
7796 		if (m == VM_PAGE_NULL) {
7797 			goto commit_next_page;
7798 		}
7799 
7800 		m_object = VM_PAGE_OBJECT(m);
7801 
7802 		if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7803 			assert(m->vmp_busy);
7804 
7805 			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7806 			goto commit_next_page;
7807 		}
7808 
7809 		if (flags & UPL_COMMIT_CS_VALIDATED) {
7810 			/*
7811 			 * CODE SIGNING:
7812 			 * Set the code signing bits according to
7813 			 * what the UPL says they should be.
7814 			 */
7815 			m->vmp_cs_validated |= page_list[entry].cs_validated;
7816 			m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7817 			m->vmp_cs_nx |= page_list[entry].cs_nx;
7818 		}
7819 		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7820 			m->vmp_written_by_kernel = TRUE;
7821 		}
7822 
7823 		if (upl->flags & UPL_IO_WIRE) {
7824 			if (page_list) {
7825 				page_list[entry].phys_addr = 0;
7826 			}
7827 
7828 			if (flags & UPL_COMMIT_SET_DIRTY) {
7829 				SET_PAGE_DIRTY(m, FALSE);
7830 			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7831 				m->vmp_dirty = FALSE;
7832 
7833 				if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7834 				    m->vmp_cs_validated &&
7835 				    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7836 					/*
7837 					 * CODE SIGNING:
7838 					 * This page is no longer dirty
7839 					 * but could have been modified,
7840 					 * so it will need to be
7841 					 * re-validated.
7842 					 */
7843 					m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7844 
7845 					VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7846 
7847 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7848 				}
7849 				clear_refmod |= VM_MEM_MODIFIED;
7850 			}
7851 			if (upl->flags & UPL_ACCESS_BLOCKED) {
7852 				/*
7853 				 * We blocked access to the pages in this UPL.
7854 				 * Clear the "busy" bit and wake up any waiter
7855 				 * for this page.
7856 				 */
7857 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7858 			}
7859 			if (fast_path_possible) {
7860 				assert(m_object->purgable != VM_PURGABLE_EMPTY);
7861 				assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7862 				if (m->vmp_absent) {
7863 					assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7864 					assert(m->vmp_wire_count == 0);
7865 					assert(m->vmp_busy);
7866 
7867 					m->vmp_absent = FALSE;
7868 					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7869 				} else {
7870 					if (m->vmp_wire_count == 0) {
7871 						panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7872 					}
7873 					assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7874 
7875 					/*
7876 					 * XXX FBDP need to update some other
7877 					 * counters here (purgeable_wired_count)
7878 					 * (ledgers), ...
7879 					 */
7880 					assert(m->vmp_wire_count > 0);
7881 					m->vmp_wire_count--;
7882 
7883 					if (m->vmp_wire_count == 0) {
7884 						m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7885 						unwired_count++;
7886 					}
7887 				}
7888 				if (m->vmp_wire_count == 0) {
7889 					assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7890 
7891 					if (last_local == VM_PAGE_NULL) {
7892 						assert(first_local == VM_PAGE_NULL);
7893 
7894 						last_local = m;
7895 						first_local = m;
7896 					} else {
7897 						assert(first_local != VM_PAGE_NULL);
7898 
7899 						m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7900 						first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7901 						first_local = m;
7902 					}
7903 					local_queue_count++;
7904 
7905 					if (throttle_page) {
7906 						m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7907 					} else {
7908 						if (flags & UPL_COMMIT_INACTIVATE) {
7909 							if (shadow_object->internal) {
7910 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7911 							} else {
7912 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7913 							}
7914 						} else {
7915 							m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7916 						}
7917 					}
7918 				}
7919 			} else {
7920 				if (flags & UPL_COMMIT_INACTIVATE) {
7921 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7922 					clear_refmod |= VM_MEM_REFERENCED;
7923 				}
7924 				if (m->vmp_absent) {
7925 					if (flags & UPL_COMMIT_FREE_ABSENT) {
7926 						dwp->dw_mask |= DW_vm_page_free;
7927 					} else {
7928 						m->vmp_absent = FALSE;
7929 						dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7930 
7931 						if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7932 							dwp->dw_mask |= DW_vm_page_activate;
7933 						}
7934 					}
7935 				} else {
7936 					dwp->dw_mask |= DW_vm_page_unwire;
7937 				}
7938 			}
7939 			goto commit_next_page;
7940 		}
7941 		assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7942 
7943 		if (page_list) {
7944 			page_list[entry].phys_addr = 0;
7945 		}
7946 
7947 		/*
7948 		 * make sure to clear the hardware
7949 		 * modify or reference bits before
7950 		 * releasing the BUSY bit on this page
7951 		 * otherwise we risk losing a legitimate
7952 		 * change of state
7953 		 */
7954 		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7955 			m->vmp_dirty = FALSE;
7956 
7957 			clear_refmod |= VM_MEM_MODIFIED;
7958 		}
7959 		if (m->vmp_laundry) {
7960 			dwp->dw_mask |= DW_vm_pageout_throttle_up;
7961 		}
7962 
7963 		if (VM_PAGE_WIRED(m)) {
7964 			m->vmp_free_when_done = FALSE;
7965 		}
7966 
7967 		if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7968 		    m->vmp_cs_validated &&
7969 		    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7970 			/*
7971 			 * CODE SIGNING:
7972 			 * This page is no longer dirty
7973 			 * but could have been modified,
7974 			 * so it will need to be
7975 			 * re-validated.
7976 			 */
7977 			m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7978 
7979 			VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7980 
7981 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7982 		}
7983 		if (m->vmp_overwriting) {
7984 			/*
7985 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
7986 			 */
7987 			if (m->vmp_busy) {
7988 #if CONFIG_PHANTOM_CACHE
7989 				if (m->vmp_absent && !m_object->internal) {
7990 					dwp->dw_mask |= DW_vm_phantom_cache_update;
7991 				}
7992 #endif
7993 				m->vmp_absent = FALSE;
7994 
7995 				dwp->dw_mask |= DW_clear_busy;
7996 			} else {
7997 				/*
7998 				 * alternate (COPY_OUT_FROM == FALSE) page_list case
7999 				 * Occurs when the original page was wired
8000 				 * at the time of the list request
8001 				 */
8002 				assert(VM_PAGE_WIRED(m));
8003 
8004 				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8005 			}
8006 			m->vmp_overwriting = FALSE;
8007 		}
8008 		m->vmp_cleaning = FALSE;
8009 
8010 		if (m->vmp_free_when_done) {
8011 			/*
8012 			 * With the clean queue enabled, UPL_PAGEOUT should
8013 			 * no longer set the pageout bit. Its pages now go
8014 			 * to the clean queue.
8015 			 *
8016 			 * We don't use the cleaned Q anymore and so this
8017 			 * assert isn't correct. The code for the clean Q
8018 			 * still exists and might be used in the future. If we
8019 			 * go back to the cleaned Q, we will re-enable this
8020 			 * assert.
8021 			 *
8022 			 * assert(!(upl->flags & UPL_PAGEOUT));
8023 			 */
8024 			assert(!m_object->internal);
8025 
8026 			m->vmp_free_when_done = FALSE;
8027 
8028 			if ((flags & UPL_COMMIT_SET_DIRTY) ||
8029 			    (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8030 				/*
8031 				 * page was re-dirtied after we started
8032 				 * the pageout... reactivate it since
8033 				 * we don't know whether the on-disk
8034 				 * copy matches what is now in memory
8035 				 */
8036 				SET_PAGE_DIRTY(m, FALSE);
8037 
8038 				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8039 
8040 				if (upl->flags & UPL_PAGEOUT) {
8041 					counter_inc(&vm_statistics_reactivations);
8042 					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8043 				}
8044 			} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8045 				/*
8046 				 * Someone else might still be handling this
8047 				 * page (vm_fault() for example), so let's not
8048 				 * free it or "un-busy" it!
8049 				 * Put that page in the "speculative" queue
8050 				 * for now (since we would otherwise have freed
8051 				 * it) and let whoever is keeping the page
8052 				 * "busy" move it if needed when they're done
8053 				 * with it.
8054 				 */
8055 				dwp->dw_mask |= DW_vm_page_speculate;
8056 			} else {
8057 				/*
8058 				 * page has been successfully cleaned
8059 				 * go ahead and free it for other use
8060 				 */
8061 				if (m_object->internal) {
8062 					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8063 				} else {
8064 					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8065 				}
8066 				m->vmp_dirty = FALSE;
8067 				if (!(upl->flags & UPL_HAS_BUSY)) {
8068 					assert(!m->vmp_busy);
8069 				}
8070 				m->vmp_busy = TRUE;
8071 
8072 				dwp->dw_mask |= DW_vm_page_free;
8073 			}
8074 			goto commit_next_page;
8075 		}
8076 		/*
8077 		 * It is a part of the semantic of COPYOUT_FROM
8078 		 * UPLs that a commit implies cache sync
8079 		 * between the vm page and the backing store
8080 		 * this can be used to strip the precious bit
8081 		 * as well as clean
8082 		 */
8083 		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8084 			m->vmp_precious = FALSE;
8085 		}
8086 
8087 		if (flags & UPL_COMMIT_SET_DIRTY) {
8088 			SET_PAGE_DIRTY(m, FALSE);
8089 		} else {
8090 			m->vmp_dirty = FALSE;
8091 		}
8092 
8093 		/* with the clean queue on, move *all* cleaned pages to the clean queue */
8094 		if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8095 			pgpgout_count++;
8096 
8097 			counter_inc(&vm_statistics_pageouts);
8098 			DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8099 
8100 			dwp->dw_mask |= DW_enqueue_cleaned;
8101 		} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8102 			/*
8103 			 * page coming back in from being 'frozen'...
8104 			 * it was dirty before it was frozen, so keep it so
8105 			 * the vm_page_activate will notice that it really belongs
8106 			 * on the throttle queue and put it there
8107 			 */
8108 			SET_PAGE_DIRTY(m, FALSE);
8109 			dwp->dw_mask |= DW_vm_page_activate;
8110 		} else {
8111 			if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8112 				dwp->dw_mask |= DW_vm_page_deactivate_internal;
8113 				clear_refmod |= VM_MEM_REFERENCED;
8114 			} else if (!VM_PAGE_PAGEABLE(m)) {
8115 				if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8116 					dwp->dw_mask |= DW_vm_page_speculate;
8117 				} else if (m->vmp_reference) {
8118 					dwp->dw_mask |= DW_vm_page_activate;
8119 				} else {
8120 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
8121 					clear_refmod |= VM_MEM_REFERENCED;
8122 				}
8123 			}
8124 		}
8125 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8126 			/*
8127 			 * We blocked access to the pages in this URL.
8128 			 * Clear the "busy" bit on this page before we
8129 			 * wake up any waiter.
8130 			 */
8131 			dwp->dw_mask |= DW_clear_busy;
8132 		}
8133 		/*
8134 		 * Wakeup any thread waiting for the page to be un-cleaning.
8135 		 */
8136 		dwp->dw_mask |= DW_PAGE_WAKEUP;
8137 
8138 commit_next_page:
8139 		if (clear_refmod) {
8140 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8141 		}
8142 
8143 		target_offset += PAGE_SIZE_64;
8144 		xfer_size -= PAGE_SIZE;
8145 		entry++;
8146 
8147 		if (dwp->dw_mask) {
8148 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8149 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8150 
8151 				if (dw_count >= dw_limit) {
8152 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8153 
8154 					dwp = dwp_start;
8155 					dw_count = 0;
8156 				}
8157 			} else {
8158 				if (dwp->dw_mask & DW_clear_busy) {
8159 					m->vmp_busy = FALSE;
8160 				}
8161 
8162 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8163 					PAGE_WAKEUP(m);
8164 				}
8165 			}
8166 		}
8167 	}
8168 	if (dw_count) {
8169 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8170 		dwp = dwp_start;
8171 		dw_count = 0;
8172 	}
8173 
8174 	if (fast_path_possible) {
8175 		assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8176 		assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8177 
8178 		if (local_queue_count || unwired_count) {
8179 			if (local_queue_count) {
8180 				vm_page_t       first_target;
8181 				vm_page_queue_head_t    *target_queue;
8182 
8183 				if (throttle_page) {
8184 					target_queue = &vm_page_queue_throttled;
8185 				} else {
8186 					if (flags & UPL_COMMIT_INACTIVATE) {
8187 						if (shadow_object->internal) {
8188 							target_queue = &vm_page_queue_anonymous;
8189 						} else {
8190 							target_queue = &vm_page_queue_inactive;
8191 						}
8192 					} else {
8193 						target_queue = &vm_page_queue_active;
8194 					}
8195 				}
8196 				/*
8197 				 * Transfer the entire local queue to a regular LRU page queues.
8198 				 */
8199 				vm_page_lockspin_queues();
8200 
8201 				first_target = (vm_page_t) vm_page_queue_first(target_queue);
8202 
8203 				if (vm_page_queue_empty(target_queue)) {
8204 					target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8205 				} else {
8206 					first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8207 				}
8208 
8209 				target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8210 				first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8211 				last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8212 
8213 				/*
8214 				 * Adjust the global page counts.
8215 				 */
8216 				if (throttle_page) {
8217 					vm_page_throttled_count += local_queue_count;
8218 				} else {
8219 					if (flags & UPL_COMMIT_INACTIVATE) {
8220 						if (shadow_object->internal) {
8221 							vm_page_anonymous_count += local_queue_count;
8222 						}
8223 						vm_page_inactive_count += local_queue_count;
8224 
8225 						token_new_pagecount += local_queue_count;
8226 					} else {
8227 						vm_page_active_count += local_queue_count;
8228 					}
8229 
8230 					if (shadow_object->internal) {
8231 						vm_page_pageable_internal_count += local_queue_count;
8232 					} else {
8233 						vm_page_pageable_external_count += local_queue_count;
8234 					}
8235 				}
8236 			} else {
8237 				vm_page_lockspin_queues();
8238 			}
8239 			if (unwired_count) {
8240 				vm_page_wire_count -= unwired_count;
8241 				VM_CHECK_MEMORYSTATUS;
8242 			}
8243 			vm_page_unlock_queues();
8244 
8245 			VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8246 		}
8247 	}
8248 
8249 	if (upl->flags & UPL_DEVICE_MEMORY) {
8250 		occupied = 0;
8251 	} else if (upl->flags & UPL_LITE) {
8252 		uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8253 
8254 		occupied = !fast_path_full_commit &&
8255 		    !bitmap_is_empty(upl->lite_list, pages);
8256 	} else {
8257 		occupied = !vm_page_queue_empty(&upl->map_object->memq);
8258 	}
8259 	if (occupied == 0) {
8260 		/*
8261 		 * If this UPL element belongs to a Vector UPL and is
8262 		 * empty, then this is the right function to deallocate
8263 		 * it. So go ahead set the *empty variable. The flag
8264 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8265 		 * should be considered relevant for the Vector UPL and not
8266 		 * the internal UPLs.
8267 		 */
8268 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8269 			*empty = TRUE;
8270 		}
8271 
8272 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8273 			/*
8274 			 * this is not a paging object
8275 			 * so we need to drop the paging reference
8276 			 * that was taken when we created the UPL
8277 			 * against this object
8278 			 */
8279 			vm_object_activity_end(shadow_object);
8280 			vm_object_collapse(shadow_object, 0, TRUE);
8281 		} else {
8282 			/*
8283 			 * we dontated the paging reference to
8284 			 * the map object... vm_pageout_object_terminate
8285 			 * will drop this reference
8286 			 */
8287 		}
8288 	}
8289 	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8290 	vm_object_unlock(shadow_object);
8291 	if (object != shadow_object) {
8292 		vm_object_unlock(object);
8293 	}
8294 
8295 	if (!isVectorUPL) {
8296 		upl_unlock(upl);
8297 	} else {
8298 		/*
8299 		 * If we completed our operations on an UPL that is
8300 		 * part of a Vectored UPL and if empty is TRUE, then
8301 		 * we should go ahead and deallocate this UPL element.
8302 		 * Then we check if this was the last of the UPL elements
8303 		 * within that Vectored UPL. If so, set empty to TRUE
8304 		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8305 		 * can go ahead and deallocate the Vector UPL too.
8306 		 */
8307 		if (*empty == TRUE) {
8308 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8309 			upl_deallocate(upl);
8310 		}
8311 		goto process_upl_to_commit;
8312 	}
8313 	if (pgpgout_count) {
8314 		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8315 	}
8316 
8317 	kr = KERN_SUCCESS;
8318 done:
8319 	if (dwp_start && dwp_finish_ctx) {
8320 		vm_page_delayed_work_finish_ctx(dwp_start);
8321 		dwp_start = dwp = NULL;
8322 	}
8323 
8324 	return kr;
8325 }
8326 
8327 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8328 upl_abort_range(
8329 	upl_t                   upl,
8330 	upl_offset_t            offset,
8331 	upl_size_t              size,
8332 	int                     error,
8333 	boolean_t               *empty)
8334 {
8335 	upl_size_t              xfer_size, subupl_size;
8336 	vm_object_t             shadow_object;
8337 	vm_object_t             object;
8338 	vm_object_offset_t      target_offset;
8339 	upl_offset_t            subupl_offset = offset;
8340 	int                     occupied;
8341 	struct  vm_page_delayed_work    dw_array;
8342 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8343 	bool                    dwp_finish_ctx = TRUE;
8344 	int                     dw_count;
8345 	int                     dw_limit;
8346 	int                     isVectorUPL = 0;
8347 	upl_t                   vector_upl = NULL;
8348 	vm_object_offset_t      obj_start, obj_end, obj_offset;
8349 	kern_return_t           kr = KERN_SUCCESS;
8350 
8351 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8352 
8353 	dwp_start = dwp = NULL;
8354 
8355 	subupl_size = size;
8356 	*empty = FALSE;
8357 
8358 	if (upl == UPL_NULL) {
8359 		return KERN_INVALID_ARGUMENT;
8360 	}
8361 
8362 	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8363 		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8364 	}
8365 
8366 	dw_count = 0;
8367 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8368 	dwp_start = vm_page_delayed_work_get_ctx();
8369 	if (dwp_start == NULL) {
8370 		dwp_start = &dw_array;
8371 		dw_limit = 1;
8372 		dwp_finish_ctx = FALSE;
8373 	}
8374 
8375 	dwp = dwp_start;
8376 
8377 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8378 		vector_upl = upl;
8379 		upl_lock(vector_upl);
8380 	} else {
8381 		upl_lock(upl);
8382 	}
8383 
8384 process_upl_to_abort:
8385 	if (isVectorUPL) {
8386 		size = subupl_size;
8387 		offset = subupl_offset;
8388 		if (size == 0) {
8389 			upl_unlock(vector_upl);
8390 			kr = KERN_SUCCESS;
8391 			goto done;
8392 		}
8393 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8394 		if (upl == NULL) {
8395 			upl_unlock(vector_upl);
8396 			kr = KERN_FAILURE;
8397 			goto done;
8398 		}
8399 		subupl_size -= size;
8400 		subupl_offset += size;
8401 	}
8402 
8403 	*empty = FALSE;
8404 
8405 #if UPL_DEBUG
8406 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8407 		upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8408 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8409 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8410 		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8411 
8412 		upl->upl_commit_index++;
8413 	}
8414 #endif
8415 	if (upl->flags & UPL_DEVICE_MEMORY) {
8416 		xfer_size = 0;
8417 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8418 		xfer_size = size;
8419 	} else {
8420 		if (!isVectorUPL) {
8421 			upl_unlock(upl);
8422 		} else {
8423 			upl_unlock(vector_upl);
8424 		}
8425 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8426 		kr = KERN_FAILURE;
8427 		goto done;
8428 	}
8429 	object = upl->map_object;
8430 
8431 	if (upl->flags & UPL_SHADOWED) {
8432 		vm_object_lock(object);
8433 		shadow_object = object->shadow;
8434 	} else {
8435 		shadow_object = object;
8436 	}
8437 
8438 	target_offset = (vm_object_offset_t)offset;
8439 
8440 	if (upl->flags & UPL_KERNEL_OBJECT) {
8441 		vm_object_lock_shared(shadow_object);
8442 	} else {
8443 		vm_object_lock(shadow_object);
8444 	}
8445 
8446 	if (upl->flags & UPL_ACCESS_BLOCKED) {
8447 		assert(shadow_object->blocked_access);
8448 		shadow_object->blocked_access = FALSE;
8449 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8450 	}
8451 
8452 	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8453 		panic("upl_abort_range: kernel_object being DUMPED");
8454 	}
8455 
8456 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8457 	obj_end = obj_start + xfer_size;
8458 	obj_start = vm_object_trunc_page(obj_start);
8459 	obj_end = vm_object_round_page(obj_end);
8460 	for (obj_offset = obj_start;
8461 	    obj_offset < obj_end;
8462 	    obj_offset += PAGE_SIZE) {
8463 		vm_page_t       t, m;
8464 		unsigned int    pg_num;
8465 		boolean_t       needed;
8466 
8467 		pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8468 		assert(pg_num == target_offset / PAGE_SIZE);
8469 
8470 		needed = FALSE;
8471 
8472 		if (upl->flags & UPL_INTERNAL) {
8473 			needed = upl->page_list[pg_num].needed;
8474 		}
8475 
8476 		dwp->dw_mask = 0;
8477 		m = VM_PAGE_NULL;
8478 
8479 		if (upl->flags & UPL_LITE) {
8480 			if (bitmap_test(upl->lite_list, pg_num)) {
8481 				bitmap_clear(upl->lite_list, pg_num);
8482 
8483 				if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8484 					m = vm_page_lookup(shadow_object, obj_offset);
8485 				}
8486 			}
8487 		}
8488 		if (upl->flags & UPL_SHADOWED) {
8489 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8490 				t->vmp_free_when_done = FALSE;
8491 
8492 				VM_PAGE_FREE(t);
8493 
8494 				if (m == VM_PAGE_NULL) {
8495 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8496 				}
8497 			}
8498 		}
8499 		if ((upl->flags & UPL_KERNEL_OBJECT)) {
8500 			goto abort_next_page;
8501 		}
8502 
8503 		if (m != VM_PAGE_NULL) {
8504 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8505 
8506 			if (m->vmp_absent) {
8507 				boolean_t must_free = TRUE;
8508 
8509 				/*
8510 				 * COPYOUT = FALSE case
8511 				 * check for error conditions which must
8512 				 * be passed back to the pages customer
8513 				 */
8514 				if (error & UPL_ABORT_RESTART) {
8515 					m->vmp_restart = TRUE;
8516 					m->vmp_absent = FALSE;
8517 					m->vmp_unusual = TRUE;
8518 					must_free = FALSE;
8519 				} else if (error & UPL_ABORT_UNAVAILABLE) {
8520 					m->vmp_restart = FALSE;
8521 					m->vmp_unusual = TRUE;
8522 					must_free = FALSE;
8523 				} else if (error & UPL_ABORT_ERROR) {
8524 					m->vmp_restart = FALSE;
8525 					m->vmp_absent = FALSE;
8526 					m->vmp_error = TRUE;
8527 					m->vmp_unusual = TRUE;
8528 					must_free = FALSE;
8529 				}
8530 				if (m->vmp_clustered && needed == FALSE) {
8531 					/*
8532 					 * This page was a part of a speculative
8533 					 * read-ahead initiated by the kernel
8534 					 * itself.  No one is expecting this
8535 					 * page and no one will clean up its
8536 					 * error state if it ever becomes valid
8537 					 * in the future.
8538 					 * We have to free it here.
8539 					 */
8540 					must_free = TRUE;
8541 				}
8542 				m->vmp_cleaning = FALSE;
8543 
8544 				if (m->vmp_overwriting && !m->vmp_busy) {
8545 					/*
8546 					 * this shouldn't happen since
8547 					 * this is an 'absent' page, but
8548 					 * it doesn't hurt to check for
8549 					 * the 'alternate' method of
8550 					 * stabilizing the page...
8551 					 * we will mark 'busy' to be cleared
8552 					 * in the following code which will
8553 					 * take care of the primary stabilzation
8554 					 * method (i.e. setting 'busy' to TRUE)
8555 					 */
8556 					dwp->dw_mask |= DW_vm_page_unwire;
8557 				}
8558 				m->vmp_overwriting = FALSE;
8559 
8560 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8561 
8562 				if (must_free == TRUE) {
8563 					dwp->dw_mask |= DW_vm_page_free;
8564 				} else {
8565 					dwp->dw_mask |= DW_vm_page_activate;
8566 				}
8567 			} else {
8568 				/*
8569 				 * Handle the trusted pager throttle.
8570 				 */
8571 				if (m->vmp_laundry) {
8572 					dwp->dw_mask |= DW_vm_pageout_throttle_up;
8573 				}
8574 
8575 				if (upl->flags & UPL_ACCESS_BLOCKED) {
8576 					/*
8577 					 * We blocked access to the pages in this UPL.
8578 					 * Clear the "busy" bit and wake up any waiter
8579 					 * for this page.
8580 					 */
8581 					dwp->dw_mask |= DW_clear_busy;
8582 				}
8583 				if (m->vmp_overwriting) {
8584 					if (m->vmp_busy) {
8585 						dwp->dw_mask |= DW_clear_busy;
8586 					} else {
8587 						/*
8588 						 * deal with the 'alternate' method
8589 						 * of stabilizing the page...
8590 						 * we will either free the page
8591 						 * or mark 'busy' to be cleared
8592 						 * in the following code which will
8593 						 * take care of the primary stabilzation
8594 						 * method (i.e. setting 'busy' to TRUE)
8595 						 */
8596 						dwp->dw_mask |= DW_vm_page_unwire;
8597 					}
8598 					m->vmp_overwriting = FALSE;
8599 				}
8600 				m->vmp_free_when_done = FALSE;
8601 				m->vmp_cleaning = FALSE;
8602 
8603 				if (error & UPL_ABORT_DUMP_PAGES) {
8604 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8605 
8606 					dwp->dw_mask |= DW_vm_page_free;
8607 				} else {
8608 					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8609 						if (error & UPL_ABORT_REFERENCE) {
8610 							/*
8611 							 * we've been told to explictly
8612 							 * reference this page... for
8613 							 * file I/O, this is done by
8614 							 * implementing an LRU on the inactive q
8615 							 */
8616 							dwp->dw_mask |= DW_vm_page_lru;
8617 						} else if (!VM_PAGE_PAGEABLE(m)) {
8618 							dwp->dw_mask |= DW_vm_page_deactivate_internal;
8619 						}
8620 					}
8621 					dwp->dw_mask |= DW_PAGE_WAKEUP;
8622 				}
8623 			}
8624 		}
8625 abort_next_page:
8626 		target_offset += PAGE_SIZE_64;
8627 		xfer_size -= PAGE_SIZE;
8628 
8629 		if (dwp->dw_mask) {
8630 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8631 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8632 
8633 				if (dw_count >= dw_limit) {
8634 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8635 
8636 					dwp = dwp_start;
8637 					dw_count = 0;
8638 				}
8639 			} else {
8640 				if (dwp->dw_mask & DW_clear_busy) {
8641 					m->vmp_busy = FALSE;
8642 				}
8643 
8644 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8645 					PAGE_WAKEUP(m);
8646 				}
8647 			}
8648 		}
8649 	}
8650 	if (dw_count) {
8651 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8652 		dwp = dwp_start;
8653 		dw_count = 0;
8654 	}
8655 
8656 	if (upl->flags & UPL_DEVICE_MEMORY) {
8657 		occupied = 0;
8658 	} else if (upl->flags & UPL_LITE) {
8659 		uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8660 
8661 		occupied = !bitmap_is_empty(upl->lite_list, pages);
8662 	} else {
8663 		occupied = !vm_page_queue_empty(&upl->map_object->memq);
8664 	}
8665 	if (occupied == 0) {
8666 		/*
8667 		 * If this UPL element belongs to a Vector UPL and is
8668 		 * empty, then this is the right function to deallocate
8669 		 * it. So go ahead set the *empty variable. The flag
8670 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8671 		 * should be considered relevant for the Vector UPL and
8672 		 * not the internal UPLs.
8673 		 */
8674 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8675 			*empty = TRUE;
8676 		}
8677 
8678 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8679 			/*
8680 			 * this is not a paging object
8681 			 * so we need to drop the paging reference
8682 			 * that was taken when we created the UPL
8683 			 * against this object
8684 			 */
8685 			vm_object_activity_end(shadow_object);
8686 			vm_object_collapse(shadow_object, 0, TRUE);
8687 		} else {
8688 			/*
8689 			 * we dontated the paging reference to
8690 			 * the map object... vm_pageout_object_terminate
8691 			 * will drop this reference
8692 			 */
8693 		}
8694 	}
8695 	vm_object_unlock(shadow_object);
8696 	if (object != shadow_object) {
8697 		vm_object_unlock(object);
8698 	}
8699 
8700 	if (!isVectorUPL) {
8701 		upl_unlock(upl);
8702 	} else {
8703 		/*
8704 		 * If we completed our operations on an UPL that is
8705 		 * part of a Vectored UPL and if empty is TRUE, then
8706 		 * we should go ahead and deallocate this UPL element.
8707 		 * Then we check if this was the last of the UPL elements
8708 		 * within that Vectored UPL. If so, set empty to TRUE
8709 		 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8710 		 * can go ahead and deallocate the Vector UPL too.
8711 		 */
8712 		if (*empty == TRUE) {
8713 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8714 			upl_deallocate(upl);
8715 		}
8716 		goto process_upl_to_abort;
8717 	}
8718 
8719 	kr = KERN_SUCCESS;
8720 
8721 done:
8722 	if (dwp_start && dwp_finish_ctx) {
8723 		vm_page_delayed_work_finish_ctx(dwp_start);
8724 		dwp_start = dwp = NULL;
8725 	}
8726 
8727 	return kr;
8728 }
8729 
8730 
8731 kern_return_t
upl_abort(upl_t upl,int error)8732 upl_abort(
8733 	upl_t   upl,
8734 	int     error)
8735 {
8736 	boolean_t       empty;
8737 
8738 	if (upl == UPL_NULL) {
8739 		return KERN_INVALID_ARGUMENT;
8740 	}
8741 
8742 	return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8743 }
8744 
8745 
8746 /* an option on commit should be wire */
8747 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8748 upl_commit(
8749 	upl_t                   upl,
8750 	upl_page_info_t         *page_list,
8751 	mach_msg_type_number_t  count)
8752 {
8753 	boolean_t       empty;
8754 
8755 	if (upl == UPL_NULL) {
8756 		return KERN_INVALID_ARGUMENT;
8757 	}
8758 
8759 	return upl_commit_range(upl, 0, upl->u_size, 0,
8760 	           page_list, count, &empty);
8761 }
8762 
8763 
8764 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8765 iopl_valid_data(
8766 	upl_t    upl,
8767 	vm_tag_t tag)
8768 {
8769 	vm_object_t     object;
8770 	vm_offset_t     offset;
8771 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
8772 	upl_size_t      size;
8773 	int             wired_count = 0;
8774 
8775 	if (upl == NULL) {
8776 		panic("iopl_valid_data: NULL upl");
8777 	}
8778 	if (vector_upl_is_valid(upl)) {
8779 		panic("iopl_valid_data: vector upl");
8780 	}
8781 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8782 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8783 	}
8784 
8785 	object = upl->map_object;
8786 
8787 	if (object == kernel_object || object == compressor_object) {
8788 		panic("iopl_valid_data: object == kernel or compressor");
8789 	}
8790 
8791 	if (object->purgable == VM_PURGABLE_VOLATILE ||
8792 	    object->purgable == VM_PURGABLE_EMPTY) {
8793 		panic("iopl_valid_data: object %p purgable %d",
8794 		    object, object->purgable);
8795 	}
8796 
8797 	size = upl_adjusted_size(upl, PAGE_MASK);
8798 
8799 	vm_object_lock(object);
8800 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8801 
8802 	bool whole_object;
8803 
8804 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8805 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8806 		whole_object = true;
8807 	} else {
8808 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8809 		whole_object = false;
8810 	}
8811 
8812 	while (size) {
8813 		if (whole_object) {
8814 			if (nxt_page != VM_PAGE_NULL) {
8815 				m = nxt_page;
8816 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8817 			}
8818 		} else {
8819 			m = vm_page_lookup(object, offset);
8820 			offset += PAGE_SIZE;
8821 
8822 			if (m == VM_PAGE_NULL) {
8823 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8824 			}
8825 		}
8826 		if (m->vmp_busy) {
8827 			if (!m->vmp_absent) {
8828 				panic("iopl_valid_data: busy page w/o absent");
8829 			}
8830 
8831 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8832 				panic("iopl_valid_data: busy+absent page on page queue");
8833 			}
8834 			if (m->vmp_reusable) {
8835 				panic("iopl_valid_data: %p is reusable", m);
8836 			}
8837 
8838 			m->vmp_absent = FALSE;
8839 			m->vmp_dirty = TRUE;
8840 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8841 			assert(m->vmp_wire_count == 0);
8842 			m->vmp_wire_count++;
8843 			assert(m->vmp_wire_count);
8844 			if (m->vmp_wire_count == 1) {
8845 				m->vmp_q_state = VM_PAGE_IS_WIRED;
8846 				wired_count++;
8847 			} else {
8848 				panic("iopl_valid_data: %p already wired", m);
8849 			}
8850 
8851 			PAGE_WAKEUP_DONE(m);
8852 		}
8853 		size -= PAGE_SIZE;
8854 	}
8855 	if (wired_count) {
8856 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8857 		assert(object->resident_page_count >= object->wired_page_count);
8858 
8859 		/* no need to adjust purgeable accounting for this object: */
8860 		assert(object->purgable != VM_PURGABLE_VOLATILE);
8861 		assert(object->purgable != VM_PURGABLE_EMPTY);
8862 
8863 		vm_page_lockspin_queues();
8864 		vm_page_wire_count += wired_count;
8865 		vm_page_unlock_queues();
8866 	}
8867 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8868 	vm_object_unlock(object);
8869 }
8870 
8871 
8872 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8873 vm_object_set_pmap_cache_attr(
8874 	vm_object_t             object,
8875 	upl_page_info_array_t   user_page_list,
8876 	unsigned int            num_pages,
8877 	boolean_t               batch_pmap_op)
8878 {
8879 	unsigned int    cache_attr = 0;
8880 
8881 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8882 	assert(user_page_list);
8883 	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8884 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8885 	}
8886 }
8887 
8888 
8889 static bool
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8890 vm_object_iopl_wire_full(
8891 	vm_object_t             object,
8892 	upl_t                   upl,
8893 	upl_page_info_array_t   user_page_list,
8894 	upl_control_flags_t     cntrl_flags,
8895 	vm_tag_t                tag)
8896 {
8897 	vm_page_t       dst_page;
8898 	unsigned int    entry;
8899 	int             page_count;
8900 	int             delayed_unlock = 0;
8901 	boolean_t       retval = TRUE;
8902 	ppnum_t         phys_page;
8903 
8904 	vm_object_lock_assert_exclusive(object);
8905 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8906 	assert(object->purgable != VM_PURGABLE_EMPTY);
8907 	assert(object->pager == NULL);
8908 	assert(object->copy == NULL);
8909 	assert(object->shadow == NULL);
8910 
8911 	page_count = object->resident_page_count;
8912 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8913 
8914 	vm_page_lock_queues();
8915 
8916 	while (page_count--) {
8917 		if (dst_page->vmp_busy ||
8918 		    dst_page->vmp_fictitious ||
8919 		    dst_page->vmp_absent ||
8920 		    VMP_ERROR_GET(dst_page) ||
8921 		    dst_page->vmp_cleaning ||
8922 		    dst_page->vmp_restart ||
8923 		    dst_page->vmp_laundry) {
8924 			retval = FALSE;
8925 			goto done;
8926 		}
8927 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8928 			retval = FALSE;
8929 			goto done;
8930 		}
8931 		dst_page->vmp_reference = TRUE;
8932 
8933 		vm_page_wire(dst_page, tag, FALSE);
8934 
8935 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8936 			SET_PAGE_DIRTY(dst_page, FALSE);
8937 		}
8938 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8939 		assert(entry >= 0 && entry < object->resident_page_count);
8940 		bitmap_set(upl->lite_list, entry);
8941 
8942 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8943 
8944 		if (phys_page > upl->highest_page) {
8945 			upl->highest_page = phys_page;
8946 		}
8947 
8948 		if (user_page_list) {
8949 			user_page_list[entry].phys_addr = phys_page;
8950 			user_page_list[entry].absent    = dst_page->vmp_absent;
8951 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8952 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8953 			user_page_list[entry].precious  = dst_page->vmp_precious;
8954 			user_page_list[entry].device    = FALSE;
8955 			user_page_list[entry].speculative = FALSE;
8956 			user_page_list[entry].cs_validated = FALSE;
8957 			user_page_list[entry].cs_tainted = FALSE;
8958 			user_page_list[entry].cs_nx     = FALSE;
8959 			user_page_list[entry].needed    = FALSE;
8960 			user_page_list[entry].mark      = FALSE;
8961 		}
8962 		if (delayed_unlock++ > 256) {
8963 			delayed_unlock = 0;
8964 			lck_mtx_yield(&vm_page_queue_lock);
8965 
8966 			VM_CHECK_MEMORYSTATUS;
8967 		}
8968 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8969 	}
8970 done:
8971 	vm_page_unlock_queues();
8972 
8973 	VM_CHECK_MEMORYSTATUS;
8974 
8975 	return retval;
8976 }
8977 
8978 
8979 static kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8980 vm_object_iopl_wire_empty(
8981 	vm_object_t             object,
8982 	upl_t                   upl,
8983 	upl_page_info_array_t   user_page_list,
8984 	upl_control_flags_t     cntrl_flags,
8985 	vm_tag_t                tag,
8986 	vm_object_offset_t     *dst_offset,
8987 	int                     page_count,
8988 	int                    *page_grab_count)
8989 {
8990 	vm_page_t       dst_page;
8991 	boolean_t       no_zero_fill = FALSE;
8992 	int             interruptible;
8993 	int             pages_wired = 0;
8994 	int             pages_inserted = 0;
8995 	int             entry = 0;
8996 	uint64_t        delayed_ledger_update = 0;
8997 	kern_return_t   ret = KERN_SUCCESS;
8998 	int             grab_options;
8999 	ppnum_t         phys_page;
9000 
9001 	vm_object_lock_assert_exclusive(object);
9002 	assert(object->purgable != VM_PURGABLE_VOLATILE);
9003 	assert(object->purgable != VM_PURGABLE_EMPTY);
9004 	assert(object->pager == NULL);
9005 	assert(object->copy == NULL);
9006 	assert(object->shadow == NULL);
9007 
9008 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9009 		interruptible = THREAD_ABORTSAFE;
9010 	} else {
9011 		interruptible = THREAD_UNINT;
9012 	}
9013 
9014 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9015 		no_zero_fill = TRUE;
9016 	}
9017 
9018 	grab_options = 0;
9019 #if CONFIG_SECLUDED_MEMORY
9020 	if (object->can_grab_secluded) {
9021 		grab_options |= VM_PAGE_GRAB_SECLUDED;
9022 	}
9023 #endif /* CONFIG_SECLUDED_MEMORY */
9024 
9025 	while (page_count--) {
9026 		while ((dst_page = vm_page_grab_options(grab_options))
9027 		    == VM_PAGE_NULL) {
9028 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9029 
9030 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9031 
9032 			if (vm_page_wait(interruptible) == FALSE) {
9033 				/*
9034 				 * interrupted case
9035 				 */
9036 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9037 
9038 				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9039 
9040 				ret = MACH_SEND_INTERRUPTED;
9041 				goto done;
9042 			}
9043 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9044 
9045 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9046 		}
9047 		if (no_zero_fill == FALSE) {
9048 			vm_page_zero_fill(dst_page);
9049 		} else {
9050 			dst_page->vmp_absent = TRUE;
9051 		}
9052 
9053 		dst_page->vmp_reference = TRUE;
9054 
9055 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9056 			SET_PAGE_DIRTY(dst_page, FALSE);
9057 		}
9058 		if (dst_page->vmp_absent == FALSE) {
9059 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9060 			assert(dst_page->vmp_wire_count == 0);
9061 			dst_page->vmp_wire_count++;
9062 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9063 			assert(dst_page->vmp_wire_count);
9064 			pages_wired++;
9065 			PAGE_WAKEUP_DONE(dst_page);
9066 		}
9067 		pages_inserted++;
9068 
9069 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9070 
9071 		bitmap_set(upl->lite_list, entry);
9072 
9073 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9074 
9075 		if (phys_page > upl->highest_page) {
9076 			upl->highest_page = phys_page;
9077 		}
9078 
9079 		if (user_page_list) {
9080 			user_page_list[entry].phys_addr = phys_page;
9081 			user_page_list[entry].absent    = dst_page->vmp_absent;
9082 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9083 			user_page_list[entry].free_when_done    = FALSE;
9084 			user_page_list[entry].precious  = FALSE;
9085 			user_page_list[entry].device    = FALSE;
9086 			user_page_list[entry].speculative = FALSE;
9087 			user_page_list[entry].cs_validated = FALSE;
9088 			user_page_list[entry].cs_tainted = FALSE;
9089 			user_page_list[entry].cs_nx     = FALSE;
9090 			user_page_list[entry].needed    = FALSE;
9091 			user_page_list[entry].mark      = FALSE;
9092 		}
9093 		entry++;
9094 		*dst_offset += PAGE_SIZE_64;
9095 	}
9096 done:
9097 	if (pages_wired) {
9098 		vm_page_lockspin_queues();
9099 		vm_page_wire_count += pages_wired;
9100 		vm_page_unlock_queues();
9101 	}
9102 	if (pages_inserted) {
9103 		if (object->internal) {
9104 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
9105 		} else {
9106 			OSAddAtomic(pages_inserted, &vm_page_external_count);
9107 		}
9108 	}
9109 	if (delayed_ledger_update) {
9110 		task_t          owner;
9111 		int             ledger_idx_volatile;
9112 		int             ledger_idx_nonvolatile;
9113 		int             ledger_idx_volatile_compressed;
9114 		int             ledger_idx_nonvolatile_compressed;
9115 		boolean_t       do_footprint;
9116 
9117 		owner = VM_OBJECT_OWNER(object);
9118 		assert(owner);
9119 
9120 		vm_object_ledger_tag_ledgers(object,
9121 		    &ledger_idx_volatile,
9122 		    &ledger_idx_nonvolatile,
9123 		    &ledger_idx_volatile_compressed,
9124 		    &ledger_idx_nonvolatile_compressed,
9125 		    &do_footprint);
9126 
9127 		/* more non-volatile bytes */
9128 		ledger_credit(owner->ledger,
9129 		    ledger_idx_nonvolatile,
9130 		    delayed_ledger_update);
9131 		if (do_footprint) {
9132 			/* more footprint */
9133 			ledger_credit(owner->ledger,
9134 			    task_ledgers.phys_footprint,
9135 			    delayed_ledger_update);
9136 		}
9137 	}
9138 
9139 	assert(page_grab_count);
9140 	*page_grab_count = pages_inserted;
9141 
9142 	return ret;
9143 }
9144 
9145 
9146 
9147 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9148 vm_object_iopl_request(
9149 	vm_object_t             object,
9150 	vm_object_offset_t      offset,
9151 	upl_size_t              size,
9152 	upl_t                   *upl_ptr,
9153 	upl_page_info_array_t   user_page_list,
9154 	unsigned int            *page_list_count,
9155 	upl_control_flags_t     cntrl_flags,
9156 	vm_tag_t                tag)
9157 {
9158 	vm_page_t               dst_page;
9159 	vm_object_offset_t      dst_offset;
9160 	upl_size_t              xfer_size;
9161 	upl_t                   upl = NULL;
9162 	unsigned int            entry;
9163 	int                     no_zero_fill = FALSE;
9164 	unsigned int            size_in_pages;
9165 	int                     page_grab_count = 0;
9166 	u_int32_t               psize;
9167 	kern_return_t           ret;
9168 	vm_prot_t               prot;
9169 	struct vm_object_fault_info fault_info = {};
9170 	struct  vm_page_delayed_work    dw_array;
9171 	struct  vm_page_delayed_work    *dwp, *dwp_start;
9172 	bool                    dwp_finish_ctx = TRUE;
9173 	int                     dw_count;
9174 	int                     dw_limit;
9175 	int                     dw_index;
9176 	boolean_t               caller_lookup;
9177 	int                     io_tracking_flag = 0;
9178 	int                     interruptible;
9179 	ppnum_t                 phys_page;
9180 
9181 	boolean_t               set_cache_attr_needed = FALSE;
9182 	boolean_t               free_wired_pages = FALSE;
9183 	boolean_t               fast_path_empty_req = FALSE;
9184 	boolean_t               fast_path_full_req = FALSE;
9185 
9186 #if DEVELOPMENT || DEBUG
9187 	task_t                  task = current_task();
9188 #endif /* DEVELOPMENT || DEBUG */
9189 
9190 	dwp_start = dwp = NULL;
9191 
9192 	vm_object_offset_t original_offset = offset;
9193 	upl_size_t original_size = size;
9194 
9195 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9196 
9197 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9198 	offset = vm_object_trunc_page(offset);
9199 	if (size != original_size || offset != original_offset) {
9200 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9201 	}
9202 
9203 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9204 		/*
9205 		 * For forward compatibility's sake,
9206 		 * reject any unknown flag.
9207 		 */
9208 		return KERN_INVALID_VALUE;
9209 	}
9210 	if (vm_lopage_needed == FALSE) {
9211 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9212 	}
9213 
9214 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9215 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9216 			return KERN_INVALID_VALUE;
9217 		}
9218 
9219 		if (object->phys_contiguous) {
9220 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9221 				return KERN_INVALID_ADDRESS;
9222 			}
9223 
9224 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9225 				return KERN_INVALID_ADDRESS;
9226 			}
9227 		}
9228 	}
9229 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9230 		no_zero_fill = TRUE;
9231 	}
9232 
9233 	if (cntrl_flags & UPL_COPYOUT_FROM) {
9234 		prot = VM_PROT_READ;
9235 	} else {
9236 		prot = VM_PROT_READ | VM_PROT_WRITE;
9237 	}
9238 
9239 	if ((!object->internal) && (object->paging_offset != 0)) {
9240 		panic("vm_object_iopl_request: external object with non-zero paging offset");
9241 	}
9242 
9243 
9244 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9245 
9246 #if CONFIG_IOSCHED || UPL_DEBUG
9247 	if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9248 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9249 	}
9250 #endif
9251 
9252 #if CONFIG_IOSCHED
9253 	if (object->io_tracking) {
9254 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9255 		if (object != kernel_object) {
9256 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9257 		}
9258 	}
9259 #endif
9260 
9261 	if (object->phys_contiguous) {
9262 		psize = PAGE_SIZE;
9263 	} else {
9264 		psize = size;
9265 
9266 		dw_count = 0;
9267 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9268 		dwp_start = vm_page_delayed_work_get_ctx();
9269 		if (dwp_start == NULL) {
9270 			dwp_start = &dw_array;
9271 			dw_limit = 1;
9272 			dwp_finish_ctx = FALSE;
9273 		}
9274 
9275 		dwp = dwp_start;
9276 	}
9277 
9278 	if (cntrl_flags & UPL_SET_INTERNAL) {
9279 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9280 		user_page_list = size ? upl->page_list : NULL;
9281 	} else {
9282 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9283 	}
9284 	if (user_page_list) {
9285 		user_page_list[0].device = FALSE;
9286 	}
9287 	*upl_ptr = upl;
9288 
9289 	if (cntrl_flags & UPL_NOZEROFILLIO) {
9290 		DTRACE_VM4(upl_nozerofillio,
9291 		    vm_object_t, object,
9292 		    vm_object_offset_t, offset,
9293 		    upl_size_t, size,
9294 		    upl_t, upl);
9295 	}
9296 
9297 	upl->map_object = object;
9298 	upl->u_offset = original_offset;
9299 	upl->u_size = original_size;
9300 
9301 	size_in_pages = size / PAGE_SIZE;
9302 
9303 	if (object == kernel_object &&
9304 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9305 		upl->flags |= UPL_KERNEL_OBJECT;
9306 #if UPL_DEBUG
9307 		vm_object_lock(object);
9308 #else
9309 		vm_object_lock_shared(object);
9310 #endif
9311 	} else {
9312 		vm_object_lock(object);
9313 		vm_object_activity_begin(object);
9314 	}
9315 	/*
9316 	 * paging in progress also protects the paging_offset
9317 	 */
9318 	upl->u_offset = original_offset + object->paging_offset;
9319 
9320 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9321 		/*
9322 		 * The user requested that access to the pages in this UPL
9323 		 * be blocked until the UPL is commited or aborted.
9324 		 */
9325 		upl->flags |= UPL_ACCESS_BLOCKED;
9326 	}
9327 
9328 #if CONFIG_IOSCHED || UPL_DEBUG
9329 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9330 		vm_object_activity_begin(object);
9331 		queue_enter(&object->uplq, upl, upl_t, uplq);
9332 	}
9333 #endif
9334 
9335 	if (object->phys_contiguous) {
9336 		if (upl->flags & UPL_ACCESS_BLOCKED) {
9337 			assert(!object->blocked_access);
9338 			object->blocked_access = TRUE;
9339 		}
9340 
9341 		vm_object_unlock(object);
9342 
9343 		/*
9344 		 * don't need any shadow mappings for this one
9345 		 * since it is already I/O memory
9346 		 */
9347 		upl->flags |= UPL_DEVICE_MEMORY;
9348 
9349 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9350 
9351 		if (user_page_list) {
9352 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9353 			user_page_list[0].device = TRUE;
9354 		}
9355 		if (page_list_count != NULL) {
9356 			if (upl->flags & UPL_INTERNAL) {
9357 				*page_list_count = 0;
9358 			} else {
9359 				*page_list_count = 1;
9360 			}
9361 		}
9362 
9363 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9364 #if DEVELOPMENT || DEBUG
9365 		if (task != NULL) {
9366 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9367 		}
9368 #endif /* DEVELOPMENT || DEBUG */
9369 		return KERN_SUCCESS;
9370 	}
9371 	if (object != kernel_object && object != compressor_object) {
9372 		/*
9373 		 * Protect user space from future COW operations
9374 		 */
9375 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9376 		if (!object->true_share &&
9377 		    vm_object_tracking_btlog) {
9378 			btlog_record(vm_object_tracking_btlog, object,
9379 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
9380 			    btref_get(__builtin_frame_address(0), 0));
9381 		}
9382 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9383 
9384 		vm_object_lock_assert_exclusive(object);
9385 		object->true_share = TRUE;
9386 
9387 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9388 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9389 		}
9390 	}
9391 
9392 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9393 	    object->copy != VM_OBJECT_NULL) {
9394 		/*
9395 		 * Honor copy-on-write obligations
9396 		 *
9397 		 * The caller is gathering these pages and
9398 		 * might modify their contents.  We need to
9399 		 * make sure that the copy object has its own
9400 		 * private copies of these pages before we let
9401 		 * the caller modify them.
9402 		 *
9403 		 * NOTE: someone else could map the original object
9404 		 * after we've done this copy-on-write here, and they
9405 		 * could then see an inconsistent picture of the memory
9406 		 * while it's being modified via the UPL.  To prevent this,
9407 		 * we would have to block access to these pages until the
9408 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
9409 		 * code path for that...
9410 		 */
9411 		vm_object_update(object,
9412 		    offset,
9413 		    size,
9414 		    NULL,
9415 		    NULL,
9416 		    FALSE,              /* should_return */
9417 		    MEMORY_OBJECT_COPY_SYNC,
9418 		    VM_PROT_NO_CHANGE);
9419 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
9420 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9421 	}
9422 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9423 	    object->purgable != VM_PURGABLE_VOLATILE &&
9424 	    object->purgable != VM_PURGABLE_EMPTY &&
9425 	    object->copy == NULL &&
9426 	    size == object->vo_size &&
9427 	    offset == 0 &&
9428 	    object->shadow == NULL &&
9429 	    object->pager == NULL) {
9430 		if (object->resident_page_count == size_in_pages) {
9431 			assert(object != compressor_object);
9432 			assert(object != kernel_object);
9433 			fast_path_full_req = TRUE;
9434 		} else if (object->resident_page_count == 0) {
9435 			assert(object != compressor_object);
9436 			assert(object != kernel_object);
9437 			fast_path_empty_req = TRUE;
9438 			set_cache_attr_needed = TRUE;
9439 		}
9440 	}
9441 
9442 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9443 		interruptible = THREAD_ABORTSAFE;
9444 	} else {
9445 		interruptible = THREAD_UNINT;
9446 	}
9447 
9448 	entry = 0;
9449 
9450 	xfer_size = size;
9451 	dst_offset = offset;
9452 
9453 	if (fast_path_full_req) {
9454 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9455 			goto finish;
9456 		}
9457 		/*
9458 		 * we couldn't complete the processing of this request on the fast path
9459 		 * so fall through to the slow path and finish up
9460 		 */
9461 	} else if (fast_path_empty_req) {
9462 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9463 			ret = KERN_MEMORY_ERROR;
9464 			goto return_err;
9465 		}
9466 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9467 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9468 
9469 		if (ret) {
9470 			free_wired_pages = TRUE;
9471 			goto return_err;
9472 		}
9473 		goto finish;
9474 	}
9475 
9476 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9477 	fault_info.lo_offset = offset;
9478 	fault_info.hi_offset = offset + xfer_size;
9479 	fault_info.mark_zf_absent = TRUE;
9480 	fault_info.interruptible = interruptible;
9481 	fault_info.batch_pmap_op = TRUE;
9482 
9483 	while (xfer_size) {
9484 		vm_fault_return_t       result;
9485 
9486 		dwp->dw_mask = 0;
9487 
9488 		if (fast_path_full_req) {
9489 			/*
9490 			 * if we get here, it means that we ran into a page
9491 			 * state we couldn't handle in the fast path and
9492 			 * bailed out to the slow path... since the order
9493 			 * we look at pages is different between the 2 paths,
9494 			 * the following check is needed to determine whether
9495 			 * this page was already processed in the fast path
9496 			 */
9497 			if (bitmap_test(upl->lite_list, entry)) {
9498 				goto skip_page;
9499 			}
9500 		}
9501 		dst_page = vm_page_lookup(object, dst_offset);
9502 
9503 		if (dst_page == VM_PAGE_NULL ||
9504 		    dst_page->vmp_busy ||
9505 		    VMP_ERROR_GET(dst_page) ||
9506 		    dst_page->vmp_restart ||
9507 		    dst_page->vmp_absent ||
9508 		    dst_page->vmp_fictitious) {
9509 			if (object == kernel_object) {
9510 				panic("vm_object_iopl_request: missing/bad page in kernel object");
9511 			}
9512 			if (object == compressor_object) {
9513 				panic("vm_object_iopl_request: missing/bad page in compressor object");
9514 			}
9515 
9516 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9517 				ret = KERN_MEMORY_ERROR;
9518 				goto return_err;
9519 			}
9520 			set_cache_attr_needed = TRUE;
9521 
9522 			/*
9523 			 * We just looked up the page and the result remains valid
9524 			 * until the object lock is release, so send it to
9525 			 * vm_fault_page() (as "dst_page"), to avoid having to
9526 			 * look it up again there.
9527 			 */
9528 			caller_lookup = TRUE;
9529 
9530 			do {
9531 				vm_page_t       top_page;
9532 				kern_return_t   error_code;
9533 
9534 				fault_info.cluster_size = xfer_size;
9535 
9536 				vm_object_paging_begin(object);
9537 
9538 				result = vm_fault_page(object, dst_offset,
9539 				    prot | VM_PROT_WRITE, FALSE,
9540 				    caller_lookup,
9541 				    &prot, &dst_page, &top_page,
9542 				    (int *)0,
9543 				    &error_code, no_zero_fill,
9544 				    &fault_info);
9545 
9546 				/* our lookup is no longer valid at this point */
9547 				caller_lookup = FALSE;
9548 
9549 				switch (result) {
9550 				case VM_FAULT_SUCCESS:
9551 					page_grab_count++;
9552 
9553 					if (!dst_page->vmp_absent) {
9554 						PAGE_WAKEUP_DONE(dst_page);
9555 					} else {
9556 						/*
9557 						 * we only get back an absent page if we
9558 						 * requested that it not be zero-filled
9559 						 * because we are about to fill it via I/O
9560 						 *
9561 						 * absent pages should be left BUSY
9562 						 * to prevent them from being faulted
9563 						 * into an address space before we've
9564 						 * had a chance to complete the I/O on
9565 						 * them since they may contain info that
9566 						 * shouldn't be seen by the faulting task
9567 						 */
9568 					}
9569 					/*
9570 					 *	Release paging references and
9571 					 *	top-level placeholder page, if any.
9572 					 */
9573 					if (top_page != VM_PAGE_NULL) {
9574 						vm_object_t local_object;
9575 
9576 						local_object = VM_PAGE_OBJECT(top_page);
9577 
9578 						/*
9579 						 * comparing 2 packed pointers
9580 						 */
9581 						if (top_page->vmp_object != dst_page->vmp_object) {
9582 							vm_object_lock(local_object);
9583 							VM_PAGE_FREE(top_page);
9584 							vm_object_paging_end(local_object);
9585 							vm_object_unlock(local_object);
9586 						} else {
9587 							VM_PAGE_FREE(top_page);
9588 							vm_object_paging_end(local_object);
9589 						}
9590 					}
9591 					vm_object_paging_end(object);
9592 					break;
9593 
9594 				case VM_FAULT_RETRY:
9595 					vm_object_lock(object);
9596 					break;
9597 
9598 				case VM_FAULT_MEMORY_SHORTAGE:
9599 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9600 
9601 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9602 
9603 					if (vm_page_wait(interruptible)) {
9604 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9605 
9606 						VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9607 						vm_object_lock(object);
9608 
9609 						break;
9610 					}
9611 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9612 
9613 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9614 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9615 					OS_FALLTHROUGH;
9616 
9617 				case VM_FAULT_INTERRUPTED:
9618 					error_code = MACH_SEND_INTERRUPTED;
9619 					OS_FALLTHROUGH;
9620 				case VM_FAULT_MEMORY_ERROR:
9621 memory_error:
9622 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9623 
9624 					vm_object_lock(object);
9625 					goto return_err;
9626 
9627 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
9628 					/* success but no page: fail */
9629 					vm_object_paging_end(object);
9630 					vm_object_unlock(object);
9631 					goto memory_error;
9632 
9633 				default:
9634 					panic("vm_object_iopl_request: unexpected error"
9635 					    " 0x%x from vm_fault_page()\n", result);
9636 				}
9637 			} while (result != VM_FAULT_SUCCESS);
9638 		}
9639 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9640 
9641 		if (upl->flags & UPL_KERNEL_OBJECT) {
9642 			goto record_phys_addr;
9643 		}
9644 
9645 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9646 			dst_page->vmp_busy = TRUE;
9647 			goto record_phys_addr;
9648 		}
9649 
9650 		if (dst_page->vmp_cleaning) {
9651 			/*
9652 			 * Someone else is cleaning this page in place.
9653 			 * In theory, we should be able to  proceed and use this
9654 			 * page but they'll probably end up clearing the "busy"
9655 			 * bit on it in upl_commit_range() but they didn't set
9656 			 * it, so they would clear our "busy" bit and open
9657 			 * us to race conditions.
9658 			 * We'd better wait for the cleaning to complete and
9659 			 * then try again.
9660 			 */
9661 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9662 			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9663 			continue;
9664 		}
9665 		if (dst_page->vmp_laundry) {
9666 			vm_pageout_steal_laundry(dst_page, FALSE);
9667 		}
9668 
9669 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9670 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9671 			vm_page_t       low_page;
9672 			int             refmod;
9673 
9674 			/*
9675 			 * support devices that can't DMA above 32 bits
9676 			 * by substituting pages from a pool of low address
9677 			 * memory for any pages we find above the 4G mark
9678 			 * can't substitute if the page is already wired because
9679 			 * we don't know whether that physical address has been
9680 			 * handed out to some other 64 bit capable DMA device to use
9681 			 */
9682 			if (VM_PAGE_WIRED(dst_page)) {
9683 				ret = KERN_PROTECTION_FAILURE;
9684 				goto return_err;
9685 			}
9686 			low_page = vm_page_grablo();
9687 
9688 			if (low_page == VM_PAGE_NULL) {
9689 				ret = KERN_RESOURCE_SHORTAGE;
9690 				goto return_err;
9691 			}
9692 			/*
9693 			 * from here until the vm_page_replace completes
9694 			 * we musn't drop the object lock... we don't
9695 			 * want anyone refaulting this page in and using
9696 			 * it after we disconnect it... we want the fault
9697 			 * to find the new page being substituted.
9698 			 */
9699 			if (dst_page->vmp_pmapped) {
9700 				refmod = pmap_disconnect(phys_page);
9701 			} else {
9702 				refmod = 0;
9703 			}
9704 
9705 			if (!dst_page->vmp_absent) {
9706 				vm_page_copy(dst_page, low_page);
9707 			}
9708 
9709 			low_page->vmp_reference = dst_page->vmp_reference;
9710 			low_page->vmp_dirty     = dst_page->vmp_dirty;
9711 			low_page->vmp_absent    = dst_page->vmp_absent;
9712 
9713 			if (refmod & VM_MEM_REFERENCED) {
9714 				low_page->vmp_reference = TRUE;
9715 			}
9716 			if (refmod & VM_MEM_MODIFIED) {
9717 				SET_PAGE_DIRTY(low_page, FALSE);
9718 			}
9719 
9720 			vm_page_replace(low_page, object, dst_offset);
9721 
9722 			dst_page = low_page;
9723 			/*
9724 			 * vm_page_grablo returned the page marked
9725 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
9726 			 * here, because we've never dropped the object lock
9727 			 */
9728 			if (!dst_page->vmp_absent) {
9729 				dst_page->vmp_busy = FALSE;
9730 			}
9731 
9732 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9733 		}
9734 		if (!dst_page->vmp_busy) {
9735 			dwp->dw_mask |= DW_vm_page_wire;
9736 		}
9737 
9738 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
9739 			/*
9740 			 * Mark the page "busy" to block any future page fault
9741 			 * on this page in addition to wiring it.
9742 			 * We'll also remove the mapping
9743 			 * of all these pages before leaving this routine.
9744 			 */
9745 			assert(!dst_page->vmp_fictitious);
9746 			dst_page->vmp_busy = TRUE;
9747 		}
9748 		/*
9749 		 * expect the page to be used
9750 		 * page queues lock must be held to set 'reference'
9751 		 */
9752 		dwp->dw_mask |= DW_set_reference;
9753 
9754 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9755 			SET_PAGE_DIRTY(dst_page, TRUE);
9756 			/*
9757 			 * Page belonging to a code-signed object is about to
9758 			 * be written. Mark it tainted and disconnect it from
9759 			 * all pmaps so processes have to fault it back in and
9760 			 * deal with the tainted bit.
9761 			 */
9762 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9763 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9764 				vm_page_iopl_tainted++;
9765 				if (dst_page->vmp_pmapped) {
9766 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9767 					if (refmod & VM_MEM_REFERENCED) {
9768 						dst_page->vmp_reference = TRUE;
9769 					}
9770 				}
9771 			}
9772 		}
9773 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9774 			pmap_sync_page_attributes_phys(phys_page);
9775 			dst_page->vmp_written_by_kernel = FALSE;
9776 		}
9777 
9778 record_phys_addr:
9779 		if (dst_page->vmp_busy) {
9780 			upl->flags |= UPL_HAS_BUSY;
9781 		}
9782 
9783 		bitmap_set(upl->lite_list, entry);
9784 
9785 		if (phys_page > upl->highest_page) {
9786 			upl->highest_page = phys_page;
9787 		}
9788 
9789 		if (user_page_list) {
9790 			user_page_list[entry].phys_addr = phys_page;
9791 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9792 			user_page_list[entry].absent    = dst_page->vmp_absent;
9793 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9794 			user_page_list[entry].precious  = dst_page->vmp_precious;
9795 			user_page_list[entry].device    = FALSE;
9796 			user_page_list[entry].needed    = FALSE;
9797 			if (dst_page->vmp_clustered == TRUE) {
9798 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9799 			} else {
9800 				user_page_list[entry].speculative = FALSE;
9801 			}
9802 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9803 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9804 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9805 			user_page_list[entry].mark      = FALSE;
9806 		}
9807 		if (object != kernel_object && object != compressor_object) {
9808 			/*
9809 			 * someone is explicitly grabbing this page...
9810 			 * update clustered and speculative state
9811 			 *
9812 			 */
9813 			if (dst_page->vmp_clustered) {
9814 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
9815 			}
9816 		}
9817 skip_page:
9818 		entry++;
9819 		dst_offset += PAGE_SIZE_64;
9820 		xfer_size -= PAGE_SIZE;
9821 
9822 		if (dwp->dw_mask) {
9823 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9824 
9825 			if (dw_count >= dw_limit) {
9826 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9827 
9828 				dwp = dwp_start;
9829 				dw_count = 0;
9830 			}
9831 		}
9832 	}
9833 	assert(entry == size_in_pages);
9834 
9835 	if (dw_count) {
9836 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9837 		dwp = dwp_start;
9838 		dw_count = 0;
9839 	}
9840 finish:
9841 	if (user_page_list && set_cache_attr_needed == TRUE) {
9842 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9843 	}
9844 
9845 	if (page_list_count != NULL) {
9846 		if (upl->flags & UPL_INTERNAL) {
9847 			*page_list_count = 0;
9848 		} else if (*page_list_count > size_in_pages) {
9849 			*page_list_count = size_in_pages;
9850 		}
9851 	}
9852 	vm_object_unlock(object);
9853 
9854 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9855 		/*
9856 		 * We've marked all the pages "busy" so that future
9857 		 * page faults will block.
9858 		 * Now remove the mapping for these pages, so that they
9859 		 * can't be accessed without causing a page fault.
9860 		 */
9861 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9862 		    PMAP_NULL,
9863 		    PAGE_SIZE,
9864 		    0, VM_PROT_NONE);
9865 		assert(!object->blocked_access);
9866 		object->blocked_access = TRUE;
9867 	}
9868 
9869 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9870 #if DEVELOPMENT || DEBUG
9871 	if (task != NULL) {
9872 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9873 	}
9874 #endif /* DEVELOPMENT || DEBUG */
9875 
9876 	if (dwp_start && dwp_finish_ctx) {
9877 		vm_page_delayed_work_finish_ctx(dwp_start);
9878 		dwp_start = dwp = NULL;
9879 	}
9880 
9881 	return KERN_SUCCESS;
9882 
9883 return_err:
9884 	dw_index = 0;
9885 
9886 	for (; offset < dst_offset; offset += PAGE_SIZE) {
9887 		boolean_t need_unwire;
9888 
9889 		dst_page = vm_page_lookup(object, offset);
9890 
9891 		if (dst_page == VM_PAGE_NULL) {
9892 			panic("vm_object_iopl_request: Wired page missing.");
9893 		}
9894 
9895 		/*
9896 		 * if we've already processed this page in an earlier
9897 		 * dw_do_work, we need to undo the wiring... we will
9898 		 * leave the dirty and reference bits on if they
9899 		 * were set, since we don't have a good way of knowing
9900 		 * what the previous state was and we won't get here
9901 		 * under any normal circumstances...  we will always
9902 		 * clear BUSY and wakeup any waiters via vm_page_free
9903 		 * or PAGE_WAKEUP_DONE
9904 		 */
9905 		need_unwire = TRUE;
9906 
9907 		if (dw_count) {
9908 			if ((dwp_start)[dw_index].dw_m == dst_page) {
9909 				/*
9910 				 * still in the deferred work list
9911 				 * which means we haven't yet called
9912 				 * vm_page_wire on this page
9913 				 */
9914 				need_unwire = FALSE;
9915 
9916 				dw_index++;
9917 				dw_count--;
9918 			}
9919 		}
9920 		vm_page_lock_queues();
9921 
9922 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9923 			vm_page_free(dst_page);
9924 
9925 			need_unwire = FALSE;
9926 		} else {
9927 			if (need_unwire == TRUE) {
9928 				vm_page_unwire(dst_page, TRUE);
9929 			}
9930 
9931 			PAGE_WAKEUP_DONE(dst_page);
9932 		}
9933 		vm_page_unlock_queues();
9934 
9935 		if (need_unwire == TRUE) {
9936 			counter_inc(&vm_statistics_reactivations);
9937 		}
9938 	}
9939 #if UPL_DEBUG
9940 	upl->upl_state = 2;
9941 #endif
9942 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9943 		vm_object_activity_end(object);
9944 		vm_object_collapse(object, 0, TRUE);
9945 	}
9946 	vm_object_unlock(object);
9947 	upl_destroy(upl);
9948 
9949 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9950 #if DEVELOPMENT || DEBUG
9951 	if (task != NULL) {
9952 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9953 	}
9954 #endif /* DEVELOPMENT || DEBUG */
9955 
9956 	if (dwp_start && dwp_finish_ctx) {
9957 		vm_page_delayed_work_finish_ctx(dwp_start);
9958 		dwp_start = dwp = NULL;
9959 	}
9960 	return ret;
9961 }
9962 
9963 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9964 upl_transpose(
9965 	upl_t           upl1,
9966 	upl_t           upl2)
9967 {
9968 	kern_return_t           retval;
9969 	boolean_t               upls_locked;
9970 	vm_object_t             object1, object2;
9971 
9972 	/* LD: Should mapped UPLs be eligible for a transpose? */
9973 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9974 		return KERN_INVALID_ARGUMENT;
9975 	}
9976 
9977 	upls_locked = FALSE;
9978 
9979 	/*
9980 	 * Since we need to lock both UPLs at the same time,
9981 	 * avoid deadlocks by always taking locks in the same order.
9982 	 */
9983 	if (upl1 < upl2) {
9984 		upl_lock(upl1);
9985 		upl_lock(upl2);
9986 	} else {
9987 		upl_lock(upl2);
9988 		upl_lock(upl1);
9989 	}
9990 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9991 
9992 	object1 = upl1->map_object;
9993 	object2 = upl2->map_object;
9994 
9995 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9996 	    upl1->u_size != upl2->u_size) {
9997 		/*
9998 		 * We deal only with full objects, not subsets.
9999 		 * That's because we exchange the entire backing store info
10000 		 * for the objects: pager, resident pages, etc...  We can't do
10001 		 * only part of it.
10002 		 */
10003 		retval = KERN_INVALID_VALUE;
10004 		goto done;
10005 	}
10006 
10007 	/*
10008 	 * Tranpose the VM objects' backing store.
10009 	 */
10010 	retval = vm_object_transpose(object1, object2,
10011 	    upl_adjusted_size(upl1, PAGE_MASK));
10012 
10013 	if (retval == KERN_SUCCESS) {
10014 		/*
10015 		 * Make each UPL point to the correct VM object, i.e. the
10016 		 * object holding the pages that the UPL refers to...
10017 		 */
10018 #if CONFIG_IOSCHED || UPL_DEBUG
10019 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10020 			vm_object_lock(object1);
10021 			vm_object_lock(object2);
10022 		}
10023 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10024 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
10025 		}
10026 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10027 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
10028 		}
10029 #endif
10030 		upl1->map_object = object2;
10031 		upl2->map_object = object1;
10032 
10033 #if CONFIG_IOSCHED || UPL_DEBUG
10034 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10035 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
10036 		}
10037 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10038 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
10039 		}
10040 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10041 			vm_object_unlock(object2);
10042 			vm_object_unlock(object1);
10043 		}
10044 #endif
10045 	}
10046 
10047 done:
10048 	/*
10049 	 * Cleanup.
10050 	 */
10051 	if (upls_locked) {
10052 		upl_unlock(upl1);
10053 		upl_unlock(upl2);
10054 		upls_locked = FALSE;
10055 	}
10056 
10057 	return retval;
10058 }
10059 
10060 void
upl_range_needed(upl_t upl,int index,int count)10061 upl_range_needed(
10062 	upl_t           upl,
10063 	int             index,
10064 	int             count)
10065 {
10066 	int             size_in_pages;
10067 
10068 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10069 		return;
10070 	}
10071 
10072 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10073 
10074 	while (count-- && index < size_in_pages) {
10075 		upl->page_list[index++].needed = TRUE;
10076 	}
10077 }
10078 
10079 
10080 /*
10081  * Reserve of virtual addresses in the kernel address space.
10082  * We need to map the physical pages in the kernel, so that we
10083  * can call the code-signing or slide routines with a kernel
10084  * virtual address.  We keep this pool of pre-allocated kernel
10085  * virtual addresses so that we don't have to scan the kernel's
10086  * virtaul address space each time we need to work with
10087  * a physical page.
10088  */
10089 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10090 #define VM_PAGING_NUM_PAGES     64
10091 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10092 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10093 int             vm_paging_max_index = 0;
10094 int             vm_paging_page_waiter = 0;
10095 int             vm_paging_page_waiter_total = 0;
10096 
10097 unsigned long   vm_paging_no_kernel_page = 0;
10098 unsigned long   vm_paging_objects_mapped = 0;
10099 unsigned long   vm_paging_pages_mapped = 0;
10100 unsigned long   vm_paging_objects_mapped_slow = 0;
10101 unsigned long   vm_paging_pages_mapped_slow = 0;
10102 
10103 __startup_func
10104 static void
vm_paging_map_init(void)10105 vm_paging_map_init(void)
10106 {
10107 	kmem_alloc(kernel_map, &vm_paging_base_address,
10108 	    ptoa(VM_PAGING_NUM_PAGES),
10109 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10110 	    VM_KERN_MEMORY_NONE);
10111 }
10112 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10113 
10114 /*
10115  * vm_paging_map_object:
10116  *	Maps part of a VM object's pages in the kernel
10117  *      virtual address space, using the pre-allocated
10118  *	kernel virtual addresses, if possible.
10119  * Context:
10120  *      The VM object is locked.  This lock will get
10121  *      dropped and re-acquired though, so the caller
10122  *      must make sure the VM object is kept alive
10123  *	(by holding a VM map that has a reference
10124  *      on it, for example, or taking an extra reference).
10125  *      The page should also be kept busy to prevent
10126  *	it from being reclaimed.
10127  */
10128 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10129 vm_paging_map_object(
10130 	vm_page_t               page,
10131 	vm_object_t             object,
10132 	vm_object_offset_t      offset,
10133 	vm_prot_t               protection,
10134 	boolean_t               can_unlock_object,
10135 	vm_map_size_t           *size,          /* IN/OUT */
10136 	vm_map_offset_t         *address,       /* OUT */
10137 	boolean_t               *need_unmap)    /* OUT */
10138 {
10139 	kern_return_t           kr;
10140 	vm_map_offset_t         page_map_offset;
10141 	vm_map_size_t           map_size;
10142 	vm_object_offset_t      object_offset;
10143 	int                     i;
10144 
10145 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10146 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
10147 		*address = (vm_map_offset_t)
10148 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10149 		*need_unmap = FALSE;
10150 		return KERN_SUCCESS;
10151 
10152 		assert(page->vmp_busy);
10153 		/*
10154 		 * Use one of the pre-allocated kernel virtual addresses
10155 		 * and just enter the VM page in the kernel address space
10156 		 * at that virtual address.
10157 		 */
10158 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10159 
10160 		/*
10161 		 * Try and find an available kernel virtual address
10162 		 * from our pre-allocated pool.
10163 		 */
10164 		page_map_offset = 0;
10165 		for (;;) {
10166 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10167 				if (vm_paging_page_inuse[i] == FALSE) {
10168 					page_map_offset =
10169 					    vm_paging_base_address +
10170 					    (i * PAGE_SIZE);
10171 					break;
10172 				}
10173 			}
10174 			if (page_map_offset != 0) {
10175 				/* found a space to map our page ! */
10176 				break;
10177 			}
10178 
10179 			if (can_unlock_object) {
10180 				/*
10181 				 * If we can afford to unlock the VM object,
10182 				 * let's take the slow path now...
10183 				 */
10184 				break;
10185 			}
10186 			/*
10187 			 * We can't afford to unlock the VM object, so
10188 			 * let's wait for a space to become available...
10189 			 */
10190 			vm_paging_page_waiter_total++;
10191 			vm_paging_page_waiter++;
10192 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10193 			if (kr == THREAD_WAITING) {
10194 				simple_unlock(&vm_paging_lock);
10195 				kr = thread_block(THREAD_CONTINUE_NULL);
10196 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10197 			}
10198 			vm_paging_page_waiter--;
10199 			/* ... and try again */
10200 		}
10201 
10202 		if (page_map_offset != 0) {
10203 			/*
10204 			 * We found a kernel virtual address;
10205 			 * map the physical page to that virtual address.
10206 			 */
10207 			if (i > vm_paging_max_index) {
10208 				vm_paging_max_index = i;
10209 			}
10210 			vm_paging_page_inuse[i] = TRUE;
10211 			simple_unlock(&vm_paging_lock);
10212 
10213 			page->vmp_pmapped = TRUE;
10214 
10215 			/*
10216 			 * Keep the VM object locked over the PMAP_ENTER
10217 			 * and the actual use of the page by the kernel,
10218 			 * or this pmap mapping might get undone by a
10219 			 * vm_object_pmap_protect() call...
10220 			 */
10221 			PMAP_ENTER(kernel_pmap,
10222 			    page_map_offset,
10223 			    page,
10224 			    protection,
10225 			    VM_PROT_NONE,
10226 			    0,
10227 			    TRUE,
10228 			    kr);
10229 			assert(kr == KERN_SUCCESS);
10230 			vm_paging_objects_mapped++;
10231 			vm_paging_pages_mapped++;
10232 			*address = page_map_offset;
10233 			*need_unmap = TRUE;
10234 
10235 #if KASAN
10236 			kasan_notify_address(page_map_offset, PAGE_SIZE);
10237 #endif
10238 
10239 			/* all done and mapped, ready to use ! */
10240 			return KERN_SUCCESS;
10241 		}
10242 
10243 		/*
10244 		 * We ran out of pre-allocated kernel virtual
10245 		 * addresses.  Just map the page in the kernel
10246 		 * the slow and regular way.
10247 		 */
10248 		vm_paging_no_kernel_page++;
10249 		simple_unlock(&vm_paging_lock);
10250 	}
10251 
10252 	if (!can_unlock_object) {
10253 		*address = 0;
10254 		*size = 0;
10255 		*need_unmap = FALSE;
10256 		return KERN_NOT_SUPPORTED;
10257 	}
10258 
10259 	object_offset = vm_object_trunc_page(offset);
10260 	map_size = vm_map_round_page(*size,
10261 	    VM_MAP_PAGE_MASK(kernel_map));
10262 
10263 	/*
10264 	 * Try and map the required range of the object
10265 	 * in the kernel_map. Given that allocation is
10266 	 * for pageable memory, it shouldn't contain
10267 	 * pointers and is mapped into the data range.
10268 	 */
10269 
10270 	vm_object_reference_locked(object);     /* for the map entry */
10271 	vm_object_unlock(object);
10272 
10273 	kr = vm_map_enter(kernel_map,
10274 	    address,
10275 	    map_size,
10276 	    0,
10277 	    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10278 	    object,
10279 	    object_offset,
10280 	    FALSE,
10281 	    protection,
10282 	    VM_PROT_ALL,
10283 	    VM_INHERIT_NONE);
10284 	if (kr != KERN_SUCCESS) {
10285 		*address = 0;
10286 		*size = 0;
10287 		*need_unmap = FALSE;
10288 		vm_object_deallocate(object);   /* for the map entry */
10289 		vm_object_lock(object);
10290 		return kr;
10291 	}
10292 
10293 	*size = map_size;
10294 
10295 	/*
10296 	 * Enter the mapped pages in the page table now.
10297 	 */
10298 	vm_object_lock(object);
10299 	/*
10300 	 * VM object must be kept locked from before PMAP_ENTER()
10301 	 * until after the kernel is done accessing the page(s).
10302 	 * Otherwise, the pmap mappings in the kernel could be
10303 	 * undone by a call to vm_object_pmap_protect().
10304 	 */
10305 
10306 	for (page_map_offset = 0;
10307 	    map_size != 0;
10308 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10309 		page = vm_page_lookup(object, offset + page_map_offset);
10310 		if (page == VM_PAGE_NULL) {
10311 			printf("vm_paging_map_object: no page !?");
10312 			vm_object_unlock(object);
10313 			vm_map_remove(kernel_map, *address, *size);
10314 			*address = 0;
10315 			*size = 0;
10316 			*need_unmap = FALSE;
10317 			vm_object_lock(object);
10318 			return KERN_MEMORY_ERROR;
10319 		}
10320 		page->vmp_pmapped = TRUE;
10321 
10322 		PMAP_ENTER(kernel_pmap,
10323 		    *address + page_map_offset,
10324 		    page,
10325 		    protection,
10326 		    VM_PROT_NONE,
10327 		    0,
10328 		    TRUE,
10329 		    kr);
10330 		assert(kr == KERN_SUCCESS);
10331 #if KASAN
10332 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10333 #endif
10334 	}
10335 
10336 	vm_paging_objects_mapped_slow++;
10337 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10338 
10339 	*need_unmap = TRUE;
10340 
10341 	return KERN_SUCCESS;
10342 }
10343 
10344 /*
10345  * vm_paging_unmap_object:
10346  *	Unmaps part of a VM object's pages from the kernel
10347  *      virtual address space.
10348  * Context:
10349  *      The VM object is locked.  This lock will get
10350  *      dropped and re-acquired though.
10351  */
10352 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10353 vm_paging_unmap_object(
10354 	vm_object_t     object,
10355 	vm_map_offset_t start,
10356 	vm_map_offset_t end)
10357 {
10358 	int             i;
10359 
10360 	if ((vm_paging_base_address == 0) ||
10361 	    (start < vm_paging_base_address) ||
10362 	    (end > (vm_paging_base_address
10363 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10364 		/*
10365 		 * We didn't use our pre-allocated pool of
10366 		 * kernel virtual address.  Deallocate the
10367 		 * virtual memory.
10368 		 */
10369 		if (object != VM_OBJECT_NULL) {
10370 			vm_object_unlock(object);
10371 		}
10372 		vm_map_remove(kernel_map, start, end);
10373 		if (object != VM_OBJECT_NULL) {
10374 			vm_object_lock(object);
10375 		}
10376 	} else {
10377 		/*
10378 		 * We used a kernel virtual address from our
10379 		 * pre-allocated pool.  Put it back in the pool
10380 		 * for next time.
10381 		 */
10382 		assert(end - start == PAGE_SIZE);
10383 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10384 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10385 
10386 		/* undo the pmap mapping */
10387 		pmap_remove(kernel_pmap, start, end);
10388 
10389 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10390 		vm_paging_page_inuse[i] = FALSE;
10391 		if (vm_paging_page_waiter) {
10392 			thread_wakeup(&vm_paging_page_waiter);
10393 		}
10394 		simple_unlock(&vm_paging_lock);
10395 	}
10396 }
10397 
10398 
10399 /*
10400  * page->vmp_object must be locked
10401  */
10402 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10403 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10404 {
10405 	if (!queues_locked) {
10406 		vm_page_lockspin_queues();
10407 	}
10408 
10409 	page->vmp_free_when_done = FALSE;
10410 	/*
10411 	 * need to drop the laundry count...
10412 	 * we may also need to remove it
10413 	 * from the I/O paging queue...
10414 	 * vm_pageout_throttle_up handles both cases
10415 	 *
10416 	 * the laundry and pageout_queue flags are cleared...
10417 	 */
10418 	vm_pageout_throttle_up(page);
10419 
10420 	if (!queues_locked) {
10421 		vm_page_unlock_queues();
10422 	}
10423 }
10424 
10425 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10426 
10427 upl_t
vector_upl_create(vm_offset_t upl_offset,uint32_t max_upls)10428 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10429 {
10430 	int i = 0;
10431 	upl_t   upl;
10432 
10433 	assert(max_upls > 0);
10434 	if (max_upls == 0) {
10435 		return NULL;
10436 	}
10437 
10438 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10439 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10440 	}
10441 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10442 
10443 	upl = upl_create(0, UPL_VECTOR, 0);
10444 	upl->vector_upl = vector_upl;
10445 	upl->u_offset = upl_offset;
10446 	vector_upl->size = 0;
10447 	vector_upl->offset = upl_offset;
10448 	vector_upl->invalid_upls = 0;
10449 	vector_upl->num_upls = 0;
10450 	vector_upl->pagelist = NULL;
10451 	vector_upl->max_upls = max_upls;
10452 
10453 	for (i = 0; i < max_upls; i++) {
10454 		vector_upl->upls[i].iostate.size = 0;
10455 		vector_upl->upls[i].iostate.offset = 0;
10456 	}
10457 	return upl;
10458 }
10459 
10460 uint32_t
vector_upl_max_upls(const upl_t upl)10461 vector_upl_max_upls(const upl_t upl)
10462 {
10463 	if (!vector_upl_is_valid(upl)) {
10464 		return 0;
10465 	}
10466 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
10467 }
10468 
10469 void
vector_upl_deallocate(upl_t upl)10470 vector_upl_deallocate(upl_t upl)
10471 {
10472 	vector_upl_t vector_upl = upl->vector_upl;
10473 
10474 	assert(vector_upl_is_valid(upl));
10475 
10476 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
10477 		panic("Deallocating non-empty Vectored UPL");
10478 	}
10479 	uint32_t max_upls = vector_upl->max_upls;
10480 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10481 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10482 	upl->vector_upl = NULL;
10483 }
10484 
10485 boolean_t
vector_upl_is_valid(upl_t upl)10486 vector_upl_is_valid(upl_t upl)
10487 {
10488 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10489 }
10490 
10491 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10492 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10493 {
10494 	if (vector_upl_is_valid(upl)) {
10495 		vector_upl_t vector_upl = upl->vector_upl;
10496 
10497 		if (vector_upl) {
10498 			if (subupl) {
10499 				if (io_size) {
10500 					if (io_size < PAGE_SIZE) {
10501 						io_size = PAGE_SIZE;
10502 					}
10503 					subupl->vector_upl = (void*)vector_upl;
10504 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10505 					vector_upl->size += io_size;
10506 					upl->u_size += io_size;
10507 				} else {
10508 					uint32_t i = 0, invalid_upls = 0;
10509 					for (i = 0; i < vector_upl->num_upls; i++) {
10510 						if (vector_upl->upls[i].elem == subupl) {
10511 							break;
10512 						}
10513 					}
10514 					if (i == vector_upl->num_upls) {
10515 						panic("Trying to remove sub-upl when none exists");
10516 					}
10517 
10518 					vector_upl->upls[i].elem = NULL;
10519 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10520 					    relaxed);
10521 					if (invalid_upls == vector_upl->num_upls) {
10522 						return TRUE;
10523 					} else {
10524 						return FALSE;
10525 					}
10526 				}
10527 			} else {
10528 				panic("vector_upl_set_subupl was passed a NULL upl element");
10529 			}
10530 		} else {
10531 			panic("vector_upl_set_subupl was passed a non-vectored upl");
10532 		}
10533 	} else {
10534 		panic("vector_upl_set_subupl was passed a NULL upl");
10535 	}
10536 
10537 	return FALSE;
10538 }
10539 
10540 void
vector_upl_set_pagelist(upl_t upl)10541 vector_upl_set_pagelist(upl_t upl)
10542 {
10543 	if (vector_upl_is_valid(upl)) {
10544 		uint32_t i = 0;
10545 		vector_upl_t vector_upl = upl->vector_upl;
10546 
10547 		if (vector_upl) {
10548 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10549 
10550 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
10551 			    atop(vector_upl->size), Z_WAITOK);
10552 
10553 			for (i = 0; i < vector_upl->num_upls; i++) {
10554 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10555 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10556 				pagelist_size += cur_upl_pagelist_size;
10557 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10558 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
10559 				}
10560 			}
10561 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10562 		} else {
10563 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
10564 		}
10565 	} else {
10566 		panic("vector_upl_set_pagelist was passed a NULL upl");
10567 	}
10568 }
10569 
10570 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10571 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10572 {
10573 	if (vector_upl_is_valid(upl)) {
10574 		vector_upl_t vector_upl = upl->vector_upl;
10575 		if (vector_upl) {
10576 			if (index < vector_upl->num_upls) {
10577 				return vector_upl->upls[index].elem;
10578 			}
10579 		} else {
10580 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10581 		}
10582 	}
10583 	return NULL;
10584 }
10585 
10586 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10587 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10588 {
10589 	if (vector_upl_is_valid(upl)) {
10590 		uint32_t i = 0;
10591 		vector_upl_t vector_upl = upl->vector_upl;
10592 
10593 		if (vector_upl) {
10594 			upl_t subupl = NULL;
10595 			vector_upl_iostates_t subupl_state;
10596 
10597 			for (i = 0; i < vector_upl->num_upls; i++) {
10598 				subupl = vector_upl->upls[i].elem;
10599 				subupl_state = vector_upl->upls[i].iostate;
10600 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10601 					/* We could have been passed an offset/size pair that belongs
10602 					 * to an UPL element that has already been committed/aborted.
10603 					 * If so, return NULL.
10604 					 */
10605 					if (subupl == NULL) {
10606 						return NULL;
10607 					}
10608 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10609 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10610 						if (*upl_size > subupl_state.size) {
10611 							*upl_size = subupl_state.size;
10612 						}
10613 					}
10614 					if (*upl_offset >= subupl_state.offset) {
10615 						*upl_offset -= subupl_state.offset;
10616 					} else if (i) {
10617 						panic("Vector UPL offset miscalculation");
10618 					}
10619 					return subupl;
10620 				}
10621 			}
10622 		} else {
10623 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10624 		}
10625 	}
10626 	return NULL;
10627 }
10628 
10629 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10630 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10631 {
10632 	*v_upl_submap = NULL;
10633 
10634 	if (vector_upl_is_valid(upl)) {
10635 		vector_upl_t vector_upl = upl->vector_upl;
10636 		if (vector_upl) {
10637 			*v_upl_submap = vector_upl->submap;
10638 			*submap_dst_addr = vector_upl->submap_dst_addr;
10639 		} else {
10640 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10641 		}
10642 	} else {
10643 		panic("vector_upl_get_submap was passed a null UPL");
10644 	}
10645 }
10646 
10647 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10648 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10649 {
10650 	if (vector_upl_is_valid(upl)) {
10651 		vector_upl_t vector_upl = upl->vector_upl;
10652 		if (vector_upl) {
10653 			vector_upl->submap = submap;
10654 			vector_upl->submap_dst_addr = submap_dst_addr;
10655 		} else {
10656 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10657 		}
10658 	} else {
10659 		panic("vector_upl_get_submap was passed a NULL UPL");
10660 	}
10661 }
10662 
10663 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10664 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10665 {
10666 	if (vector_upl_is_valid(upl)) {
10667 		uint32_t i = 0;
10668 		vector_upl_t vector_upl = upl->vector_upl;
10669 
10670 		if (vector_upl) {
10671 			for (i = 0; i < vector_upl->num_upls; i++) {
10672 				if (vector_upl->upls[i].elem == subupl) {
10673 					break;
10674 				}
10675 			}
10676 
10677 			if (i == vector_upl->num_upls) {
10678 				panic("setting sub-upl iostate when none exists");
10679 			}
10680 
10681 			vector_upl->upls[i].iostate.offset = offset;
10682 			if (size < PAGE_SIZE) {
10683 				size = PAGE_SIZE;
10684 			}
10685 			vector_upl->upls[i].iostate.size = size;
10686 		} else {
10687 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
10688 		}
10689 	} else {
10690 		panic("vector_upl_set_iostate was passed a NULL UPL");
10691 	}
10692 }
10693 
10694 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10695 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10696 {
10697 	if (vector_upl_is_valid(upl)) {
10698 		uint32_t i = 0;
10699 		vector_upl_t vector_upl = upl->vector_upl;
10700 
10701 		if (vector_upl) {
10702 			for (i = 0; i < vector_upl->num_upls; i++) {
10703 				if (vector_upl->upls[i].elem == subupl) {
10704 					break;
10705 				}
10706 			}
10707 
10708 			if (i == vector_upl->num_upls) {
10709 				panic("getting sub-upl iostate when none exists");
10710 			}
10711 
10712 			*offset = vector_upl->upls[i].iostate.offset;
10713 			*size = vector_upl->upls[i].iostate.size;
10714 		} else {
10715 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
10716 		}
10717 	} else {
10718 		panic("vector_upl_get_iostate was passed a NULL UPL");
10719 	}
10720 }
10721 
10722 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10723 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10724 {
10725 	if (vector_upl_is_valid(upl)) {
10726 		vector_upl_t vector_upl = upl->vector_upl;
10727 		if (vector_upl) {
10728 			if (index < vector_upl->num_upls) {
10729 				*offset = vector_upl->upls[index].iostate.offset;
10730 				*size = vector_upl->upls[index].iostate.size;
10731 			} else {
10732 				*offset = *size = 0;
10733 			}
10734 		} else {
10735 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10736 		}
10737 	} else {
10738 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10739 	}
10740 }
10741 
10742 void *
upl_get_internal_vectorupl(upl_t upl)10743 upl_get_internal_vectorupl(upl_t upl)
10744 {
10745 	return upl->vector_upl;
10746 }
10747 
10748 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10749 upl_get_internal_vectorupl_pagelist(upl_t upl)
10750 {
10751 	return upl->vector_upl->pagelist;
10752 }
10753 
10754 upl_page_info_t *
upl_get_internal_page_list(upl_t upl)10755 upl_get_internal_page_list(upl_t upl)
10756 {
10757 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10758 }
10759 
10760 void
upl_clear_dirty(upl_t upl,boolean_t value)10761 upl_clear_dirty(
10762 	upl_t           upl,
10763 	boolean_t       value)
10764 {
10765 	if (value) {
10766 		upl->flags |= UPL_CLEAR_DIRTY;
10767 	} else {
10768 		upl->flags &= ~UPL_CLEAR_DIRTY;
10769 	}
10770 }
10771 
10772 void
upl_set_referenced(upl_t upl,boolean_t value)10773 upl_set_referenced(
10774 	upl_t           upl,
10775 	boolean_t       value)
10776 {
10777 	upl_lock(upl);
10778 	if (value) {
10779 		upl->ext_ref_count++;
10780 	} else {
10781 		if (!upl->ext_ref_count) {
10782 			panic("upl_set_referenced not %p", upl);
10783 		}
10784 		upl->ext_ref_count--;
10785 	}
10786 	upl_unlock(upl);
10787 }
10788 
10789 #if CONFIG_IOSCHED
10790 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10791 upl_set_blkno(
10792 	upl_t           upl,
10793 	vm_offset_t     upl_offset,
10794 	int             io_size,
10795 	int64_t         blkno)
10796 {
10797 	int i, j;
10798 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10799 		return;
10800 	}
10801 
10802 	assert(upl->upl_reprio_info != 0);
10803 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10804 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10805 	}
10806 }
10807 #endif
10808 
10809 void inline
memoryshot(unsigned int event,unsigned int control)10810 memoryshot(unsigned int event, unsigned int control)
10811 {
10812 	if (vm_debug_events) {
10813 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10814 		    vm_page_active_count, vm_page_inactive_count,
10815 		    vm_page_free_count, vm_page_speculative_count,
10816 		    vm_page_throttled_count);
10817 	} else {
10818 		(void) event;
10819 		(void) control;
10820 	}
10821 }
10822 
10823 #ifdef MACH_BSD
10824 
10825 boolean_t
upl_device_page(upl_page_info_t * upl)10826 upl_device_page(upl_page_info_t *upl)
10827 {
10828 	return UPL_DEVICE_PAGE(upl);
10829 }
10830 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10831 upl_page_present(upl_page_info_t *upl, int index)
10832 {
10833 	return UPL_PAGE_PRESENT(upl, index);
10834 }
10835 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10836 upl_speculative_page(upl_page_info_t *upl, int index)
10837 {
10838 	return UPL_SPECULATIVE_PAGE(upl, index);
10839 }
10840 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10841 upl_dirty_page(upl_page_info_t *upl, int index)
10842 {
10843 	return UPL_DIRTY_PAGE(upl, index);
10844 }
10845 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10846 upl_valid_page(upl_page_info_t *upl, int index)
10847 {
10848 	return UPL_VALID_PAGE(upl, index);
10849 }
10850 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10851 upl_phys_page(upl_page_info_t *upl, int index)
10852 {
10853 	return UPL_PHYS_PAGE(upl, index);
10854 }
10855 
10856 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10857 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10858 {
10859 	upl[index].mark = v;
10860 }
10861 
10862 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10863 upl_page_get_mark(upl_page_info_t *upl, int index)
10864 {
10865 	return upl[index].mark;
10866 }
10867 
10868 void
vm_countdirtypages(void)10869 vm_countdirtypages(void)
10870 {
10871 	vm_page_t m;
10872 	int dpages;
10873 	int pgopages;
10874 	int precpages;
10875 
10876 
10877 	dpages = 0;
10878 	pgopages = 0;
10879 	precpages = 0;
10880 
10881 	vm_page_lock_queues();
10882 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10883 	do {
10884 		if (m == (vm_page_t)0) {
10885 			break;
10886 		}
10887 
10888 		if (m->vmp_dirty) {
10889 			dpages++;
10890 		}
10891 		if (m->vmp_free_when_done) {
10892 			pgopages++;
10893 		}
10894 		if (m->vmp_precious) {
10895 			precpages++;
10896 		}
10897 
10898 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10899 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10900 		if (m == (vm_page_t)0) {
10901 			break;
10902 		}
10903 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10904 	vm_page_unlock_queues();
10905 
10906 	vm_page_lock_queues();
10907 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10908 	do {
10909 		if (m == (vm_page_t)0) {
10910 			break;
10911 		}
10912 
10913 		dpages++;
10914 		assert(m->vmp_dirty);
10915 		assert(!m->vmp_free_when_done);
10916 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10917 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10918 		if (m == (vm_page_t)0) {
10919 			break;
10920 		}
10921 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10922 	vm_page_unlock_queues();
10923 
10924 	vm_page_lock_queues();
10925 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10926 	do {
10927 		if (m == (vm_page_t)0) {
10928 			break;
10929 		}
10930 
10931 		if (m->vmp_dirty) {
10932 			dpages++;
10933 		}
10934 		if (m->vmp_free_when_done) {
10935 			pgopages++;
10936 		}
10937 		if (m->vmp_precious) {
10938 			precpages++;
10939 		}
10940 
10941 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10942 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10943 		if (m == (vm_page_t)0) {
10944 			break;
10945 		}
10946 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10947 	vm_page_unlock_queues();
10948 
10949 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10950 
10951 	dpages = 0;
10952 	pgopages = 0;
10953 	precpages = 0;
10954 
10955 	vm_page_lock_queues();
10956 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10957 
10958 	do {
10959 		if (m == (vm_page_t)0) {
10960 			break;
10961 		}
10962 		if (m->vmp_dirty) {
10963 			dpages++;
10964 		}
10965 		if (m->vmp_free_when_done) {
10966 			pgopages++;
10967 		}
10968 		if (m->vmp_precious) {
10969 			precpages++;
10970 		}
10971 
10972 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10973 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10974 		if (m == (vm_page_t)0) {
10975 			break;
10976 		}
10977 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10978 	vm_page_unlock_queues();
10979 
10980 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10981 }
10982 #endif /* MACH_BSD */
10983 
10984 
10985 #if CONFIG_IOSCHED
10986 int
upl_get_cached_tier(upl_t upl)10987 upl_get_cached_tier(upl_t  upl)
10988 {
10989 	assert(upl);
10990 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10991 		return upl->upl_priority;
10992 	}
10993 	return -1;
10994 }
10995 #endif /* CONFIG_IOSCHED */
10996 
10997 
10998 void
upl_callout_iodone(upl_t upl)10999 upl_callout_iodone(upl_t upl)
11000 {
11001 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11002 
11003 	if (upl_ctx) {
11004 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
11005 
11006 		assert(upl_ctx->io_done);
11007 
11008 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11009 	}
11010 }
11011 
11012 void
upl_set_iodone(upl_t upl,void * upl_iodone)11013 upl_set_iodone(upl_t upl, void *upl_iodone)
11014 {
11015 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11016 }
11017 
11018 void
upl_set_iodone_error(upl_t upl,int error)11019 upl_set_iodone_error(upl_t upl, int error)
11020 {
11021 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11022 
11023 	if (upl_ctx) {
11024 		upl_ctx->io_error = error;
11025 	}
11026 }
11027 
11028 
11029 ppnum_t
upl_get_highest_page(upl_t upl)11030 upl_get_highest_page(
11031 	upl_t                      upl)
11032 {
11033 	return upl->highest_page;
11034 }
11035 
11036 upl_size_t
upl_get_size(upl_t upl)11037 upl_get_size(
11038 	upl_t                      upl)
11039 {
11040 	return upl_adjusted_size(upl, PAGE_MASK);
11041 }
11042 
11043 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11044 upl_adjusted_size(
11045 	upl_t upl,
11046 	vm_map_offset_t pgmask)
11047 {
11048 	vm_object_offset_t start_offset, end_offset;
11049 
11050 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11051 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11052 
11053 	return (upl_size_t)(end_offset - start_offset);
11054 }
11055 
11056 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11057 upl_adjusted_offset(
11058 	upl_t upl,
11059 	vm_map_offset_t pgmask)
11060 {
11061 	return trunc_page_mask_64(upl->u_offset, pgmask);
11062 }
11063 
11064 vm_object_offset_t
upl_get_data_offset(upl_t upl)11065 upl_get_data_offset(
11066 	upl_t upl)
11067 {
11068 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11069 }
11070 
11071 upl_t
upl_associated_upl(upl_t upl)11072 upl_associated_upl(upl_t upl)
11073 {
11074 	return upl->associated_upl;
11075 }
11076 
11077 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11078 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11079 {
11080 	upl->associated_upl = associated_upl;
11081 }
11082 
11083 struct vnode *
upl_lookup_vnode(upl_t upl)11084 upl_lookup_vnode(upl_t upl)
11085 {
11086 	if (!upl->map_object->internal) {
11087 		return vnode_pager_lookup_vnode(upl->map_object->pager);
11088 	} else {
11089 		return NULL;
11090 	}
11091 }
11092 
11093 #if UPL_DEBUG
11094 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11095 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11096 {
11097 	upl->ubc_alias1 = alias1;
11098 	upl->ubc_alias2 = alias2;
11099 	return KERN_SUCCESS;
11100 }
11101 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11102 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11103 {
11104 	if (al) {
11105 		*al = upl->ubc_alias1;
11106 	}
11107 	if (al2) {
11108 		*al2 = upl->ubc_alias2;
11109 	}
11110 	return KERN_SUCCESS;
11111 }
11112 #endif /* UPL_DEBUG */
11113 
11114 #if VM_PRESSURE_EVENTS
11115 /*
11116  * Upward trajectory.
11117  */
11118 extern boolean_t vm_compressor_low_on_space(void);
11119 
11120 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11121 VM_PRESSURE_NORMAL_TO_WARNING(void)
11122 {
11123 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11124 		/* Available pages below our threshold */
11125 		if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11126 			/* No frozen processes to kill */
11127 			if (memorystatus_frozen_count == 0) {
11128 				/* Not enough suspended processes available. */
11129 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11130 					return TRUE;
11131 				}
11132 			}
11133 		}
11134 		return FALSE;
11135 	} else {
11136 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11137 	}
11138 }
11139 
11140 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11141 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11142 {
11143 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11144 		/* Available pages below our threshold */
11145 		if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11146 			return TRUE;
11147 		}
11148 		return FALSE;
11149 	} else {
11150 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11151 	}
11152 }
11153 
11154 /*
11155  * Downward trajectory.
11156  */
11157 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11158 VM_PRESSURE_WARNING_TO_NORMAL(void)
11159 {
11160 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161 		/* Available pages above our threshold */
11162 		unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11163 		if (memorystatus_available_pages > target_threshold) {
11164 			return TRUE;
11165 		}
11166 		return FALSE;
11167 	} else {
11168 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11169 	}
11170 }
11171 
11172 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11173 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11174 {
11175 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11176 		/* Available pages above our threshold */
11177 		unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11178 		if (memorystatus_available_pages > target_threshold) {
11179 			return TRUE;
11180 		}
11181 		return FALSE;
11182 	} else {
11183 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11184 	}
11185 }
11186 #endif /* VM_PRESSURE_EVENTS */
11187 
11188 #if DEVELOPMENT || DEBUG
11189 bool compressor_running_perf_test;
11190 uint64_t compressor_perf_test_pages_processed;
11191 
11192 kern_return_t
11193 run_compressor_perf_test(
11194 	user_addr_t buf,
11195 	size_t buffer_size,
11196 	uint64_t *time,
11197 	uint64_t *bytes_compressed,
11198 	uint64_t *compressor_growth);
11199 
11200 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11201 move_pages_to_queue(
11202 	vm_map_t map,
11203 	user_addr_t start_addr,
11204 	size_t buffer_size,
11205 	vm_page_queue_head_t *queue,
11206 	size_t *pages_moved)
11207 {
11208 	kern_return_t err = KERN_SUCCESS;
11209 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11210 	boolean_t addr_in_map = FALSE;
11211 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11212 	vm_object_t curr_object = VM_OBJECT_NULL;
11213 	*pages_moved = 0;
11214 
11215 
11216 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11217 		/*
11218 		 * We don't currently support benchmarking maps with a different page size
11219 		 * than the kernel.
11220 		 */
11221 		return KERN_INVALID_ARGUMENT;
11222 	}
11223 
11224 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11225 		return KERN_INVALID_ARGUMENT;
11226 	}
11227 
11228 	vm_map_lock_read(map);
11229 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11230 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11231 
11232 
11233 	while (curr_addr < end_addr) {
11234 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11235 		if (!addr_in_map) {
11236 			err = KERN_INVALID_ARGUMENT;
11237 			break;
11238 		}
11239 		curr_object = VME_OBJECT(curr_entry);
11240 		if (curr_object) {
11241 			vm_object_lock(curr_object);
11242 			/* We really only want anonymous memory that's in the top level map and object here. */
11243 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11244 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11245 				err = KERN_INVALID_ARGUMENT;
11246 				vm_object_unlock(curr_object);
11247 				break;
11248 			}
11249 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11250 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11251 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
11252 			vm_map_offset_t curr_offset = start_offset;
11253 			vm_page_t curr_page;
11254 			while (curr_offset < end_offset) {
11255 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11256 				if (curr_page != VM_PAGE_NULL) {
11257 					vm_page_lock_queues();
11258 					if (curr_page->vmp_laundry) {
11259 						vm_pageout_steal_laundry(curr_page, TRUE);
11260 					}
11261 					/*
11262 					 * we've already factored out pages in the laundry which
11263 					 * means this page can't be on the pageout queue so it's
11264 					 * safe to do the vm_page_queues_remove
11265 					 */
11266 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11267 					vm_page_queues_remove(curr_page, TRUE);
11268 					if (donate) {
11269 						/*
11270 						 * The compressor needs to see this bit to know
11271 						 * where this page needs to land. Also if stolen,
11272 						 * this bit helps put the page back in the right
11273 						 * special queue where it belongs.
11274 						 */
11275 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11276 					}
11277 					// Clear the referenced bit so we ensure this gets paged out
11278 					curr_page->vmp_reference = false;
11279 					if (curr_page->vmp_pmapped) {
11280 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11281 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11282 					}
11283 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
11284 					vm_page_unlock_queues();
11285 					*pages_moved += 1;
11286 				}
11287 				curr_offset += PAGE_SIZE_64;
11288 				curr_addr += PAGE_SIZE_64;
11289 			}
11290 		}
11291 		vm_object_unlock(curr_object);
11292 	}
11293 	vm_map_unlock_read(map);
11294 	return err;
11295 }
11296 
11297 /*
11298  * Local queue for processing benchmark pages.
11299  * Can't be allocated on the stack because the pointer has to
11300  * be packable.
11301  */
11302 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11303 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11304 run_compressor_perf_test(
11305 	user_addr_t buf,
11306 	size_t buffer_size,
11307 	uint64_t *time,
11308 	uint64_t *bytes_compressed,
11309 	uint64_t *compressor_growth)
11310 {
11311 	kern_return_t err = KERN_SUCCESS;
11312 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11313 		return KERN_NOT_SUPPORTED;
11314 	}
11315 	if (current_task() == kernel_task) {
11316 		return KERN_INVALID_ARGUMENT;
11317 	}
11318 	vm_page_lock_queues();
11319 	if (compressor_running_perf_test) {
11320 		/* Only run one instance of the benchmark at a time. */
11321 		vm_page_unlock_queues();
11322 		return KERN_RESOURCE_SHORTAGE;
11323 	}
11324 	vm_page_unlock_queues();
11325 	size_t page_count = 0;
11326 	vm_map_t map;
11327 	vm_page_t p, next;
11328 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11329 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11330 	*bytes_compressed = *compressor_growth = 0;
11331 
11332 	vm_page_queue_init(&compressor_perf_test_queue);
11333 	map = current_task()->map;
11334 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11335 	if (err != KERN_SUCCESS) {
11336 		goto out;
11337 	}
11338 
11339 	vm_page_lock_queues();
11340 	compressor_running_perf_test = true;
11341 	compressor_perf_test_pages_processed = 0;
11342 	/*
11343 	 * At this point the compressor threads should only process the benchmark queue
11344 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11345 	 * to determine how many compressed bytes we ended up using.
11346 	 */
11347 	compressed_bytes_start = c_segment_compressed_bytes;
11348 	vm_page_unlock_queues();
11349 
11350 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11351 
11352 	vm_page_lock_queues();
11353 	compressor_perf_test_start = mach_absolute_time();
11354 
11355 	// Wake up the compressor thread(s)
11356 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11357 	    pgo_iothread_internal_state[0].pgo_iothread);
11358 
11359 	/*
11360 	 * Depending on when this test is run we could overshoot or be right on the mark
11361 	 * with our page_count. So the comparison is of the _less than_ variety.
11362 	 */
11363 	while (compressor_perf_test_pages_processed < page_count) {
11364 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11365 		vm_page_unlock_queues();
11366 		thread_block(THREAD_CONTINUE_NULL);
11367 		vm_page_lock_queues();
11368 	}
11369 	compressor_perf_test_end = mach_absolute_time();
11370 	compressed_bytes_end = c_segment_compressed_bytes;
11371 	vm_page_unlock_queues();
11372 
11373 
11374 out:
11375 	/*
11376 	 * If we errored out above, then we could still have some pages
11377 	 * on the local queue. Make sure to put them back on the active queue before
11378 	 * returning so they're not orphaned.
11379 	 */
11380 	vm_page_lock_queues();
11381 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11382 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11383 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11384 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11385 
11386 		vm_page_enqueue_active(p, FALSE);
11387 		p = next;
11388 	}
11389 
11390 	compressor_running_perf_test = false;
11391 	vm_page_unlock_queues();
11392 	if (err == KERN_SUCCESS) {
11393 		*bytes_compressed = page_count * PAGE_SIZE_64;
11394 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
11395 	}
11396 
11397 	/*
11398 	 * pageout_scan will consider waking the compactor swapper
11399 	 * before it blocks. Do the same thing here before we return
11400 	 * to ensure that back to back benchmark runs can't overly fragment the
11401 	 * compressor pool.
11402 	 */
11403 	vm_consider_waking_compactor_swapper();
11404 	return err;
11405 }
11406 #endif /* DEVELOPMENT || DEBUG */
11407