xref: /xnu-8792.41.9/osfmk/vm/vm_pageout.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include <stdint.h>
67 #include <ptrauth.h>
68 
69 #include <debug.h>
70 
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79 
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91 
92 #include <sys/kdebug_triage.h>
93 
94 #include <machine/vm_tuning.h>
95 #include <machine/commpage.h>
96 
97 #include <vm/pmap.h>
98 #include <vm/vm_compressor_pager.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map_internal.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_pageout.h>
104 #include <vm/vm_protos.h> /* must be last */
105 #include <vm/memory_object.h>
106 #include <vm/vm_purgeable_internal.h>
107 #include <vm/vm_shared_region.h>
108 #include <vm/vm_compressor.h>
109 
110 #include <san/kasan.h>
111 
112 #if CONFIG_PHANTOM_CACHE
113 #include <vm/vm_phantom_cache.h>
114 #endif
115 
116 #if UPL_DEBUG
117 #include <libkern/OSDebug.h>
118 #endif
119 
120 extern int cs_debug;
121 
122 extern void mbuf_drain(boolean_t);
123 
124 #if VM_PRESSURE_EVENTS
125 #if CONFIG_JETSAM
126 extern unsigned int memorystatus_available_pages;
127 extern unsigned int memorystatus_available_pages_pressure;
128 extern unsigned int memorystatus_available_pages_critical;
129 #else /* CONFIG_JETSAM */
130 extern uint64_t memorystatus_available_pages;
131 extern uint64_t memorystatus_available_pages_pressure;
132 extern uint64_t memorystatus_available_pages_critical;
133 #endif /* CONFIG_JETSAM */
134 
135 extern unsigned int memorystatus_frozen_count;
136 extern unsigned int memorystatus_suspended_count;
137 extern vm_pressure_level_t memorystatus_vm_pressure_level;
138 
139 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
140 extern uint32_t memorystatus_jetsam_fg_band_waiters;
141 
142 void vm_pressure_response(void);
143 extern void consider_vm_pressure_events(void);
144 
145 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
146 #endif /* VM_PRESSURE_EVENTS */
147 
148 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
149 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
150 boolean_t vps_dynamic_priority_enabled = FALSE;
151 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
152 
153 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
154 #if !XNU_TARGET_OS_OSX
155 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
156 #else /* !XNU_TARGET_OS_OSX */
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
158 #endif /* !XNU_TARGET_OS_OSX */
159 #endif
160 
161 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
162 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
163 #endif
164 
165 #ifndef VM_PAGE_LAUNDRY_MAX
166 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
167 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
168 
169 #ifndef VM_PAGEOUT_BURST_WAIT
170 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
171 #endif  /* VM_PAGEOUT_BURST_WAIT */
172 
173 #ifndef VM_PAGEOUT_EMPTY_WAIT
174 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
175 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
176 
177 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
178 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
179 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
180 
181 #ifndef VM_PAGEOUT_IDLE_WAIT
182 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
183 #endif  /* VM_PAGEOUT_IDLE_WAIT */
184 
185 #ifndef VM_PAGEOUT_SWAP_WAIT
186 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
187 #endif  /* VM_PAGEOUT_SWAP_WAIT */
188 
189 
190 #ifndef VM_PAGE_SPECULATIVE_TARGET
191 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
192 #endif /* VM_PAGE_SPECULATIVE_TARGET */
193 
194 
195 /*
196  *	To obtain a reasonable LRU approximation, the inactive queue
197  *	needs to be large enough to give pages on it a chance to be
198  *	referenced a second time.  This macro defines the fraction
199  *	of active+inactive pages that should be inactive.
200  *	The pageout daemon uses it to update vm_page_inactive_target.
201  *
202  *	If vm_page_free_count falls below vm_page_free_target and
203  *	vm_page_inactive_count is below vm_page_inactive_target,
204  *	then the pageout daemon starts running.
205  */
206 
207 #ifndef VM_PAGE_INACTIVE_TARGET
208 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
209 #endif  /* VM_PAGE_INACTIVE_TARGET */
210 
211 /*
212  *	Once the pageout daemon starts running, it keeps going
213  *	until vm_page_free_count meets or exceeds vm_page_free_target.
214  */
215 
216 #ifndef VM_PAGE_FREE_TARGET
217 #if !XNU_TARGET_OS_OSX
218 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
219 #else /* !XNU_TARGET_OS_OSX */
220 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
221 #endif /* !XNU_TARGET_OS_OSX */
222 #endif  /* VM_PAGE_FREE_TARGET */
223 
224 
225 /*
226  *	The pageout daemon always starts running once vm_page_free_count
227  *	falls below vm_page_free_min.
228  */
229 
230 #ifndef VM_PAGE_FREE_MIN
231 #if !XNU_TARGET_OS_OSX
232 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
233 #else /* !XNU_TARGET_OS_OSX */
234 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
235 #endif /* !XNU_TARGET_OS_OSX */
236 #endif  /* VM_PAGE_FREE_MIN */
237 
238 #if !XNU_TARGET_OS_OSX
239 #define VM_PAGE_FREE_RESERVED_LIMIT     100
240 #define VM_PAGE_FREE_MIN_LIMIT          1500
241 #define VM_PAGE_FREE_TARGET_LIMIT       2000
242 #else /* !XNU_TARGET_OS_OSX */
243 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
244 #define VM_PAGE_FREE_MIN_LIMIT          3500
245 #define VM_PAGE_FREE_TARGET_LIMIT       4000
246 #endif /* !XNU_TARGET_OS_OSX */
247 
248 /*
249  *	When vm_page_free_count falls below vm_page_free_reserved,
250  *	only vm-privileged threads can allocate pages.  vm-privilege
251  *	allows the pageout daemon and default pager (and any other
252  *	associated threads needed for default pageout) to continue
253  *	operation by dipping into the reserved pool of pages.
254  */
255 
256 #ifndef VM_PAGE_FREE_RESERVED
257 #define VM_PAGE_FREE_RESERVED(n)        \
258 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
259 #endif  /* VM_PAGE_FREE_RESERVED */
260 
261 /*
262  *	When we dequeue pages from the inactive list, they are
263  *	reactivated (ie, put back on the active queue) if referenced.
264  *	However, it is possible to starve the free list if other
265  *	processors are referencing pages faster than we can turn off
266  *	the referenced bit.  So we limit the number of reactivations
267  *	we will make per call of vm_pageout_scan().
268  */
269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270 
271 #ifndef VM_PAGE_REACTIVATE_LIMIT
272 #if !XNU_TARGET_OS_OSX
273 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
274 #else /* !XNU_TARGET_OS_OSX */
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
276 #endif /* !XNU_TARGET_OS_OSX */
277 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
278 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
279 
280 extern boolean_t hibernate_cleaning_in_progress;
281 
282 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
283 
284 #if VM_PRESSURE_EVENTS
285 void vm_pressure_thread(void);
286 
287 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
288 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
289 
290 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
291 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
292 #endif
293 
294 static void vm_pageout_iothread_external(struct cq *, wait_result_t);
295 static void vm_pageout_iothread_internal(struct cq *, wait_result_t);
296 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
297 
298 extern void vm_pageout_continue(void);
299 extern void vm_pageout_scan(void);
300 
301 boolean_t vm_pageout_running = FALSE;
302 
303 uint32_t vm_page_upl_tainted = 0;
304 uint32_t vm_page_iopl_tainted = 0;
305 
306 #if XNU_TARGET_OS_OSX
307 static boolean_t vm_pageout_waiter  = FALSE;
308 #endif /* XNU_TARGET_OS_OSX */
309 
310 
311 #if DEVELOPMENT || DEBUG
312 struct vm_pageout_debug vm_pageout_debug;
313 #endif
314 struct vm_pageout_vminfo vm_pageout_vminfo;
315 struct vm_pageout_state  vm_pageout_state;
316 struct vm_config         vm_config;
317 
318 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
319 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
320 #if DEVELOPMENT || DEBUG
321 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
322 #endif /* DEVELOPMENT || DEBUG */
323 
324 int         vm_upl_wait_for_pages = 0;
325 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
326 
327 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
328 
329 int     vm_debug_events = 0;
330 
331 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
332 
333 #if CONFIG_MEMORYSTATUS
334 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
335 
336 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
337 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
338 
339 #endif
340 
341 #if __AMP__
342 int vm_compressor_ebound = 1;
343 int vm_pgo_pbound = 0;
344 extern void thread_bind_cluster_type(thread_t, char, bool);
345 #endif /* __AMP__ */
346 
347 
348 /*
349  *	Routine:	vm_pageout_object_terminate
350  *	Purpose:
351  *		Destroy the pageout_object, and perform all of the
352  *		required cleanup actions.
353  *
354  *	In/Out conditions:
355  *		The object must be locked, and will be returned locked.
356  */
357 void
vm_pageout_object_terminate(vm_object_t object)358 vm_pageout_object_terminate(
359 	vm_object_t     object)
360 {
361 	vm_object_t     shadow_object;
362 
363 	/*
364 	 * Deal with the deallocation (last reference) of a pageout object
365 	 * (used for cleaning-in-place) by dropping the paging references/
366 	 * freeing pages in the original object.
367 	 */
368 
369 	assert(object->pageout);
370 	shadow_object = object->shadow;
371 	vm_object_lock(shadow_object);
372 
373 	while (!vm_page_queue_empty(&object->memq)) {
374 		vm_page_t               p, m;
375 		vm_object_offset_t      offset;
376 
377 		p = (vm_page_t) vm_page_queue_first(&object->memq);
378 
379 		assert(p->vmp_private);
380 		assert(p->vmp_free_when_done);
381 		p->vmp_free_when_done = FALSE;
382 		assert(!p->vmp_cleaning);
383 		assert(!p->vmp_laundry);
384 
385 		offset = p->vmp_offset;
386 		VM_PAGE_FREE(p);
387 		p = VM_PAGE_NULL;
388 
389 		m = vm_page_lookup(shadow_object,
390 		    offset + object->vo_shadow_offset);
391 
392 		if (m == VM_PAGE_NULL) {
393 			continue;
394 		}
395 
396 		assert((m->vmp_dirty) || (m->vmp_precious) ||
397 		    (m->vmp_busy && m->vmp_cleaning));
398 
399 		/*
400 		 * Handle the trusted pager throttle.
401 		 * Also decrement the burst throttle (if external).
402 		 */
403 		vm_page_lock_queues();
404 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
405 			vm_pageout_throttle_up(m);
406 		}
407 
408 		/*
409 		 * Handle the "target" page(s). These pages are to be freed if
410 		 * successfully cleaned. Target pages are always busy, and are
411 		 * wired exactly once. The initial target pages are not mapped,
412 		 * (so cannot be referenced or modified) but converted target
413 		 * pages may have been modified between the selection as an
414 		 * adjacent page and conversion to a target.
415 		 */
416 		if (m->vmp_free_when_done) {
417 			assert(m->vmp_busy);
418 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
419 			assert(m->vmp_wire_count == 1);
420 			m->vmp_cleaning = FALSE;
421 			m->vmp_free_when_done = FALSE;
422 			/*
423 			 * Revoke all access to the page. Since the object is
424 			 * locked, and the page is busy, this prevents the page
425 			 * from being dirtied after the pmap_disconnect() call
426 			 * returns.
427 			 *
428 			 * Since the page is left "dirty" but "not modifed", we
429 			 * can detect whether the page was redirtied during
430 			 * pageout by checking the modify state.
431 			 */
432 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
433 				SET_PAGE_DIRTY(m, FALSE);
434 			} else {
435 				m->vmp_dirty = FALSE;
436 			}
437 
438 			if (m->vmp_dirty) {
439 				vm_page_unwire(m, TRUE);        /* reactivates */
440 				counter_inc(&vm_statistics_reactivations);
441 				PAGE_WAKEUP_DONE(m);
442 			} else {
443 				vm_page_free(m);  /* clears busy, etc. */
444 			}
445 			vm_page_unlock_queues();
446 			continue;
447 		}
448 		/*
449 		 * Handle the "adjacent" pages. These pages were cleaned in
450 		 * place, and should be left alone.
451 		 * If prep_pin_count is nonzero, then someone is using the
452 		 * page, so make it active.
453 		 */
454 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
455 			if (m->vmp_reference) {
456 				vm_page_activate(m);
457 			} else {
458 				vm_page_deactivate(m);
459 			}
460 		}
461 		if (m->vmp_overwriting) {
462 			/*
463 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
464 			 */
465 			if (m->vmp_busy) {
466 				/*
467 				 * We do not re-set m->vmp_dirty !
468 				 * The page was busy so no extraneous activity
469 				 * could have occurred. COPY_INTO is a read into the
470 				 * new pages. CLEAN_IN_PLACE does actually write
471 				 * out the pages but handling outside of this code
472 				 * will take care of resetting dirty. We clear the
473 				 * modify however for the Programmed I/O case.
474 				 */
475 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
476 
477 				m->vmp_busy = FALSE;
478 				m->vmp_absent = FALSE;
479 			} else {
480 				/*
481 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
482 				 * Occurs when the original page was wired
483 				 * at the time of the list request
484 				 */
485 				assert(VM_PAGE_WIRED(m));
486 				vm_page_unwire(m, TRUE);        /* reactivates */
487 			}
488 			m->vmp_overwriting = FALSE;
489 		} else {
490 			m->vmp_dirty = FALSE;
491 		}
492 		m->vmp_cleaning = FALSE;
493 
494 		/*
495 		 * Wakeup any thread waiting for the page to be un-cleaning.
496 		 */
497 		PAGE_WAKEUP(m);
498 		vm_page_unlock_queues();
499 	}
500 	/*
501 	 * Account for the paging reference taken in vm_paging_object_allocate.
502 	 */
503 	vm_object_activity_end(shadow_object);
504 	vm_object_unlock(shadow_object);
505 
506 	assert(object->ref_count == 0);
507 	assert(object->paging_in_progress == 0);
508 	assert(object->activity_in_progress == 0);
509 	assert(object->resident_page_count == 0);
510 	return;
511 }
512 
513 /*
514  * Routine:	vm_pageclean_setup
515  *
516  * Purpose:	setup a page to be cleaned (made non-dirty), but not
517  *		necessarily flushed from the VM page cache.
518  *		This is accomplished by cleaning in place.
519  *
520  *		The page must not be busy, and new_object
521  *		must be locked.
522  *
523  */
524 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)525 vm_pageclean_setup(
526 	vm_page_t               m,
527 	vm_page_t               new_m,
528 	vm_object_t             new_object,
529 	vm_object_offset_t      new_offset)
530 {
531 	assert(!m->vmp_busy);
532 #if 0
533 	assert(!m->vmp_cleaning);
534 #endif
535 
536 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
537 
538 	/*
539 	 * Mark original page as cleaning in place.
540 	 */
541 	m->vmp_cleaning = TRUE;
542 	SET_PAGE_DIRTY(m, FALSE);
543 	m->vmp_precious = FALSE;
544 
545 	/*
546 	 * Convert the fictitious page to a private shadow of
547 	 * the real page.
548 	 */
549 	assert(new_m->vmp_fictitious);
550 	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
551 	new_m->vmp_fictitious = FALSE;
552 	new_m->vmp_private = TRUE;
553 	new_m->vmp_free_when_done = TRUE;
554 	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
555 
556 	vm_page_lockspin_queues();
557 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
558 	vm_page_unlock_queues();
559 
560 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
561 	assert(!new_m->vmp_wanted);
562 	new_m->vmp_busy = FALSE;
563 }
564 
565 /*
566  *	Routine:	vm_pageout_initialize_page
567  *	Purpose:
568  *		Causes the specified page to be initialized in
569  *		the appropriate memory object. This routine is used to push
570  *		pages into a copy-object when they are modified in the
571  *		permanent object.
572  *
573  *		The page is moved to a temporary object and paged out.
574  *
575  *	In/out conditions:
576  *		The page in question must not be on any pageout queues.
577  *		The object to which it belongs must be locked.
578  *		The page must be busy, but not hold a paging reference.
579  *
580  *	Implementation:
581  *		Move this page to a completely new object.
582  */
583 void
vm_pageout_initialize_page(vm_page_t m)584 vm_pageout_initialize_page(
585 	vm_page_t       m)
586 {
587 	vm_object_t             object;
588 	vm_object_offset_t      paging_offset;
589 	memory_object_t         pager;
590 
591 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
592 
593 	object = VM_PAGE_OBJECT(m);
594 
595 	assert(m->vmp_busy);
596 	assert(object->internal);
597 
598 	/*
599 	 *	Verify that we really want to clean this page
600 	 */
601 	assert(!m->vmp_absent);
602 	assert(m->vmp_dirty);
603 
604 	/*
605 	 *	Create a paging reference to let us play with the object.
606 	 */
607 	paging_offset = m->vmp_offset + object->paging_offset;
608 
609 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
610 		panic("reservation without pageout?"); /* alan */
611 
612 		VM_PAGE_FREE(m);
613 		vm_object_unlock(object);
614 
615 		return;
616 	}
617 
618 	/*
619 	 * If there's no pager, then we can't clean the page.  This should
620 	 * never happen since this should be a copy object and therefore not
621 	 * an external object, so the pager should always be there.
622 	 */
623 
624 	pager = object->pager;
625 
626 	if (pager == MEMORY_OBJECT_NULL) {
627 		panic("missing pager for copy object");
628 
629 		VM_PAGE_FREE(m);
630 		return;
631 	}
632 
633 	/*
634 	 * set the page for future call to vm_fault_list_request
635 	 */
636 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
637 	SET_PAGE_DIRTY(m, FALSE);
638 
639 	/*
640 	 * keep the object from collapsing or terminating
641 	 */
642 	vm_object_paging_begin(object);
643 	vm_object_unlock(object);
644 
645 	/*
646 	 *	Write the data to its pager.
647 	 *	Note that the data is passed by naming the new object,
648 	 *	not a virtual address; the pager interface has been
649 	 *	manipulated to use the "internal memory" data type.
650 	 *	[The object reference from its allocation is donated
651 	 *	to the eventual recipient.]
652 	 */
653 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
654 
655 	vm_object_lock(object);
656 	vm_object_paging_end(object);
657 }
658 
659 
660 /*
661  * vm_pageout_cluster:
662  *
663  * Given a page, queue it to the appropriate I/O thread,
664  * which will page it out and attempt to clean adjacent pages
665  * in the same operation.
666  *
667  * The object and queues must be locked. We will take a
668  * paging reference to prevent deallocation or collapse when we
669  * release the object lock back at the call site.  The I/O thread
670  * is responsible for consuming this reference
671  *
672  * The page must not be on any pageout queue.
673  */
674 #if DEVELOPMENT || DEBUG
675 vmct_stats_t vmct_stats;
676 
677 int32_t vmct_active = 0;
678 uint64_t vm_compressor_epoch_start = 0;
679 uint64_t vm_compressor_epoch_stop = 0;
680 
681 typedef enum vmct_state_t {
682 	VMCT_IDLE,
683 	VMCT_AWAKENED,
684 	VMCT_ACTIVE,
685 } vmct_state_t;
686 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
687 #endif
688 
689 
690 
691 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)692 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
693 {
694 	event_t     wakeup_event;
695 	vm_object_t object = VM_PAGE_OBJECT(m);
696 
697 	VM_PAGE_CHECK(m);
698 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
699 	vm_object_lock_assert_exclusive(object);
700 
701 	/*
702 	 * Make sure it's OK to page this out.
703 	 */
704 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
705 	assert(!m->vmp_cleaning && !m->vmp_laundry);
706 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
707 
708 	/*
709 	 * protect the object from collapse or termination
710 	 */
711 	vm_object_activity_begin(object);
712 
713 	if (object->internal == TRUE) {
714 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
715 
716 		m->vmp_busy = TRUE;
717 		wakeup_event = (event_t) &(vm_pageout_queue_internal.pgo_pending);
718 	} else {
719 		wakeup_event = (event_t) &(vm_pageout_queue_external.pgo_pending);
720 	}
721 
722 	/*
723 	 * pgo_laundry count is tied to the laundry bit
724 	 */
725 	m->vmp_laundry = TRUE;
726 	q->pgo_laundry++;
727 
728 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
729 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
730 
731 	if (q->pgo_idle == TRUE) {
732 		q->pgo_idle = FALSE;
733 		thread_wakeup(wakeup_event);
734 	}
735 	VM_PAGE_CHECK(m);
736 }
737 
738 void
vm_pageout_cluster(vm_page_t m)739 vm_pageout_cluster(vm_page_t m)
740 {
741 	struct          vm_pageout_queue *q;
742 	vm_object_t     object = VM_PAGE_OBJECT(m);
743 	if (object->internal) {
744 		q = &vm_pageout_queue_internal;
745 	} else {
746 		q = &vm_pageout_queue_external;
747 	}
748 	vm_pageout_cluster_to_queue(m, q);
749 }
750 
751 
752 /*
753  * A page is back from laundry or we are stealing it back from
754  * the laundering state.  See if there are some pages waiting to
755  * go to laundry and if we can let some of them go now.
756  *
757  * Object and page queues must be locked.
758  */
759 void
vm_pageout_throttle_up(vm_page_t m)760 vm_pageout_throttle_up(
761 	vm_page_t       m)
762 {
763 	struct vm_pageout_queue *q;
764 	vm_object_t      m_object;
765 
766 	m_object = VM_PAGE_OBJECT(m);
767 
768 	assert(m_object != VM_OBJECT_NULL);
769 	assert(m_object != kernel_object);
770 
771 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
772 	vm_object_lock_assert_exclusive(m_object);
773 
774 	if (m_object->internal == TRUE) {
775 		q = &vm_pageout_queue_internal;
776 	} else {
777 		q = &vm_pageout_queue_external;
778 	}
779 
780 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
781 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
782 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
783 
784 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
785 
786 		vm_object_activity_end(m_object);
787 
788 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
789 	}
790 	if (m->vmp_laundry == TRUE) {
791 		m->vmp_laundry = FALSE;
792 		q->pgo_laundry--;
793 
794 		if (q->pgo_throttled == TRUE) {
795 			q->pgo_throttled = FALSE;
796 			thread_wakeup((event_t) &q->pgo_laundry);
797 		}
798 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
799 			q->pgo_draining = FALSE;
800 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
801 		}
802 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
803 	}
804 }
805 
806 
807 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)808 vm_pageout_throttle_up_batch(
809 	struct vm_pageout_queue *q,
810 	int             batch_cnt)
811 {
812 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
813 
814 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
815 
816 	q->pgo_laundry -= batch_cnt;
817 
818 	if (q->pgo_throttled == TRUE) {
819 		q->pgo_throttled = FALSE;
820 		thread_wakeup((event_t) &q->pgo_laundry);
821 	}
822 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
823 		q->pgo_draining = FALSE;
824 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
825 	}
826 }
827 
828 
829 
830 /*
831  * VM memory pressure monitoring.
832  *
833  * vm_pageout_scan() keeps track of the number of pages it considers and
834  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
835  *
836  * compute_memory_pressure() is called every second from compute_averages()
837  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
838  * of recalimed pages in a new vm_pageout_stat[] bucket.
839  *
840  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
841  * The caller provides the number of seconds ("nsecs") worth of statistics
842  * it wants, up to 30 seconds.
843  * It computes the number of pages reclaimed in the past "nsecs" seconds and
844  * also returns the number of pages the system still needs to reclaim at this
845  * moment in time.
846  */
847 #if DEVELOPMENT || DEBUG
848 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
849 #else
850 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
851 #endif
852 struct vm_pageout_stat {
853 	unsigned long vm_page_active_count;
854 	unsigned long vm_page_speculative_count;
855 	unsigned long vm_page_inactive_count;
856 	unsigned long vm_page_anonymous_count;
857 
858 	unsigned long vm_page_free_count;
859 	unsigned long vm_page_wire_count;
860 	unsigned long vm_page_compressor_count;
861 
862 	unsigned long vm_page_pages_compressed;
863 	unsigned long vm_page_pageable_internal_count;
864 	unsigned long vm_page_pageable_external_count;
865 	unsigned long vm_page_xpmapped_external_count;
866 
867 	unsigned int pages_grabbed;
868 	unsigned int pages_freed;
869 
870 	unsigned int pages_compressed;
871 	unsigned int pages_grabbed_by_compressor;
872 	unsigned int failed_compressions;
873 
874 	unsigned int pages_evicted;
875 	unsigned int pages_purged;
876 
877 	unsigned int considered;
878 	unsigned int considered_bq_internal;
879 	unsigned int considered_bq_external;
880 
881 	unsigned int skipped_external;
882 	unsigned int skipped_internal;
883 	unsigned int filecache_min_reactivations;
884 
885 	unsigned int freed_speculative;
886 	unsigned int freed_cleaned;
887 	unsigned int freed_internal;
888 	unsigned int freed_external;
889 
890 	unsigned int cleaned_dirty_external;
891 	unsigned int cleaned_dirty_internal;
892 
893 	unsigned int inactive_referenced;
894 	unsigned int inactive_nolock;
895 	unsigned int reactivation_limit_exceeded;
896 	unsigned int forced_inactive_reclaim;
897 
898 	unsigned int throttled_internal_q;
899 	unsigned int throttled_external_q;
900 
901 	unsigned int phantom_ghosts_found;
902 	unsigned int phantom_ghosts_added;
903 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, };
904 
905 unsigned int vm_pageout_stat_now = 0;
906 
907 #define VM_PAGEOUT_STAT_BEFORE(i) \
908 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
909 #define VM_PAGEOUT_STAT_AFTER(i) \
910 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
911 
912 #if VM_PAGE_BUCKETS_CHECK
913 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
914 #endif /* VM_PAGE_BUCKETS_CHECK */
915 
916 
917 void
918 record_memory_pressure(void);
919 void
record_memory_pressure(void)920 record_memory_pressure(void)
921 {
922 	unsigned int vm_pageout_next;
923 
924 #if VM_PAGE_BUCKETS_CHECK
925 	/* check the consistency of VM page buckets at regular interval */
926 	static int counter = 0;
927 	if ((++counter % vm_page_buckets_check_interval) == 0) {
928 		vm_page_buckets_check();
929 	}
930 #endif /* VM_PAGE_BUCKETS_CHECK */
931 
932 	vm_pageout_state.vm_memory_pressure =
933 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
934 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
935 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
936 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
937 
938 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
939 
940 	/* move "now" forward */
941 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
942 
943 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
944 
945 	vm_pageout_stat_now = vm_pageout_next;
946 }
947 
948 
949 /*
950  * IMPORTANT
951  * mach_vm_ctl_page_free_wanted() is called indirectly, via
952  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
953  * it must be safe in the restricted stackshot context. Locks and/or
954  * blocking are not allowable.
955  */
956 unsigned int
mach_vm_ctl_page_free_wanted(void)957 mach_vm_ctl_page_free_wanted(void)
958 {
959 	unsigned int page_free_target, page_free_count, page_free_wanted;
960 
961 	page_free_target = vm_page_free_target;
962 	page_free_count = vm_page_free_count;
963 	if (page_free_target > page_free_count) {
964 		page_free_wanted = page_free_target - page_free_count;
965 	} else {
966 		page_free_wanted = 0;
967 	}
968 
969 	return page_free_wanted;
970 }
971 
972 
973 /*
974  * IMPORTANT:
975  * mach_vm_pressure_monitor() is called when taking a stackshot, with
976  * wait_for_pressure FALSE, so that code path must remain safe in the
977  * restricted stackshot context. No blocking or locks are allowable.
978  * on that code path.
979  */
980 
981 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)982 mach_vm_pressure_monitor(
983 	boolean_t       wait_for_pressure,
984 	unsigned int    nsecs_monitored,
985 	unsigned int    *pages_reclaimed_p,
986 	unsigned int    *pages_wanted_p)
987 {
988 	wait_result_t   wr;
989 	unsigned int    vm_pageout_then, vm_pageout_now;
990 	unsigned int    pages_reclaimed;
991 	unsigned int    units_of_monitor;
992 
993 	units_of_monitor = 8 * nsecs_monitored;
994 	/*
995 	 * We don't take the vm_page_queue_lock here because we don't want
996 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
997 	 * thread when it's trying to reclaim memory.  We don't need fully
998 	 * accurate monitoring anyway...
999 	 */
1000 
1001 	if (wait_for_pressure) {
1002 		/* wait until there's memory pressure */
1003 		while (vm_page_free_count >= vm_page_free_target) {
1004 			wr = assert_wait((event_t) &vm_page_free_wanted,
1005 			    THREAD_INTERRUPTIBLE);
1006 			if (wr == THREAD_WAITING) {
1007 				wr = thread_block(THREAD_CONTINUE_NULL);
1008 			}
1009 			if (wr == THREAD_INTERRUPTED) {
1010 				return KERN_ABORTED;
1011 			}
1012 			if (wr == THREAD_AWAKENED) {
1013 				/*
1014 				 * The memory pressure might have already
1015 				 * been relieved but let's not block again
1016 				 * and let's report that there was memory
1017 				 * pressure at some point.
1018 				 */
1019 				break;
1020 			}
1021 		}
1022 	}
1023 
1024 	/* provide the number of pages the system wants to reclaim */
1025 	if (pages_wanted_p != NULL) {
1026 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1027 	}
1028 
1029 	if (pages_reclaimed_p == NULL) {
1030 		return KERN_SUCCESS;
1031 	}
1032 
1033 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1034 	vm_pageout_now = vm_pageout_stat_now;
1035 	pages_reclaimed = 0;
1036 	for (vm_pageout_then =
1037 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1038 	    vm_pageout_then != vm_pageout_now &&
1039 	    units_of_monitor-- != 0;
1040 	    vm_pageout_then =
1041 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1042 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1043 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1044 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1045 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1046 	}
1047 	*pages_reclaimed_p = pages_reclaimed;
1048 
1049 	return KERN_SUCCESS;
1050 }
1051 
1052 
1053 
1054 #if DEVELOPMENT || DEBUG
1055 
1056 static void
1057 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1058 
1059 /*
1060  * condition variable used to make sure there is
1061  * only a single sweep going on at a time
1062  */
1063 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1064 
1065 
1066 void
vm_pageout_disconnect_all_pages()1067 vm_pageout_disconnect_all_pages()
1068 {
1069 	vm_page_lock_queues();
1070 
1071 	if (vm_pageout_disconnect_all_pages_active == TRUE) {
1072 		vm_page_unlock_queues();
1073 		return;
1074 	}
1075 	vm_pageout_disconnect_all_pages_active = TRUE;
1076 	vm_page_unlock_queues();
1077 
1078 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1079 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1080 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1081 
1082 	vm_pageout_disconnect_all_pages_active = FALSE;
1083 }
1084 
1085 
1086 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1087 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1088 {
1089 	vm_page_t       m;
1090 	vm_object_t     t_object = NULL;
1091 	vm_object_t     l_object = NULL;
1092 	vm_object_t     m_object = NULL;
1093 	int             delayed_unlock = 0;
1094 	int             try_failed_count = 0;
1095 	int             disconnected_count = 0;
1096 	int             paused_count = 0;
1097 	int             object_locked_count = 0;
1098 
1099 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1100 	    q, qcount, 0, 0, 0);
1101 
1102 	vm_page_lock_queues();
1103 
1104 	while (qcount && !vm_page_queue_empty(q)) {
1105 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1106 
1107 		m = (vm_page_t) vm_page_queue_first(q);
1108 		m_object = VM_PAGE_OBJECT(m);
1109 
1110 		/*
1111 		 * check to see if we currently are working
1112 		 * with the same object... if so, we've
1113 		 * already got the lock
1114 		 */
1115 		if (m_object != l_object) {
1116 			/*
1117 			 * the object associated with candidate page is
1118 			 * different from the one we were just working
1119 			 * with... dump the lock if we still own it
1120 			 */
1121 			if (l_object != NULL) {
1122 				vm_object_unlock(l_object);
1123 				l_object = NULL;
1124 			}
1125 			if (m_object != t_object) {
1126 				try_failed_count = 0;
1127 			}
1128 
1129 			/*
1130 			 * Try to lock object; since we've alread got the
1131 			 * page queues lock, we can only 'try' for this one.
1132 			 * if the 'try' fails, we need to do a mutex_pause
1133 			 * to allow the owner of the object lock a chance to
1134 			 * run...
1135 			 */
1136 			if (!vm_object_lock_try_scan(m_object)) {
1137 				if (try_failed_count > 20) {
1138 					goto reenter_pg_on_q;
1139 				}
1140 				vm_page_unlock_queues();
1141 				mutex_pause(try_failed_count++);
1142 				vm_page_lock_queues();
1143 				delayed_unlock = 0;
1144 
1145 				paused_count++;
1146 
1147 				t_object = m_object;
1148 				continue;
1149 			}
1150 			object_locked_count++;
1151 
1152 			l_object = m_object;
1153 		}
1154 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1155 			/*
1156 			 * put it back on the head of its queue
1157 			 */
1158 			goto reenter_pg_on_q;
1159 		}
1160 		if (m->vmp_pmapped == TRUE) {
1161 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1162 
1163 			disconnected_count++;
1164 		}
1165 reenter_pg_on_q:
1166 		vm_page_queue_remove(q, m, vmp_pageq);
1167 		vm_page_queue_enter(q, m, vmp_pageq);
1168 
1169 		qcount--;
1170 		try_failed_count = 0;
1171 
1172 		if (delayed_unlock++ > 128) {
1173 			if (l_object != NULL) {
1174 				vm_object_unlock(l_object);
1175 				l_object = NULL;
1176 			}
1177 			lck_mtx_yield(&vm_page_queue_lock);
1178 			delayed_unlock = 0;
1179 		}
1180 	}
1181 	if (l_object != NULL) {
1182 		vm_object_unlock(l_object);
1183 		l_object = NULL;
1184 	}
1185 	vm_page_unlock_queues();
1186 
1187 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1188 	    q, disconnected_count, object_locked_count, paused_count, 0);
1189 }
1190 
1191 extern char* proc_best_name(struct proc* proc);
1192 
1193 int
vm_toggle_task_selfdonate_pages(task_t task)1194 vm_toggle_task_selfdonate_pages(task_t task)
1195 {
1196 	int state = 0;
1197 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1198 		printf("VM Donation mode is OFF on the system\n");
1199 		return state;
1200 	}
1201 	if (task != kernel_task) {
1202 		task_lock(task);
1203 		if (!task->donates_own_pages) {
1204 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1205 			task->donates_own_pages = true;
1206 			state = 1;
1207 		} else if (task->donates_own_pages) {
1208 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1209 			task->donates_own_pages = false;
1210 			state = 0;
1211 		}
1212 		task_unlock(task);
1213 	}
1214 	return state;
1215 }
1216 #endif /* DEVELOPMENT || DEBUG */
1217 
1218 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1219 vm_task_set_selfdonate_pages(task_t task, bool donate)
1220 {
1221 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1222 	assert(task != kernel_task);
1223 
1224 	task_lock(task);
1225 	task->donates_own_pages = donate;
1226 	task_unlock(task);
1227 }
1228 
1229 
1230 
1231 static size_t
1232 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1233 
1234 /*
1235  * condition variable used to make sure there is
1236  * only a single sweep going on at a time
1237  */
1238 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1239 
1240 
1241 void
vm_pageout_anonymous_pages()1242 vm_pageout_anonymous_pages()
1243 {
1244 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1245 		vm_page_lock_queues();
1246 
1247 		if (vm_pageout_anonymous_pages_active == TRUE) {
1248 			vm_page_unlock_queues();
1249 			return;
1250 		}
1251 		vm_pageout_anonymous_pages_active = TRUE;
1252 		vm_page_unlock_queues();
1253 
1254 		vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1255 		vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1256 		vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1257 
1258 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1259 			vm_consider_swapping();
1260 		}
1261 
1262 		vm_page_lock_queues();
1263 		vm_pageout_anonymous_pages_active = FALSE;
1264 		vm_page_unlock_queues();
1265 	}
1266 }
1267 
1268 
1269 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1270 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1271 {
1272 	vm_page_t       m;
1273 	vm_object_t     t_object = NULL;
1274 	vm_object_t     l_object = NULL;
1275 	vm_object_t     m_object = NULL;
1276 	int             delayed_unlock = 0;
1277 	int             try_failed_count = 0;
1278 	int             refmod_state;
1279 	int             pmap_options;
1280 	struct          vm_pageout_queue *iq;
1281 	ppnum_t         phys_page;
1282 	size_t          pages_moved = 0;
1283 
1284 
1285 	iq = &vm_pageout_queue_internal;
1286 
1287 	vm_page_lock_queues();
1288 
1289 #if DEVELOPMENT || DEBUG
1290 	if (perf_test) {
1291 		iq = &vm_pageout_queue_benchmark;
1292 	}
1293 #endif /* DEVELOPMENT ||DEBUG */
1294 
1295 	while (qcount && !vm_page_queue_empty(q)) {
1296 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1297 
1298 		if (VM_PAGE_Q_THROTTLED(iq)) {
1299 			if (l_object != NULL) {
1300 				vm_object_unlock(l_object);
1301 				l_object = NULL;
1302 			}
1303 			iq->pgo_draining = TRUE;
1304 
1305 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1306 			vm_page_unlock_queues();
1307 
1308 			thread_block(THREAD_CONTINUE_NULL);
1309 
1310 			vm_page_lock_queues();
1311 			delayed_unlock = 0;
1312 			continue;
1313 		}
1314 		m = (vm_page_t) vm_page_queue_first(q);
1315 		m_object = VM_PAGE_OBJECT(m);
1316 
1317 		/*
1318 		 * check to see if we currently are working
1319 		 * with the same object... if so, we've
1320 		 * already got the lock
1321 		 */
1322 		if (m_object != l_object) {
1323 			if (!m_object->internal) {
1324 				goto reenter_pg_on_q;
1325 			}
1326 
1327 			/*
1328 			 * the object associated with candidate page is
1329 			 * different from the one we were just working
1330 			 * with... dump the lock if we still own it
1331 			 */
1332 			if (l_object != NULL) {
1333 				vm_object_unlock(l_object);
1334 				l_object = NULL;
1335 			}
1336 			if (m_object != t_object) {
1337 				try_failed_count = 0;
1338 			}
1339 
1340 			/*
1341 			 * Try to lock object; since we've alread got the
1342 			 * page queues lock, we can only 'try' for this one.
1343 			 * if the 'try' fails, we need to do a mutex_pause
1344 			 * to allow the owner of the object lock a chance to
1345 			 * run...
1346 			 */
1347 			if (!vm_object_lock_try_scan(m_object)) {
1348 				if (try_failed_count > 20) {
1349 					goto reenter_pg_on_q;
1350 				}
1351 				vm_page_unlock_queues();
1352 				mutex_pause(try_failed_count++);
1353 				vm_page_lock_queues();
1354 				delayed_unlock = 0;
1355 
1356 				t_object = m_object;
1357 				continue;
1358 			}
1359 			l_object = m_object;
1360 		}
1361 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1362 			/*
1363 			 * page is not to be cleaned
1364 			 * put it back on the head of its queue
1365 			 */
1366 			goto reenter_pg_on_q;
1367 		}
1368 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1369 
1370 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1371 			refmod_state = pmap_get_refmod(phys_page);
1372 
1373 			if (refmod_state & VM_MEM_REFERENCED) {
1374 				m->vmp_reference = TRUE;
1375 			}
1376 			if (refmod_state & VM_MEM_MODIFIED) {
1377 				SET_PAGE_DIRTY(m, FALSE);
1378 			}
1379 		}
1380 		if (m->vmp_reference == TRUE) {
1381 			m->vmp_reference = FALSE;
1382 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1383 			goto reenter_pg_on_q;
1384 		}
1385 		if (m->vmp_pmapped == TRUE) {
1386 			if (m->vmp_dirty || m->vmp_precious) {
1387 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1388 			} else {
1389 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1390 			}
1391 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1392 			if (refmod_state & VM_MEM_MODIFIED) {
1393 				SET_PAGE_DIRTY(m, FALSE);
1394 			}
1395 		}
1396 
1397 		if (!m->vmp_dirty && !m->vmp_precious) {
1398 			vm_page_unlock_queues();
1399 			VM_PAGE_FREE(m);
1400 			vm_page_lock_queues();
1401 			delayed_unlock = 0;
1402 
1403 			goto next_pg;
1404 		}
1405 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1406 			if (!m_object->pager_initialized) {
1407 				vm_page_unlock_queues();
1408 
1409 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1410 
1411 				if (!m_object->pager_initialized) {
1412 					vm_object_compressor_pager_create(m_object);
1413 				}
1414 
1415 				vm_page_lock_queues();
1416 				delayed_unlock = 0;
1417 			}
1418 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1419 				goto reenter_pg_on_q;
1420 			}
1421 			/*
1422 			 * vm_object_compressor_pager_create will drop the object lock
1423 			 * which means 'm' may no longer be valid to use
1424 			 */
1425 			continue;
1426 		}
1427 
1428 		if (!perf_test) {
1429 			/*
1430 			 * we've already factored out pages in the laundry which
1431 			 * means this page can't be on the pageout queue so it's
1432 			 * safe to do the vm_page_queues_remove
1433 			 */
1434 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1435 			vm_page_queues_remove(m, TRUE);
1436 			if (donate) {
1437 				/*
1438 				 * The compressor needs to see this bit to know
1439 				 * where this page needs to land. Also if stolen,
1440 				 * this bit helps put the page back in the right
1441 				 * special queue where it belongs.
1442 				 */
1443 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1444 			}
1445 		} else {
1446 			vm_page_queue_remove(q, m, vmp_pageq);
1447 		}
1448 
1449 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1450 
1451 		vm_pageout_cluster_to_queue(m, iq);
1452 
1453 		pages_moved++;
1454 		goto next_pg;
1455 
1456 reenter_pg_on_q:
1457 		vm_page_queue_remove(q, m, vmp_pageq);
1458 		vm_page_queue_enter(q, m, vmp_pageq);
1459 next_pg:
1460 		qcount--;
1461 		try_failed_count = 0;
1462 
1463 		if (delayed_unlock++ > 128) {
1464 			if (l_object != NULL) {
1465 				vm_object_unlock(l_object);
1466 				l_object = NULL;
1467 			}
1468 			lck_mtx_yield(&vm_page_queue_lock);
1469 			delayed_unlock = 0;
1470 		}
1471 	}
1472 	if (l_object != NULL) {
1473 		vm_object_unlock(l_object);
1474 		l_object = NULL;
1475 	}
1476 	vm_page_unlock_queues();
1477 	return pages_moved;
1478 }
1479 
1480 
1481 
1482 /*
1483  * function in BSD to apply I/O throttle to the pageout thread
1484  */
1485 extern void vm_pageout_io_throttle(void);
1486 
1487 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1488 	MACRO_BEGIN                                                     \
1489 	/* \
1490 	 * If a "reusable" page somehow made it back into \
1491 	 * the active queue, it's been re-used and is not \
1492 	 * quite re-usable. \
1493 	 * If the VM object was "all_reusable", consider it \
1494 	 * as "all re-used" instead of converting it to \
1495 	 * "partially re-used", which could be expensive. \
1496 	 */                                                             \
1497 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1498 	if ((m)->vmp_reusable ||                                        \
1499 	    (obj)->all_reusable) {                                      \
1500 	        vm_object_reuse_pages((obj),                            \
1501 	                              (m)->vmp_offset,                  \
1502 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1503 	                              FALSE);                           \
1504 	}                                                               \
1505 	MACRO_END
1506 
1507 
1508 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1509 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1510 
1511 #define FCS_IDLE                0
1512 #define FCS_DELAYED             1
1513 #define FCS_DEADLOCK_DETECTED   2
1514 
1515 struct flow_control {
1516 	int             state;
1517 	mach_timespec_t ts;
1518 };
1519 
1520 
1521 uint64_t vm_pageout_rejected_bq_internal = 0;
1522 uint64_t vm_pageout_rejected_bq_external = 0;
1523 uint64_t vm_pageout_skipped_bq_internal = 0;
1524 uint64_t vm_pageout_skipped_bq_external = 0;
1525 
1526 #define ANONS_GRABBED_LIMIT     2
1527 
1528 
1529 #if 0
1530 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1531 #endif
1532 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1533 
1534 #define VM_PAGEOUT_PB_NO_ACTION                         0
1535 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1536 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1537 
1538 
1539 #if 0
1540 static void
1541 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1542 {
1543 	if (*local_freeq) {
1544 		vm_page_unlock_queues();
1545 
1546 		VM_DEBUG_CONSTANT_EVENT(
1547 			vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1548 			vm_page_free_count, 0, 0, 1);
1549 
1550 		vm_page_free_list(*local_freeq, TRUE);
1551 
1552 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1553 		    vm_page_free_count, *local_freed, 0, 1);
1554 
1555 		*local_freeq = NULL;
1556 		*local_freed = 0;
1557 
1558 		vm_page_lock_queues();
1559 	} else {
1560 		lck_mtx_yield(&vm_page_queue_lock);
1561 	}
1562 	*delayed_unlock = 1;
1563 }
1564 #endif
1565 
1566 
1567 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1568 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1569     vm_page_t *local_freeq, int *local_freed, int action)
1570 {
1571 	vm_page_unlock_queues();
1572 
1573 	if (*object != NULL) {
1574 		vm_object_unlock(*object);
1575 		*object = NULL;
1576 	}
1577 	if (*local_freeq) {
1578 		vm_page_free_list(*local_freeq, TRUE);
1579 
1580 		*local_freeq = NULL;
1581 		*local_freed = 0;
1582 	}
1583 	*delayed_unlock = 1;
1584 
1585 	switch (action) {
1586 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1587 		vm_consider_waking_compactor_swapper();
1588 		break;
1589 	case VM_PAGEOUT_PB_THREAD_YIELD:
1590 		thread_yield_internal(1);
1591 		break;
1592 	case VM_PAGEOUT_PB_NO_ACTION:
1593 	default:
1594 		break;
1595 	}
1596 	vm_page_lock_queues();
1597 }
1598 
1599 
1600 static struct vm_pageout_vminfo last;
1601 
1602 uint64_t last_vm_page_pages_grabbed = 0;
1603 
1604 extern  uint32_t c_segment_pages_compressed;
1605 
1606 extern uint64_t shared_region_pager_reclaimed;
1607 extern struct memory_object_pager_ops shared_region_pager_ops;
1608 
1609 void
update_vm_info(void)1610 update_vm_info(void)
1611 {
1612 	unsigned long tmp;
1613 	uint64_t tmp64;
1614 
1615 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1616 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1617 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1618 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1619 
1620 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1621 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1622 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1623 
1624 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1625 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1626 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1627 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1628 
1629 
1630 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1631 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1632 	last.vm_pageout_considered_page = tmp;
1633 
1634 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1635 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1636 	last.vm_pageout_compressions = tmp64;
1637 
1638 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1639 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1640 	last.vm_compressor_failed = tmp;
1641 
1642 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1643 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1644 	last.vm_compressor_pages_grabbed = tmp64;
1645 
1646 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1647 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1648 	last.vm_phantom_cache_found_ghost = tmp;
1649 
1650 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1651 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1652 	last.vm_phantom_cache_added_ghost = tmp;
1653 
1654 	tmp64 = counter_load(&vm_page_grab_count);
1655 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1656 	last_vm_page_pages_grabbed = tmp64;
1657 
1658 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1659 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1660 	last.vm_page_pages_freed = tmp;
1661 
1662 
1663 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1664 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1665 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1666 		last.vm_pageout_pages_evicted = tmp;
1667 
1668 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1669 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1670 		last.vm_pageout_pages_purged = tmp;
1671 
1672 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1673 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1674 		last.vm_pageout_freed_speculative = tmp;
1675 
1676 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1677 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1678 		last.vm_pageout_freed_external = tmp;
1679 
1680 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1681 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1682 		last.vm_pageout_inactive_referenced = tmp;
1683 
1684 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1685 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1686 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1687 
1688 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1689 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1690 		last.vm_pageout_inactive_dirty_external = tmp;
1691 
1692 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1693 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1694 		last.vm_pageout_freed_cleaned = tmp;
1695 
1696 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1697 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1698 		last.vm_pageout_inactive_nolock = tmp;
1699 
1700 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1701 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1702 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1703 
1704 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1705 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1706 		last.vm_pageout_skipped_external = tmp;
1707 
1708 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1709 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1710 		last.vm_pageout_skipped_internal = tmp;
1711 
1712 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1713 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1714 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1715 
1716 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1717 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1718 		last.vm_pageout_inactive_force_reclaim = tmp;
1719 
1720 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1721 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1722 		last.vm_pageout_freed_internal = tmp;
1723 
1724 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1725 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1726 		last.vm_pageout_considered_bq_internal = tmp;
1727 
1728 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1729 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1730 		last.vm_pageout_considered_bq_external = tmp;
1731 
1732 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1733 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1734 		last.vm_pageout_filecache_min_reactivated = tmp;
1735 
1736 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1737 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1738 		last.vm_pageout_inactive_dirty_internal = tmp;
1739 	}
1740 
1741 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1742 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1743 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1744 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1745 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1746 	    0);
1747 
1748 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1749 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1750 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1751 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1752 	    0,
1753 	    0);
1754 
1755 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1756 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1757 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1758 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1759 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1760 	    0);
1761 
1762 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1763 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1764 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1765 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1766 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1767 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1768 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1769 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1770 		    0);
1771 
1772 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1773 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1774 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1775 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1776 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1777 		    0);
1778 
1779 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1780 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1781 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1782 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1783 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1784 		    0);
1785 
1786 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1787 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1788 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1789 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1790 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1791 		    0);
1792 
1793 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1794 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1795 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1796 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1797 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1798 		    0);
1799 	}
1800 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1801 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1802 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1803 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1804 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1805 	    0);
1806 
1807 	record_memory_pressure();
1808 }
1809 
1810 extern boolean_t hibernation_vmqueues_inspection;
1811 
1812 /*
1813  * Return values for functions called by vm_pageout_scan
1814  * that control its flow.
1815  *
1816  * PROCEED -- vm_pageout_scan will keep making forward progress.
1817  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1818  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1819  */
1820 
1821 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1822 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1823 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1824 
1825 /*
1826  * This function is called only from vm_pageout_scan and
1827  * it moves overflow secluded pages (one-at-a-time) to the
1828  * batched 'local' free Q or active Q.
1829  */
1830 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1831 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1832 {
1833 #if CONFIG_SECLUDED_MEMORY
1834 	/*
1835 	 * Deal with secluded_q overflow.
1836 	 */
1837 	if (vm_page_secluded_count > vm_page_secluded_target) {
1838 		vm_page_t secluded_page;
1839 
1840 		/*
1841 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1842 		 * Excess secluded pages go to the active queue and
1843 		 * will later go to the inactive queue.
1844 		 */
1845 		assert((vm_page_secluded_count_free +
1846 		    vm_page_secluded_count_inuse) ==
1847 		    vm_page_secluded_count);
1848 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1849 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1850 
1851 		vm_page_queues_remove(secluded_page, FALSE);
1852 		assert(!secluded_page->vmp_fictitious);
1853 		assert(!VM_PAGE_WIRED(secluded_page));
1854 
1855 		if (secluded_page->vmp_object == 0) {
1856 			/* transfer to free queue */
1857 			assert(secluded_page->vmp_busy);
1858 			secluded_page->vmp_snext = *local_freeq;
1859 			*local_freeq = secluded_page;
1860 			*local_freed += 1;
1861 		} else {
1862 			/* transfer to head of active queue */
1863 			vm_page_enqueue_active(secluded_page, FALSE);
1864 			secluded_page = VM_PAGE_NULL;
1865 		}
1866 	}
1867 #else /* CONFIG_SECLUDED_MEMORY */
1868 
1869 #pragma unused(local_freeq)
1870 #pragma unused(local_freed)
1871 
1872 	return;
1873 
1874 #endif /* CONFIG_SECLUDED_MEMORY */
1875 }
1876 
1877 /*
1878  * This function is called only from vm_pageout_scan and
1879  * it initializes the loop targets for vm_pageout_scan().
1880  */
1881 static void
vps_init_page_targets(void)1882 vps_init_page_targets(void)
1883 {
1884 	/*
1885 	 * LD TODO: Other page targets should be calculated here too.
1886 	 */
1887 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1888 
1889 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1890 		vm_pageout_state.vm_page_speculative_percentage = 50;
1891 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1892 		vm_pageout_state.vm_page_speculative_percentage = 1;
1893 	}
1894 
1895 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1896 	    vm_page_inactive_count);
1897 }
1898 
1899 /*
1900  * This function is called only from vm_pageout_scan and
1901  * it purges a single VM object at-a-time and will either
1902  * make vm_pageout_scan() restart the loop or keeping moving forward.
1903  */
1904 static int
vps_purge_object()1905 vps_purge_object()
1906 {
1907 	int             force_purge;
1908 
1909 	assert(available_for_purge >= 0);
1910 	force_purge = 0; /* no force-purging */
1911 
1912 #if VM_PRESSURE_EVENTS
1913 	vm_pressure_level_t pressure_level;
1914 
1915 	pressure_level = memorystatus_vm_pressure_level;
1916 
1917 	if (pressure_level > kVMPressureNormal) {
1918 		if (pressure_level >= kVMPressureCritical) {
1919 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1920 		} else if (pressure_level >= kVMPressureUrgent) {
1921 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1922 		} else if (pressure_level >= kVMPressureWarning) {
1923 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1924 		}
1925 	}
1926 #endif /* VM_PRESSURE_EVENTS */
1927 
1928 	if (available_for_purge || force_purge) {
1929 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1930 
1931 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1932 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1933 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1934 			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1935 			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1936 
1937 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1938 		}
1939 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1940 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1941 	}
1942 
1943 	return VM_PAGEOUT_SCAN_PROCEED;
1944 }
1945 
1946 /*
1947  * This function is called only from vm_pageout_scan and
1948  * it will try to age the next speculative Q if the oldest
1949  * one is empty.
1950  */
1951 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1952 vps_age_speculative_queue(boolean_t force_speculative_aging)
1953 {
1954 #define DELAY_SPECULATIVE_AGE   1000
1955 
1956 	/*
1957 	 * try to pull pages from the aging bins...
1958 	 * see vm_page.h for an explanation of how
1959 	 * this mechanism works
1960 	 */
1961 	boolean_t                       can_steal = FALSE;
1962 	int                             num_scanned_queues;
1963 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1964 	mach_timespec_t                 ts;
1965 	struct vm_speculative_age_q     *aq;
1966 	struct vm_speculative_age_q     *sq;
1967 
1968 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1969 
1970 	aq = &vm_page_queue_speculative[speculative_steal_index];
1971 
1972 	num_scanned_queues = 0;
1973 	while (vm_page_queue_empty(&aq->age_q) &&
1974 	    num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1975 		speculative_steal_index++;
1976 
1977 		if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1978 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1979 		}
1980 
1981 		aq = &vm_page_queue_speculative[speculative_steal_index];
1982 	}
1983 
1984 	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1985 		/*
1986 		 * XXX We've scanned all the speculative
1987 		 * queues but still haven't found one
1988 		 * that is not empty, even though
1989 		 * vm_page_speculative_count is not 0.
1990 		 */
1991 		if (!vm_page_queue_empty(&sq->age_q)) {
1992 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1993 		}
1994 #if DEVELOPMENT || DEBUG
1995 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
1996 #endif
1997 		/* readjust... */
1998 		vm_page_speculative_count = 0;
1999 		/* ... and continue */
2000 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001 	}
2002 
2003 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2004 		can_steal = TRUE;
2005 	} else {
2006 		if (!delay_speculative_age) {
2007 			mach_timespec_t ts_fully_aged;
2008 
2009 			ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2010 			ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2011 			    * 1000 * NSEC_PER_USEC;
2012 
2013 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2014 
2015 			clock_sec_t sec;
2016 			clock_nsec_t nsec;
2017 			clock_get_system_nanotime(&sec, &nsec);
2018 			ts.tv_sec = (unsigned int) sec;
2019 			ts.tv_nsec = nsec;
2020 
2021 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2022 				can_steal = TRUE;
2023 			} else {
2024 				delay_speculative_age++;
2025 			}
2026 		} else {
2027 			delay_speculative_age++;
2028 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2029 				delay_speculative_age = 0;
2030 			}
2031 		}
2032 	}
2033 	if (can_steal == TRUE) {
2034 		vm_page_speculate_ageit(aq);
2035 	}
2036 
2037 	return VM_PAGEOUT_SCAN_PROCEED;
2038 }
2039 
2040 /*
2041  * This function is called only from vm_pageout_scan and
2042  * it evicts a single VM object from the cache.
2043  */
2044 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2045 vps_object_cache_evict(vm_object_t *object_to_unlock)
2046 {
2047 	static int                      cache_evict_throttle = 0;
2048 	struct vm_speculative_age_q     *sq;
2049 
2050 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2051 
2052 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2053 		int     pages_evicted;
2054 
2055 		if (*object_to_unlock != NULL) {
2056 			vm_object_unlock(*object_to_unlock);
2057 			*object_to_unlock = NULL;
2058 		}
2059 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2060 
2061 		pages_evicted = vm_object_cache_evict(100, 10);
2062 
2063 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2064 
2065 		if (pages_evicted) {
2066 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2067 
2068 			VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2069 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2070 			memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2071 
2072 			/*
2073 			 * we just freed up to 100 pages,
2074 			 * so go back to the top of the main loop
2075 			 * and re-evaulate the memory situation
2076 			 */
2077 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2078 		} else {
2079 			cache_evict_throttle = 1000;
2080 		}
2081 	}
2082 	if (cache_evict_throttle) {
2083 		cache_evict_throttle--;
2084 	}
2085 
2086 	return VM_PAGEOUT_SCAN_PROCEED;
2087 }
2088 
2089 
2090 /*
2091  * This function is called only from vm_pageout_scan and
2092  * it calculates the filecache min. that needs to be maintained
2093  * as we start to steal pages.
2094  */
2095 static void
vps_calculate_filecache_min(void)2096 vps_calculate_filecache_min(void)
2097 {
2098 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2099 
2100 #if CONFIG_JETSAM
2101 	/*
2102 	 * don't let the filecache_min fall below 15% of available memory
2103 	 * on systems with an active compressor that isn't nearing its
2104 	 * limits w/r to accepting new data
2105 	 *
2106 	 * on systems w/o the compressor/swapper, the filecache is always
2107 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2108 	 * since most (if not all) of the anonymous pages are in the
2109 	 * throttled queue (which isn't counted as available) which
2110 	 * effectively disables this filter
2111 	 */
2112 	if (vm_compressor_low_on_space() || divisor == 0) {
2113 		vm_pageout_state.vm_page_filecache_min = 0;
2114 	} else {
2115 		vm_pageout_state.vm_page_filecache_min =
2116 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2117 	}
2118 #else
2119 	if (vm_compressor_out_of_space() || divisor == 0) {
2120 		vm_pageout_state.vm_page_filecache_min = 0;
2121 	} else {
2122 		/*
2123 		 * don't let the filecache_min fall below the specified critical level
2124 		 */
2125 		vm_pageout_state.vm_page_filecache_min =
2126 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2127 	}
2128 #endif
2129 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2130 		vm_pageout_state.vm_page_filecache_min = 0;
2131 	}
2132 }
2133 
2134 /*
2135  * This function is called only from vm_pageout_scan and
2136  * it updates the flow control time to detect if VM pageoutscan
2137  * isn't making progress.
2138  */
2139 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2140 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2141 {
2142 	mach_timespec_t ts;
2143 	clock_sec_t sec;
2144 	clock_nsec_t nsec;
2145 
2146 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2147 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2148 	clock_get_system_nanotime(&sec, &nsec);
2149 	flow_control->ts.tv_sec = (unsigned int) sec;
2150 	flow_control->ts.tv_nsec = nsec;
2151 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2152 
2153 	flow_control->state = FCS_DELAYED;
2154 
2155 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2156 }
2157 
2158 /*
2159  * This function is called only from vm_pageout_scan and
2160  * it is the flow control logic of VM pageout scan which
2161  * controls if it should block and for how long.
2162  * Any blocking of vm_pageout_scan happens ONLY in this function.
2163  */
2164 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2165 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2166     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2167 {
2168 	boolean_t       exceeded_burst_throttle = FALSE;
2169 	unsigned int    msecs = 0;
2170 	uint32_t        inactive_external_count;
2171 	mach_timespec_t ts;
2172 	struct  vm_pageout_queue *iq;
2173 	struct  vm_pageout_queue *eq;
2174 	struct  vm_speculative_age_q *sq;
2175 
2176 	iq = &vm_pageout_queue_internal;
2177 	eq = &vm_pageout_queue_external;
2178 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2179 
2180 	/*
2181 	 * Sometimes we have to pause:
2182 	 *	1) No inactive pages - nothing to do.
2183 	 *	2) Loop control - no acceptable pages found on the inactive queue
2184 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2185 	 *	3) Flow control - default pageout queue is full
2186 	 */
2187 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2188 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2189 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2190 	    vm_page_queue_empty(&sq->age_q)) {
2191 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2192 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2193 	} else if (inactive_burst_count >=
2194 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2195 	    (vm_page_inactive_count +
2196 	    vm_page_speculative_count))) {
2197 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2198 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2199 
2200 		exceeded_burst_throttle = TRUE;
2201 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2202 	    VM_DYNAMIC_PAGING_ENABLED()) {
2203 		clock_sec_t sec;
2204 		clock_nsec_t nsec;
2205 
2206 		switch (flow_control->state) {
2207 		case FCS_IDLE:
2208 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2209 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2210 				/*
2211 				 * since the compressor is running independently of vm_pageout_scan
2212 				 * let's not wait for it just yet... as long as we have a healthy supply
2213 				 * of filecache pages to work with, let's keep stealing those.
2214 				 */
2215 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2216 
2217 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2218 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2219 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2220 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2221 					return VM_PAGEOUT_SCAN_PROCEED;
2222 				}
2223 			}
2224 
2225 			vps_flow_control_reset_deadlock_timer(flow_control);
2226 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2227 
2228 			break;
2229 
2230 		case FCS_DELAYED:
2231 			clock_get_system_nanotime(&sec, &nsec);
2232 			ts.tv_sec = (unsigned int) sec;
2233 			ts.tv_nsec = nsec;
2234 
2235 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2236 				/*
2237 				 * the pageout thread for the default pager is potentially
2238 				 * deadlocked since the
2239 				 * default pager queue has been throttled for more than the
2240 				 * allowable time... we need to move some clean pages or dirty
2241 				 * pages belonging to the external pagers if they aren't throttled
2242 				 * vm_page_free_wanted represents the number of threads currently
2243 				 * blocked waiting for pages... we'll move one page for each of
2244 				 * these plus a fixed amount to break the logjam... once we're done
2245 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2246 				 * with a new timeout target since we have no way of knowing
2247 				 * whether we've broken the deadlock except through observation
2248 				 * of the queue associated with the default pager... we need to
2249 				 * stop moving pages and allow the system to run to see what
2250 				 * state it settles into.
2251 				 */
2252 
2253 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2254 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2255 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2256 				flow_control->state = FCS_DEADLOCK_DETECTED;
2257 				thread_wakeup(VM_PAGEOUT_GC_EVENT);
2258 				return VM_PAGEOUT_SCAN_PROCEED;
2259 			}
2260 			/*
2261 			 * just resniff instead of trying
2262 			 * to compute a new delay time... we're going to be
2263 			 * awakened immediately upon a laundry completion,
2264 			 * so we won't wait any longer than necessary
2265 			 */
2266 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2267 			break;
2268 
2269 		case FCS_DEADLOCK_DETECTED:
2270 			if (*vm_pageout_deadlock_target) {
2271 				return VM_PAGEOUT_SCAN_PROCEED;
2272 			}
2273 
2274 			vps_flow_control_reset_deadlock_timer(flow_control);
2275 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2276 
2277 			break;
2278 		}
2279 	} else {
2280 		/*
2281 		 * No need to pause...
2282 		 */
2283 		return VM_PAGEOUT_SCAN_PROCEED;
2284 	}
2285 
2286 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2287 
2288 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2289 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2290 
2291 	if (vm_page_free_count >= vm_page_free_target) {
2292 		/*
2293 		 * we're here because
2294 		 *  1) someone else freed up some pages while we had
2295 		 *     the queues unlocked above
2296 		 * and we've hit one of the 3 conditions that
2297 		 * cause us to pause the pageout scan thread
2298 		 *
2299 		 * since we already have enough free pages,
2300 		 * let's avoid stalling and return normally
2301 		 *
2302 		 * before we return, make sure the pageout I/O threads
2303 		 * are running throttled in case there are still requests
2304 		 * in the laundry... since we have enough free pages
2305 		 * we don't need the laundry to be cleaned in a timely
2306 		 * fashion... so let's avoid interfering with foreground
2307 		 * activity
2308 		 *
2309 		 * we don't want to hold vm_page_queue_free_lock when
2310 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2311 		 * may cause other locks to be taken), we do the intitial
2312 		 * check outside of the lock.  Once we take the lock,
2313 		 * we recheck the condition since it may have changed.
2314 		 * if it has, no problem, we will make the threads
2315 		 * non-throttled before actually blocking
2316 		 */
2317 		vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2318 	}
2319 	vm_free_page_lock();
2320 
2321 	if (vm_page_free_count >= vm_page_free_target &&
2322 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2323 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2324 	}
2325 	vm_free_page_unlock();
2326 
2327 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2328 		/*
2329 		 * we're most likely about to block due to one of
2330 		 * the 3 conditions that cause vm_pageout_scan to
2331 		 * not be able to make forward progress w/r
2332 		 * to providing new pages to the free queue,
2333 		 * so unthrottle the I/O threads in case we
2334 		 * have laundry to be cleaned... it needs
2335 		 * to be completed ASAP.
2336 		 *
2337 		 * even if we don't block, we want the io threads
2338 		 * running unthrottled since the sum of free +
2339 		 * clean pages is still under our free target
2340 		 */
2341 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2342 	}
2343 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2344 		/*
2345 		 * if we get here we're below our free target and
2346 		 * we're stalling due to a full laundry queue or
2347 		 * we don't have any inactive pages other then
2348 		 * those in the clean queue...
2349 		 * however, we have pages on the clean queue that
2350 		 * can be moved to the free queue, so let's not
2351 		 * stall the pageout scan
2352 		 */
2353 		flow_control->state = FCS_IDLE;
2354 		return VM_PAGEOUT_SCAN_PROCEED;
2355 	}
2356 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2357 		flow_control->state = FCS_IDLE;
2358 		return VM_PAGEOUT_SCAN_PROCEED;
2359 	}
2360 
2361 	VM_CHECK_MEMORYSTATUS;
2362 
2363 	if (flow_control->state != FCS_IDLE) {
2364 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2365 	}
2366 
2367 	iq->pgo_throttled = TRUE;
2368 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2369 
2370 	vm_page_unlock_queues();
2371 
2372 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2373 
2374 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2375 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2376 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2377 
2378 	thread_block(THREAD_CONTINUE_NULL);
2379 
2380 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2381 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2382 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2383 
2384 	vm_page_lock_queues();
2385 
2386 	iq->pgo_throttled = FALSE;
2387 
2388 	vps_init_page_targets();
2389 
2390 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2391 }
2392 
2393 extern boolean_t vm_darkwake_mode;
2394 /*
2395  * This function is called only from vm_pageout_scan and
2396  * it will find and return the most appropriate page to be
2397  * reclaimed.
2398  */
2399 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2400 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2401     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2402 {
2403 	vm_page_t                       m = NULL;
2404 	vm_object_t                     m_object = VM_OBJECT_NULL;
2405 	uint32_t                        inactive_external_count;
2406 	struct vm_speculative_age_q     *sq;
2407 	struct vm_pageout_queue         *iq;
2408 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2409 
2410 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2411 	iq = &vm_pageout_queue_internal;
2412 
2413 	*is_page_from_bg_q = FALSE;
2414 
2415 	m = NULL;
2416 	m_object = VM_OBJECT_NULL;
2417 
2418 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2419 		assert(vm_page_throttled_count == 0);
2420 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2421 	}
2422 
2423 	/*
2424 	 * Try for a clean-queue inactive page.
2425 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2426 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2427 	 */
2428 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2429 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2430 
2431 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2432 
2433 		goto found_page;
2434 	}
2435 
2436 	/*
2437 	 * The next most eligible pages are ones we paged in speculatively,
2438 	 * but which have not yet been touched and have been aged out.
2439 	 */
2440 	if (!vm_page_queue_empty(&sq->age_q)) {
2441 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2442 
2443 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2444 
2445 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2446 			goto found_page;
2447 		} else {
2448 			m = NULL;
2449 		}
2450 	}
2451 
2452 #if !CONFIG_JETSAM
2453 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2454 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2455 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2456 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2457 			goto found_page;
2458 		}
2459 	}
2460 #endif /* !CONFIG_JETSAM */
2461 
2462 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2463 		vm_object_t     bg_m_object = NULL;
2464 
2465 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2466 
2467 		bg_m_object = VM_PAGE_OBJECT(m);
2468 
2469 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2470 			/*
2471 			 * This page is on the background queue
2472 			 * but not on a pageable queue OR is busy during
2473 			 * darkwake mode when the target is artificially lowered.
2474 			 * If it is busy during darkwake mode, and we don't skip it,
2475 			 * we will just swing back around and try again with the same
2476 			 * queue and might hit the same page or its neighbor in a
2477 			 * similar state. Both of these are transient states and will
2478 			 * get resolved, but, at this point let's ignore this page.
2479 			 */
2480 			if (vm_darkwake_mode && m->vmp_busy) {
2481 				if (bg_m_object->internal) {
2482 					vm_pageout_skipped_bq_internal++;
2483 				} else {
2484 					vm_pageout_skipped_bq_external++;
2485 				}
2486 			}
2487 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2488 			if (bg_m_object->internal &&
2489 			    (VM_PAGE_Q_THROTTLED(iq) ||
2490 			    vm_compressor_out_of_space() == TRUE ||
2491 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2492 				vm_pageout_skipped_bq_internal++;
2493 			} else {
2494 				*is_page_from_bg_q = TRUE;
2495 
2496 				if (bg_m_object->internal) {
2497 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2498 				} else {
2499 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2500 				}
2501 				goto found_page;
2502 			}
2503 		}
2504 	}
2505 
2506 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2507 
2508 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2509 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2510 		*grab_anonymous = TRUE;
2511 		*anons_grabbed = 0;
2512 
2513 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2514 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2515 		} else {
2516 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2517 				/*
2518 				 * No swap and we are in dangerously low levels of free memory.
2519 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2520 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2521 				 *
2522 				 * So, pick a file backed page...
2523 				 */
2524 				*grab_anonymous = FALSE;
2525 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2526 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2527 			}
2528 		}
2529 		goto want_anonymous;
2530 	}
2531 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2532 
2533 #if CONFIG_JETSAM
2534 	/* If the file-backed pool has accumulated
2535 	 * significantly more pages than the jetsam
2536 	 * threshold, prefer to reclaim those
2537 	 * inline to minimise compute overhead of reclaiming
2538 	 * anonymous pages.
2539 	 * This calculation does not account for the CPU local
2540 	 * external page queues, as those are expected to be
2541 	 * much smaller relative to the global pools.
2542 	 */
2543 
2544 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2545 
2546 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2547 		if (vm_page_pageable_external_count >
2548 		    vm_pageout_state.vm_page_filecache_min) {
2549 			if ((vm_page_pageable_external_count *
2550 			    vm_pageout_memorystatus_fb_factor_dr) >
2551 			    (memorystatus_available_pages_critical *
2552 			    vm_pageout_memorystatus_fb_factor_nr)) {
2553 				*grab_anonymous = FALSE;
2554 
2555 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2556 			}
2557 		}
2558 		if (*grab_anonymous) {
2559 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2560 		}
2561 	}
2562 #endif /* CONFIG_JETSAM */
2563 
2564 want_anonymous:
2565 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2566 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2567 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2568 
2569 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2570 			*anons_grabbed = 0;
2571 
2572 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2573 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2574 					if ((++(*reactivated_this_call) % 100)) {
2575 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2576 
2577 						vm_page_activate(m);
2578 						counter_inc(&vm_statistics_reactivations);
2579 #if DEVELOPMENT || DEBUG
2580 						if (*is_page_from_bg_q == TRUE) {
2581 							if (m_object->internal) {
2582 								vm_pageout_rejected_bq_internal++;
2583 							} else {
2584 								vm_pageout_rejected_bq_external++;
2585 							}
2586 						}
2587 #endif /* DEVELOPMENT || DEBUG */
2588 						vm_pageout_state.vm_pageout_inactive_used++;
2589 
2590 						m = NULL;
2591 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2592 
2593 						goto found_page;
2594 					}
2595 
2596 					/*
2597 					 * steal 1 of the file backed pages even if
2598 					 * we are under the limit that has been set
2599 					 * for a healthy filecache
2600 					 */
2601 				}
2602 			}
2603 			goto found_page;
2604 		}
2605 	}
2606 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2607 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2608 
2609 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2610 		*anons_grabbed += 1;
2611 
2612 		goto found_page;
2613 	}
2614 
2615 	m = NULL;
2616 
2617 found_page:
2618 	*victim_page = m;
2619 
2620 	return retval;
2621 }
2622 
2623 /*
2624  * This function is called only from vm_pageout_scan and
2625  * it will put a page back on the active/inactive queue
2626  * if we can't reclaim it for some reason.
2627  */
2628 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2629 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2630 {
2631 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2632 		vm_page_enqueue_inactive(m, FALSE);
2633 	} else {
2634 		vm_page_activate(m);
2635 	}
2636 
2637 #if DEVELOPMENT || DEBUG
2638 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2639 
2640 	if (page_from_bg_q == TRUE) {
2641 		if (m_object->internal) {
2642 			vm_pageout_rejected_bq_internal++;
2643 		} else {
2644 			vm_pageout_rejected_bq_external++;
2645 		}
2646 	}
2647 #endif /* DEVELOPMENT || DEBUG */
2648 }
2649 
2650 /*
2651  * This function is called only from vm_pageout_scan and
2652  * it will try to grab the victim page's VM object (m_object)
2653  * which differs from the previous victim page's object (object).
2654  */
2655 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2656 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2657 {
2658 	struct vm_speculative_age_q *sq;
2659 
2660 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2661 
2662 	/*
2663 	 * the object associated with candidate page is
2664 	 * different from the one we were just working
2665 	 * with... dump the lock if we still own it
2666 	 */
2667 	if (*object != NULL) {
2668 		vm_object_unlock(*object);
2669 		*object = NULL;
2670 	}
2671 	/*
2672 	 * Try to lock object; since we've alread got the
2673 	 * page queues lock, we can only 'try' for this one.
2674 	 * if the 'try' fails, we need to do a mutex_pause
2675 	 * to allow the owner of the object lock a chance to
2676 	 * run... otherwise, we're likely to trip over this
2677 	 * object in the same state as we work our way through
2678 	 * the queue... clumps of pages associated with the same
2679 	 * object are fairly typical on the inactive and active queues
2680 	 */
2681 	if (!vm_object_lock_try_scan(m_object)) {
2682 		vm_page_t m_want = NULL;
2683 
2684 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2685 
2686 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2687 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2688 		}
2689 
2690 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2691 
2692 		m->vmp_reference = FALSE;
2693 
2694 		if (!m_object->object_is_shared_cache) {
2695 			/*
2696 			 * don't apply this optimization if this is the shared cache
2697 			 * object, it's too easy to get rid of very hot and important
2698 			 * pages...
2699 			 * m->vmp_object must be stable since we hold the page queues lock...
2700 			 * we can update the scan_collisions field sans the object lock
2701 			 * since it is a separate field and this is the only spot that does
2702 			 * a read-modify-write operation and it is never executed concurrently...
2703 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2704 			 * is possible for the value to be a bit non-determistic, but that's ok
2705 			 * since it's only used as a hint
2706 			 */
2707 			m_object->scan_collisions = 1;
2708 		}
2709 		if (page_from_bg_q) {
2710 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2711 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2712 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2713 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2714 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2715 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2716 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2717 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2718 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2719 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2720 		}
2721 
2722 		/*
2723 		 * this is the next object we're going to be interested in
2724 		 * try to make sure its available after the mutex_pause
2725 		 * returns control
2726 		 */
2727 		if (m_want) {
2728 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2729 		}
2730 
2731 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2732 
2733 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2734 	} else {
2735 		*object = m_object;
2736 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2737 	}
2738 
2739 	return VM_PAGEOUT_SCAN_PROCEED;
2740 }
2741 
2742 /*
2743  * This function is called only from vm_pageout_scan and
2744  * it notices that pageout scan may be rendered ineffective
2745  * due to a FS deadlock and will jetsam a process if possible.
2746  * If jetsam isn't supported, it'll move the page to the active
2747  * queue to try and get some different pages pushed onwards so
2748  * we can try to get out of this scenario.
2749  */
2750 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2751 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2752     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2753 {
2754 	struct  vm_pageout_queue *eq;
2755 	vm_object_t cur_object = VM_OBJECT_NULL;
2756 
2757 	cur_object = *object;
2758 
2759 	eq = &vm_pageout_queue_external;
2760 
2761 	if (cur_object->internal == FALSE) {
2762 		/*
2763 		 * we need to break up the following potential deadlock case...
2764 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2765 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2766 		 *  c) Most of the pages in the inactive queue belong to this file.
2767 		 *
2768 		 * we are potentially in this deadlock because...
2769 		 *  a) the external pageout queue is throttled
2770 		 *  b) we're done with the active queue and moved on to the inactive queue
2771 		 *  c) we've got a dirty external page
2772 		 *
2773 		 * since we don't know the reason for the external pageout queue being throttled we
2774 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2775 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2776 		 *
2777 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2778 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2779 		 * pool the next time we select a victim page... if we can make enough new free pages,
2780 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2781 		 * be throttled
2782 		 *
2783 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2784 		 * that we can try to find clean pages in the active/inactive queues before
2785 		 * deciding to jetsam a process
2786 		 */
2787 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2788 
2789 		vm_page_check_pageable_safe(m);
2790 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2791 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2792 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2793 		vm_page_active_count++;
2794 		vm_page_pageable_external_count++;
2795 
2796 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2797 
2798 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2799 
2800 #pragma unused(force_anonymous)
2801 
2802 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2803 
2804 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2805 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2806 			/*
2807 			 * Possible deadlock scenario so request jetsam action
2808 			 */
2809 
2810 			assert(cur_object);
2811 			vm_object_unlock(cur_object);
2812 
2813 			cur_object = VM_OBJECT_NULL;
2814 
2815 			/*
2816 			 * VM pageout scan needs to know we have dropped this lock and so set the
2817 			 * object variable we got passed in to NULL.
2818 			 */
2819 			*object = VM_OBJECT_NULL;
2820 
2821 			vm_page_unlock_queues();
2822 
2823 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2824 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2825 
2826 			/* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2827 			if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2828 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2829 			}
2830 
2831 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2832 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2833 
2834 			vm_page_lock_queues();
2835 			*delayed_unlock = 1;
2836 		}
2837 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2838 
2839 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2840 #pragma unused(delayed_unlock)
2841 
2842 		*force_anonymous = TRUE;
2843 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2844 	} else {
2845 		vm_page_activate(m);
2846 		counter_inc(&vm_statistics_reactivations);
2847 
2848 #if DEVELOPMENT || DEBUG
2849 		if (is_page_from_bg_q == TRUE) {
2850 			if (cur_object->internal) {
2851 				vm_pageout_rejected_bq_internal++;
2852 			} else {
2853 				vm_pageout_rejected_bq_external++;
2854 			}
2855 		}
2856 #endif /* DEVELOPMENT || DEBUG */
2857 
2858 		vm_pageout_state.vm_pageout_inactive_used++;
2859 	}
2860 }
2861 
2862 
2863 void
vm_page_balance_inactive(int max_to_move)2864 vm_page_balance_inactive(int max_to_move)
2865 {
2866 	vm_page_t m;
2867 
2868 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2869 
2870 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2871 		/*
2872 		 * It is likely that the hibernation code path is
2873 		 * dealing with these very queues as we are about
2874 		 * to move pages around in/from them and completely
2875 		 * change the linkage of the pages.
2876 		 *
2877 		 * And so we skip the rebalancing of these queues.
2878 		 */
2879 		return;
2880 	}
2881 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2882 	    vm_page_inactive_count +
2883 	    vm_page_speculative_count);
2884 
2885 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2886 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2887 
2888 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2889 
2890 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2891 		assert(!m->vmp_laundry);
2892 		assert(VM_PAGE_OBJECT(m) != kernel_object);
2893 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2894 
2895 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2896 
2897 		/*
2898 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2899 		 *
2900 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2901 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2902 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2903 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2904 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2905 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2906 		 * have happened before we moved the page
2907 		 */
2908 		if (m->vmp_pmapped == TRUE) {
2909 			/*
2910 			 * We might be holding the page queue lock as a
2911 			 * spin lock and clearing the "referenced" bit could
2912 			 * take a while if there are lots of mappings of
2913 			 * that page, so make sure we acquire the lock as
2914 			 * as mutex to avoid a spinlock timeout.
2915 			 */
2916 			vm_page_lockconvert_queues();
2917 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2918 		}
2919 
2920 		/*
2921 		 * The page might be absent or busy,
2922 		 * but vm_page_deactivate can handle that.
2923 		 * FALSE indicates that we don't want a H/W clear reference
2924 		 */
2925 		vm_page_deactivate_internal(m, FALSE);
2926 	}
2927 }
2928 
2929 /*
2930  *	vm_pageout_scan does the dirty work for the pageout daemon.
2931  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2932  *	held and vm_page_free_wanted == 0.
2933  */
2934 void
vm_pageout_scan(void)2935 vm_pageout_scan(void)
2936 {
2937 	unsigned int loop_count = 0;
2938 	unsigned int inactive_burst_count = 0;
2939 	unsigned int reactivated_this_call;
2940 	unsigned int reactivate_limit;
2941 	vm_page_t   local_freeq = NULL;
2942 	int         local_freed = 0;
2943 	int         delayed_unlock;
2944 	int         delayed_unlock_limit = 0;
2945 	int         refmod_state = 0;
2946 	int     vm_pageout_deadlock_target = 0;
2947 	struct  vm_pageout_queue *iq;
2948 	struct  vm_pageout_queue *eq;
2949 	struct  vm_speculative_age_q *sq;
2950 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2951 	boolean_t inactive_throttled = FALSE;
2952 	vm_object_t     object = NULL;
2953 	uint32_t        inactive_reclaim_run;
2954 	boolean_t       grab_anonymous = FALSE;
2955 	boolean_t       force_anonymous = FALSE;
2956 	boolean_t       force_speculative_aging = FALSE;
2957 	int             anons_grabbed = 0;
2958 	int             page_prev_q_state = 0;
2959 	boolean_t       page_from_bg_q = FALSE;
2960 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
2961 	vm_object_t     m_object = VM_OBJECT_NULL;
2962 	int             retval = 0;
2963 	boolean_t       lock_yield_check = FALSE;
2964 
2965 
2966 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2967 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
2968 	    vm_pageout_state.vm_pageout_inactive_clean,
2969 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2970 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2971 
2972 	flow_control.state = FCS_IDLE;
2973 	iq = &vm_pageout_queue_internal;
2974 	eq = &vm_pageout_queue_external;
2975 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2976 
2977 	/* Ask the pmap layer to return any pages it no longer needs. */
2978 	uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
2979 
2980 	vm_page_lock_queues();
2981 
2982 	vm_page_wire_count -= pmap_wired_pages_freed;
2983 
2984 	delayed_unlock = 1;
2985 
2986 	/*
2987 	 *	Calculate the max number of referenced pages on the inactive
2988 	 *	queue that we will reactivate.
2989 	 */
2990 	reactivated_this_call = 0;
2991 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
2992 	    vm_page_inactive_count);
2993 	inactive_reclaim_run = 0;
2994 
2995 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2996 
2997 	/*
2998 	 *	We must limit the rate at which we send pages to the pagers
2999 	 *	so that we don't tie up too many pages in the I/O queues.
3000 	 *	We implement a throttling mechanism using the laundry count
3001 	 *      to limit the number of pages outstanding to the default
3002 	 *	and external pagers.  We can bypass the throttles and look
3003 	 *	for clean pages if the pageout queues don't drain in a timely
3004 	 *	fashion since this may indicate that the pageout paths are
3005 	 *	stalled waiting for memory, which only we can provide.
3006 	 */
3007 
3008 	vps_init_page_targets();
3009 	assert(object == NULL);
3010 	assert(delayed_unlock != 0);
3011 
3012 	for (;;) {
3013 		vm_page_t m;
3014 
3015 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3016 
3017 		if (lock_yield_check) {
3018 			lock_yield_check = FALSE;
3019 
3020 			if (delayed_unlock++ > delayed_unlock_limit) {
3021 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3022 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3023 			} else if (vm_pageout_scan_wants_object) {
3024 				vm_page_unlock_queues();
3025 				mutex_pause(0);
3026 				vm_page_lock_queues();
3027 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3028 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3029 			}
3030 		}
3031 
3032 		if (vm_upl_wait_for_pages < 0) {
3033 			vm_upl_wait_for_pages = 0;
3034 		}
3035 
3036 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3037 
3038 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3039 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3040 		}
3041 
3042 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3043 
3044 		assert(delayed_unlock);
3045 
3046 		/*
3047 		 * maintain our balance
3048 		 */
3049 		vm_page_balance_inactive(1);
3050 
3051 
3052 		/**********************************************************************
3053 		* above this point we're playing with the active and secluded queues
3054 		* below this point we're playing with the throttling mechanisms
3055 		* and the inactive queue
3056 		**********************************************************************/
3057 
3058 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3059 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3060 
3061 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3062 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3063 			/*
3064 			 * make sure the pageout I/O threads are running
3065 			 * throttled in case there are still requests
3066 			 * in the laundry... since we have met our targets
3067 			 * we don't need the laundry to be cleaned in a timely
3068 			 * fashion... so let's avoid interfering with foreground
3069 			 * activity
3070 			 */
3071 			vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3072 
3073 			vm_free_page_lock();
3074 
3075 			if ((vm_page_free_count >= vm_page_free_target) &&
3076 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3077 				/*
3078 				 * done - we have met our target *and*
3079 				 * there is no one waiting for a page.
3080 				 */
3081 return_from_scan:
3082 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3083 
3084 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3085 				    vm_pageout_state.vm_pageout_inactive,
3086 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3087 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3088 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3089 				    vm_pageout_state.vm_pageout_inactive_clean,
3090 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3091 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3092 
3093 				return;
3094 			}
3095 			vm_free_page_unlock();
3096 		}
3097 
3098 		/*
3099 		 * Before anything, we check if we have any ripe volatile
3100 		 * objects around. If so, try to purge the first object.
3101 		 * If the purge fails, fall through to reclaim a page instead.
3102 		 * If the purge succeeds, go back to the top and reevalute
3103 		 * the new memory situation.
3104 		 */
3105 		retval = vps_purge_object();
3106 
3107 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3108 			/*
3109 			 * Success
3110 			 */
3111 			if (object != NULL) {
3112 				vm_object_unlock(object);
3113 				object = NULL;
3114 			}
3115 
3116 			lock_yield_check = FALSE;
3117 			continue;
3118 		}
3119 
3120 		/*
3121 		 * If our 'aged' queue is empty and we have some speculative pages
3122 		 * in the other queues, let's go through and see if we need to age
3123 		 * them.
3124 		 *
3125 		 * If we succeeded in aging a speculative Q or just that everything
3126 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3127 		 *
3128 		 * If, for some reason, we seem to have a mismatch between the spec.
3129 		 * page count and the page queues, we reset those variables and
3130 		 * restart the loop (LD TODO: Track this better?).
3131 		 */
3132 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3133 			retval = vps_age_speculative_queue(force_speculative_aging);
3134 
3135 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3136 				lock_yield_check = FALSE;
3137 				continue;
3138 			}
3139 		}
3140 		force_speculative_aging = FALSE;
3141 
3142 		/*
3143 		 * Check to see if we need to evict objects from the cache.
3144 		 *
3145 		 * Note: 'object' here doesn't have anything to do with
3146 		 * the eviction part. We just need to make sure we have dropped
3147 		 * any object lock we might be holding if we need to go down
3148 		 * into the eviction logic.
3149 		 */
3150 		retval = vps_object_cache_evict(&object);
3151 
3152 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3153 			lock_yield_check = FALSE;
3154 			continue;
3155 		}
3156 
3157 
3158 		/*
3159 		 * Calculate our filecache_min that will affect the loop
3160 		 * going forward.
3161 		 */
3162 		vps_calculate_filecache_min();
3163 
3164 		/*
3165 		 * LD TODO: Use a structure to hold all state variables for a single
3166 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3167 		 */
3168 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3169 		    &delayed_unlock, &local_freeq, &local_freed,
3170 		    &vm_pageout_deadlock_target, inactive_burst_count);
3171 
3172 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3173 			if (loop_count >= vm_page_inactive_count) {
3174 				loop_count = 0;
3175 			}
3176 
3177 			inactive_burst_count = 0;
3178 
3179 			assert(object == NULL);
3180 			assert(delayed_unlock != 0);
3181 
3182 			lock_yield_check = FALSE;
3183 			continue;
3184 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3185 			goto return_from_scan;
3186 		}
3187 
3188 		flow_control.state = FCS_IDLE;
3189 
3190 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3191 		    vm_pageout_inactive_external_forced_reactivate_limit);
3192 		loop_count++;
3193 		inactive_burst_count++;
3194 		vm_pageout_state.vm_pageout_inactive++;
3195 
3196 		/*
3197 		 * Choose a victim.
3198 		 */
3199 
3200 		m = NULL;
3201 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3202 
3203 		if (m == NULL) {
3204 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3205 				inactive_burst_count = 0;
3206 
3207 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3208 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3209 				}
3210 
3211 				lock_yield_check = TRUE;
3212 				continue;
3213 			}
3214 
3215 			/*
3216 			 * if we've gotten here, we have no victim page.
3217 			 * check to see if we've not finished balancing the queues
3218 			 * or we have a page on the aged speculative queue that we
3219 			 * skipped due to force_anonymous == TRUE.. or we have
3220 			 * speculative  pages that we can prematurely age... if
3221 			 * one of these cases we'll keep going, else panic
3222 			 */
3223 			force_anonymous = FALSE;
3224 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3225 
3226 			if (!vm_page_queue_empty(&sq->age_q)) {
3227 				lock_yield_check = TRUE;
3228 				continue;
3229 			}
3230 
3231 			if (vm_page_speculative_count) {
3232 				force_speculative_aging = TRUE;
3233 				lock_yield_check = TRUE;
3234 				continue;
3235 			}
3236 			panic("vm_pageout: no victim");
3237 
3238 			/* NOTREACHED */
3239 		}
3240 
3241 		assert(VM_PAGE_PAGEABLE(m));
3242 		m_object = VM_PAGE_OBJECT(m);
3243 		force_anonymous = FALSE;
3244 
3245 		page_prev_q_state = m->vmp_q_state;
3246 		/*
3247 		 * we just found this page on one of our queues...
3248 		 * it can't also be on the pageout queue, so safe
3249 		 * to call vm_page_queues_remove
3250 		 */
3251 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3252 		vm_page_queues_remove(m, TRUE);
3253 		if (donate) {
3254 			/*
3255 			 * The compressor needs to see this bit to know
3256 			 * where this page needs to land. Also if stolen,
3257 			 * this bit helps put the page back in the right
3258 			 * special queue where it belongs.
3259 			 */
3260 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3261 		}
3262 
3263 		assert(!m->vmp_laundry);
3264 		assert(!m->vmp_private);
3265 		assert(!m->vmp_fictitious);
3266 		assert(m_object != kernel_object);
3267 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3268 
3269 		vm_pageout_vminfo.vm_pageout_considered_page++;
3270 
3271 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3272 
3273 		/*
3274 		 * check to see if we currently are working
3275 		 * with the same object... if so, we've
3276 		 * already got the lock
3277 		 */
3278 		if (m_object != object) {
3279 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3280 
3281 			/*
3282 			 * vps_switch_object() will always drop the 'object' lock first
3283 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3284 			 * either 'm_object' or NULL.
3285 			 */
3286 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3287 
3288 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3289 				lock_yield_check = TRUE;
3290 				continue;
3291 			}
3292 		}
3293 		assert(m_object == object);
3294 		assert(VM_PAGE_OBJECT(m) == m_object);
3295 
3296 		if (m->vmp_busy) {
3297 			/*
3298 			 *	Somebody is already playing with this page.
3299 			 *	Put it back on the appropriate queue
3300 			 *
3301 			 */
3302 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3303 
3304 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3305 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3306 			}
3307 
3308 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3309 
3310 			lock_yield_check = TRUE;
3311 			continue;
3312 		}
3313 
3314 		/*
3315 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3316 		 *	If already cleaning this page in place
3317 		 *	just leave if off the paging queues.
3318 		 *	We can leave the page mapped, and upl_commit_range
3319 		 *	will put it on the clean queue.
3320 		 *
3321 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3322 		 *	an msync INVALIDATE is in progress...
3323 		 *	this page has been marked for destruction
3324 		 *      after it has been cleaned,
3325 		 *      but not yet gathered into a UPL
3326 		 *	where 'cleaning' will be set...
3327 		 *	just leave it off the paging queues
3328 		 *
3329 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3330 		 *	an msync INVALIDATE is in progress
3331 		 *	and the UPL has already gathered this page...
3332 		 *	just leave it off the paging queues
3333 		 */
3334 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3335 			lock_yield_check = TRUE;
3336 			continue;
3337 		}
3338 
3339 
3340 		/*
3341 		 *	If it's absent, in error or the object is no longer alive,
3342 		 *	we can reclaim the page... in the no longer alive case,
3343 		 *	there are 2 states the page can be in that preclude us
3344 		 *	from reclaiming it - busy or cleaning - that we've already
3345 		 *	dealt with
3346 		 */
3347 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3348 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3349 			if (m->vmp_absent) {
3350 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3351 			} else if (!object->alive ||
3352 			    (!object->internal &&
3353 			    object->pager == MEMORY_OBJECT_NULL)) {
3354 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3355 			} else {
3356 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3357 			}
3358 reclaim_page:
3359 			if (vm_pageout_deadlock_target) {
3360 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3361 				vm_pageout_deadlock_target--;
3362 			}
3363 
3364 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3365 
3366 			if (object->internal) {
3367 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3368 			} else {
3369 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3370 			}
3371 			assert(!m->vmp_cleaning);
3372 			assert(!m->vmp_laundry);
3373 
3374 			if (!object->internal &&
3375 			    object->pager != NULL &&
3376 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3377 				shared_region_pager_reclaimed++;
3378 			}
3379 
3380 			m->vmp_busy = TRUE;
3381 
3382 			/*
3383 			 * remove page from object here since we're already
3384 			 * behind the object lock... defer the rest of the work
3385 			 * we'd normally do in vm_page_free_prepare_object
3386 			 * until 'vm_page_free_list' is called
3387 			 */
3388 			if (m->vmp_tabled) {
3389 				vm_page_remove(m, TRUE);
3390 			}
3391 
3392 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3393 			m->vmp_snext = local_freeq;
3394 			local_freeq = m;
3395 			local_freed++;
3396 
3397 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3398 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3399 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3400 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3401 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3402 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3403 			} else {
3404 				vm_pageout_vminfo.vm_pageout_freed_external++;
3405 			}
3406 
3407 			inactive_burst_count = 0;
3408 
3409 			lock_yield_check = TRUE;
3410 			continue;
3411 		}
3412 		if (object->copy == VM_OBJECT_NULL) {
3413 			/*
3414 			 * No one else can have any interest in this page.
3415 			 * If this is an empty purgable object, the page can be
3416 			 * reclaimed even if dirty.
3417 			 * If the page belongs to a volatile purgable object, we
3418 			 * reactivate it if the compressor isn't active.
3419 			 */
3420 			if (object->purgable == VM_PURGABLE_EMPTY) {
3421 				if (m->vmp_pmapped == TRUE) {
3422 					/* unmap the page */
3423 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3424 					if (refmod_state & VM_MEM_MODIFIED) {
3425 						SET_PAGE_DIRTY(m, FALSE);
3426 					}
3427 				}
3428 				if (m->vmp_dirty || m->vmp_precious) {
3429 					/* we saved the cost of cleaning this page ! */
3430 					vm_page_purged_count++;
3431 				}
3432 				goto reclaim_page;
3433 			}
3434 
3435 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3436 				/*
3437 				 * With the VM compressor, the cost of
3438 				 * reclaiming a page is much lower (no I/O),
3439 				 * so if we find a "volatile" page, it's better
3440 				 * to let it get compressed rather than letting
3441 				 * it occupy a full page until it gets purged.
3442 				 * So no need to check for "volatile" here.
3443 				 */
3444 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3445 				/*
3446 				 * Avoid cleaning a "volatile" page which might
3447 				 * be purged soon.
3448 				 */
3449 
3450 				/* if it's wired, we can't put it on our queue */
3451 				assert(!VM_PAGE_WIRED(m));
3452 
3453 				/* just stick it back on! */
3454 				reactivated_this_call++;
3455 
3456 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3457 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3458 				}
3459 
3460 				goto reactivate_page;
3461 			}
3462 		}
3463 		/*
3464 		 *	If it's being used, reactivate.
3465 		 *	(Fictitious pages are either busy or absent.)
3466 		 *	First, update the reference and dirty bits
3467 		 *	to make sure the page is unreferenced.
3468 		 */
3469 		refmod_state = -1;
3470 
3471 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3472 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3473 
3474 			if (refmod_state & VM_MEM_REFERENCED) {
3475 				m->vmp_reference = TRUE;
3476 			}
3477 			if (refmod_state & VM_MEM_MODIFIED) {
3478 				SET_PAGE_DIRTY(m, FALSE);
3479 			}
3480 		}
3481 
3482 		if (m->vmp_reference || m->vmp_dirty) {
3483 			/* deal with a rogue "reusable" page */
3484 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3485 		}
3486 
3487 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3488 			vm_pageout_state.vm_page_xpmapped_min = 0;
3489 		} else {
3490 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3491 		}
3492 
3493 		if (!m->vmp_no_cache &&
3494 		    page_from_bg_q == FALSE &&
3495 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3496 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3497 			/*
3498 			 * The page we pulled off the inactive list has
3499 			 * been referenced.  It is possible for other
3500 			 * processors to be touching pages faster than we
3501 			 * can clear the referenced bit and traverse the
3502 			 * inactive queue, so we limit the number of
3503 			 * reactivations.
3504 			 */
3505 			if (++reactivated_this_call >= reactivate_limit) {
3506 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3507 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3508 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3509 			} else {
3510 				uint32_t isinuse;
3511 
3512 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3513 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3514 				}
3515 
3516 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3517 reactivate_page:
3518 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3519 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3520 					/*
3521 					 * no explict mappings of this object exist
3522 					 * and it's not open via the filesystem
3523 					 */
3524 					vm_page_deactivate(m);
3525 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3526 				} else {
3527 					/*
3528 					 * The page was/is being used, so put back on active list.
3529 					 */
3530 					vm_page_activate(m);
3531 					counter_inc(&vm_statistics_reactivations);
3532 					inactive_burst_count = 0;
3533 				}
3534 #if DEVELOPMENT || DEBUG
3535 				if (page_from_bg_q == TRUE) {
3536 					if (m_object->internal) {
3537 						vm_pageout_rejected_bq_internal++;
3538 					} else {
3539 						vm_pageout_rejected_bq_external++;
3540 					}
3541 				}
3542 #endif /* DEVELOPMENT || DEBUG */
3543 
3544 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3545 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3546 				}
3547 				vm_pageout_state.vm_pageout_inactive_used++;
3548 
3549 				lock_yield_check = TRUE;
3550 				continue;
3551 			}
3552 			/*
3553 			 * Make sure we call pmap_get_refmod() if it
3554 			 * wasn't already called just above, to update
3555 			 * the dirty bit.
3556 			 */
3557 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3558 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3559 				if (refmod_state & VM_MEM_MODIFIED) {
3560 					SET_PAGE_DIRTY(m, FALSE);
3561 				}
3562 			}
3563 		}
3564 
3565 		/*
3566 		 * we've got a candidate page to steal...
3567 		 *
3568 		 * m->vmp_dirty is up to date courtesy of the
3569 		 * preceding check for m->vmp_reference... if
3570 		 * we get here, then m->vmp_reference had to be
3571 		 * FALSE (or possibly "reactivate_limit" was
3572 		 * exceeded), but in either case we called
3573 		 * pmap_get_refmod() and updated both
3574 		 * m->vmp_reference and m->vmp_dirty
3575 		 *
3576 		 * if it's dirty or precious we need to
3577 		 * see if the target queue is throtttled
3578 		 * it if is, we need to skip over it by moving it back
3579 		 * to the end of the inactive queue
3580 		 */
3581 
3582 		inactive_throttled = FALSE;
3583 
3584 		if (m->vmp_dirty || m->vmp_precious) {
3585 			if (object->internal) {
3586 				if (VM_PAGE_Q_THROTTLED(iq)) {
3587 					inactive_throttled = TRUE;
3588 				}
3589 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3590 				inactive_throttled = TRUE;
3591 			}
3592 		}
3593 throttle_inactive:
3594 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3595 		    object->internal && m->vmp_dirty &&
3596 		    (object->purgable == VM_PURGABLE_DENY ||
3597 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3598 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3599 			vm_page_check_pageable_safe(m);
3600 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3601 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3602 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3603 			vm_page_throttled_count++;
3604 
3605 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3606 
3607 			inactive_burst_count = 0;
3608 
3609 			lock_yield_check = TRUE;
3610 			continue;
3611 		}
3612 		if (inactive_throttled == TRUE) {
3613 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3614 			    &delayed_unlock, &force_anonymous, page_from_bg_q);
3615 
3616 			inactive_burst_count = 0;
3617 
3618 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3619 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3620 			}
3621 
3622 			lock_yield_check = TRUE;
3623 			continue;
3624 		}
3625 
3626 		/*
3627 		 * we've got a page that we can steal...
3628 		 * eliminate all mappings and make sure
3629 		 * we have the up-to-date modified state
3630 		 *
3631 		 * if we need to do a pmap_disconnect then we
3632 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3633 		 * provides the true state atomically... the
3634 		 * page was still mapped up to the pmap_disconnect
3635 		 * and may have been dirtied at the last microsecond
3636 		 *
3637 		 * Note that if 'pmapped' is FALSE then the page is not
3638 		 * and has not been in any map, so there is no point calling
3639 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3640 		 * of likely usage of the page.
3641 		 */
3642 		if (m->vmp_pmapped == TRUE) {
3643 			int pmap_options;
3644 
3645 			/*
3646 			 * Don't count this page as going into the compressor
3647 			 * if any of these are true:
3648 			 * 1) compressed pager isn't enabled
3649 			 * 2) Freezer enabled device with compressed pager
3650 			 *    backend (exclusive use) i.e. most of the VM system
3651 			 *    (including vm_pageout_scan) has no knowledge of
3652 			 *    the compressor
3653 			 * 3) This page belongs to a file and hence will not be
3654 			 *    sent into the compressor
3655 			 */
3656 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3657 			    object->internal == FALSE) {
3658 				pmap_options = 0;
3659 			} else if (m->vmp_dirty || m->vmp_precious) {
3660 				/*
3661 				 * VM knows that this page is dirty (or
3662 				 * precious) and needs to be compressed
3663 				 * rather than freed.
3664 				 * Tell the pmap layer to count this page
3665 				 * as "compressed".
3666 				 */
3667 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3668 			} else {
3669 				/*
3670 				 * VM does not know if the page needs to
3671 				 * be preserved but the pmap layer might tell
3672 				 * us if any mapping has "modified" it.
3673 				 * Let's the pmap layer to count this page
3674 				 * as compressed if and only if it has been
3675 				 * modified.
3676 				 */
3677 				pmap_options =
3678 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3679 			}
3680 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3681 			    pmap_options,
3682 			    NULL);
3683 			if (refmod_state & VM_MEM_MODIFIED) {
3684 				SET_PAGE_DIRTY(m, FALSE);
3685 			}
3686 		}
3687 
3688 		/*
3689 		 * reset our count of pages that have been reclaimed
3690 		 * since the last page was 'stolen'
3691 		 */
3692 		inactive_reclaim_run = 0;
3693 
3694 		/*
3695 		 *	If it's clean and not precious, we can free the page.
3696 		 */
3697 		if (!m->vmp_dirty && !m->vmp_precious) {
3698 			vm_pageout_state.vm_pageout_inactive_clean++;
3699 
3700 			/*
3701 			 * OK, at this point we have found a page we are going to free.
3702 			 */
3703 #if CONFIG_PHANTOM_CACHE
3704 			if (!object->internal) {
3705 				vm_phantom_cache_add_ghost(m);
3706 			}
3707 #endif
3708 			goto reclaim_page;
3709 		}
3710 
3711 		/*
3712 		 * The page may have been dirtied since the last check
3713 		 * for a throttled target queue (which may have been skipped
3714 		 * if the page was clean then).  With the dirty page
3715 		 * disconnected here, we can make one final check.
3716 		 */
3717 		if (object->internal) {
3718 			if (VM_PAGE_Q_THROTTLED(iq)) {
3719 				inactive_throttled = TRUE;
3720 			}
3721 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3722 			inactive_throttled = TRUE;
3723 		}
3724 
3725 		if (inactive_throttled == TRUE) {
3726 			goto throttle_inactive;
3727 		}
3728 
3729 #if VM_PRESSURE_EVENTS
3730 #if CONFIG_JETSAM
3731 
3732 		/*
3733 		 * If Jetsam is enabled, then the sending
3734 		 * of memory pressure notifications is handled
3735 		 * from the same thread that takes care of high-water
3736 		 * and other jetsams i.e. the memorystatus_thread.
3737 		 */
3738 
3739 #else /* CONFIG_JETSAM */
3740 
3741 		vm_pressure_response();
3742 
3743 #endif /* CONFIG_JETSAM */
3744 #endif /* VM_PRESSURE_EVENTS */
3745 
3746 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3747 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3748 		}
3749 
3750 		if (object->internal) {
3751 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3752 		} else {
3753 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3754 		}
3755 
3756 		/*
3757 		 * internal pages will go to the compressor...
3758 		 * external pages will go to the appropriate pager to be cleaned
3759 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3760 		 * is a preferred queue to steal from
3761 		 */
3762 		vm_pageout_cluster(m);
3763 		inactive_burst_count = 0;
3764 
3765 		/*
3766 		 * back to top of pageout scan loop
3767 		 */
3768 	}
3769 }
3770 
3771 
3772 void
vm_page_free_reserve(int pages)3773 vm_page_free_reserve(
3774 	int pages)
3775 {
3776 	int             free_after_reserve;
3777 
3778 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3779 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3780 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3781 		} else {
3782 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3783 		}
3784 	} else {
3785 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3786 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3787 		} else {
3788 			vm_page_free_reserved += pages;
3789 		}
3790 	}
3791 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3792 
3793 	vm_page_free_min = vm_page_free_reserved +
3794 	    VM_PAGE_FREE_MIN(free_after_reserve);
3795 
3796 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3797 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3798 	}
3799 
3800 	vm_page_free_target = vm_page_free_reserved +
3801 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3802 
3803 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3804 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3805 	}
3806 
3807 	if (vm_page_free_target < vm_page_free_min + 5) {
3808 		vm_page_free_target = vm_page_free_min + 5;
3809 	}
3810 
3811 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3812 }
3813 
3814 /*
3815  *	vm_pageout is the high level pageout daemon.
3816  */
3817 
3818 void
vm_pageout_continue(void)3819 vm_pageout_continue(void)
3820 {
3821 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3822 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3823 
3824 	vm_free_page_lock();
3825 	vm_pageout_running = TRUE;
3826 	vm_free_page_unlock();
3827 
3828 	vm_pageout_scan();
3829 	/*
3830 	 * we hold both the vm_page_queue_free_lock
3831 	 * and the vm_page_queues_lock at this point
3832 	 */
3833 	assert(vm_page_free_wanted == 0);
3834 	assert(vm_page_free_wanted_privileged == 0);
3835 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3836 
3837 	vm_pageout_running = FALSE;
3838 #if XNU_TARGET_OS_OSX
3839 	if (vm_pageout_waiter) {
3840 		vm_pageout_waiter = FALSE;
3841 		thread_wakeup((event_t)&vm_pageout_waiter);
3842 	}
3843 #endif /* XNU_TARGET_OS_OSX */
3844 
3845 	vm_free_page_unlock();
3846 	vm_page_unlock_queues();
3847 
3848 	thread_block((thread_continue_t)vm_pageout_continue);
3849 	/*NOTREACHED*/
3850 }
3851 
3852 #if XNU_TARGET_OS_OSX
3853 kern_return_t
vm_pageout_wait(uint64_t deadline)3854 vm_pageout_wait(uint64_t deadline)
3855 {
3856 	kern_return_t kr;
3857 
3858 	vm_free_page_lock();
3859 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3860 		vm_pageout_waiter = TRUE;
3861 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3862 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3863 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3864 			kr = KERN_OPERATION_TIMED_OUT;
3865 		}
3866 	}
3867 	vm_free_page_unlock();
3868 
3869 	return kr;
3870 }
3871 #endif /* XNU_TARGET_OS_OSX */
3872 
3873 
3874 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q,__unused wait_result_t w)3875 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q, __unused wait_result_t w)
3876 {
3877 	vm_page_t       m = NULL;
3878 	vm_object_t     object;
3879 	vm_object_offset_t offset;
3880 	memory_object_t pager;
3881 
3882 	/* On systems with a compressor, the external IO thread clears its
3883 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3884 	 * creation)
3885 	 */
3886 	if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3887 		current_thread()->options &= ~TH_OPT_VMPRIV;
3888 	}
3889 
3890 	vm_page_lockspin_queues();
3891 
3892 	while (!vm_page_queue_empty(&q->pgo_pending)) {
3893 		q->pgo_busy = TRUE;
3894 		vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3895 
3896 		assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3897 		VM_PAGE_CHECK(m);
3898 		/*
3899 		 * grab a snapshot of the object and offset this
3900 		 * page is tabled in so that we can relookup this
3901 		 * page after we've taken the object lock - these
3902 		 * fields are stable while we hold the page queues lock
3903 		 * but as soon as we drop it, there is nothing to keep
3904 		 * this page in this object... we hold an activity_in_progress
3905 		 * on this object which will keep it from terminating
3906 		 */
3907 		object = VM_PAGE_OBJECT(m);
3908 		offset = m->vmp_offset;
3909 
3910 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3911 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3912 
3913 		vm_page_unlock_queues();
3914 
3915 		vm_object_lock(object);
3916 
3917 		m = vm_page_lookup(object, offset);
3918 
3919 		if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3920 		    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3921 			/*
3922 			 * it's either the same page that someone else has
3923 			 * started cleaning (or it's finished cleaning or
3924 			 * been put back on the pageout queue), or
3925 			 * the page has been freed or we have found a
3926 			 * new page at this offset... in all of these cases
3927 			 * we merely need to release the activity_in_progress
3928 			 * we took when we put the page on the pageout queue
3929 			 */
3930 			vm_object_activity_end(object);
3931 			vm_object_unlock(object);
3932 
3933 			vm_page_lockspin_queues();
3934 			continue;
3935 		}
3936 		pager = object->pager;
3937 
3938 		if (pager == MEMORY_OBJECT_NULL) {
3939 			/*
3940 			 * This pager has been destroyed by either
3941 			 * memory_object_destroy or vm_object_destroy, and
3942 			 * so there is nowhere for the page to go.
3943 			 */
3944 			if (m->vmp_free_when_done) {
3945 				/*
3946 				 * Just free the page... VM_PAGE_FREE takes
3947 				 * care of cleaning up all the state...
3948 				 * including doing the vm_pageout_throttle_up
3949 				 */
3950 				VM_PAGE_FREE(m);
3951 			} else {
3952 				vm_page_lockspin_queues();
3953 
3954 				vm_pageout_throttle_up(m);
3955 				vm_page_activate(m);
3956 
3957 				vm_page_unlock_queues();
3958 
3959 				/*
3960 				 *	And we are done with it.
3961 				 */
3962 			}
3963 			vm_object_activity_end(object);
3964 			vm_object_unlock(object);
3965 
3966 			vm_page_lockspin_queues();
3967 			continue;
3968 		}
3969 #if 0
3970 		/*
3971 		 * we don't hold the page queue lock
3972 		 * so this check isn't safe to make
3973 		 */
3974 		VM_PAGE_CHECK(m);
3975 #endif
3976 		/*
3977 		 * give back the activity_in_progress reference we
3978 		 * took when we queued up this page and replace it
3979 		 * it with a paging_in_progress reference that will
3980 		 * also hold the paging offset from changing and
3981 		 * prevent the object from terminating
3982 		 */
3983 		vm_object_activity_end(object);
3984 		vm_object_paging_begin(object);
3985 		vm_object_unlock(object);
3986 
3987 		/*
3988 		 * Send the data to the pager.
3989 		 * any pageout clustering happens there
3990 		 */
3991 		memory_object_data_return(pager,
3992 		    m->vmp_offset + object->paging_offset,
3993 		    PAGE_SIZE,
3994 		    NULL,
3995 		    NULL,
3996 		    FALSE,
3997 		    FALSE,
3998 		    0);
3999 
4000 		vm_object_lock(object);
4001 		vm_object_paging_end(object);
4002 		vm_object_unlock(object);
4003 
4004 		vm_pageout_io_throttle();
4005 
4006 		vm_page_lockspin_queues();
4007 	}
4008 	q->pgo_busy = FALSE;
4009 	q->pgo_idle = TRUE;
4010 
4011 	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
4012 	vm_page_unlock_queues();
4013 
4014 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
4015 	/*NOTREACHED*/
4016 }
4017 
4018 
4019 #define         MAX_FREE_BATCH          32
4020 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4021                                      * this thread.
4022                                      */
4023 
4024 
4025 void
4026 vm_pageout_iothread_internal_continue(struct cq *, __unused wait_result_t);
4027 void
vm_pageout_iothread_internal_continue(struct cq * cq,__unused wait_result_t w)4028 vm_pageout_iothread_internal_continue(struct cq *cq, __unused wait_result_t w)
4029 {
4030 	struct vm_pageout_queue *q;
4031 	vm_page_t       m = NULL;
4032 	boolean_t       pgo_draining;
4033 	vm_page_t   local_q;
4034 	int         local_cnt;
4035 	vm_page_t   local_freeq = NULL;
4036 	int         local_freed = 0;
4037 	int         local_batch_size;
4038 #if DEVELOPMENT || DEBUG
4039 	int       ncomps = 0;
4040 	boolean_t marked_active = FALSE;
4041 	int       num_pages_processed = 0;
4042 #endif
4043 	void *chead = NULL;
4044 
4045 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4046 
4047 	q = cq->q;
4048 #if DEVELOPMENT || DEBUG
4049 	bool benchmark_accounting = false;
4050 	/*
4051 	 * If we're running the compressor perf test, only process the benchmark pages.
4052 	 * We'll get back to our regular queue once the benchmark is done
4053 	 */
4054 	if (compressor_running_perf_test) {
4055 		q = cq->benchmark_q;
4056 		if (!vm_page_queue_empty(&q->pgo_pending)) {
4057 			benchmark_accounting = true;
4058 		} else {
4059 			q = cq->q;
4060 			benchmark_accounting = false;
4061 		}
4062 	}
4063 #endif /* DEVELOPMENT || DEBUG */
4064 
4065 #if __AMP__
4066 	if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4067 		local_batch_size = (q->pgo_maxlaundry >> 3);
4068 		local_batch_size = MAX(local_batch_size, 16);
4069 	} else {
4070 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4071 	}
4072 #else
4073 	local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4074 #endif
4075 
4076 #if RECORD_THE_COMPRESSED_DATA
4077 	if (q->pgo_laundry) {
4078 		c_compressed_record_init();
4079 	}
4080 #endif
4081 	while (TRUE) {
4082 		int     pages_left_on_q = 0;
4083 
4084 		local_cnt = 0;
4085 		local_q = NULL;
4086 
4087 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4088 
4089 		vm_page_lock_queues();
4090 #if DEVELOPMENT || DEBUG
4091 		if (marked_active == FALSE) {
4092 			vmct_active++;
4093 			vmct_state[cq->id] = VMCT_ACTIVE;
4094 			marked_active = TRUE;
4095 			if (vmct_active == 1) {
4096 				vm_compressor_epoch_start = mach_absolute_time();
4097 			}
4098 		}
4099 #endif
4100 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4101 
4102 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4103 
4104 		while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4105 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4106 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4107 			VM_PAGE_CHECK(m);
4108 
4109 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4110 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4111 			m->vmp_laundry = FALSE;
4112 
4113 			m->vmp_snext = local_q;
4114 			local_q = m;
4115 			local_cnt++;
4116 		}
4117 		if (local_q == NULL) {
4118 			break;
4119 		}
4120 
4121 		q->pgo_busy = TRUE;
4122 
4123 		if ((pgo_draining = q->pgo_draining) == FALSE) {
4124 			vm_pageout_throttle_up_batch(q, local_cnt);
4125 			pages_left_on_q = q->pgo_laundry;
4126 		} else {
4127 			pages_left_on_q = q->pgo_laundry - local_cnt;
4128 		}
4129 
4130 		vm_page_unlock_queues();
4131 
4132 #if !RECORD_THE_COMPRESSED_DATA
4133 		if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4134 			thread_wakeup((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id + 1));
4135 		}
4136 #endif
4137 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4138 
4139 		while (local_q) {
4140 			KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4141 
4142 			m = local_q;
4143 			local_q = m->vmp_snext;
4144 			m->vmp_snext = NULL;
4145 
4146 			/*
4147 			 * Technically we need the pageq locks to manipulate this field.
4148 			 * However, this page has been removed from all queues and is only
4149 			 * known to this compressor thread dealing with this local queue.
4150 			 *
4151 			 * TODO LIONEL: Add a second localq that is the early localq and
4152 			 * put special pages like this one on that queue in the block above
4153 			 * under the pageq lock to avoid this 'works but not clean' logic.
4154 			 */
4155 			void *donate_queue_head;
4156 #if XNU_TARGET_OS_OSX
4157 			donate_queue_head = &cq->current_early_swapout_chead;
4158 #else /* XNU_TARGET_OS_OSX */
4159 			donate_queue_head = &cq->current_late_swapout_chead;
4160 #endif /* XNU_TARGET_OS_OSX */
4161 			if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4162 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4163 				chead = donate_queue_head;
4164 			} else {
4165 				chead = &cq->current_regular_swapout_chead;
4166 			}
4167 
4168 			if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4169 #if DEVELOPMENT || DEBUG
4170 				ncomps++;
4171 #endif
4172 				KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4173 
4174 				m->vmp_snext = local_freeq;
4175 				local_freeq = m;
4176 				local_freed++;
4177 
4178 				if (local_freed >= MAX_FREE_BATCH) {
4179 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4180 
4181 					vm_page_free_list(local_freeq, TRUE);
4182 
4183 					local_freeq = NULL;
4184 					local_freed = 0;
4185 				}
4186 			}
4187 #if DEVELOPMENT || DEBUG
4188 			num_pages_processed++;
4189 #endif /* DEVELOPMENT || DEBUG */
4190 #if !CONFIG_JETSAM
4191 			while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4192 				kern_return_t   wait_result;
4193 				int             need_wakeup = 0;
4194 
4195 				if (local_freeq) {
4196 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4197 
4198 					vm_page_free_list(local_freeq, TRUE);
4199 					local_freeq = NULL;
4200 					local_freed = 0;
4201 
4202 					continue;
4203 				}
4204 				vm_free_page_lock_spin();
4205 
4206 				if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4207 					if (vm_page_free_wanted_privileged++ == 0) {
4208 						need_wakeup = 1;
4209 					}
4210 					wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4211 
4212 					vm_free_page_unlock();
4213 
4214 					if (need_wakeup) {
4215 						thread_wakeup((event_t)&vm_page_free_wanted);
4216 					}
4217 
4218 					if (wait_result == THREAD_WAITING) {
4219 						thread_block(THREAD_CONTINUE_NULL);
4220 					}
4221 				} else {
4222 					vm_free_page_unlock();
4223 				}
4224 			}
4225 #endif
4226 		}
4227 		if (local_freeq) {
4228 			OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4229 
4230 			vm_page_free_list(local_freeq, TRUE);
4231 			local_freeq = NULL;
4232 			local_freed = 0;
4233 		}
4234 		if (pgo_draining == TRUE) {
4235 			vm_page_lockspin_queues();
4236 			vm_pageout_throttle_up_batch(q, local_cnt);
4237 			vm_page_unlock_queues();
4238 		}
4239 	}
4240 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4241 
4242 	/*
4243 	 * queue lock is held and our q is empty
4244 	 */
4245 	q->pgo_busy = FALSE;
4246 	q->pgo_idle = TRUE;
4247 
4248 	assert_wait((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id), THREAD_UNINT);
4249 #if DEVELOPMENT || DEBUG
4250 	if (marked_active == TRUE) {
4251 		vmct_active--;
4252 		vmct_state[cq->id] = VMCT_IDLE;
4253 
4254 		if (vmct_active == 0) {
4255 			vm_compressor_epoch_stop = mach_absolute_time();
4256 			assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4257 			    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4258 			    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4259 			/* This interval includes intervals where one or more
4260 			 * compressor threads were pre-empted
4261 			 */
4262 			vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4263 		}
4264 	}
4265 	if (compressor_running_perf_test && benchmark_accounting) {
4266 		/*
4267 		 * We could turn ON compressor_running_perf_test while still processing
4268 		 * regular non-benchmark pages. We shouldn't count them here else we
4269 		 * could overshoot. We might also still be populating that benchmark Q
4270 		 * and be under pressure. So we will go back to the regular queues. And
4271 		 * benchmark accounting will be off for that case too.
4272 		 */
4273 		compressor_perf_test_pages_processed += num_pages_processed;
4274 		thread_wakeup(&compressor_perf_test_pages_processed);
4275 	}
4276 #endif
4277 	vm_page_unlock_queues();
4278 #if DEVELOPMENT || DEBUG
4279 	if (__improbable(vm_compressor_time_thread)) {
4280 		vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4281 		vmct_stats.vmct_pages[cq->id] += ncomps;
4282 		vmct_stats.vmct_iterations[cq->id]++;
4283 		if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4284 			vmct_stats.vmct_maxpages[cq->id] = ncomps;
4285 		}
4286 		if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4287 			vmct_stats.vmct_minpages[cq->id] = ncomps;
4288 		}
4289 	}
4290 #endif
4291 
4292 	KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4293 
4294 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4295 	/*NOTREACHED*/
4296 }
4297 
4298 
4299 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4300 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4301 {
4302 	vm_object_t     object;
4303 	memory_object_t pager;
4304 	int             compressed_count_delta;
4305 	kern_return_t   retval;
4306 
4307 	object = VM_PAGE_OBJECT(m);
4308 
4309 	assert(!m->vmp_free_when_done);
4310 	assert(!m->vmp_laundry);
4311 
4312 	pager = object->pager;
4313 
4314 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4315 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4316 
4317 		vm_object_lock(object);
4318 
4319 		/*
4320 		 * If there is no memory object for the page, create
4321 		 * one and hand it to the compression pager.
4322 		 */
4323 
4324 		if (!object->pager_initialized) {
4325 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4326 		}
4327 		if (!object->pager_initialized) {
4328 			vm_object_compressor_pager_create(object);
4329 		}
4330 
4331 		pager = object->pager;
4332 
4333 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4334 			/*
4335 			 * Still no pager for the object,
4336 			 * or the pager has been destroyed.
4337 			 * Reactivate the page.
4338 			 *
4339 			 * Should only happen if there is no
4340 			 * compression pager
4341 			 */
4342 			PAGE_WAKEUP_DONE(m);
4343 
4344 			vm_page_lockspin_queues();
4345 			vm_page_activate(m);
4346 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4347 			vm_page_unlock_queues();
4348 
4349 			/*
4350 			 *	And we are done with it.
4351 			 */
4352 			vm_object_activity_end(object);
4353 			vm_object_unlock(object);
4354 
4355 			return KERN_FAILURE;
4356 		}
4357 		vm_object_unlock(object);
4358 
4359 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4360 	}
4361 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4362 	assert(object->activity_in_progress > 0);
4363 
4364 	retval = vm_compressor_pager_put(
4365 		pager,
4366 		m->vmp_offset + object->paging_offset,
4367 		VM_PAGE_GET_PHYS_PAGE(m),
4368 		current_chead,
4369 		scratch_buf,
4370 		&compressed_count_delta);
4371 
4372 	vm_object_lock(object);
4373 
4374 	assert(object->activity_in_progress > 0);
4375 	assert(VM_PAGE_OBJECT(m) == object);
4376 	assert( !VM_PAGE_WIRED(m));
4377 
4378 	vm_compressor_pager_count(pager,
4379 	    compressed_count_delta,
4380 	    FALSE,                       /* shared_lock */
4381 	    object);
4382 
4383 	if (retval == KERN_SUCCESS) {
4384 		/*
4385 		 * If the object is purgeable, its owner's
4386 		 * purgeable ledgers will be updated in
4387 		 * vm_page_remove() but the page still
4388 		 * contributes to the owner's memory footprint,
4389 		 * so account for it as such.
4390 		 */
4391 		if ((object->purgable != VM_PURGABLE_DENY ||
4392 		    object->vo_ledger_tag) &&
4393 		    object->vo_owner != NULL) {
4394 			/* one more compressed purgeable/tagged page */
4395 			vm_object_owner_compressed_update(object,
4396 			    +1);
4397 		}
4398 		counter_inc(&vm_statistics_compressions);
4399 
4400 		if (m->vmp_tabled) {
4401 			vm_page_remove(m, TRUE);
4402 		}
4403 	} else {
4404 		PAGE_WAKEUP_DONE(m);
4405 
4406 		vm_page_lockspin_queues();
4407 
4408 		vm_page_activate(m);
4409 		vm_pageout_vminfo.vm_compressor_failed++;
4410 
4411 		vm_page_unlock_queues();
4412 	}
4413 	vm_object_activity_end(object);
4414 	vm_object_unlock(object);
4415 
4416 	return retval;
4417 }
4418 
4419 
4420 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4421 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4422 {
4423 	uint32_t        policy;
4424 
4425 	if (hibernate_cleaning_in_progress == TRUE) {
4426 		req_lowpriority = FALSE;
4427 	}
4428 
4429 	if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4430 		vm_page_unlock_queues();
4431 
4432 		if (req_lowpriority == TRUE) {
4433 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4434 			DTRACE_VM(laundrythrottle);
4435 		} else {
4436 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4437 			DTRACE_VM(laundryunthrottle);
4438 		}
4439 		proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4440 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4441 
4442 		vm_page_lock_queues();
4443 		eq->pgo_lowpriority = req_lowpriority;
4444 	}
4445 }
4446 
4447 
4448 static void
vm_pageout_iothread_external(__unused struct cq * c,__unused wait_result_t w)4449 vm_pageout_iothread_external(__unused struct cq *c, __unused wait_result_t w)
4450 {
4451 	thread_t        self = current_thread();
4452 
4453 	self->options |= TH_OPT_VMPRIV;
4454 
4455 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4456 
4457 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4458 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4459 
4460 	vm_page_lock_queues();
4461 
4462 	vm_pageout_queue_external.pgo_tid = self->thread_id;
4463 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4464 	vm_pageout_queue_external.pgo_inited = TRUE;
4465 
4466 	vm_page_unlock_queues();
4467 
4468 #if CONFIG_THREAD_GROUPS
4469 	thread_group_vm_add();
4470 #endif /* CONFIG_THREAD_GROUPS */
4471 
4472 	vm_pageout_iothread_external_continue(&vm_pageout_queue_external, 0);
4473 
4474 	/*NOTREACHED*/
4475 }
4476 
4477 
4478 static void
vm_pageout_iothread_internal(struct cq * cq,__unused wait_result_t w)4479 vm_pageout_iothread_internal(struct cq *cq, __unused wait_result_t w)
4480 {
4481 	thread_t        self = current_thread();
4482 
4483 	self->options |= TH_OPT_VMPRIV;
4484 
4485 	vm_page_lock_queues();
4486 
4487 	vm_pageout_queue_internal.pgo_tid = self->thread_id;
4488 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4489 	vm_pageout_queue_internal.pgo_inited = TRUE;
4490 
4491 #if DEVELOPMENT || DEBUG
4492 	vm_pageout_queue_benchmark.pgo_tid = vm_pageout_queue_internal.pgo_tid;
4493 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4494 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4495 	vm_pageout_queue_benchmark.pgo_idle = TRUE;
4496 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4497 #endif /* DEVELOPMENT || DEBUG */
4498 
4499 	vm_page_unlock_queues();
4500 
4501 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4502 		thread_vm_bind_group_add();
4503 	}
4504 
4505 #if CONFIG_THREAD_GROUPS
4506 	thread_group_vm_add();
4507 #endif /* CONFIG_THREAD_GROUPS */
4508 
4509 #if __AMP__
4510 	if (vm_compressor_ebound) {
4511 		/*
4512 		 * Use the soft bound option for vm_compressor to allow it to run on
4513 		 * P-cores if E-cluster is unavailable.
4514 		 */
4515 		thread_bind_cluster_type(self, 'E', true);
4516 	}
4517 #endif /* __AMP__ */
4518 
4519 	thread_set_thread_name(current_thread(), "VM_compressor");
4520 #if DEVELOPMENT || DEBUG
4521 	vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4522 #endif
4523 	vm_pageout_iothread_internal_continue(cq, 0);
4524 
4525 	/*NOTREACHED*/
4526 }
4527 
4528 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4529 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4530 {
4531 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4532 		return KERN_SUCCESS;
4533 	} else {
4534 		return KERN_FAILURE; /* Already set */
4535 	}
4536 }
4537 
4538 extern boolean_t        memorystatus_manual_testing_on;
4539 extern unsigned int     memorystatus_level;
4540 
4541 
4542 #if VM_PRESSURE_EVENTS
4543 
4544 boolean_t vm_pressure_events_enabled = FALSE;
4545 
4546 extern uint64_t next_warning_notification_sent_at_ts;
4547 extern uint64_t next_critical_notification_sent_at_ts;
4548 
4549 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4550 
4551 /*
4552  * The last time there was change in pressure level OR we forced a check
4553  * because the system is stuck in a non-normal pressure level.
4554  */
4555 uint64_t  vm_pressure_last_level_transition_abs = 0;
4556 
4557 /*
4558  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4559  * level before resending out notifications for that level again.
4560  */
4561 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4562 
4563 void
vm_pressure_response(void)4564 vm_pressure_response(void)
4565 {
4566 	vm_pressure_level_t     old_level = kVMPressureNormal;
4567 	int                     new_level = -1;
4568 	unsigned int            total_pages;
4569 	uint64_t                available_memory = 0;
4570 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4571 	bool                    force_check = false;
4572 	int                     time_in_mins;
4573 
4574 
4575 	if (vm_pressure_events_enabled == FALSE) {
4576 		return;
4577 	}
4578 
4579 #if !XNU_TARGET_OS_OSX
4580 
4581 	available_memory = (uint64_t) memorystatus_available_pages;
4582 
4583 #else /* !XNU_TARGET_OS_OSX */
4584 
4585 	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4586 	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4587 
4588 #endif /* !XNU_TARGET_OS_OSX */
4589 
4590 	total_pages = (unsigned int) atop_64(max_mem);
4591 #if CONFIG_SECLUDED_MEMORY
4592 	total_pages -= vm_page_secluded_count;
4593 #endif /* CONFIG_SECLUDED_MEMORY */
4594 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4595 
4596 	if (memorystatus_manual_testing_on) {
4597 		return;
4598 	}
4599 
4600 	curr_ts = mach_absolute_time();
4601 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4602 
4603 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4604 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4605 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4606 
4607 	old_level = memorystatus_vm_pressure_level;
4608 
4609 	switch (memorystatus_vm_pressure_level) {
4610 	case kVMPressureNormal:
4611 	{
4612 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4613 			new_level = kVMPressureCritical;
4614 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4615 			new_level = kVMPressureWarning;
4616 		}
4617 		break;
4618 	}
4619 
4620 	case kVMPressureWarning:
4621 	case kVMPressureUrgent:
4622 	{
4623 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4624 			new_level = kVMPressureNormal;
4625 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4626 			new_level = kVMPressureCritical;
4627 		} else if (force_check) {
4628 			new_level = kVMPressureWarning;
4629 			next_warning_notification_sent_at_ts = curr_ts;
4630 		}
4631 		break;
4632 	}
4633 
4634 	case kVMPressureCritical:
4635 	{
4636 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4637 			new_level = kVMPressureNormal;
4638 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4639 			new_level = kVMPressureWarning;
4640 		} else if (force_check) {
4641 			new_level = kVMPressureCritical;
4642 			next_critical_notification_sent_at_ts = curr_ts;
4643 		}
4644 		break;
4645 	}
4646 
4647 	default:
4648 		return;
4649 	}
4650 
4651 	if (new_level != -1 || force_check) {
4652 		if (new_level != -1) {
4653 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4654 
4655 			if (new_level != (int) old_level) {
4656 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4657 				    new_level, old_level, 0, 0);
4658 			}
4659 		} else {
4660 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4661 			    new_level, old_level, force_check, 0);
4662 		}
4663 
4664 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4665 			/*
4666 			 * We don't want to schedule a wakeup while hibernation is in progress
4667 			 * because that could collide with checks for non-monotonicity in the scheduler.
4668 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4669 			 * we _might_ want to use that for decisions regarding which pages or how
4670 			 * many pages we want to dump in hibernation.
4671 			 */
4672 			return;
4673 		}
4674 
4675 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4676 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4677 				thread_wakeup(&vm_pressure_thread);
4678 			}
4679 
4680 			if (old_level != memorystatus_vm_pressure_level) {
4681 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4682 			}
4683 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4684 		}
4685 	}
4686 }
4687 #endif /* VM_PRESSURE_EVENTS */
4688 
4689 /*
4690  * Function called by a kernel thread to either get the current pressure level or
4691  * wait until memory pressure changes from a given level.
4692  */
4693 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4694 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4695 {
4696 #if !VM_PRESSURE_EVENTS
4697 
4698 	return KERN_FAILURE;
4699 
4700 #else /* VM_PRESSURE_EVENTS */
4701 
4702 	wait_result_t       wr = 0;
4703 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4704 
4705 	if (pressure_level == NULL) {
4706 		return KERN_INVALID_ARGUMENT;
4707 	}
4708 
4709 	if (*pressure_level == kVMPressureJetsam) {
4710 		if (!wait_for_pressure) {
4711 			return KERN_INVALID_ARGUMENT;
4712 		}
4713 
4714 		lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4715 		wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4716 		    THREAD_INTERRUPTIBLE);
4717 		if (wr == THREAD_WAITING) {
4718 			++memorystatus_jetsam_fg_band_waiters;
4719 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4720 			wr = thread_block(THREAD_CONTINUE_NULL);
4721 		} else {
4722 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4723 		}
4724 		if (wr != THREAD_AWAKENED) {
4725 			return KERN_ABORTED;
4726 		}
4727 		*pressure_level = kVMPressureJetsam;
4728 		return KERN_SUCCESS;
4729 	}
4730 
4731 	if (wait_for_pressure == TRUE) {
4732 		while (old_level == *pressure_level) {
4733 			wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4734 			    THREAD_INTERRUPTIBLE);
4735 			if (wr == THREAD_WAITING) {
4736 				wr = thread_block(THREAD_CONTINUE_NULL);
4737 			}
4738 			if (wr == THREAD_INTERRUPTED) {
4739 				return KERN_ABORTED;
4740 			}
4741 
4742 			if (wr == THREAD_AWAKENED) {
4743 				old_level = memorystatus_vm_pressure_level;
4744 			}
4745 		}
4746 	}
4747 
4748 	*pressure_level = old_level;
4749 	return KERN_SUCCESS;
4750 #endif /* VM_PRESSURE_EVENTS */
4751 }
4752 
4753 #if VM_PRESSURE_EVENTS
4754 void
vm_pressure_thread(void)4755 vm_pressure_thread(void)
4756 {
4757 	static boolean_t thread_initialized = FALSE;
4758 
4759 	if (thread_initialized == TRUE) {
4760 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4761 		consider_vm_pressure_events();
4762 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4763 	}
4764 
4765 #if CONFIG_THREAD_GROUPS
4766 	thread_group_vm_add();
4767 #endif /* CONFIG_THREAD_GROUPS */
4768 
4769 	thread_set_thread_name(current_thread(), "VM_pressure");
4770 	thread_initialized = TRUE;
4771 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4772 	thread_block((thread_continue_t)vm_pressure_thread);
4773 }
4774 #endif /* VM_PRESSURE_EVENTS */
4775 
4776 
4777 /*
4778  * called once per-second via "compute_averages"
4779  */
4780 void
compute_pageout_gc_throttle(__unused void * arg)4781 compute_pageout_gc_throttle(__unused void *arg)
4782 {
4783 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4784 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4785 
4786 		thread_wakeup(VM_PAGEOUT_GC_EVENT);
4787 	}
4788 }
4789 
4790 /*
4791  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4792  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4793  * jetsams. We need to check if the zone map size is above its jetsam limit to
4794  * decide if this was indeed the case.
4795  *
4796  * We need to do this on a different thread because of the following reasons:
4797  *
4798  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4799  * itself causing the system to hang. We perform synchronous jetsams if we're
4800  * leaking in the VM map entries zone, so the leaking process could be doing a
4801  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4802  * jetsam itself. We also need the vm_map lock on the process termination path,
4803  * which would now lead the dying process to deadlock against itself.
4804  *
4805  * 2. The jetsam path might need to allocate zone memory itself. We could try
4806  * using the non-blocking variant of zalloc for this path, but we can still
4807  * end up trying to do a kmem_alloc when the zone maps are almost full.
4808  */
4809 __dead2
4810 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4811 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4812 {
4813 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4814 
4815 	if (step == VM_PAGEOUT_GC_INIT) {
4816 		/* first time being called is not about GC */
4817 #if CONFIG_THREAD_GROUPS
4818 		thread_group_vm_add();
4819 #endif /* CONFIG_THREAD_GROUPS */
4820 	} else if (zone_map_nearing_exhaustion()) {
4821 		/*
4822 		 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4823 		 *
4824 		 * Bail out after calling zone_gc (which triggers the
4825 		 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4826 		 * operations that clear out a bunch of caches might allocate zone
4827 		 * memory themselves (for eg. vm_map operations would need VM map
4828 		 * entries). Since the zone map is almost full at this point, we
4829 		 * could end up with a panic. We just need to quickly jetsam a
4830 		 * process and exit here.
4831 		 *
4832 		 * It could so happen that we were woken up to relieve memory
4833 		 * pressure and the zone map also happened to be near its limit at
4834 		 * the time, in which case we'll skip out early. But that should be
4835 		 * ok; if memory pressure persists, the thread will simply be woken
4836 		 * up again.
4837 		 */
4838 		zone_gc(ZONE_GC_JETSAM);
4839 	} else {
4840 		/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4841 		boolean_t buf_large_zfree = FALSE;
4842 		boolean_t first_try = TRUE;
4843 
4844 		stack_collect();
4845 
4846 		consider_machine_collect();
4847 		mbuf_drain(FALSE);
4848 
4849 		do {
4850 			if (consider_buffer_cache_collect != NULL) {
4851 				buf_large_zfree = (*consider_buffer_cache_collect)(0);
4852 			}
4853 			if (first_try == TRUE || buf_large_zfree == TRUE) {
4854 				/*
4855 				 * zone_gc should be last, because the other operations
4856 				 * might return memory to zones.
4857 				 */
4858 				zone_gc(ZONE_GC_TRIM);
4859 			}
4860 			first_try = FALSE;
4861 		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4862 
4863 		consider_machine_adjust();
4864 	}
4865 
4866 	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4867 
4868 	thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4869 	__builtin_unreachable();
4870 }
4871 
4872 
4873 #if VM_PAGE_BUCKETS_CHECK
4874 #if VM_PAGE_FAKE_BUCKETS
4875 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4876 #endif /* VM_PAGE_FAKE_BUCKETS */
4877 #endif /* VM_PAGE_BUCKETS_CHECK */
4878 
4879 
4880 
4881 void
vm_set_restrictions(unsigned int num_cpus)4882 vm_set_restrictions(unsigned int num_cpus)
4883 {
4884 	int vm_restricted_to_single_processor = 0;
4885 
4886 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4887 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4888 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4889 	} else {
4890 		assert(num_cpus > 0);
4891 
4892 		if (num_cpus <= 3) {
4893 			/*
4894 			 * on systems with a limited number of CPUS, bind the
4895 			 * 4 major threads that can free memory and that tend to use
4896 			 * a fair bit of CPU under pressured conditions to a single processor.
4897 			 * This insures that these threads don't hog all of the available CPUs
4898 			 * (important for camera launch), while allowing them to run independently
4899 			 * w/r to locks... the 4 threads are
4900 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4901 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
4902 			 * memorystatus_thread (jetsams).
4903 			 *
4904 			 * the first time the thread is run, it is responsible for checking the
4905 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
4906 			 * thread_bind_master...  someday this should be replaced with a group
4907 			 * scheduling mechanism and KPI.
4908 			 */
4909 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4910 		} else {
4911 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4912 		}
4913 	}
4914 }
4915 
4916 /*
4917  * Set up vm_config based on the vm_compressor_mode.
4918  * Must run BEFORE the pageout thread starts up.
4919  */
4920 __startup_func
4921 void
vm_config_init(void)4922 vm_config_init(void)
4923 {
4924 	bzero(&vm_config, sizeof(vm_config));
4925 
4926 	switch (vm_compressor_mode) {
4927 	case VM_PAGER_DEFAULT:
4928 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4929 		OS_FALLTHROUGH;
4930 
4931 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
4932 		vm_config.compressor_is_present = TRUE;
4933 		vm_config.swap_is_present = TRUE;
4934 		vm_config.compressor_is_active = TRUE;
4935 		vm_config.swap_is_active = TRUE;
4936 		break;
4937 
4938 	case VM_PAGER_COMPRESSOR_NO_SWAP:
4939 		vm_config.compressor_is_present = TRUE;
4940 		vm_config.swap_is_present = TRUE;
4941 		vm_config.compressor_is_active = TRUE;
4942 		break;
4943 
4944 	case VM_PAGER_FREEZER_DEFAULT:
4945 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4946 		OS_FALLTHROUGH;
4947 
4948 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4949 		vm_config.compressor_is_present = TRUE;
4950 		vm_config.swap_is_present = TRUE;
4951 		break;
4952 
4953 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4954 		vm_config.compressor_is_present = TRUE;
4955 		vm_config.swap_is_present = TRUE;
4956 		vm_config.compressor_is_active = TRUE;
4957 		vm_config.freezer_swap_is_active = TRUE;
4958 		break;
4959 
4960 	case VM_PAGER_NOT_CONFIGURED:
4961 		break;
4962 
4963 	default:
4964 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
4965 		break;
4966 	}
4967 }
4968 
4969 __startup_func
4970 static void
vm_pageout_create_gc_thread(void)4971 vm_pageout_create_gc_thread(void)
4972 {
4973 	thread_t thread;
4974 
4975 	if (kernel_thread_create(vm_pageout_garbage_collect,
4976 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
4977 		panic("vm_pageout_garbage_collect: create failed");
4978 	}
4979 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
4980 	if (thread->reserved_stack == 0) {
4981 		assert(thread->kernel_stack);
4982 		thread->reserved_stack = thread->kernel_stack;
4983 	}
4984 
4985 	/* thread is started in vm_pageout() */
4986 	vm_pageout_gc_thread = thread;
4987 }
4988 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
4989 
4990 void
vm_pageout(void)4991 vm_pageout(void)
4992 {
4993 	thread_t        self = current_thread();
4994 	thread_t        thread;
4995 	kern_return_t   result;
4996 	spl_t           s;
4997 
4998 	/*
4999 	 * Set thread privileges.
5000 	 */
5001 	s = splsched();
5002 
5003 #if CONFIG_VPS_DYNAMIC_PRIO
5004 
5005 	int             vps_dynprio_bootarg = 0;
5006 
5007 	if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5008 		vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5009 		kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5010 	} else {
5011 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5012 			vps_dynamic_priority_enabled = TRUE;
5013 		} else {
5014 			vps_dynamic_priority_enabled = FALSE;
5015 		}
5016 	}
5017 
5018 	if (vps_dynamic_priority_enabled) {
5019 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5020 		thread_set_eager_preempt(self);
5021 	} else {
5022 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5023 	}
5024 
5025 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5026 
5027 	vps_dynamic_priority_enabled = FALSE;
5028 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5029 
5030 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5031 
5032 	thread_lock(self);
5033 	self->options |= TH_OPT_VMPRIV;
5034 	thread_unlock(self);
5035 
5036 	if (!self->reserved_stack) {
5037 		self->reserved_stack = self->kernel_stack;
5038 	}
5039 
5040 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5041 	    vps_dynamic_priority_enabled == FALSE) {
5042 		thread_vm_bind_group_add();
5043 	}
5044 
5045 
5046 #if CONFIG_THREAD_GROUPS
5047 	thread_group_vm_add();
5048 #endif /* CONFIG_THREAD_GROUPS */
5049 
5050 #if __AMP__
5051 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5052 	if (vm_pgo_pbound) {
5053 		/*
5054 		 * Use the soft bound option for vm pageout to allow it to run on
5055 		 * E-cores if P-cluster is unavailable.
5056 		 */
5057 		thread_bind_cluster_type(self, 'P', true);
5058 	}
5059 #endif /* __AMP__ */
5060 
5061 	splx(s);
5062 
5063 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5064 
5065 	/*
5066 	 *	Initialize some paging parameters.
5067 	 */
5068 
5069 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5070 	vm_pageout_state.vm_pressure_changed = FALSE;
5071 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5072 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5073 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5074 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5075 	vm_pageout_state.vm_page_speculative_percentage = 5;
5076 	vm_pageout_state.vm_page_speculative_target = 0;
5077 
5078 	vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
5079 	vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
5080 
5081 	vm_pageout_state.vm_pageout_swap_wait = 0;
5082 	vm_pageout_state.vm_pageout_idle_wait = 0;
5083 	vm_pageout_state.vm_pageout_empty_wait = 0;
5084 	vm_pageout_state.vm_pageout_burst_wait = 0;
5085 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5086 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5087 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5088 
5089 	vm_pageout_state.vm_pageout_inactive = 0;
5090 	vm_pageout_state.vm_pageout_inactive_used = 0;
5091 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5092 
5093 	vm_pageout_state.vm_memory_pressure = 0;
5094 	vm_pageout_state.vm_page_filecache_min = 0;
5095 #if CONFIG_JETSAM
5096 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5097 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5098 #else
5099 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5100 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5101 #endif
5102 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5103 
5104 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5105 
5106 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5107 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5108 	}
5109 
5110 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5111 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5112 	}
5113 
5114 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5115 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5116 	}
5117 
5118 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5119 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5120 	}
5121 
5122 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5123 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5124 	}
5125 
5126 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5127 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5128 	}
5129 
5130 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5131 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5132 	}
5133 	/*
5134 	 * even if we've already called vm_page_free_reserve
5135 	 * call it again here to insure that the targets are
5136 	 * accurately calculated (it uses vm_page_free_count_init)
5137 	 * calling it with an arg of 0 will not change the reserve
5138 	 * but will re-calculate free_min and free_target
5139 	 */
5140 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5141 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5142 	} else {
5143 		vm_page_free_reserve(0);
5144 	}
5145 
5146 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5147 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5148 
5149 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5150 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5151 	vm_pageout_queue_external.pgo_tid = -1;
5152 
5153 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5154 	vm_pageout_queue_internal.pgo_tid = -1;
5155 
5156 #if DEVELOPMENT || DEBUG
5157 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5158 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5159 	vm_pageout_queue_internal.pgo_tid = -1;
5160 #endif /* DEVELOPMENT || DEBUG */
5161 
5162 
5163 	/* internal pageout thread started when default pager registered first time */
5164 	/* external pageout and garbage collection threads started here */
5165 
5166 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5167 	    BASEPRI_VM,
5168 	    &vm_pageout_state.vm_pageout_external_iothread);
5169 	if (result != KERN_SUCCESS) {
5170 		panic("vm_pageout_iothread_external: create failed");
5171 	}
5172 	thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5173 	thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5174 
5175 	thread_mtx_lock(vm_pageout_gc_thread );
5176 	thread_start(vm_pageout_gc_thread );
5177 	thread_mtx_unlock(vm_pageout_gc_thread);
5178 
5179 #if VM_PRESSURE_EVENTS
5180 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5181 	    BASEPRI_DEFAULT,
5182 	    &thread);
5183 
5184 	if (result != KERN_SUCCESS) {
5185 		panic("vm_pressure_thread: create failed");
5186 	}
5187 
5188 	thread_deallocate(thread);
5189 #endif
5190 
5191 	vm_object_reaper_init();
5192 
5193 
5194 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5195 		vm_compressor_init();
5196 	}
5197 
5198 #if VM_PRESSURE_EVENTS
5199 	vm_pressure_events_enabled = TRUE;
5200 #endif /* VM_PRESSURE_EVENTS */
5201 
5202 #if CONFIG_PHANTOM_CACHE
5203 	vm_phantom_cache_init();
5204 #endif
5205 #if VM_PAGE_BUCKETS_CHECK
5206 #if VM_PAGE_FAKE_BUCKETS
5207 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5208 	    (uint64_t) vm_page_fake_buckets_start,
5209 	    (uint64_t) vm_page_fake_buckets_end);
5210 	pmap_protect(kernel_pmap,
5211 	    vm_page_fake_buckets_start,
5212 	    vm_page_fake_buckets_end,
5213 	    VM_PROT_READ);
5214 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5215 #endif /* VM_PAGE_FAKE_BUCKETS */
5216 #endif /* VM_PAGE_BUCKETS_CHECK */
5217 
5218 #if VM_OBJECT_TRACKING
5219 	vm_object_tracking_init();
5220 #endif /* VM_OBJECT_TRACKING */
5221 
5222 #if __arm64__
5223 //	vm_tests();
5224 #endif /* __arm64__ */
5225 
5226 	vm_pageout_continue();
5227 
5228 	/*
5229 	 * Unreached code!
5230 	 *
5231 	 * The vm_pageout_continue() call above never returns, so the code below is never
5232 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5233 	 * points that our kernel doesn't have an analog for.  These are probe points that
5234 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5235 	 * scripts that use them.  Declaring the probe points here means their scripts will
5236 	 * compile and execute which we want for portability of the scripts, but since this
5237 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5238 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5239 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5240 	 */
5241 
5242 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5243 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5244 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5245 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5246 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5247 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5248 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5249 	/*NOTREACHED*/
5250 }
5251 
5252 
5253 
5254 kern_return_t
vm_pageout_internal_start(void)5255 vm_pageout_internal_start(void)
5256 {
5257 	kern_return_t   result = KERN_SUCCESS;
5258 	host_basic_info_data_t hinfo;
5259 	vm_offset_t     buf, bufsize;
5260 
5261 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5262 
5263 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5264 #define BSD_HOST 1
5265 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5266 
5267 	assert(hinfo.max_cpus > 0);
5268 
5269 #if !XNU_TARGET_OS_OSX
5270 	vm_pageout_state.vm_compressor_thread_count = 1;
5271 #else /* !XNU_TARGET_OS_OSX */
5272 	if (hinfo.max_cpus > 4) {
5273 		vm_pageout_state.vm_compressor_thread_count = 2;
5274 	} else {
5275 		vm_pageout_state.vm_compressor_thread_count = 1;
5276 	}
5277 #endif /* !XNU_TARGET_OS_OSX */
5278 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5279 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5280 
5281 #if     __AMP__
5282 	PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5283 	if (vm_compressor_ebound) {
5284 		vm_pageout_state.vm_compressor_thread_count = 2;
5285 	}
5286 #endif
5287 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5288 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5289 	}
5290 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5291 		vm_pageout_state.vm_compressor_thread_count = 1;
5292 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5293 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5294 	}
5295 
5296 	vm_pageout_queue_internal.pgo_maxlaundry =
5297 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5298 
5299 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5300 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5301 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5302 
5303 #if DEVELOPMENT || DEBUG
5304 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5305 #endif /* DEVELOPMENT || DEBUG */
5306 
5307 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5308 
5309 	kmem_alloc(kernel_map, &buf,
5310 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5311 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5312 	    VM_KERN_MEMORY_COMPRESSOR);
5313 
5314 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5315 		ciq[i].id = i;
5316 		ciq[i].q = &vm_pageout_queue_internal;
5317 		ciq[i].current_early_swapout_chead = NULL;
5318 		ciq[i].current_regular_swapout_chead = NULL;
5319 		ciq[i].current_late_swapout_chead = NULL;
5320 		ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5321 #if DEVELOPMENT || DEBUG
5322 		ciq[i].benchmark_q = &vm_pageout_queue_benchmark;
5323 #endif /* DEVELOPMENT || DEBUG */
5324 
5325 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5326 		    (void *)&ciq[i], BASEPRI_VM,
5327 		    &vm_pageout_state.vm_pageout_internal_iothread);
5328 
5329 		if (result == KERN_SUCCESS) {
5330 			thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5331 		} else {
5332 			break;
5333 		}
5334 	}
5335 	return result;
5336 }
5337 
5338 #if CONFIG_IOSCHED
5339 /*
5340  * To support I/O Expedite for compressed files we mark the upls with special flags.
5341  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5342  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5343  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5344  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5345  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5346  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5347  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5348  * unless the real I/O upl is being destroyed).
5349  */
5350 
5351 
5352 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5353 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5354 {
5355 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5356 
5357 	upl_lock(src_upl);
5358 	if (src_upl->decmp_io_upl) {
5359 		/*
5360 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5361 		 * This case should rarely happen and even if it does, it just means
5362 		 * that we might issue a spurious expedite which the driver is expected
5363 		 * to handle.
5364 		 */
5365 		upl_unlock(src_upl);
5366 		return;
5367 	}
5368 	src_upl->decmp_io_upl = (void *)upl;
5369 	src_upl->ref_count++;
5370 
5371 	upl->flags |= UPL_DECMP_REAL_IO;
5372 	upl->decmp_io_upl = (void *)src_upl;
5373 	upl_unlock(src_upl);
5374 }
5375 #endif /* CONFIG_IOSCHED */
5376 
5377 #if UPL_DEBUG
5378 int     upl_debug_enabled = 1;
5379 #else
5380 int     upl_debug_enabled = 0;
5381 #endif
5382 
5383 static upl_t
upl_create(int type,int flags,upl_size_t size)5384 upl_create(int type, int flags, upl_size_t size)
5385 {
5386 	upl_t   upl;
5387 	vm_size_t       page_field_size = 0;
5388 	int     upl_flags = 0;
5389 	vm_size_t       upl_size  = sizeof(struct upl);
5390 
5391 	assert(page_aligned(size));
5392 
5393 	size = round_page_32(size);
5394 
5395 	if (type & UPL_CREATE_LITE) {
5396 		page_field_size = (atop(size) + 7) >> 3;
5397 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5398 
5399 		upl_flags |= UPL_LITE;
5400 	}
5401 	if (type & UPL_CREATE_INTERNAL) {
5402 		upl_size += sizeof(struct upl_page_info) * atop(size);
5403 
5404 		upl_flags |= UPL_INTERNAL;
5405 	}
5406 	// rdar://88964158
5407 	/* BEGIN IGNORE CODESTYLE */
5408 	__typed_allocators_ignore_push
5409 	upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5410 	__typed_allocators_ignore_pop
5411 	/* END IGNORE CODESTYLE */
5412 
5413 	upl->flags = upl_flags | flags;
5414 	upl->ref_count = 1;
5415 	upl_lock_init(upl);
5416 #if CONFIG_IOSCHED
5417 	if (type & UPL_CREATE_IO_TRACKING) {
5418 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5419 	}
5420 
5421 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5422 		/* Only support expedite on internal UPLs */
5423 		thread_t        curthread = current_thread();
5424 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5425 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5426 		if (curthread->decmp_upl != NULL) {
5427 			upl_set_decmp_info(upl, curthread->decmp_upl);
5428 		}
5429 	}
5430 #endif
5431 #if CONFIG_IOSCHED || UPL_DEBUG
5432 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5433 		upl->upl_creator = current_thread();
5434 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5435 	}
5436 #endif
5437 
5438 #if UPL_DEBUG
5439 	(void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5440 #endif /* UPL_DEBUG */
5441 
5442 	return upl;
5443 }
5444 
5445 static void
upl_destroy(upl_t upl)5446 upl_destroy(upl_t upl)
5447 {
5448 	int     page_field_size;  /* bit field in word size buf */
5449 	int     size;
5450 
5451 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5452 
5453 	if (upl->ext_ref_count) {
5454 		panic("upl(%p) ext_ref_count", upl);
5455 	}
5456 
5457 #if CONFIG_IOSCHED
5458 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5459 		upl_t src_upl;
5460 		src_upl = upl->decmp_io_upl;
5461 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5462 		upl_lock(src_upl);
5463 		src_upl->decmp_io_upl = NULL;
5464 		upl_unlock(src_upl);
5465 		upl_deallocate(src_upl);
5466 	}
5467 #endif /* CONFIG_IOSCHED */
5468 
5469 #if CONFIG_IOSCHED || UPL_DEBUG
5470 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5471 	    !(upl->flags & UPL_VECTOR)) {
5472 		vm_object_t     object;
5473 
5474 		if (upl->flags & UPL_SHADOWED) {
5475 			object = upl->map_object->shadow;
5476 		} else {
5477 			object = upl->map_object;
5478 		}
5479 
5480 		vm_object_lock(object);
5481 		queue_remove(&object->uplq, upl, upl_t, uplq);
5482 		vm_object_activity_end(object);
5483 		vm_object_collapse(object, 0, TRUE);
5484 		vm_object_unlock(object);
5485 	}
5486 #endif
5487 	/*
5488 	 * drop a reference on the map_object whether or
5489 	 * not a pageout object is inserted
5490 	 */
5491 	if (upl->flags & UPL_SHADOWED) {
5492 		vm_object_deallocate(upl->map_object);
5493 	}
5494 
5495 	if (upl->flags & UPL_DEVICE_MEMORY) {
5496 		size = PAGE_SIZE;
5497 	} else {
5498 		size = upl_adjusted_size(upl, PAGE_MASK);
5499 	}
5500 	page_field_size = 0;
5501 
5502 	if (upl->flags & UPL_LITE) {
5503 		page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5504 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5505 	}
5506 	upl_lock_destroy(upl);
5507 	upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5508 
5509 #if CONFIG_IOSCHED
5510 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5511 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5512 	}
5513 #endif
5514 
5515 	// rdar://88964158
5516 	__typed_allocators_ignore_push
5517 	if (upl->flags & UPL_INTERNAL) {
5518 		kheap_free(KHEAP_DEFAULT, upl,
5519 		    sizeof(struct upl) +
5520 		    (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5521 		    + page_field_size);
5522 	} else {
5523 		kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5524 	}
5525 	__typed_allocators_ignore_pop
5526 }
5527 
5528 void
upl_deallocate(upl_t upl)5529 upl_deallocate(upl_t upl)
5530 {
5531 	upl_lock(upl);
5532 
5533 	if (--upl->ref_count == 0) {
5534 		if (vector_upl_is_valid(upl)) {
5535 			vector_upl_deallocate(upl);
5536 		}
5537 		upl_unlock(upl);
5538 
5539 		if (upl->upl_iodone) {
5540 			upl_callout_iodone(upl);
5541 		}
5542 
5543 		upl_destroy(upl);
5544 	} else {
5545 		upl_unlock(upl);
5546 	}
5547 }
5548 
5549 #if CONFIG_IOSCHED
5550 void
upl_mark_decmp(upl_t upl)5551 upl_mark_decmp(upl_t upl)
5552 {
5553 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5554 		upl->flags |= UPL_DECMP_REQ;
5555 		upl->upl_creator->decmp_upl = (void *)upl;
5556 	}
5557 }
5558 
5559 void
upl_unmark_decmp(upl_t upl)5560 upl_unmark_decmp(upl_t upl)
5561 {
5562 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5563 		upl->upl_creator->decmp_upl = NULL;
5564 	}
5565 }
5566 
5567 #endif /* CONFIG_IOSCHED */
5568 
5569 #define VM_PAGE_Q_BACKING_UP(q)         \
5570 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5571 
5572 boolean_t must_throttle_writes(void);
5573 
5574 boolean_t
must_throttle_writes()5575 must_throttle_writes()
5576 {
5577 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5578 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5579 		return TRUE;
5580 	}
5581 
5582 	return FALSE;
5583 }
5584 
5585 int vm_page_delayed_work_ctx_needed = 0;
5586 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5587 
5588 __startup_func
5589 static void
vm_page_delayed_work_init_ctx(void)5590 vm_page_delayed_work_init_ctx(void)
5591 {
5592 	uint16_t min_delayed_work_ctx_allocated = 16;
5593 
5594 	/*
5595 	 * try really hard to always keep NCPU elements around in the zone
5596 	 * in order for the UPL code to almost always get an element.
5597 	 */
5598 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5599 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5600 	}
5601 
5602 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5603 }
5604 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5605 
5606 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5607 vm_page_delayed_work_get_ctx(void)
5608 {
5609 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5610 
5611 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5612 
5613 	if (__probable(dw_ctx)) {
5614 		dw_ctx->delayed_owner = current_thread();
5615 	} else {
5616 		vm_page_delayed_work_ctx_needed++;
5617 	}
5618 	return dw_ctx ? dw_ctx->dwp : NULL;
5619 }
5620 
5621 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5622 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5623 {
5624 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5625 
5626 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5627 	ldw_ctx->delayed_owner = NULL;
5628 
5629 	zfree(dw_ctx_zone, ldw_ctx);
5630 }
5631 
5632 /*
5633  *	Routine:	vm_object_upl_request
5634  *	Purpose:
5635  *		Cause the population of a portion of a vm_object.
5636  *		Depending on the nature of the request, the pages
5637  *		returned may be contain valid data or be uninitialized.
5638  *		A page list structure, listing the physical pages
5639  *		will be returned upon request.
5640  *		This function is called by the file system or any other
5641  *		supplier of backing store to a pager.
5642  *		IMPORTANT NOTE: The caller must still respect the relationship
5643  *		between the vm_object and its backing memory object.  The
5644  *		caller MUST NOT substitute changes in the backing file
5645  *		without first doing a memory_object_lock_request on the
5646  *		target range unless it is know that the pages are not
5647  *		shared with another entity at the pager level.
5648  *		Copy_in_to:
5649  *			if a page list structure is present
5650  *			return the mapped physical pages, where a
5651  *			page is not present, return a non-initialized
5652  *			one.  If the no_sync bit is turned on, don't
5653  *			call the pager unlock to synchronize with other
5654  *			possible copies of the page. Leave pages busy
5655  *			in the original object, if a page list structure
5656  *			was specified.  When a commit of the page list
5657  *			pages is done, the dirty bit will be set for each one.
5658  *		Copy_out_from:
5659  *			If a page list structure is present, return
5660  *			all mapped pages.  Where a page does not exist
5661  *			map a zero filled one. Leave pages busy in
5662  *			the original object.  If a page list structure
5663  *			is not specified, this call is a no-op.
5664  *
5665  *		Note:  access of default pager objects has a rather interesting
5666  *		twist.  The caller of this routine, presumably the file system
5667  *		page cache handling code, will never actually make a request
5668  *		against a default pager backed object.  Only the default
5669  *		pager will make requests on backing store related vm_objects
5670  *		In this way the default pager can maintain the relationship
5671  *		between backing store files (abstract memory objects) and
5672  *		the vm_objects (cache objects), they support.
5673  *
5674  */
5675 
5676 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5677 vm_object_upl_request(
5678 	vm_object_t             object,
5679 	vm_object_offset_t      offset,
5680 	upl_size_t              size,
5681 	upl_t                   *upl_ptr,
5682 	upl_page_info_array_t   user_page_list,
5683 	unsigned int            *page_list_count,
5684 	upl_control_flags_t     cntrl_flags,
5685 	vm_tag_t                tag)
5686 {
5687 	vm_page_t               dst_page = VM_PAGE_NULL;
5688 	vm_object_offset_t      dst_offset;
5689 	upl_size_t              xfer_size;
5690 	unsigned int            size_in_pages;
5691 	boolean_t               dirty;
5692 	boolean_t               hw_dirty;
5693 	upl_t                   upl = NULL;
5694 	unsigned int            entry;
5695 	vm_page_t               alias_page = NULL;
5696 	int                     refmod_state = 0;
5697 	wpl_array_t             lite_list = NULL;
5698 	vm_object_t             last_copy_object;
5699 	struct  vm_page_delayed_work    dw_array;
5700 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5701 	bool                    dwp_finish_ctx = TRUE;
5702 	int                     dw_count;
5703 	int                     dw_limit;
5704 	int                     io_tracking_flag = 0;
5705 	int                     grab_options;
5706 	int                     page_grab_count = 0;
5707 	ppnum_t                 phys_page;
5708 	pmap_flush_context      pmap_flush_context_storage;
5709 	boolean_t               pmap_flushes_delayed = FALSE;
5710 #if DEVELOPMENT || DEBUG
5711 	task_t                  task = current_task();
5712 #endif /* DEVELOPMENT || DEBUG */
5713 
5714 	dwp_start = dwp = NULL;
5715 
5716 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5717 		/*
5718 		 * For forward compatibility's sake,
5719 		 * reject any unknown flag.
5720 		 */
5721 		return KERN_INVALID_VALUE;
5722 	}
5723 	if ((!object->internal) && (object->paging_offset != 0)) {
5724 		panic("vm_object_upl_request: external object with non-zero paging offset");
5725 	}
5726 	if (object->phys_contiguous) {
5727 		panic("vm_object_upl_request: contiguous object specified");
5728 	}
5729 
5730 	assertf(page_aligned(offset) && page_aligned(size),
5731 	    "offset 0x%llx size 0x%x",
5732 	    offset, size);
5733 
5734 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5735 
5736 	dw_count = 0;
5737 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5738 	dwp_start = vm_page_delayed_work_get_ctx();
5739 	if (dwp_start == NULL) {
5740 		dwp_start = &dw_array;
5741 		dw_limit = 1;
5742 		dwp_finish_ctx = FALSE;
5743 	}
5744 
5745 	dwp = dwp_start;
5746 
5747 	if (size > MAX_UPL_SIZE_BYTES) {
5748 		size = MAX_UPL_SIZE_BYTES;
5749 	}
5750 
5751 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5752 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5753 	}
5754 
5755 #if CONFIG_IOSCHED || UPL_DEBUG
5756 	if (object->io_tracking || upl_debug_enabled) {
5757 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5758 	}
5759 #endif
5760 #if CONFIG_IOSCHED
5761 	if (object->io_tracking) {
5762 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5763 	}
5764 #endif
5765 
5766 	if (cntrl_flags & UPL_SET_INTERNAL) {
5767 		if (cntrl_flags & UPL_SET_LITE) {
5768 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5769 
5770 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5771 			lite_list = (wpl_array_t)
5772 			    (((uintptr_t)user_page_list) +
5773 			    ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5774 			if (size == 0) {
5775 				user_page_list = NULL;
5776 				lite_list = NULL;
5777 			}
5778 		} else {
5779 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5780 
5781 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5782 			if (size == 0) {
5783 				user_page_list = NULL;
5784 			}
5785 		}
5786 	} else {
5787 		if (cntrl_flags & UPL_SET_LITE) {
5788 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5789 
5790 			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5791 			if (size == 0) {
5792 				lite_list = NULL;
5793 			}
5794 		} else {
5795 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5796 		}
5797 	}
5798 	*upl_ptr = upl;
5799 
5800 	if (user_page_list) {
5801 		user_page_list[0].device = FALSE;
5802 	}
5803 
5804 	if (cntrl_flags & UPL_SET_LITE) {
5805 		upl->map_object = object;
5806 	} else {
5807 		upl->map_object = vm_object_allocate(size);
5808 		/*
5809 		 * No neeed to lock the new object: nobody else knows
5810 		 * about it yet, so it's all ours so far.
5811 		 */
5812 		upl->map_object->shadow = object;
5813 		upl->map_object->pageout = TRUE;
5814 		upl->map_object->can_persist = FALSE;
5815 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5816 		upl->map_object->vo_shadow_offset = offset;
5817 		upl->map_object->wimg_bits = object->wimg_bits;
5818 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5819 		    "object %p shadow_offset 0x%llx",
5820 		    upl->map_object, upl->map_object->vo_shadow_offset);
5821 
5822 		alias_page = vm_page_grab_fictitious(TRUE);
5823 
5824 		upl->flags |= UPL_SHADOWED;
5825 	}
5826 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5827 		upl->flags |= UPL_PAGEOUT;
5828 	}
5829 
5830 	vm_object_lock(object);
5831 	vm_object_activity_begin(object);
5832 
5833 	grab_options = 0;
5834 #if CONFIG_SECLUDED_MEMORY
5835 	if (object->can_grab_secluded) {
5836 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5837 	}
5838 #endif /* CONFIG_SECLUDED_MEMORY */
5839 
5840 	/*
5841 	 * we can lock in the paging_offset once paging_in_progress is set
5842 	 */
5843 	upl->u_size = size;
5844 	upl->u_offset = offset + object->paging_offset;
5845 
5846 #if CONFIG_IOSCHED || UPL_DEBUG
5847 	if (object->io_tracking || upl_debug_enabled) {
5848 		vm_object_activity_begin(object);
5849 		queue_enter(&object->uplq, upl, upl_t, uplq);
5850 	}
5851 #endif
5852 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5853 		/*
5854 		 * Honor copy-on-write obligations
5855 		 *
5856 		 * The caller is gathering these pages and
5857 		 * might modify their contents.  We need to
5858 		 * make sure that the copy object has its own
5859 		 * private copies of these pages before we let
5860 		 * the caller modify them.
5861 		 */
5862 		vm_object_update(object,
5863 		    offset,
5864 		    size,
5865 		    NULL,
5866 		    NULL,
5867 		    FALSE,              /* should_return */
5868 		    MEMORY_OBJECT_COPY_SYNC,
5869 		    VM_PROT_NO_CHANGE);
5870 
5871 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5872 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5873 	}
5874 	/*
5875 	 * remember which copy object we synchronized with
5876 	 */
5877 	last_copy_object = object->copy;
5878 	entry = 0;
5879 
5880 	xfer_size = size;
5881 	dst_offset = offset;
5882 	size_in_pages = size / PAGE_SIZE;
5883 
5884 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5885 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5886 		object->scan_collisions = 0;
5887 	}
5888 
5889 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5890 		boolean_t       isSSD = FALSE;
5891 
5892 #if !XNU_TARGET_OS_OSX
5893 		isSSD = TRUE;
5894 #else /* !XNU_TARGET_OS_OSX */
5895 		vnode_pager_get_isSSD(object->pager, &isSSD);
5896 #endif /* !XNU_TARGET_OS_OSX */
5897 		vm_object_unlock(object);
5898 
5899 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5900 
5901 		if (isSSD == TRUE) {
5902 			delay(1000 * size_in_pages);
5903 		} else {
5904 			delay(5000 * size_in_pages);
5905 		}
5906 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5907 
5908 		vm_object_lock(object);
5909 	}
5910 
5911 	while (xfer_size) {
5912 		dwp->dw_mask = 0;
5913 
5914 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5915 			vm_object_unlock(object);
5916 			alias_page = vm_page_grab_fictitious(TRUE);
5917 			vm_object_lock(object);
5918 		}
5919 		if (cntrl_flags & UPL_COPYOUT_FROM) {
5920 			upl->flags |= UPL_PAGE_SYNC_DONE;
5921 
5922 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5923 			    dst_page->vmp_fictitious ||
5924 			    dst_page->vmp_absent ||
5925 			    VMP_ERROR_GET(dst_page) ||
5926 			    dst_page->vmp_cleaning ||
5927 			    (VM_PAGE_WIRED(dst_page))) {
5928 				if (user_page_list) {
5929 					user_page_list[entry].phys_addr = 0;
5930 				}
5931 
5932 				goto try_next_page;
5933 			}
5934 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5935 
5936 			/*
5937 			 * grab this up front...
5938 			 * a high percentange of the time we're going to
5939 			 * need the hardware modification state a bit later
5940 			 * anyway... so we can eliminate an extra call into
5941 			 * the pmap layer by grabbing it here and recording it
5942 			 */
5943 			if (dst_page->vmp_pmapped) {
5944 				refmod_state = pmap_get_refmod(phys_page);
5945 			} else {
5946 				refmod_state = 0;
5947 			}
5948 
5949 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5950 				/*
5951 				 * page is on inactive list and referenced...
5952 				 * reactivate it now... this gets it out of the
5953 				 * way of vm_pageout_scan which would have to
5954 				 * reactivate it upon tripping over it
5955 				 */
5956 				dwp->dw_mask |= DW_vm_page_activate;
5957 			}
5958 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5959 				/*
5960 				 * we're only asking for DIRTY pages to be returned
5961 				 */
5962 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5963 					/*
5964 					 * if we were the page stolen by vm_pageout_scan to be
5965 					 * cleaned (as opposed to a buddy being clustered in
5966 					 * or this request is not being driven by a PAGEOUT cluster
5967 					 * then we only need to check for the page being dirty or
5968 					 * precious to decide whether to return it
5969 					 */
5970 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
5971 						goto check_busy;
5972 					}
5973 					goto dont_return;
5974 				}
5975 				/*
5976 				 * this is a request for a PAGEOUT cluster and this page
5977 				 * is merely along for the ride as a 'buddy'... not only
5978 				 * does it have to be dirty to be returned, but it also
5979 				 * can't have been referenced recently...
5980 				 */
5981 				if ((hibernate_cleaning_in_progress == TRUE ||
5982 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
5983 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5984 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
5985 					goto check_busy;
5986 				}
5987 dont_return:
5988 				/*
5989 				 * if we reach here, we're not to return
5990 				 * the page... go on to the next one
5991 				 */
5992 				if (dst_page->vmp_laundry == TRUE) {
5993 					/*
5994 					 * if we get here, the page is not 'cleaning' (filtered out above).
5995 					 * since it has been referenced, remove it from the laundry
5996 					 * so we don't pay the cost of an I/O to clean a page
5997 					 * we're just going to take back
5998 					 */
5999 					vm_page_lockspin_queues();
6000 
6001 					vm_pageout_steal_laundry(dst_page, TRUE);
6002 					vm_page_activate(dst_page);
6003 
6004 					vm_page_unlock_queues();
6005 				}
6006 				if (user_page_list) {
6007 					user_page_list[entry].phys_addr = 0;
6008 				}
6009 
6010 				goto try_next_page;
6011 			}
6012 check_busy:
6013 			if (dst_page->vmp_busy) {
6014 				if (cntrl_flags & UPL_NOBLOCK) {
6015 					if (user_page_list) {
6016 						user_page_list[entry].phys_addr = 0;
6017 					}
6018 					dwp->dw_mask = 0;
6019 
6020 					goto try_next_page;
6021 				}
6022 				/*
6023 				 * someone else is playing with the
6024 				 * page.  We will have to wait.
6025 				 */
6026 				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6027 
6028 				continue;
6029 			}
6030 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6031 				vm_page_lockspin_queues();
6032 
6033 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6034 					/*
6035 					 * we've buddied up a page for a clustered pageout
6036 					 * that has already been moved to the pageout
6037 					 * queue by pageout_scan... we need to remove
6038 					 * it from the queue and drop the laundry count
6039 					 * on that queue
6040 					 */
6041 					vm_pageout_throttle_up(dst_page);
6042 				}
6043 				vm_page_unlock_queues();
6044 			}
6045 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6046 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6047 
6048 			if (phys_page > upl->highest_page) {
6049 				upl->highest_page = phys_page;
6050 			}
6051 
6052 			assert(!pmap_is_noencrypt(phys_page));
6053 
6054 			if (cntrl_flags & UPL_SET_LITE) {
6055 				unsigned int    pg_num;
6056 
6057 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6058 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6059 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6060 
6061 				if (hw_dirty) {
6062 					if (pmap_flushes_delayed == FALSE) {
6063 						pmap_flush_context_init(&pmap_flush_context_storage);
6064 						pmap_flushes_delayed = TRUE;
6065 					}
6066 					pmap_clear_refmod_options(phys_page,
6067 					    VM_MEM_MODIFIED,
6068 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6069 					    &pmap_flush_context_storage);
6070 				}
6071 
6072 				/*
6073 				 * Mark original page as cleaning
6074 				 * in place.
6075 				 */
6076 				dst_page->vmp_cleaning = TRUE;
6077 				dst_page->vmp_precious = FALSE;
6078 			} else {
6079 				/*
6080 				 * use pageclean setup, it is more
6081 				 * convenient even for the pageout
6082 				 * cases here
6083 				 */
6084 				vm_object_lock(upl->map_object);
6085 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6086 				vm_object_unlock(upl->map_object);
6087 
6088 				alias_page->vmp_absent = FALSE;
6089 				alias_page = NULL;
6090 			}
6091 			if (dirty) {
6092 				SET_PAGE_DIRTY(dst_page, FALSE);
6093 			} else {
6094 				dst_page->vmp_dirty = FALSE;
6095 			}
6096 
6097 			if (!dirty) {
6098 				dst_page->vmp_precious = TRUE;
6099 			}
6100 
6101 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6102 				if (!VM_PAGE_WIRED(dst_page)) {
6103 					dst_page->vmp_free_when_done = TRUE;
6104 				}
6105 			}
6106 		} else {
6107 			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6108 				/*
6109 				 * Honor copy-on-write obligations
6110 				 *
6111 				 * The copy object has changed since we
6112 				 * last synchronized for copy-on-write.
6113 				 * Another copy object might have been
6114 				 * inserted while we released the object's
6115 				 * lock.  Since someone could have seen the
6116 				 * original contents of the remaining pages
6117 				 * through that new object, we have to
6118 				 * synchronize with it again for the remaining
6119 				 * pages only.  The previous pages are "busy"
6120 				 * so they can not be seen through the new
6121 				 * mapping.  The new mapping will see our
6122 				 * upcoming changes for those previous pages,
6123 				 * but that's OK since they couldn't see what
6124 				 * was there before.  It's just a race anyway
6125 				 * and there's no guarantee of consistency or
6126 				 * atomicity.  We just don't want new mappings
6127 				 * to see both the *before* and *after* pages.
6128 				 */
6129 				if (object->copy != VM_OBJECT_NULL) {
6130 					vm_object_update(
6131 						object,
6132 						dst_offset,/* current offset */
6133 						xfer_size, /* remaining size */
6134 						NULL,
6135 						NULL,
6136 						FALSE,     /* should_return */
6137 						MEMORY_OBJECT_COPY_SYNC,
6138 						VM_PROT_NO_CHANGE);
6139 
6140 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6141 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6142 				}
6143 				/*
6144 				 * remember the copy object we synced with
6145 				 */
6146 				last_copy_object = object->copy;
6147 			}
6148 			dst_page = vm_page_lookup(object, dst_offset);
6149 
6150 			if (dst_page != VM_PAGE_NULL) {
6151 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6152 					/*
6153 					 * skip over pages already present in the cache
6154 					 */
6155 					if (user_page_list) {
6156 						user_page_list[entry].phys_addr = 0;
6157 					}
6158 
6159 					goto try_next_page;
6160 				}
6161 				if (dst_page->vmp_fictitious) {
6162 					panic("need corner case for fictitious page");
6163 				}
6164 
6165 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6166 					/*
6167 					 * someone else is playing with the
6168 					 * page.  We will have to wait.
6169 					 */
6170 					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6171 
6172 					continue;
6173 				}
6174 				if (dst_page->vmp_laundry) {
6175 					vm_pageout_steal_laundry(dst_page, FALSE);
6176 				}
6177 			} else {
6178 				if (object->private) {
6179 					/*
6180 					 * This is a nasty wrinkle for users
6181 					 * of upl who encounter device or
6182 					 * private memory however, it is
6183 					 * unavoidable, only a fault can
6184 					 * resolve the actual backing
6185 					 * physical page by asking the
6186 					 * backing device.
6187 					 */
6188 					if (user_page_list) {
6189 						user_page_list[entry].phys_addr = 0;
6190 					}
6191 
6192 					goto try_next_page;
6193 				}
6194 				if (object->scan_collisions) {
6195 					/*
6196 					 * the pageout_scan thread is trying to steal
6197 					 * pages from this object, but has run into our
6198 					 * lock... grab 2 pages from the head of the object...
6199 					 * the first is freed on behalf of pageout_scan, the
6200 					 * 2nd is for our own use... we use vm_object_page_grab
6201 					 * in both cases to avoid taking pages from the free
6202 					 * list since we are under memory pressure and our
6203 					 * lock on this object is getting in the way of
6204 					 * relieving it
6205 					 */
6206 					dst_page = vm_object_page_grab(object);
6207 
6208 					if (dst_page != VM_PAGE_NULL) {
6209 						vm_page_release(dst_page,
6210 						    FALSE);
6211 					}
6212 
6213 					dst_page = vm_object_page_grab(object);
6214 				}
6215 				if (dst_page == VM_PAGE_NULL) {
6216 					/*
6217 					 * need to allocate a page
6218 					 */
6219 					dst_page = vm_page_grab_options(grab_options);
6220 					if (dst_page != VM_PAGE_NULL) {
6221 						page_grab_count++;
6222 					}
6223 				}
6224 				if (dst_page == VM_PAGE_NULL) {
6225 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6226 						/*
6227 						 * we don't want to stall waiting for pages to come onto the free list
6228 						 * while we're already holding absent pages in this UPL
6229 						 * the caller will deal with the empty slots
6230 						 */
6231 						if (user_page_list) {
6232 							user_page_list[entry].phys_addr = 0;
6233 						}
6234 
6235 						goto try_next_page;
6236 					}
6237 					/*
6238 					 * no pages available... wait
6239 					 * then try again for the same
6240 					 * offset...
6241 					 */
6242 					vm_object_unlock(object);
6243 
6244 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6245 
6246 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6247 
6248 					VM_PAGE_WAIT();
6249 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6250 
6251 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6252 
6253 					vm_object_lock(object);
6254 
6255 					continue;
6256 				}
6257 				vm_page_insert(dst_page, object, dst_offset);
6258 
6259 				dst_page->vmp_absent = TRUE;
6260 				dst_page->vmp_busy = FALSE;
6261 
6262 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6263 					/*
6264 					 * if UPL_RET_ONLY_ABSENT was specified,
6265 					 * than we're definitely setting up a
6266 					 * upl for a clustered read/pagein
6267 					 * operation... mark the pages as clustered
6268 					 * so upl_commit_range can put them on the
6269 					 * speculative list
6270 					 */
6271 					dst_page->vmp_clustered = TRUE;
6272 
6273 					if (!(cntrl_flags & UPL_FILE_IO)) {
6274 						counter_inc(&vm_statistics_pageins);
6275 					}
6276 				}
6277 			}
6278 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6279 
6280 			dst_page->vmp_overwriting = TRUE;
6281 
6282 			if (dst_page->vmp_pmapped) {
6283 				if (!(cntrl_flags & UPL_FILE_IO)) {
6284 					/*
6285 					 * eliminate all mappings from the
6286 					 * original object and its prodigy
6287 					 */
6288 					refmod_state = pmap_disconnect(phys_page);
6289 				} else {
6290 					refmod_state = pmap_get_refmod(phys_page);
6291 				}
6292 			} else {
6293 				refmod_state = 0;
6294 			}
6295 
6296 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6297 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6298 
6299 			if (cntrl_flags & UPL_SET_LITE) {
6300 				unsigned int    pg_num;
6301 
6302 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6303 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6304 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6305 
6306 				if (hw_dirty) {
6307 					pmap_clear_modify(phys_page);
6308 				}
6309 
6310 				/*
6311 				 * Mark original page as cleaning
6312 				 * in place.
6313 				 */
6314 				dst_page->vmp_cleaning = TRUE;
6315 				dst_page->vmp_precious = FALSE;
6316 			} else {
6317 				/*
6318 				 * use pageclean setup, it is more
6319 				 * convenient even for the pageout
6320 				 * cases here
6321 				 */
6322 				vm_object_lock(upl->map_object);
6323 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6324 				vm_object_unlock(upl->map_object);
6325 
6326 				alias_page->vmp_absent = FALSE;
6327 				alias_page = NULL;
6328 			}
6329 
6330 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6331 				upl->flags &= ~UPL_CLEAR_DIRTY;
6332 				upl->flags |= UPL_SET_DIRTY;
6333 				dirty = TRUE;
6334 				/*
6335 				 * Page belonging to a code-signed object is about to
6336 				 * be written. Mark it tainted and disconnect it from
6337 				 * all pmaps so processes have to fault it back in and
6338 				 * deal with the tainted bit.
6339 				 */
6340 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6341 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6342 					vm_page_upl_tainted++;
6343 					if (dst_page->vmp_pmapped) {
6344 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6345 						if (refmod_state & VM_MEM_REFERENCED) {
6346 							dst_page->vmp_reference = TRUE;
6347 						}
6348 					}
6349 				}
6350 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6351 				/*
6352 				 * clean in place for read implies
6353 				 * that a write will be done on all
6354 				 * the pages that are dirty before
6355 				 * a upl commit is done.  The caller
6356 				 * is obligated to preserve the
6357 				 * contents of all pages marked dirty
6358 				 */
6359 				upl->flags |= UPL_CLEAR_DIRTY;
6360 			}
6361 			dst_page->vmp_dirty = dirty;
6362 
6363 			if (!dirty) {
6364 				dst_page->vmp_precious = TRUE;
6365 			}
6366 
6367 			if (!VM_PAGE_WIRED(dst_page)) {
6368 				/*
6369 				 * deny access to the target page while
6370 				 * it is being worked on
6371 				 */
6372 				dst_page->vmp_busy = TRUE;
6373 			} else {
6374 				dwp->dw_mask |= DW_vm_page_wire;
6375 			}
6376 
6377 			/*
6378 			 * We might be about to satisfy a fault which has been
6379 			 * requested. So no need for the "restart" bit.
6380 			 */
6381 			dst_page->vmp_restart = FALSE;
6382 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6383 				/*
6384 				 * expect the page to be used
6385 				 */
6386 				dwp->dw_mask |= DW_set_reference;
6387 			}
6388 			if (cntrl_flags & UPL_PRECIOUS) {
6389 				if (object->internal) {
6390 					SET_PAGE_DIRTY(dst_page, FALSE);
6391 					dst_page->vmp_precious = FALSE;
6392 				} else {
6393 					dst_page->vmp_precious = TRUE;
6394 				}
6395 			} else {
6396 				dst_page->vmp_precious = FALSE;
6397 			}
6398 		}
6399 		if (dst_page->vmp_busy) {
6400 			upl->flags |= UPL_HAS_BUSY;
6401 		}
6402 
6403 		if (phys_page > upl->highest_page) {
6404 			upl->highest_page = phys_page;
6405 		}
6406 		assert(!pmap_is_noencrypt(phys_page));
6407 		if (user_page_list) {
6408 			user_page_list[entry].phys_addr = phys_page;
6409 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6410 			user_page_list[entry].absent    = dst_page->vmp_absent;
6411 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6412 			user_page_list[entry].precious  = dst_page->vmp_precious;
6413 			user_page_list[entry].device    = FALSE;
6414 			user_page_list[entry].needed    = FALSE;
6415 			if (dst_page->vmp_clustered == TRUE) {
6416 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6417 			} else {
6418 				user_page_list[entry].speculative = FALSE;
6419 			}
6420 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6421 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6422 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6423 			user_page_list[entry].mark      = FALSE;
6424 		}
6425 		/*
6426 		 * if UPL_RET_ONLY_ABSENT is set, then
6427 		 * we are working with a fresh page and we've
6428 		 * just set the clustered flag on it to
6429 		 * indicate that it was drug in as part of a
6430 		 * speculative cluster... so leave it alone
6431 		 */
6432 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6433 			/*
6434 			 * someone is explicitly grabbing this page...
6435 			 * update clustered and speculative state
6436 			 *
6437 			 */
6438 			if (dst_page->vmp_clustered) {
6439 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6440 			}
6441 		}
6442 try_next_page:
6443 		if (dwp->dw_mask) {
6444 			if (dwp->dw_mask & DW_vm_page_activate) {
6445 				counter_inc(&vm_statistics_reactivations);
6446 			}
6447 
6448 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6449 
6450 			if (dw_count >= dw_limit) {
6451 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6452 
6453 				dwp = dwp_start;
6454 				dw_count = 0;
6455 			}
6456 		}
6457 		entry++;
6458 		dst_offset += PAGE_SIZE_64;
6459 		xfer_size -= PAGE_SIZE;
6460 	}
6461 	if (dw_count) {
6462 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6463 		dwp = dwp_start;
6464 		dw_count = 0;
6465 	}
6466 
6467 	if (alias_page != NULL) {
6468 		VM_PAGE_FREE(alias_page);
6469 	}
6470 	if (pmap_flushes_delayed == TRUE) {
6471 		pmap_flush(&pmap_flush_context_storage);
6472 	}
6473 
6474 	if (page_list_count != NULL) {
6475 		if (upl->flags & UPL_INTERNAL) {
6476 			*page_list_count = 0;
6477 		} else if (*page_list_count > entry) {
6478 			*page_list_count = entry;
6479 		}
6480 	}
6481 #if UPL_DEBUG
6482 	upl->upl_state = 1;
6483 #endif
6484 	vm_object_unlock(object);
6485 
6486 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6487 #if DEVELOPMENT || DEBUG
6488 	if (task != NULL) {
6489 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6490 	}
6491 #endif /* DEVELOPMENT || DEBUG */
6492 
6493 	if (dwp_start && dwp_finish_ctx) {
6494 		vm_page_delayed_work_finish_ctx(dwp_start);
6495 		dwp_start = dwp = NULL;
6496 	}
6497 
6498 	return KERN_SUCCESS;
6499 }
6500 
6501 /*
6502  *	Routine:	vm_object_super_upl_request
6503  *	Purpose:
6504  *		Cause the population of a portion of a vm_object
6505  *		in much the same way as memory_object_upl_request.
6506  *		Depending on the nature of the request, the pages
6507  *		returned may be contain valid data or be uninitialized.
6508  *		However, the region may be expanded up to the super
6509  *		cluster size provided.
6510  */
6511 
6512 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6513 vm_object_super_upl_request(
6514 	vm_object_t object,
6515 	vm_object_offset_t      offset,
6516 	upl_size_t              size,
6517 	upl_size_t              super_cluster,
6518 	upl_t                   *upl,
6519 	upl_page_info_t         *user_page_list,
6520 	unsigned int            *page_list_count,
6521 	upl_control_flags_t     cntrl_flags,
6522 	vm_tag_t                tag)
6523 {
6524 	if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6525 		return KERN_FAILURE;
6526 	}
6527 
6528 	assert(object->paging_in_progress);
6529 	offset = offset - object->paging_offset;
6530 
6531 	if (super_cluster > size) {
6532 		vm_object_offset_t      base_offset;
6533 		upl_size_t              super_size;
6534 		vm_object_size_t        super_size_64;
6535 
6536 		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6537 		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6538 		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6539 		super_size = (upl_size_t) super_size_64;
6540 		assert(super_size == super_size_64);
6541 
6542 		if (offset > (base_offset + super_size)) {
6543 			panic("vm_object_super_upl_request: Missed target pageout"
6544 			    " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6545 			    offset, base_offset, super_size, super_cluster,
6546 			    size, object->paging_offset);
6547 		}
6548 		/*
6549 		 * apparently there is a case where the vm requests a
6550 		 * page to be written out who's offset is beyond the
6551 		 * object size
6552 		 */
6553 		if ((offset + size) > (base_offset + super_size)) {
6554 			super_size_64 = (offset + size) - base_offset;
6555 			super_size = (upl_size_t) super_size_64;
6556 			assert(super_size == super_size_64);
6557 		}
6558 
6559 		offset = base_offset;
6560 		size = super_size;
6561 	}
6562 	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6563 }
6564 
6565 int cs_executable_create_upl = 0;
6566 extern int proc_selfpid(void);
6567 extern char *proc_name_address(void *p);
6568 
6569 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6570 vm_map_create_upl(
6571 	vm_map_t                map,
6572 	vm_map_address_t        offset,
6573 	upl_size_t              *upl_size,
6574 	upl_t                   *upl,
6575 	upl_page_info_array_t   page_list,
6576 	unsigned int            *count,
6577 	upl_control_flags_t     *flags,
6578 	vm_tag_t                tag)
6579 {
6580 	vm_map_entry_t          entry;
6581 	upl_control_flags_t     caller_flags;
6582 	int                     force_data_sync;
6583 	int                     sync_cow_data;
6584 	vm_object_t             local_object;
6585 	vm_map_offset_t         local_offset;
6586 	vm_map_offset_t         local_start;
6587 	kern_return_t           ret;
6588 	vm_map_address_t        original_offset;
6589 	vm_map_size_t           original_size, adjusted_size;
6590 	vm_map_offset_t         local_entry_start;
6591 	vm_object_offset_t      local_entry_offset;
6592 	vm_object_offset_t      offset_in_mapped_page;
6593 	boolean_t               release_map = FALSE;
6594 
6595 start_with_map:
6596 
6597 	original_offset = offset;
6598 	original_size = *upl_size;
6599 	adjusted_size = original_size;
6600 
6601 	caller_flags = *flags;
6602 
6603 	if (caller_flags & ~UPL_VALID_FLAGS) {
6604 		/*
6605 		 * For forward compatibility's sake,
6606 		 * reject any unknown flag.
6607 		 */
6608 		ret = KERN_INVALID_VALUE;
6609 		goto done;
6610 	}
6611 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6612 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6613 
6614 	if (upl == NULL) {
6615 		ret = KERN_INVALID_ARGUMENT;
6616 		goto done;
6617 	}
6618 
6619 REDISCOVER_ENTRY:
6620 	vm_map_lock_read(map);
6621 
6622 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6623 		vm_map_unlock_read(map);
6624 		ret = KERN_FAILURE;
6625 		goto done;
6626 	}
6627 
6628 	local_entry_start = entry->vme_start;
6629 	local_entry_offset = VME_OFFSET(entry);
6630 
6631 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6632 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6633 	}
6634 
6635 	if (entry->vme_end - original_offset < adjusted_size) {
6636 		adjusted_size = entry->vme_end - original_offset;
6637 		assert(adjusted_size > 0);
6638 		*upl_size = (upl_size_t) adjusted_size;
6639 		assert(*upl_size == adjusted_size);
6640 	}
6641 
6642 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6643 		*flags = 0;
6644 
6645 		if (!entry->is_sub_map &&
6646 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6647 			if (VME_OBJECT(entry)->private) {
6648 				*flags = UPL_DEV_MEMORY;
6649 			}
6650 
6651 			if (VME_OBJECT(entry)->phys_contiguous) {
6652 				*flags |= UPL_PHYS_CONTIG;
6653 			}
6654 		}
6655 		vm_map_unlock_read(map);
6656 		ret = KERN_SUCCESS;
6657 		goto done;
6658 	}
6659 
6660 	offset_in_mapped_page = 0;
6661 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6662 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6663 		*upl_size = (upl_size_t)
6664 		    (vm_map_round_page(original_offset + adjusted_size,
6665 		    VM_MAP_PAGE_MASK(map))
6666 		    - offset);
6667 
6668 		offset_in_mapped_page = original_offset - offset;
6669 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6670 
6671 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6672 	}
6673 
6674 	if (!entry->is_sub_map) {
6675 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6676 		    !VME_OBJECT(entry)->phys_contiguous) {
6677 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6678 				*upl_size = MAX_UPL_SIZE_BYTES;
6679 			}
6680 		}
6681 
6682 		/*
6683 		 *      Create an object if necessary.
6684 		 */
6685 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6686 			if (vm_map_lock_read_to_write(map)) {
6687 				goto REDISCOVER_ENTRY;
6688 			}
6689 
6690 			VME_OBJECT_SET(entry,
6691 			    vm_object_allocate((vm_size_t)
6692 			    vm_object_round_page((entry->vme_end - entry->vme_start))),
6693 			    false, 0);
6694 			VME_OFFSET_SET(entry, 0);
6695 			assert(entry->use_pmap);
6696 
6697 			vm_map_lock_write_to_read(map);
6698 		}
6699 
6700 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6701 		    !(entry->protection & VM_PROT_WRITE)) {
6702 			vm_map_unlock_read(map);
6703 			ret = KERN_PROTECTION_FAILURE;
6704 			goto done;
6705 		}
6706 	}
6707 
6708 #if !XNU_TARGET_OS_OSX
6709 	if (map->pmap != kernel_pmap &&
6710 	    (caller_flags & UPL_COPYOUT_FROM) &&
6711 	    (entry->protection & VM_PROT_EXECUTE) &&
6712 	    !(entry->protection & VM_PROT_WRITE)) {
6713 		vm_offset_t     kaddr;
6714 		vm_size_t       ksize;
6715 
6716 		/*
6717 		 * We're about to create a read-only UPL backed by
6718 		 * memory from an executable mapping.
6719 		 * Wiring the pages would result in the pages being copied
6720 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6721 		 * code-signed, so no longer eligible for execution.
6722 		 * Instead, let's copy the data into a kernel buffer and
6723 		 * create the UPL from this kernel buffer.
6724 		 * The kernel buffer is then freed, leaving the UPL holding
6725 		 * the last reference on the VM object, so the memory will
6726 		 * be released when the UPL is committed.
6727 		 */
6728 
6729 		vm_map_unlock_read(map);
6730 		entry = VM_MAP_ENTRY_NULL;
6731 		/* allocate kernel buffer */
6732 		ksize = round_page(*upl_size);
6733 		kaddr = 0;
6734 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6735 		    KMA_PAGEABLE | KMA_DATA, tag);
6736 		if (ret == KERN_SUCCESS) {
6737 			/* copyin the user data */
6738 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6739 		}
6740 		if (ret == KERN_SUCCESS) {
6741 			if (ksize > *upl_size) {
6742 				/* zero out the extra space in kernel buffer */
6743 				memset((void *)(kaddr + *upl_size),
6744 				    0,
6745 				    ksize - *upl_size);
6746 			}
6747 			/* create the UPL from the kernel buffer */
6748 			vm_object_offset_t      offset_in_object;
6749 			vm_object_offset_t      offset_in_object_page;
6750 
6751 			offset_in_object = offset - local_entry_start + local_entry_offset;
6752 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6753 			assert(offset_in_object_page < PAGE_SIZE);
6754 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6755 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6756 			ret = vm_map_create_upl(kernel_map,
6757 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6758 			    upl_size, upl, page_list, count, flags, tag);
6759 		}
6760 		if (kaddr != 0) {
6761 			/* free the kernel buffer */
6762 			kmem_free(kernel_map, kaddr, ksize);
6763 			kaddr = 0;
6764 			ksize = 0;
6765 		}
6766 #if DEVELOPMENT || DEBUG
6767 		DTRACE_VM4(create_upl_from_executable,
6768 		    vm_map_t, map,
6769 		    vm_map_address_t, offset,
6770 		    upl_size_t, *upl_size,
6771 		    kern_return_t, ret);
6772 #endif /* DEVELOPMENT || DEBUG */
6773 		goto done;
6774 	}
6775 #endif /* !XNU_TARGET_OS_OSX */
6776 
6777 	if (!entry->is_sub_map) {
6778 		local_object = VME_OBJECT(entry);
6779 		assert(local_object != VM_OBJECT_NULL);
6780 	}
6781 
6782 	if (!entry->is_sub_map &&
6783 	    !entry->needs_copy &&
6784 	    *upl_size != 0 &&
6785 	    local_object->vo_size > *upl_size && /* partial UPL */
6786 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6787 	    (map->pmap != kernel_pmap) && /* alias checks */
6788 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6789 	    ||
6790 	    ( /* case 2 */
6791 		    local_object->internal &&
6792 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6793 		    local_object->ref_count > 1))) {
6794 		vm_prot_t       prot;
6795 
6796 		/*
6797 		 * Case 1:
6798 		 * Set up the targeted range for copy-on-write to avoid
6799 		 * applying true_share/copy_delay to the entire object.
6800 		 *
6801 		 * Case 2:
6802 		 * This map entry covers only part of an internal
6803 		 * object.  There could be other map entries covering
6804 		 * other areas of this object and some of these map
6805 		 * entries could be marked as "needs_copy", which
6806 		 * assumes that the object is COPY_SYMMETRIC.
6807 		 * To avoid marking this object as COPY_DELAY and
6808 		 * "true_share", let's shadow it and mark the new
6809 		 * (smaller) object as "true_share" and COPY_DELAY.
6810 		 */
6811 
6812 		if (vm_map_lock_read_to_write(map)) {
6813 			goto REDISCOVER_ENTRY;
6814 		}
6815 		vm_map_lock_assert_exclusive(map);
6816 		assert(VME_OBJECT(entry) == local_object);
6817 
6818 		vm_map_clip_start(map,
6819 		    entry,
6820 		    vm_map_trunc_page(offset,
6821 		    VM_MAP_PAGE_MASK(map)));
6822 		vm_map_clip_end(map,
6823 		    entry,
6824 		    vm_map_round_page(offset + *upl_size,
6825 		    VM_MAP_PAGE_MASK(map)));
6826 		if ((entry->vme_end - offset) < *upl_size) {
6827 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6828 			assert(*upl_size == entry->vme_end - offset);
6829 		}
6830 
6831 		prot = entry->protection & ~VM_PROT_WRITE;
6832 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6833 			prot |= VM_PROT_EXECUTE;
6834 		}
6835 		vm_object_pmap_protect(local_object,
6836 		    VME_OFFSET(entry),
6837 		    entry->vme_end - entry->vme_start,
6838 		    ((entry->is_shared ||
6839 		    map->mapped_in_other_pmaps)
6840 		    ? PMAP_NULL
6841 		    : map->pmap),
6842 		    VM_MAP_PAGE_SIZE(map),
6843 		    entry->vme_start,
6844 		    prot);
6845 
6846 		assert(entry->wired_count == 0);
6847 
6848 		/*
6849 		 * Lock the VM object and re-check its status: if it's mapped
6850 		 * in another address space, we could still be racing with
6851 		 * another thread holding that other VM map exclusively.
6852 		 */
6853 		vm_object_lock(local_object);
6854 		if (local_object->true_share) {
6855 			/* object is already in proper state: no COW needed */
6856 			assert(local_object->copy_strategy !=
6857 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6858 		} else {
6859 			/* not true_share: ask for copy-on-write below */
6860 			assert(local_object->copy_strategy ==
6861 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6862 			entry->needs_copy = TRUE;
6863 		}
6864 		vm_object_unlock(local_object);
6865 
6866 		vm_map_lock_write_to_read(map);
6867 	}
6868 
6869 	if (entry->needs_copy) {
6870 		/*
6871 		 * Honor copy-on-write for COPY_SYMMETRIC
6872 		 * strategy.
6873 		 */
6874 		vm_map_t                local_map;
6875 		vm_object_t             object;
6876 		vm_object_offset_t      new_offset;
6877 		vm_prot_t               prot;
6878 		boolean_t               wired;
6879 		vm_map_version_t        version;
6880 		vm_map_t                real_map;
6881 		vm_prot_t               fault_type;
6882 
6883 		local_map = map;
6884 
6885 		if (caller_flags & UPL_COPYOUT_FROM) {
6886 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6887 			vm_counters.create_upl_extra_cow++;
6888 			vm_counters.create_upl_extra_cow_pages +=
6889 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6890 		} else {
6891 			fault_type = VM_PROT_WRITE;
6892 		}
6893 		if (vm_map_lookup_and_lock_object(&local_map,
6894 		    offset, fault_type,
6895 		    OBJECT_LOCK_EXCLUSIVE,
6896 		    &version, &object,
6897 		    &new_offset, &prot, &wired,
6898 		    NULL,
6899 		    &real_map, NULL) != KERN_SUCCESS) {
6900 			if (fault_type == VM_PROT_WRITE) {
6901 				vm_counters.create_upl_lookup_failure_write++;
6902 			} else {
6903 				vm_counters.create_upl_lookup_failure_copy++;
6904 			}
6905 			vm_map_unlock_read(local_map);
6906 			ret = KERN_FAILURE;
6907 			goto done;
6908 		}
6909 		if (real_map != local_map) {
6910 			vm_map_unlock(real_map);
6911 		}
6912 		vm_map_unlock_read(local_map);
6913 
6914 		vm_object_unlock(object);
6915 
6916 		goto REDISCOVER_ENTRY;
6917 	}
6918 
6919 	if (entry->is_sub_map) {
6920 		vm_map_t        submap;
6921 
6922 		submap = VME_SUBMAP(entry);
6923 		local_start = entry->vme_start;
6924 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6925 
6926 		vm_map_reference(submap);
6927 		vm_map_unlock_read(map);
6928 
6929 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6930 		offset += offset_in_mapped_page;
6931 		*upl_size -= offset_in_mapped_page;
6932 
6933 		if (release_map) {
6934 			vm_map_deallocate(map);
6935 		}
6936 		map = submap;
6937 		release_map = TRUE;
6938 		offset = local_offset + (offset - local_start);
6939 		goto start_with_map;
6940 	}
6941 
6942 	if (sync_cow_data &&
6943 	    (VME_OBJECT(entry)->shadow ||
6944 	    VME_OBJECT(entry)->copy)) {
6945 		local_object = VME_OBJECT(entry);
6946 		local_start = entry->vme_start;
6947 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6948 
6949 		vm_object_reference(local_object);
6950 		vm_map_unlock_read(map);
6951 
6952 		if (local_object->shadow && local_object->copy) {
6953 			vm_object_lock_request(local_object->shadow,
6954 			    ((vm_object_offset_t)
6955 			    ((offset - local_start) +
6956 			    local_offset) +
6957 			    local_object->vo_shadow_offset),
6958 			    *upl_size, FALSE,
6959 			    MEMORY_OBJECT_DATA_SYNC,
6960 			    VM_PROT_NO_CHANGE);
6961 		}
6962 		sync_cow_data = FALSE;
6963 		vm_object_deallocate(local_object);
6964 
6965 		goto REDISCOVER_ENTRY;
6966 	}
6967 	if (force_data_sync) {
6968 		local_object = VME_OBJECT(entry);
6969 		local_start = entry->vme_start;
6970 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6971 
6972 		vm_object_reference(local_object);
6973 		vm_map_unlock_read(map);
6974 
6975 		vm_object_lock_request(local_object,
6976 		    ((vm_object_offset_t)
6977 		    ((offset - local_start) +
6978 		    local_offset)),
6979 		    (vm_object_size_t)*upl_size,
6980 		    FALSE,
6981 		    MEMORY_OBJECT_DATA_SYNC,
6982 		    VM_PROT_NO_CHANGE);
6983 
6984 		force_data_sync = FALSE;
6985 		vm_object_deallocate(local_object);
6986 
6987 		goto REDISCOVER_ENTRY;
6988 	}
6989 	if (VME_OBJECT(entry)->private) {
6990 		*flags = UPL_DEV_MEMORY;
6991 	} else {
6992 		*flags = 0;
6993 	}
6994 
6995 	if (VME_OBJECT(entry)->phys_contiguous) {
6996 		*flags |= UPL_PHYS_CONTIG;
6997 	}
6998 
6999 	local_object = VME_OBJECT(entry);
7000 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7001 	local_start = entry->vme_start;
7002 
7003 	/*
7004 	 * Wiring will copy the pages to the shadow object.
7005 	 * The shadow object will not be code-signed so
7006 	 * attempting to execute code from these copied pages
7007 	 * would trigger a code-signing violation.
7008 	 */
7009 	if (entry->protection & VM_PROT_EXECUTE) {
7010 #if MACH_ASSERT
7011 		printf("pid %d[%s] create_upl out of executable range from "
7012 		    "0x%llx to 0x%llx: side effects may include "
7013 		    "code-signing violations later on\n",
7014 		    proc_selfpid(),
7015 		    (get_bsdtask_info(current_task())
7016 		    ? proc_name_address(get_bsdtask_info(current_task()))
7017 		    : "?"),
7018 		    (uint64_t) entry->vme_start,
7019 		    (uint64_t) entry->vme_end);
7020 #endif /* MACH_ASSERT */
7021 		DTRACE_VM2(cs_executable_create_upl,
7022 		    uint64_t, (uint64_t)entry->vme_start,
7023 		    uint64_t, (uint64_t)entry->vme_end);
7024 		cs_executable_create_upl++;
7025 	}
7026 
7027 	vm_object_lock(local_object);
7028 
7029 	/*
7030 	 * Ensure that this object is "true_share" and "copy_delay" now,
7031 	 * while we're still holding the VM map lock.  After we unlock the map,
7032 	 * anything could happen to that mapping, including some copy-on-write
7033 	 * activity.  We need to make sure that the IOPL will point at the
7034 	 * same memory as the mapping.
7035 	 */
7036 	if (local_object->true_share) {
7037 		assert(local_object->copy_strategy !=
7038 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7039 	} else if (local_object != kernel_object &&
7040 	    local_object != compressor_object &&
7041 	    !local_object->phys_contiguous) {
7042 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7043 		if (!local_object->true_share &&
7044 		    vm_object_tracking_btlog) {
7045 			btlog_record(vm_object_tracking_btlog, local_object,
7046 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7047 			    btref_get(__builtin_frame_address(0), 0));
7048 		}
7049 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7050 		local_object->true_share = TRUE;
7051 		if (local_object->copy_strategy ==
7052 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7053 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7054 		}
7055 	}
7056 
7057 	vm_object_reference_locked(local_object);
7058 	vm_object_unlock(local_object);
7059 
7060 	vm_map_unlock_read(map);
7061 
7062 	offset += offset_in_mapped_page;
7063 	assert(*upl_size > offset_in_mapped_page);
7064 	*upl_size -= offset_in_mapped_page;
7065 
7066 	ret = vm_object_iopl_request(local_object,
7067 	    ((vm_object_offset_t)
7068 	    ((offset - local_start) + local_offset)),
7069 	    *upl_size,
7070 	    upl,
7071 	    page_list,
7072 	    count,
7073 	    caller_flags,
7074 	    tag);
7075 	vm_object_deallocate(local_object);
7076 
7077 done:
7078 	if (release_map) {
7079 		vm_map_deallocate(map);
7080 	}
7081 
7082 	return ret;
7083 }
7084 
7085 /*
7086  * Internal routine to enter a UPL into a VM map.
7087  *
7088  * JMM - This should just be doable through the standard
7089  * vm_map_enter() API.
7090  */
7091 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7092 vm_map_enter_upl_range(
7093 	vm_map_t                map,
7094 	upl_t                   upl,
7095 	vm_object_offset_t      offset_to_map,
7096 	upl_size_t              size_to_map,
7097 	vm_prot_t               prot_to_map,
7098 	vm_map_offset_t         *dst_addr)
7099 {
7100 	vm_map_size_t           size;
7101 	vm_object_offset_t      offset;
7102 	vm_map_offset_t         addr;
7103 	vm_page_t               m;
7104 	kern_return_t           kr;
7105 	int                     isVectorUPL = 0, curr_upl = 0;
7106 	upl_t                   vector_upl = NULL;
7107 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7108 	vm_map_t                vector_upl_submap = NULL;
7109 	upl_offset_t            subupl_offset = 0;
7110 	upl_size_t              subupl_size = 0;
7111 
7112 	if (upl == UPL_NULL) {
7113 		return KERN_INVALID_ARGUMENT;
7114 	}
7115 
7116 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7117 	assert(map == kernel_map);
7118 
7119 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7120 		int mapped = 0, valid_upls = 0;
7121 		vector_upl = upl;
7122 
7123 		upl_lock(vector_upl);
7124 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7125 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7126 			if (upl == NULL) {
7127 				continue;
7128 			}
7129 			valid_upls++;
7130 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7131 				mapped++;
7132 			}
7133 		}
7134 
7135 		if (mapped) {
7136 			if (mapped != valid_upls) {
7137 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7138 			} else {
7139 				upl_unlock(vector_upl);
7140 				return KERN_FAILURE;
7141 			}
7142 		}
7143 
7144 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7145 			panic("TODO4K: vector UPL not implemented");
7146 		}
7147 
7148 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7149 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7150 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7151 		    VM_KERN_MEMORY_NONE).kmr_submap;
7152 		map = vector_upl_submap;
7153 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7154 		curr_upl = 0;
7155 	} else {
7156 		upl_lock(upl);
7157 	}
7158 
7159 process_upl_to_enter:
7160 	if (isVectorUPL) {
7161 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7162 			*dst_addr = vector_upl_dst_addr;
7163 			upl_unlock(vector_upl);
7164 			return KERN_SUCCESS;
7165 		}
7166 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7167 		if (upl == NULL) {
7168 			goto process_upl_to_enter;
7169 		}
7170 
7171 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7172 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7173 	} else {
7174 		/*
7175 		 * check to see if already mapped
7176 		 */
7177 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7178 			upl_unlock(upl);
7179 			return KERN_FAILURE;
7180 		}
7181 	}
7182 
7183 	if ((!(upl->flags & UPL_SHADOWED)) &&
7184 	    ((upl->flags & UPL_HAS_BUSY) ||
7185 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7186 		vm_object_t             object;
7187 		vm_page_t               alias_page;
7188 		vm_object_offset_t      new_offset;
7189 		unsigned int            pg_num;
7190 		wpl_array_t             lite_list;
7191 
7192 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7193 		if (upl->flags & UPL_INTERNAL) {
7194 			lite_list = (wpl_array_t)
7195 			    ((((uintptr_t)upl) + sizeof(struct upl))
7196 			    + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7197 		} else {
7198 			lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7199 		}
7200 		object = upl->map_object;
7201 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7202 
7203 		vm_object_lock(upl->map_object);
7204 
7205 		upl->map_object->shadow = object;
7206 		upl->map_object->pageout = TRUE;
7207 		upl->map_object->can_persist = FALSE;
7208 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7209 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7210 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7211 		    "object %p shadow_offset 0x%llx",
7212 		    upl->map_object,
7213 		    (uint64_t)upl->map_object->vo_shadow_offset);
7214 		upl->map_object->wimg_bits = object->wimg_bits;
7215 		offset = upl->map_object->vo_shadow_offset;
7216 		new_offset = 0;
7217 
7218 		upl->flags |= UPL_SHADOWED;
7219 
7220 		while (size) {
7221 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7222 			assert(pg_num == new_offset / PAGE_SIZE);
7223 
7224 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7225 				alias_page = vm_page_grab_fictitious(TRUE);
7226 
7227 				vm_object_lock(object);
7228 
7229 				m = vm_page_lookup(object, offset);
7230 				if (m == VM_PAGE_NULL) {
7231 					panic("vm_upl_map: page missing");
7232 				}
7233 
7234 				/*
7235 				 * Convert the fictitious page to a private
7236 				 * shadow of the real page.
7237 				 */
7238 				assert(alias_page->vmp_fictitious);
7239 				alias_page->vmp_fictitious = FALSE;
7240 				alias_page->vmp_private = TRUE;
7241 				alias_page->vmp_free_when_done = TRUE;
7242 				/*
7243 				 * since m is a page in the upl it must
7244 				 * already be wired or BUSY, so it's
7245 				 * safe to assign the underlying physical
7246 				 * page to the alias
7247 				 */
7248 				VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7249 
7250 				vm_object_unlock(object);
7251 
7252 				vm_page_lockspin_queues();
7253 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7254 				vm_page_unlock_queues();
7255 
7256 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7257 
7258 				assert(!alias_page->vmp_wanted);
7259 				alias_page->vmp_busy = FALSE;
7260 				alias_page->vmp_absent = FALSE;
7261 			}
7262 			size -= PAGE_SIZE;
7263 			offset += PAGE_SIZE_64;
7264 			new_offset += PAGE_SIZE_64;
7265 		}
7266 		vm_object_unlock(upl->map_object);
7267 	}
7268 	if (upl->flags & UPL_SHADOWED) {
7269 		if (isVectorUPL) {
7270 			offset = 0;
7271 		} else {
7272 			offset = offset_to_map;
7273 		}
7274 	} else {
7275 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7276 		if (!isVectorUPL) {
7277 			offset += offset_to_map;
7278 		}
7279 	}
7280 
7281 	if (isVectorUPL) {
7282 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7283 	} else {
7284 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7285 	}
7286 
7287 	vm_object_reference(upl->map_object);
7288 
7289 	if (!isVectorUPL) {
7290 		*dst_addr = 0;
7291 		/*
7292 		 * NEED A UPL_MAP ALIAS
7293 		 */
7294 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7295 		    VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_DATA, VM_KERN_MEMORY_OSFMK,
7296 		    upl->map_object, offset, FALSE,
7297 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7298 
7299 		if (kr != KERN_SUCCESS) {
7300 			vm_object_deallocate(upl->map_object);
7301 			upl_unlock(upl);
7302 			return kr;
7303 		}
7304 	} else {
7305 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7306 		    VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7307 		    upl->map_object, offset, FALSE,
7308 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7309 		if (kr) {
7310 			panic("vm_map_enter failed for a Vector UPL");
7311 		}
7312 	}
7313 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7314 	                                        /* this will have to be an increment rather than */
7315 	                                        /* an assignment. */
7316 	vm_object_lock(upl->map_object);
7317 
7318 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7319 		m = vm_page_lookup(upl->map_object, offset);
7320 
7321 		if (m) {
7322 			m->vmp_pmapped = TRUE;
7323 
7324 			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7325 			 * but only in kernel space. If this was on a user map,
7326 			 * we'd have to set the wpmapped bit. */
7327 			/* m->vmp_wpmapped = TRUE; */
7328 			assert(map->pmap == kernel_pmap);
7329 
7330 			PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7331 
7332 			assert(kr == KERN_SUCCESS);
7333 #if KASAN
7334 			kasan_notify_address(addr, PAGE_SIZE_64);
7335 #endif
7336 		}
7337 		offset += PAGE_SIZE_64;
7338 	}
7339 	vm_object_unlock(upl->map_object);
7340 
7341 	/*
7342 	 * hold a reference for the mapping
7343 	 */
7344 	upl->ref_count++;
7345 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7346 	upl->kaddr = (vm_offset_t) *dst_addr;
7347 	assert(upl->kaddr == *dst_addr);
7348 
7349 	if (isVectorUPL) {
7350 		goto process_upl_to_enter;
7351 	}
7352 
7353 	if (!isVectorUPL) {
7354 		vm_map_offset_t addr_adjustment;
7355 
7356 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7357 		if (addr_adjustment) {
7358 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7359 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7360 			*dst_addr += addr_adjustment;
7361 		}
7362 	}
7363 
7364 	upl_unlock(upl);
7365 
7366 	return KERN_SUCCESS;
7367 }
7368 
7369 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7370 vm_map_enter_upl(
7371 	vm_map_t                map,
7372 	upl_t                   upl,
7373 	vm_map_offset_t         *dst_addr)
7374 {
7375 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7376 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7377 }
7378 
7379 /*
7380  * Internal routine to remove a UPL mapping from a VM map.
7381  *
7382  * XXX - This should just be doable through a standard
7383  * vm_map_remove() operation.  Otherwise, implicit clean-up
7384  * of the target map won't be able to correctly remove
7385  * these (and release the reference on the UPL).  Having
7386  * to do this means we can't map these into user-space
7387  * maps yet.
7388  */
7389 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7390 vm_map_remove_upl_range(
7391 	vm_map_t        map,
7392 	upl_t           upl,
7393 	__unused vm_object_offset_t    offset_to_unmap,
7394 	__unused upl_size_t      size_to_unmap)
7395 {
7396 	vm_address_t    addr;
7397 	upl_size_t      size;
7398 	int             isVectorUPL = 0, curr_upl = 0;
7399 	upl_t           vector_upl = NULL;
7400 
7401 	if (upl == UPL_NULL) {
7402 		return KERN_INVALID_ARGUMENT;
7403 	}
7404 
7405 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7406 		int     unmapped = 0, valid_upls = 0;
7407 		vector_upl = upl;
7408 		upl_lock(vector_upl);
7409 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7410 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7411 			if (upl == NULL) {
7412 				continue;
7413 			}
7414 			valid_upls++;
7415 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7416 				unmapped++;
7417 			}
7418 		}
7419 
7420 		if (unmapped) {
7421 			if (unmapped != valid_upls) {
7422 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7423 			} else {
7424 				upl_unlock(vector_upl);
7425 				return KERN_FAILURE;
7426 			}
7427 		}
7428 		curr_upl = 0;
7429 	} else {
7430 		upl_lock(upl);
7431 	}
7432 
7433 process_upl_to_remove:
7434 	if (isVectorUPL) {
7435 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7436 			vm_map_t v_upl_submap;
7437 			vm_offset_t v_upl_submap_dst_addr;
7438 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7439 
7440 			kmem_free_guard(map, v_upl_submap_dst_addr,
7441 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7442 			vm_map_deallocate(v_upl_submap);
7443 			upl_unlock(vector_upl);
7444 			return KERN_SUCCESS;
7445 		}
7446 
7447 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7448 		if (upl == NULL) {
7449 			goto process_upl_to_remove;
7450 		}
7451 	}
7452 
7453 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7454 		addr = upl->kaddr;
7455 		size = upl->u_mapped_size;
7456 
7457 		assert(upl->ref_count > 1);
7458 		upl->ref_count--;               /* removing mapping ref */
7459 
7460 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7461 		upl->kaddr = (vm_offset_t) 0;
7462 		upl->u_mapped_size = 0;
7463 
7464 		if (isVectorUPL) {
7465 			/*
7466 			 * If it's a Vectored UPL, we'll be removing the entire
7467 			 * submap anyways, so no need to remove individual UPL
7468 			 * element mappings from within the submap
7469 			 */
7470 			goto process_upl_to_remove;
7471 		}
7472 
7473 		upl_unlock(upl);
7474 
7475 		vm_map_remove(map,
7476 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7477 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7478 		return KERN_SUCCESS;
7479 	}
7480 	upl_unlock(upl);
7481 
7482 	return KERN_FAILURE;
7483 }
7484 
7485 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7486 vm_map_remove_upl(
7487 	vm_map_t        map,
7488 	upl_t           upl)
7489 {
7490 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7491 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7492 }
7493 
7494 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7495 upl_commit_range(
7496 	upl_t                   upl,
7497 	upl_offset_t            offset,
7498 	upl_size_t              size,
7499 	int                     flags,
7500 	upl_page_info_t         *page_list,
7501 	mach_msg_type_number_t  count,
7502 	boolean_t               *empty)
7503 {
7504 	upl_size_t              xfer_size, subupl_size;
7505 	vm_object_t             shadow_object;
7506 	vm_object_t             object;
7507 	vm_object_t             m_object;
7508 	vm_object_offset_t      target_offset;
7509 	upl_offset_t            subupl_offset = offset;
7510 	int                     entry;
7511 	wpl_array_t             lite_list;
7512 	int                     occupied;
7513 	int                     clear_refmod = 0;
7514 	int                     pgpgout_count = 0;
7515 	struct  vm_page_delayed_work    dw_array;
7516 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7517 	bool                    dwp_finish_ctx = TRUE;
7518 	int                     dw_count;
7519 	int                     dw_limit;
7520 	int                     isVectorUPL = 0;
7521 	upl_t                   vector_upl = NULL;
7522 	boolean_t               should_be_throttled = FALSE;
7523 
7524 	vm_page_t               nxt_page = VM_PAGE_NULL;
7525 	int                     fast_path_possible = 0;
7526 	int                     fast_path_full_commit = 0;
7527 	int                     throttle_page = 0;
7528 	int                     unwired_count = 0;
7529 	int                     local_queue_count = 0;
7530 	vm_page_t               first_local, last_local;
7531 	vm_object_offset_t      obj_start, obj_end, obj_offset;
7532 	kern_return_t           kr = KERN_SUCCESS;
7533 
7534 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7535 
7536 	dwp_start = dwp = NULL;
7537 
7538 	subupl_size = size;
7539 	*empty = FALSE;
7540 
7541 	if (upl == UPL_NULL) {
7542 		return KERN_INVALID_ARGUMENT;
7543 	}
7544 
7545 	dw_count = 0;
7546 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7547 	dwp_start = vm_page_delayed_work_get_ctx();
7548 	if (dwp_start == NULL) {
7549 		dwp_start = &dw_array;
7550 		dw_limit = 1;
7551 		dwp_finish_ctx = FALSE;
7552 	}
7553 
7554 	dwp = dwp_start;
7555 
7556 	if (count == 0) {
7557 		page_list = NULL;
7558 	}
7559 
7560 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7561 		vector_upl = upl;
7562 		upl_lock(vector_upl);
7563 	} else {
7564 		upl_lock(upl);
7565 	}
7566 
7567 process_upl_to_commit:
7568 
7569 	if (isVectorUPL) {
7570 		size = subupl_size;
7571 		offset = subupl_offset;
7572 		if (size == 0) {
7573 			upl_unlock(vector_upl);
7574 			kr = KERN_SUCCESS;
7575 			goto done;
7576 		}
7577 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7578 		if (upl == NULL) {
7579 			upl_unlock(vector_upl);
7580 			kr = KERN_FAILURE;
7581 			goto done;
7582 		}
7583 		page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7584 		subupl_size -= size;
7585 		subupl_offset += size;
7586 	}
7587 
7588 #if UPL_DEBUG
7589 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7590 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7591 
7592 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7593 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7594 
7595 		upl->upl_commit_index++;
7596 	}
7597 #endif
7598 	if (upl->flags & UPL_DEVICE_MEMORY) {
7599 		xfer_size = 0;
7600 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7601 		xfer_size = size;
7602 	} else {
7603 		if (!isVectorUPL) {
7604 			upl_unlock(upl);
7605 		} else {
7606 			upl_unlock(vector_upl);
7607 		}
7608 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7609 		kr = KERN_FAILURE;
7610 		goto done;
7611 	}
7612 	if (upl->flags & UPL_SET_DIRTY) {
7613 		flags |= UPL_COMMIT_SET_DIRTY;
7614 	}
7615 	if (upl->flags & UPL_CLEAR_DIRTY) {
7616 		flags |= UPL_COMMIT_CLEAR_DIRTY;
7617 	}
7618 
7619 	if (upl->flags & UPL_INTERNAL) {
7620 		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7621 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7622 	} else {
7623 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7624 	}
7625 
7626 	object = upl->map_object;
7627 
7628 	if (upl->flags & UPL_SHADOWED) {
7629 		vm_object_lock(object);
7630 		shadow_object = object->shadow;
7631 	} else {
7632 		shadow_object = object;
7633 	}
7634 	entry = offset / PAGE_SIZE;
7635 	target_offset = (vm_object_offset_t)offset;
7636 
7637 	if (upl->flags & UPL_KERNEL_OBJECT) {
7638 		vm_object_lock_shared(shadow_object);
7639 	} else {
7640 		vm_object_lock(shadow_object);
7641 	}
7642 
7643 	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7644 
7645 	if (upl->flags & UPL_ACCESS_BLOCKED) {
7646 		assert(shadow_object->blocked_access);
7647 		shadow_object->blocked_access = FALSE;
7648 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7649 	}
7650 
7651 	if (shadow_object->code_signed) {
7652 		/*
7653 		 * CODE SIGNING:
7654 		 * If the object is code-signed, do not let this UPL tell
7655 		 * us if the pages are valid or not.  Let the pages be
7656 		 * validated by VM the normal way (when they get mapped or
7657 		 * copied).
7658 		 */
7659 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7660 	}
7661 	if (!page_list) {
7662 		/*
7663 		 * No page list to get the code-signing info from !?
7664 		 */
7665 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7666 	}
7667 	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7668 		should_be_throttled = TRUE;
7669 	}
7670 
7671 	if ((upl->flags & UPL_IO_WIRE) &&
7672 	    !(flags & UPL_COMMIT_FREE_ABSENT) &&
7673 	    !isVectorUPL &&
7674 	    shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7675 	    shadow_object->purgable != VM_PURGABLE_EMPTY) {
7676 		if (!vm_page_queue_empty(&shadow_object->memq)) {
7677 			if (size == shadow_object->vo_size) {
7678 				nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7679 				fast_path_full_commit = 1;
7680 			}
7681 			fast_path_possible = 1;
7682 
7683 			if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7684 			    (shadow_object->purgable == VM_PURGABLE_DENY ||
7685 			    shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7686 			    shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7687 				throttle_page = 1;
7688 			}
7689 		}
7690 	}
7691 	first_local = VM_PAGE_NULL;
7692 	last_local = VM_PAGE_NULL;
7693 
7694 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7695 	obj_end = obj_start + xfer_size;
7696 	obj_start = vm_object_trunc_page(obj_start);
7697 	obj_end = vm_object_round_page(obj_end);
7698 	for (obj_offset = obj_start;
7699 	    obj_offset < obj_end;
7700 	    obj_offset += PAGE_SIZE) {
7701 		vm_page_t       t, m;
7702 
7703 		dwp->dw_mask = 0;
7704 		clear_refmod = 0;
7705 
7706 		m = VM_PAGE_NULL;
7707 
7708 		if (upl->flags & UPL_LITE) {
7709 			unsigned int    pg_num;
7710 
7711 			if (nxt_page != VM_PAGE_NULL) {
7712 				m = nxt_page;
7713 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7714 				target_offset = m->vmp_offset;
7715 			}
7716 			pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7717 			assert(pg_num == target_offset / PAGE_SIZE);
7718 
7719 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7720 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7721 
7722 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7723 					m = vm_page_lookup(shadow_object, obj_offset);
7724 				}
7725 			} else {
7726 				m = NULL;
7727 			}
7728 		}
7729 		if (upl->flags & UPL_SHADOWED) {
7730 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7731 				t->vmp_free_when_done = FALSE;
7732 
7733 				VM_PAGE_FREE(t);
7734 
7735 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7736 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7737 				}
7738 			}
7739 		}
7740 		if (m == VM_PAGE_NULL) {
7741 			goto commit_next_page;
7742 		}
7743 
7744 		m_object = VM_PAGE_OBJECT(m);
7745 
7746 		if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7747 			assert(m->vmp_busy);
7748 
7749 			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7750 			goto commit_next_page;
7751 		}
7752 
7753 		if (flags & UPL_COMMIT_CS_VALIDATED) {
7754 			/*
7755 			 * CODE SIGNING:
7756 			 * Set the code signing bits according to
7757 			 * what the UPL says they should be.
7758 			 */
7759 			m->vmp_cs_validated |= page_list[entry].cs_validated;
7760 			m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7761 			m->vmp_cs_nx |= page_list[entry].cs_nx;
7762 		}
7763 		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7764 			m->vmp_written_by_kernel = TRUE;
7765 		}
7766 
7767 		if (upl->flags & UPL_IO_WIRE) {
7768 			if (page_list) {
7769 				page_list[entry].phys_addr = 0;
7770 			}
7771 
7772 			if (flags & UPL_COMMIT_SET_DIRTY) {
7773 				SET_PAGE_DIRTY(m, FALSE);
7774 			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7775 				m->vmp_dirty = FALSE;
7776 
7777 				if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7778 				    m->vmp_cs_validated &&
7779 				    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7780 					/*
7781 					 * CODE SIGNING:
7782 					 * This page is no longer dirty
7783 					 * but could have been modified,
7784 					 * so it will need to be
7785 					 * re-validated.
7786 					 */
7787 					m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7788 
7789 					VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7790 
7791 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7792 				}
7793 				clear_refmod |= VM_MEM_MODIFIED;
7794 			}
7795 			if (upl->flags & UPL_ACCESS_BLOCKED) {
7796 				/*
7797 				 * We blocked access to the pages in this UPL.
7798 				 * Clear the "busy" bit and wake up any waiter
7799 				 * for this page.
7800 				 */
7801 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7802 			}
7803 			if (fast_path_possible) {
7804 				assert(m_object->purgable != VM_PURGABLE_EMPTY);
7805 				assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7806 				if (m->vmp_absent) {
7807 					assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7808 					assert(m->vmp_wire_count == 0);
7809 					assert(m->vmp_busy);
7810 
7811 					m->vmp_absent = FALSE;
7812 					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7813 				} else {
7814 					if (m->vmp_wire_count == 0) {
7815 						panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7816 					}
7817 					assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7818 
7819 					/*
7820 					 * XXX FBDP need to update some other
7821 					 * counters here (purgeable_wired_count)
7822 					 * (ledgers), ...
7823 					 */
7824 					assert(m->vmp_wire_count > 0);
7825 					m->vmp_wire_count--;
7826 
7827 					if (m->vmp_wire_count == 0) {
7828 						m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7829 						unwired_count++;
7830 					}
7831 				}
7832 				if (m->vmp_wire_count == 0) {
7833 					assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7834 
7835 					if (last_local == VM_PAGE_NULL) {
7836 						assert(first_local == VM_PAGE_NULL);
7837 
7838 						last_local = m;
7839 						first_local = m;
7840 					} else {
7841 						assert(first_local != VM_PAGE_NULL);
7842 
7843 						m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7844 						first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7845 						first_local = m;
7846 					}
7847 					local_queue_count++;
7848 
7849 					if (throttle_page) {
7850 						m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7851 					} else {
7852 						if (flags & UPL_COMMIT_INACTIVATE) {
7853 							if (shadow_object->internal) {
7854 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7855 							} else {
7856 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7857 							}
7858 						} else {
7859 							m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7860 						}
7861 					}
7862 				}
7863 			} else {
7864 				if (flags & UPL_COMMIT_INACTIVATE) {
7865 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7866 					clear_refmod |= VM_MEM_REFERENCED;
7867 				}
7868 				if (m->vmp_absent) {
7869 					if (flags & UPL_COMMIT_FREE_ABSENT) {
7870 						dwp->dw_mask |= DW_vm_page_free;
7871 					} else {
7872 						m->vmp_absent = FALSE;
7873 						dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7874 
7875 						if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7876 							dwp->dw_mask |= DW_vm_page_activate;
7877 						}
7878 					}
7879 				} else {
7880 					dwp->dw_mask |= DW_vm_page_unwire;
7881 				}
7882 			}
7883 			goto commit_next_page;
7884 		}
7885 		assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7886 
7887 		if (page_list) {
7888 			page_list[entry].phys_addr = 0;
7889 		}
7890 
7891 		/*
7892 		 * make sure to clear the hardware
7893 		 * modify or reference bits before
7894 		 * releasing the BUSY bit on this page
7895 		 * otherwise we risk losing a legitimate
7896 		 * change of state
7897 		 */
7898 		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7899 			m->vmp_dirty = FALSE;
7900 
7901 			clear_refmod |= VM_MEM_MODIFIED;
7902 		}
7903 		if (m->vmp_laundry) {
7904 			dwp->dw_mask |= DW_vm_pageout_throttle_up;
7905 		}
7906 
7907 		if (VM_PAGE_WIRED(m)) {
7908 			m->vmp_free_when_done = FALSE;
7909 		}
7910 
7911 		if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7912 		    m->vmp_cs_validated &&
7913 		    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7914 			/*
7915 			 * CODE SIGNING:
7916 			 * This page is no longer dirty
7917 			 * but could have been modified,
7918 			 * so it will need to be
7919 			 * re-validated.
7920 			 */
7921 			m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7922 
7923 			VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7924 
7925 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7926 		}
7927 		if (m->vmp_overwriting) {
7928 			/*
7929 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
7930 			 */
7931 			if (m->vmp_busy) {
7932 #if CONFIG_PHANTOM_CACHE
7933 				if (m->vmp_absent && !m_object->internal) {
7934 					dwp->dw_mask |= DW_vm_phantom_cache_update;
7935 				}
7936 #endif
7937 				m->vmp_absent = FALSE;
7938 
7939 				dwp->dw_mask |= DW_clear_busy;
7940 			} else {
7941 				/*
7942 				 * alternate (COPY_OUT_FROM == FALSE) page_list case
7943 				 * Occurs when the original page was wired
7944 				 * at the time of the list request
7945 				 */
7946 				assert(VM_PAGE_WIRED(m));
7947 
7948 				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7949 			}
7950 			m->vmp_overwriting = FALSE;
7951 		}
7952 		m->vmp_cleaning = FALSE;
7953 
7954 		if (m->vmp_free_when_done) {
7955 			/*
7956 			 * With the clean queue enabled, UPL_PAGEOUT should
7957 			 * no longer set the pageout bit. Its pages now go
7958 			 * to the clean queue.
7959 			 *
7960 			 * We don't use the cleaned Q anymore and so this
7961 			 * assert isn't correct. The code for the clean Q
7962 			 * still exists and might be used in the future. If we
7963 			 * go back to the cleaned Q, we will re-enable this
7964 			 * assert.
7965 			 *
7966 			 * assert(!(upl->flags & UPL_PAGEOUT));
7967 			 */
7968 			assert(!m_object->internal);
7969 
7970 			m->vmp_free_when_done = FALSE;
7971 
7972 			if ((flags & UPL_COMMIT_SET_DIRTY) ||
7973 			    (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7974 				/*
7975 				 * page was re-dirtied after we started
7976 				 * the pageout... reactivate it since
7977 				 * we don't know whether the on-disk
7978 				 * copy matches what is now in memory
7979 				 */
7980 				SET_PAGE_DIRTY(m, FALSE);
7981 
7982 				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7983 
7984 				if (upl->flags & UPL_PAGEOUT) {
7985 					counter_inc(&vm_statistics_reactivations);
7986 					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7987 				}
7988 			} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
7989 				/*
7990 				 * Someone else might still be handling this
7991 				 * page (vm_fault() for example), so let's not
7992 				 * free it or "un-busy" it!
7993 				 * Put that page in the "speculative" queue
7994 				 * for now (since we would otherwise have freed
7995 				 * it) and let whoever is keeping the page
7996 				 * "busy" move it if needed when they're done
7997 				 * with it.
7998 				 */
7999 				dwp->dw_mask |= DW_vm_page_speculate;
8000 			} else {
8001 				/*
8002 				 * page has been successfully cleaned
8003 				 * go ahead and free it for other use
8004 				 */
8005 				if (m_object->internal) {
8006 					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8007 				} else {
8008 					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8009 				}
8010 				m->vmp_dirty = FALSE;
8011 				if (!(upl->flags & UPL_HAS_BUSY)) {
8012 					assert(!m->vmp_busy);
8013 				}
8014 				m->vmp_busy = TRUE;
8015 
8016 				dwp->dw_mask |= DW_vm_page_free;
8017 			}
8018 			goto commit_next_page;
8019 		}
8020 		/*
8021 		 * It is a part of the semantic of COPYOUT_FROM
8022 		 * UPLs that a commit implies cache sync
8023 		 * between the vm page and the backing store
8024 		 * this can be used to strip the precious bit
8025 		 * as well as clean
8026 		 */
8027 		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8028 			m->vmp_precious = FALSE;
8029 		}
8030 
8031 		if (flags & UPL_COMMIT_SET_DIRTY) {
8032 			SET_PAGE_DIRTY(m, FALSE);
8033 		} else {
8034 			m->vmp_dirty = FALSE;
8035 		}
8036 
8037 		/* with the clean queue on, move *all* cleaned pages to the clean queue */
8038 		if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8039 			pgpgout_count++;
8040 
8041 			counter_inc(&vm_statistics_pageouts);
8042 			DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8043 
8044 			dwp->dw_mask |= DW_enqueue_cleaned;
8045 		} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8046 			/*
8047 			 * page coming back in from being 'frozen'...
8048 			 * it was dirty before it was frozen, so keep it so
8049 			 * the vm_page_activate will notice that it really belongs
8050 			 * on the throttle queue and put it there
8051 			 */
8052 			SET_PAGE_DIRTY(m, FALSE);
8053 			dwp->dw_mask |= DW_vm_page_activate;
8054 		} else {
8055 			if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8056 				dwp->dw_mask |= DW_vm_page_deactivate_internal;
8057 				clear_refmod |= VM_MEM_REFERENCED;
8058 			} else if (!VM_PAGE_PAGEABLE(m)) {
8059 				if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8060 					dwp->dw_mask |= DW_vm_page_speculate;
8061 				} else if (m->vmp_reference) {
8062 					dwp->dw_mask |= DW_vm_page_activate;
8063 				} else {
8064 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
8065 					clear_refmod |= VM_MEM_REFERENCED;
8066 				}
8067 			}
8068 		}
8069 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8070 			/*
8071 			 * We blocked access to the pages in this URL.
8072 			 * Clear the "busy" bit on this page before we
8073 			 * wake up any waiter.
8074 			 */
8075 			dwp->dw_mask |= DW_clear_busy;
8076 		}
8077 		/*
8078 		 * Wakeup any thread waiting for the page to be un-cleaning.
8079 		 */
8080 		dwp->dw_mask |= DW_PAGE_WAKEUP;
8081 
8082 commit_next_page:
8083 		if (clear_refmod) {
8084 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8085 		}
8086 
8087 		target_offset += PAGE_SIZE_64;
8088 		xfer_size -= PAGE_SIZE;
8089 		entry++;
8090 
8091 		if (dwp->dw_mask) {
8092 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8093 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8094 
8095 				if (dw_count >= dw_limit) {
8096 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8097 
8098 					dwp = dwp_start;
8099 					dw_count = 0;
8100 				}
8101 			} else {
8102 				if (dwp->dw_mask & DW_clear_busy) {
8103 					m->vmp_busy = FALSE;
8104 				}
8105 
8106 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8107 					PAGE_WAKEUP(m);
8108 				}
8109 			}
8110 		}
8111 	}
8112 	if (dw_count) {
8113 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8114 		dwp = dwp_start;
8115 		dw_count = 0;
8116 	}
8117 
8118 	if (fast_path_possible) {
8119 		assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8120 		assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8121 
8122 		if (local_queue_count || unwired_count) {
8123 			if (local_queue_count) {
8124 				vm_page_t       first_target;
8125 				vm_page_queue_head_t    *target_queue;
8126 
8127 				if (throttle_page) {
8128 					target_queue = &vm_page_queue_throttled;
8129 				} else {
8130 					if (flags & UPL_COMMIT_INACTIVATE) {
8131 						if (shadow_object->internal) {
8132 							target_queue = &vm_page_queue_anonymous;
8133 						} else {
8134 							target_queue = &vm_page_queue_inactive;
8135 						}
8136 					} else {
8137 						target_queue = &vm_page_queue_active;
8138 					}
8139 				}
8140 				/*
8141 				 * Transfer the entire local queue to a regular LRU page queues.
8142 				 */
8143 				vm_page_lockspin_queues();
8144 
8145 				first_target = (vm_page_t) vm_page_queue_first(target_queue);
8146 
8147 				if (vm_page_queue_empty(target_queue)) {
8148 					target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8149 				} else {
8150 					first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8151 				}
8152 
8153 				target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8154 				first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8155 				last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8156 
8157 				/*
8158 				 * Adjust the global page counts.
8159 				 */
8160 				if (throttle_page) {
8161 					vm_page_throttled_count += local_queue_count;
8162 				} else {
8163 					if (flags & UPL_COMMIT_INACTIVATE) {
8164 						if (shadow_object->internal) {
8165 							vm_page_anonymous_count += local_queue_count;
8166 						}
8167 						vm_page_inactive_count += local_queue_count;
8168 
8169 						token_new_pagecount += local_queue_count;
8170 					} else {
8171 						vm_page_active_count += local_queue_count;
8172 					}
8173 
8174 					if (shadow_object->internal) {
8175 						vm_page_pageable_internal_count += local_queue_count;
8176 					} else {
8177 						vm_page_pageable_external_count += local_queue_count;
8178 					}
8179 				}
8180 			} else {
8181 				vm_page_lockspin_queues();
8182 			}
8183 			if (unwired_count) {
8184 				vm_page_wire_count -= unwired_count;
8185 				VM_CHECK_MEMORYSTATUS;
8186 			}
8187 			vm_page_unlock_queues();
8188 
8189 			VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8190 		}
8191 	}
8192 	occupied = 1;
8193 
8194 	if (upl->flags & UPL_DEVICE_MEMORY) {
8195 		occupied = 0;
8196 	} else if (upl->flags & UPL_LITE) {
8197 		int     pg_num;
8198 		int     i;
8199 
8200 		occupied = 0;
8201 
8202 		if (!fast_path_full_commit) {
8203 			pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8204 			pg_num = (pg_num + 31) >> 5;
8205 
8206 			for (i = 0; i < pg_num; i++) {
8207 				if (lite_list[i] != 0) {
8208 					occupied = 1;
8209 					break;
8210 				}
8211 			}
8212 		}
8213 	} else {
8214 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8215 			occupied = 0;
8216 		}
8217 	}
8218 	if (occupied == 0) {
8219 		/*
8220 		 * If this UPL element belongs to a Vector UPL and is
8221 		 * empty, then this is the right function to deallocate
8222 		 * it. So go ahead set the *empty variable. The flag
8223 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8224 		 * should be considered relevant for the Vector UPL and not
8225 		 * the internal UPLs.
8226 		 */
8227 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8228 			*empty = TRUE;
8229 		}
8230 
8231 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8232 			/*
8233 			 * this is not a paging object
8234 			 * so we need to drop the paging reference
8235 			 * that was taken when we created the UPL
8236 			 * against this object
8237 			 */
8238 			vm_object_activity_end(shadow_object);
8239 			vm_object_collapse(shadow_object, 0, TRUE);
8240 		} else {
8241 			/*
8242 			 * we dontated the paging reference to
8243 			 * the map object... vm_pageout_object_terminate
8244 			 * will drop this reference
8245 			 */
8246 		}
8247 	}
8248 	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8249 	vm_object_unlock(shadow_object);
8250 	if (object != shadow_object) {
8251 		vm_object_unlock(object);
8252 	}
8253 
8254 	if (!isVectorUPL) {
8255 		upl_unlock(upl);
8256 	} else {
8257 		/*
8258 		 * If we completed our operations on an UPL that is
8259 		 * part of a Vectored UPL and if empty is TRUE, then
8260 		 * we should go ahead and deallocate this UPL element.
8261 		 * Then we check if this was the last of the UPL elements
8262 		 * within that Vectored UPL. If so, set empty to TRUE
8263 		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8264 		 * can go ahead and deallocate the Vector UPL too.
8265 		 */
8266 		if (*empty == TRUE) {
8267 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8268 			upl_deallocate(upl);
8269 		}
8270 		goto process_upl_to_commit;
8271 	}
8272 	if (pgpgout_count) {
8273 		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8274 	}
8275 
8276 	kr = KERN_SUCCESS;
8277 done:
8278 	if (dwp_start && dwp_finish_ctx) {
8279 		vm_page_delayed_work_finish_ctx(dwp_start);
8280 		dwp_start = dwp = NULL;
8281 	}
8282 
8283 	return kr;
8284 }
8285 
8286 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8287 upl_abort_range(
8288 	upl_t                   upl,
8289 	upl_offset_t            offset,
8290 	upl_size_t              size,
8291 	int                     error,
8292 	boolean_t               *empty)
8293 {
8294 	upl_page_info_t         *user_page_list = NULL;
8295 	upl_size_t              xfer_size, subupl_size;
8296 	vm_object_t             shadow_object;
8297 	vm_object_t             object;
8298 	vm_object_offset_t      target_offset;
8299 	upl_offset_t            subupl_offset = offset;
8300 	int                     entry;
8301 	wpl_array_t             lite_list;
8302 	int                     occupied;
8303 	struct  vm_page_delayed_work    dw_array;
8304 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8305 	bool                    dwp_finish_ctx = TRUE;
8306 	int                     dw_count;
8307 	int                     dw_limit;
8308 	int                     isVectorUPL = 0;
8309 	upl_t                   vector_upl = NULL;
8310 	vm_object_offset_t      obj_start, obj_end, obj_offset;
8311 	kern_return_t           kr = KERN_SUCCESS;
8312 
8313 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8314 
8315 	dwp_start = dwp = NULL;
8316 
8317 	subupl_size = size;
8318 	*empty = FALSE;
8319 
8320 	if (upl == UPL_NULL) {
8321 		return KERN_INVALID_ARGUMENT;
8322 	}
8323 
8324 	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8325 		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8326 	}
8327 
8328 	dw_count = 0;
8329 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8330 	dwp_start = vm_page_delayed_work_get_ctx();
8331 	if (dwp_start == NULL) {
8332 		dwp_start = &dw_array;
8333 		dw_limit = 1;
8334 		dwp_finish_ctx = FALSE;
8335 	}
8336 
8337 	dwp = dwp_start;
8338 
8339 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8340 		vector_upl = upl;
8341 		upl_lock(vector_upl);
8342 	} else {
8343 		upl_lock(upl);
8344 	}
8345 
8346 process_upl_to_abort:
8347 	if (isVectorUPL) {
8348 		size = subupl_size;
8349 		offset = subupl_offset;
8350 		if (size == 0) {
8351 			upl_unlock(vector_upl);
8352 			kr = KERN_SUCCESS;
8353 			goto done;
8354 		}
8355 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8356 		if (upl == NULL) {
8357 			upl_unlock(vector_upl);
8358 			kr = KERN_FAILURE;
8359 			goto done;
8360 		}
8361 		subupl_size -= size;
8362 		subupl_offset += size;
8363 	}
8364 
8365 	*empty = FALSE;
8366 
8367 #if UPL_DEBUG
8368 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8369 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8370 
8371 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8372 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8373 		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8374 
8375 		upl->upl_commit_index++;
8376 	}
8377 #endif
8378 	if (upl->flags & UPL_DEVICE_MEMORY) {
8379 		xfer_size = 0;
8380 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8381 		xfer_size = size;
8382 	} else {
8383 		if (!isVectorUPL) {
8384 			upl_unlock(upl);
8385 		} else {
8386 			upl_unlock(vector_upl);
8387 		}
8388 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8389 		kr = KERN_FAILURE;
8390 		goto done;
8391 	}
8392 	if (upl->flags & UPL_INTERNAL) {
8393 		lite_list = (wpl_array_t)
8394 		    ((((uintptr_t)upl) + sizeof(struct upl))
8395 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8396 
8397 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8398 	} else {
8399 		lite_list = (wpl_array_t)
8400 		    (((uintptr_t)upl) + sizeof(struct upl));
8401 	}
8402 	object = upl->map_object;
8403 
8404 	if (upl->flags & UPL_SHADOWED) {
8405 		vm_object_lock(object);
8406 		shadow_object = object->shadow;
8407 	} else {
8408 		shadow_object = object;
8409 	}
8410 
8411 	entry = offset / PAGE_SIZE;
8412 	target_offset = (vm_object_offset_t)offset;
8413 
8414 	if (upl->flags & UPL_KERNEL_OBJECT) {
8415 		vm_object_lock_shared(shadow_object);
8416 	} else {
8417 		vm_object_lock(shadow_object);
8418 	}
8419 
8420 	if (upl->flags & UPL_ACCESS_BLOCKED) {
8421 		assert(shadow_object->blocked_access);
8422 		shadow_object->blocked_access = FALSE;
8423 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8424 	}
8425 
8426 	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8427 		panic("upl_abort_range: kernel_object being DUMPED");
8428 	}
8429 
8430 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8431 	obj_end = obj_start + xfer_size;
8432 	obj_start = vm_object_trunc_page(obj_start);
8433 	obj_end = vm_object_round_page(obj_end);
8434 	for (obj_offset = obj_start;
8435 	    obj_offset < obj_end;
8436 	    obj_offset += PAGE_SIZE) {
8437 		vm_page_t       t, m;
8438 		unsigned int    pg_num;
8439 		boolean_t       needed;
8440 
8441 		pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8442 		assert(pg_num == target_offset / PAGE_SIZE);
8443 
8444 		needed = FALSE;
8445 
8446 		if (user_page_list) {
8447 			needed = user_page_list[pg_num].needed;
8448 		}
8449 
8450 		dwp->dw_mask = 0;
8451 		m = VM_PAGE_NULL;
8452 
8453 		if (upl->flags & UPL_LITE) {
8454 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8455 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8456 
8457 				if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8458 					m = vm_page_lookup(shadow_object, obj_offset);
8459 				}
8460 			}
8461 		}
8462 		if (upl->flags & UPL_SHADOWED) {
8463 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8464 				t->vmp_free_when_done = FALSE;
8465 
8466 				VM_PAGE_FREE(t);
8467 
8468 				if (m == VM_PAGE_NULL) {
8469 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8470 				}
8471 			}
8472 		}
8473 		if ((upl->flags & UPL_KERNEL_OBJECT)) {
8474 			goto abort_next_page;
8475 		}
8476 
8477 		if (m != VM_PAGE_NULL) {
8478 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8479 
8480 			if (m->vmp_absent) {
8481 				boolean_t must_free = TRUE;
8482 
8483 				/*
8484 				 * COPYOUT = FALSE case
8485 				 * check for error conditions which must
8486 				 * be passed back to the pages customer
8487 				 */
8488 				if (error & UPL_ABORT_RESTART) {
8489 					m->vmp_restart = TRUE;
8490 					m->vmp_absent = FALSE;
8491 					m->vmp_unusual = TRUE;
8492 					must_free = FALSE;
8493 				} else if (error & UPL_ABORT_UNAVAILABLE) {
8494 					m->vmp_restart = FALSE;
8495 					m->vmp_unusual = TRUE;
8496 					must_free = FALSE;
8497 				} else if (error & UPL_ABORT_ERROR) {
8498 					m->vmp_restart = FALSE;
8499 					m->vmp_absent = FALSE;
8500 					m->vmp_error = TRUE;
8501 					m->vmp_unusual = TRUE;
8502 					must_free = FALSE;
8503 				}
8504 				if (m->vmp_clustered && needed == FALSE) {
8505 					/*
8506 					 * This page was a part of a speculative
8507 					 * read-ahead initiated by the kernel
8508 					 * itself.  No one is expecting this
8509 					 * page and no one will clean up its
8510 					 * error state if it ever becomes valid
8511 					 * in the future.
8512 					 * We have to free it here.
8513 					 */
8514 					must_free = TRUE;
8515 				}
8516 				m->vmp_cleaning = FALSE;
8517 
8518 				if (m->vmp_overwriting && !m->vmp_busy) {
8519 					/*
8520 					 * this shouldn't happen since
8521 					 * this is an 'absent' page, but
8522 					 * it doesn't hurt to check for
8523 					 * the 'alternate' method of
8524 					 * stabilizing the page...
8525 					 * we will mark 'busy' to be cleared
8526 					 * in the following code which will
8527 					 * take care of the primary stabilzation
8528 					 * method (i.e. setting 'busy' to TRUE)
8529 					 */
8530 					dwp->dw_mask |= DW_vm_page_unwire;
8531 				}
8532 				m->vmp_overwriting = FALSE;
8533 
8534 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8535 
8536 				if (must_free == TRUE) {
8537 					dwp->dw_mask |= DW_vm_page_free;
8538 				} else {
8539 					dwp->dw_mask |= DW_vm_page_activate;
8540 				}
8541 			} else {
8542 				/*
8543 				 * Handle the trusted pager throttle.
8544 				 */
8545 				if (m->vmp_laundry) {
8546 					dwp->dw_mask |= DW_vm_pageout_throttle_up;
8547 				}
8548 
8549 				if (upl->flags & UPL_ACCESS_BLOCKED) {
8550 					/*
8551 					 * We blocked access to the pages in this UPL.
8552 					 * Clear the "busy" bit and wake up any waiter
8553 					 * for this page.
8554 					 */
8555 					dwp->dw_mask |= DW_clear_busy;
8556 				}
8557 				if (m->vmp_overwriting) {
8558 					if (m->vmp_busy) {
8559 						dwp->dw_mask |= DW_clear_busy;
8560 					} else {
8561 						/*
8562 						 * deal with the 'alternate' method
8563 						 * of stabilizing the page...
8564 						 * we will either free the page
8565 						 * or mark 'busy' to be cleared
8566 						 * in the following code which will
8567 						 * take care of the primary stabilzation
8568 						 * method (i.e. setting 'busy' to TRUE)
8569 						 */
8570 						dwp->dw_mask |= DW_vm_page_unwire;
8571 					}
8572 					m->vmp_overwriting = FALSE;
8573 				}
8574 				m->vmp_free_when_done = FALSE;
8575 				m->vmp_cleaning = FALSE;
8576 
8577 				if (error & UPL_ABORT_DUMP_PAGES) {
8578 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8579 
8580 					dwp->dw_mask |= DW_vm_page_free;
8581 				} else {
8582 					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8583 						if (error & UPL_ABORT_REFERENCE) {
8584 							/*
8585 							 * we've been told to explictly
8586 							 * reference this page... for
8587 							 * file I/O, this is done by
8588 							 * implementing an LRU on the inactive q
8589 							 */
8590 							dwp->dw_mask |= DW_vm_page_lru;
8591 						} else if (!VM_PAGE_PAGEABLE(m)) {
8592 							dwp->dw_mask |= DW_vm_page_deactivate_internal;
8593 						}
8594 					}
8595 					dwp->dw_mask |= DW_PAGE_WAKEUP;
8596 				}
8597 			}
8598 		}
8599 abort_next_page:
8600 		target_offset += PAGE_SIZE_64;
8601 		xfer_size -= PAGE_SIZE;
8602 		entry++;
8603 
8604 		if (dwp->dw_mask) {
8605 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8606 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8607 
8608 				if (dw_count >= dw_limit) {
8609 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8610 
8611 					dwp = dwp_start;
8612 					dw_count = 0;
8613 				}
8614 			} else {
8615 				if (dwp->dw_mask & DW_clear_busy) {
8616 					m->vmp_busy = FALSE;
8617 				}
8618 
8619 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8620 					PAGE_WAKEUP(m);
8621 				}
8622 			}
8623 		}
8624 	}
8625 	if (dw_count) {
8626 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8627 		dwp = dwp_start;
8628 		dw_count = 0;
8629 	}
8630 
8631 	occupied = 1;
8632 
8633 	if (upl->flags & UPL_DEVICE_MEMORY) {
8634 		occupied = 0;
8635 	} else if (upl->flags & UPL_LITE) {
8636 		int     pg_num;
8637 		int     i;
8638 
8639 		pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8640 		pg_num = (pg_num + 31) >> 5;
8641 		occupied = 0;
8642 
8643 		for (i = 0; i < pg_num; i++) {
8644 			if (lite_list[i] != 0) {
8645 				occupied = 1;
8646 				break;
8647 			}
8648 		}
8649 	} else {
8650 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8651 			occupied = 0;
8652 		}
8653 	}
8654 	if (occupied == 0) {
8655 		/*
8656 		 * If this UPL element belongs to a Vector UPL and is
8657 		 * empty, then this is the right function to deallocate
8658 		 * it. So go ahead set the *empty variable. The flag
8659 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8660 		 * should be considered relevant for the Vector UPL and
8661 		 * not the internal UPLs.
8662 		 */
8663 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8664 			*empty = TRUE;
8665 		}
8666 
8667 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8668 			/*
8669 			 * this is not a paging object
8670 			 * so we need to drop the paging reference
8671 			 * that was taken when we created the UPL
8672 			 * against this object
8673 			 */
8674 			vm_object_activity_end(shadow_object);
8675 			vm_object_collapse(shadow_object, 0, TRUE);
8676 		} else {
8677 			/*
8678 			 * we dontated the paging reference to
8679 			 * the map object... vm_pageout_object_terminate
8680 			 * will drop this reference
8681 			 */
8682 		}
8683 	}
8684 	vm_object_unlock(shadow_object);
8685 	if (object != shadow_object) {
8686 		vm_object_unlock(object);
8687 	}
8688 
8689 	if (!isVectorUPL) {
8690 		upl_unlock(upl);
8691 	} else {
8692 		/*
8693 		 * If we completed our operations on an UPL that is
8694 		 * part of a Vectored UPL and if empty is TRUE, then
8695 		 * we should go ahead and deallocate this UPL element.
8696 		 * Then we check if this was the last of the UPL elements
8697 		 * within that Vectored UPL. If so, set empty to TRUE
8698 		 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8699 		 * can go ahead and deallocate the Vector UPL too.
8700 		 */
8701 		if (*empty == TRUE) {
8702 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8703 			upl_deallocate(upl);
8704 		}
8705 		goto process_upl_to_abort;
8706 	}
8707 
8708 	kr = KERN_SUCCESS;
8709 
8710 done:
8711 	if (dwp_start && dwp_finish_ctx) {
8712 		vm_page_delayed_work_finish_ctx(dwp_start);
8713 		dwp_start = dwp = NULL;
8714 	}
8715 
8716 	return kr;
8717 }
8718 
8719 
8720 kern_return_t
upl_abort(upl_t upl,int error)8721 upl_abort(
8722 	upl_t   upl,
8723 	int     error)
8724 {
8725 	boolean_t       empty;
8726 
8727 	if (upl == UPL_NULL) {
8728 		return KERN_INVALID_ARGUMENT;
8729 	}
8730 
8731 	return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8732 }
8733 
8734 
8735 /* an option on commit should be wire */
8736 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8737 upl_commit(
8738 	upl_t                   upl,
8739 	upl_page_info_t         *page_list,
8740 	mach_msg_type_number_t  count)
8741 {
8742 	boolean_t       empty;
8743 
8744 	if (upl == UPL_NULL) {
8745 		return KERN_INVALID_ARGUMENT;
8746 	}
8747 
8748 	return upl_commit_range(upl, 0, upl->u_size, 0,
8749 	           page_list, count, &empty);
8750 }
8751 
8752 
8753 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8754 iopl_valid_data(
8755 	upl_t    upl,
8756 	vm_tag_t tag)
8757 {
8758 	vm_object_t     object;
8759 	vm_offset_t     offset;
8760 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
8761 	upl_size_t      size;
8762 	int             wired_count = 0;
8763 
8764 	if (upl == NULL) {
8765 		panic("iopl_valid_data: NULL upl");
8766 	}
8767 	if (vector_upl_is_valid(upl)) {
8768 		panic("iopl_valid_data: vector upl");
8769 	}
8770 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8771 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8772 	}
8773 
8774 	object = upl->map_object;
8775 
8776 	if (object == kernel_object || object == compressor_object) {
8777 		panic("iopl_valid_data: object == kernel or compressor");
8778 	}
8779 
8780 	if (object->purgable == VM_PURGABLE_VOLATILE ||
8781 	    object->purgable == VM_PURGABLE_EMPTY) {
8782 		panic("iopl_valid_data: object %p purgable %d",
8783 		    object, object->purgable);
8784 	}
8785 
8786 	size = upl_adjusted_size(upl, PAGE_MASK);
8787 
8788 	vm_object_lock(object);
8789 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8790 
8791 	bool whole_object;
8792 
8793 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8794 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8795 		whole_object = true;
8796 	} else {
8797 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8798 		whole_object = false;
8799 	}
8800 
8801 	while (size) {
8802 		if (whole_object) {
8803 			if (nxt_page != VM_PAGE_NULL) {
8804 				m = nxt_page;
8805 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8806 			}
8807 		} else {
8808 			m = vm_page_lookup(object, offset);
8809 			offset += PAGE_SIZE;
8810 
8811 			if (m == VM_PAGE_NULL) {
8812 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8813 			}
8814 		}
8815 		if (m->vmp_busy) {
8816 			if (!m->vmp_absent) {
8817 				panic("iopl_valid_data: busy page w/o absent");
8818 			}
8819 
8820 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8821 				panic("iopl_valid_data: busy+absent page on page queue");
8822 			}
8823 			if (m->vmp_reusable) {
8824 				panic("iopl_valid_data: %p is reusable", m);
8825 			}
8826 
8827 			m->vmp_absent = FALSE;
8828 			m->vmp_dirty = TRUE;
8829 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8830 			assert(m->vmp_wire_count == 0);
8831 			m->vmp_wire_count++;
8832 			assert(m->vmp_wire_count);
8833 			if (m->vmp_wire_count == 1) {
8834 				m->vmp_q_state = VM_PAGE_IS_WIRED;
8835 				wired_count++;
8836 			} else {
8837 				panic("iopl_valid_data: %p already wired", m);
8838 			}
8839 
8840 			PAGE_WAKEUP_DONE(m);
8841 		}
8842 		size -= PAGE_SIZE;
8843 	}
8844 	if (wired_count) {
8845 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8846 		assert(object->resident_page_count >= object->wired_page_count);
8847 
8848 		/* no need to adjust purgeable accounting for this object: */
8849 		assert(object->purgable != VM_PURGABLE_VOLATILE);
8850 		assert(object->purgable != VM_PURGABLE_EMPTY);
8851 
8852 		vm_page_lockspin_queues();
8853 		vm_page_wire_count += wired_count;
8854 		vm_page_unlock_queues();
8855 	}
8856 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8857 	vm_object_unlock(object);
8858 }
8859 
8860 
8861 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8862 vm_object_set_pmap_cache_attr(
8863 	vm_object_t             object,
8864 	upl_page_info_array_t   user_page_list,
8865 	unsigned int            num_pages,
8866 	boolean_t               batch_pmap_op)
8867 {
8868 	unsigned int    cache_attr = 0;
8869 
8870 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8871 	assert(user_page_list);
8872 	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8873 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8874 	}
8875 }
8876 
8877 
8878 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8879 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8880 
8881 
8882 
8883 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8884 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8885     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8886 {
8887 	vm_page_t       dst_page;
8888 	unsigned int    entry;
8889 	int             page_count;
8890 	int             delayed_unlock = 0;
8891 	boolean_t       retval = TRUE;
8892 	ppnum_t         phys_page;
8893 
8894 	vm_object_lock_assert_exclusive(object);
8895 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8896 	assert(object->purgable != VM_PURGABLE_EMPTY);
8897 	assert(object->pager == NULL);
8898 	assert(object->copy == NULL);
8899 	assert(object->shadow == NULL);
8900 
8901 	page_count = object->resident_page_count;
8902 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8903 
8904 	vm_page_lock_queues();
8905 
8906 	while (page_count--) {
8907 		if (dst_page->vmp_busy ||
8908 		    dst_page->vmp_fictitious ||
8909 		    dst_page->vmp_absent ||
8910 		    VMP_ERROR_GET(dst_page) ||
8911 		    dst_page->vmp_cleaning ||
8912 		    dst_page->vmp_restart ||
8913 		    dst_page->vmp_laundry) {
8914 			retval = FALSE;
8915 			goto done;
8916 		}
8917 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8918 			retval = FALSE;
8919 			goto done;
8920 		}
8921 		dst_page->vmp_reference = TRUE;
8922 
8923 		vm_page_wire(dst_page, tag, FALSE);
8924 
8925 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8926 			SET_PAGE_DIRTY(dst_page, FALSE);
8927 		}
8928 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8929 		assert(entry >= 0 && entry < object->resident_page_count);
8930 		lite_list[entry >> 5] |= 1U << (entry & 31);
8931 
8932 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8933 
8934 		if (phys_page > upl->highest_page) {
8935 			upl->highest_page = phys_page;
8936 		}
8937 
8938 		if (user_page_list) {
8939 			user_page_list[entry].phys_addr = phys_page;
8940 			user_page_list[entry].absent    = dst_page->vmp_absent;
8941 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8942 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8943 			user_page_list[entry].precious  = dst_page->vmp_precious;
8944 			user_page_list[entry].device    = FALSE;
8945 			user_page_list[entry].speculative = FALSE;
8946 			user_page_list[entry].cs_validated = FALSE;
8947 			user_page_list[entry].cs_tainted = FALSE;
8948 			user_page_list[entry].cs_nx     = FALSE;
8949 			user_page_list[entry].needed    = FALSE;
8950 			user_page_list[entry].mark      = FALSE;
8951 		}
8952 		if (delayed_unlock++ > 256) {
8953 			delayed_unlock = 0;
8954 			lck_mtx_yield(&vm_page_queue_lock);
8955 
8956 			VM_CHECK_MEMORYSTATUS;
8957 		}
8958 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8959 	}
8960 done:
8961 	vm_page_unlock_queues();
8962 
8963 	VM_CHECK_MEMORYSTATUS;
8964 
8965 	return retval;
8966 }
8967 
8968 
8969 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8970 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8971     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
8972     int page_count, int* page_grab_count)
8973 {
8974 	vm_page_t       dst_page;
8975 	boolean_t       no_zero_fill = FALSE;
8976 	int             interruptible;
8977 	int             pages_wired = 0;
8978 	int             pages_inserted = 0;
8979 	int             entry = 0;
8980 	uint64_t        delayed_ledger_update = 0;
8981 	kern_return_t   ret = KERN_SUCCESS;
8982 	int             grab_options;
8983 	ppnum_t         phys_page;
8984 
8985 	vm_object_lock_assert_exclusive(object);
8986 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8987 	assert(object->purgable != VM_PURGABLE_EMPTY);
8988 	assert(object->pager == NULL);
8989 	assert(object->copy == NULL);
8990 	assert(object->shadow == NULL);
8991 
8992 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8993 		interruptible = THREAD_ABORTSAFE;
8994 	} else {
8995 		interruptible = THREAD_UNINT;
8996 	}
8997 
8998 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8999 		no_zero_fill = TRUE;
9000 	}
9001 
9002 	grab_options = 0;
9003 #if CONFIG_SECLUDED_MEMORY
9004 	if (object->can_grab_secluded) {
9005 		grab_options |= VM_PAGE_GRAB_SECLUDED;
9006 	}
9007 #endif /* CONFIG_SECLUDED_MEMORY */
9008 
9009 	while (page_count--) {
9010 		while ((dst_page = vm_page_grab_options(grab_options))
9011 		    == VM_PAGE_NULL) {
9012 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9013 
9014 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9015 
9016 			if (vm_page_wait(interruptible) == FALSE) {
9017 				/*
9018 				 * interrupted case
9019 				 */
9020 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9021 
9022 				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9023 
9024 				ret = MACH_SEND_INTERRUPTED;
9025 				goto done;
9026 			}
9027 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9028 
9029 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9030 		}
9031 		if (no_zero_fill == FALSE) {
9032 			vm_page_zero_fill(dst_page);
9033 		} else {
9034 			dst_page->vmp_absent = TRUE;
9035 		}
9036 
9037 		dst_page->vmp_reference = TRUE;
9038 
9039 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9040 			SET_PAGE_DIRTY(dst_page, FALSE);
9041 		}
9042 		if (dst_page->vmp_absent == FALSE) {
9043 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9044 			assert(dst_page->vmp_wire_count == 0);
9045 			dst_page->vmp_wire_count++;
9046 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9047 			assert(dst_page->vmp_wire_count);
9048 			pages_wired++;
9049 			PAGE_WAKEUP_DONE(dst_page);
9050 		}
9051 		pages_inserted++;
9052 
9053 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9054 
9055 		lite_list[entry >> 5] |= 1U << (entry & 31);
9056 
9057 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9058 
9059 		if (phys_page > upl->highest_page) {
9060 			upl->highest_page = phys_page;
9061 		}
9062 
9063 		if (user_page_list) {
9064 			user_page_list[entry].phys_addr = phys_page;
9065 			user_page_list[entry].absent    = dst_page->vmp_absent;
9066 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9067 			user_page_list[entry].free_when_done    = FALSE;
9068 			user_page_list[entry].precious  = FALSE;
9069 			user_page_list[entry].device    = FALSE;
9070 			user_page_list[entry].speculative = FALSE;
9071 			user_page_list[entry].cs_validated = FALSE;
9072 			user_page_list[entry].cs_tainted = FALSE;
9073 			user_page_list[entry].cs_nx     = FALSE;
9074 			user_page_list[entry].needed    = FALSE;
9075 			user_page_list[entry].mark      = FALSE;
9076 		}
9077 		entry++;
9078 		*dst_offset += PAGE_SIZE_64;
9079 	}
9080 done:
9081 	if (pages_wired) {
9082 		vm_page_lockspin_queues();
9083 		vm_page_wire_count += pages_wired;
9084 		vm_page_unlock_queues();
9085 	}
9086 	if (pages_inserted) {
9087 		if (object->internal) {
9088 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
9089 		} else {
9090 			OSAddAtomic(pages_inserted, &vm_page_external_count);
9091 		}
9092 	}
9093 	if (delayed_ledger_update) {
9094 		task_t          owner;
9095 		int             ledger_idx_volatile;
9096 		int             ledger_idx_nonvolatile;
9097 		int             ledger_idx_volatile_compressed;
9098 		int             ledger_idx_nonvolatile_compressed;
9099 		boolean_t       do_footprint;
9100 
9101 		owner = VM_OBJECT_OWNER(object);
9102 		assert(owner);
9103 
9104 		vm_object_ledger_tag_ledgers(object,
9105 		    &ledger_idx_volatile,
9106 		    &ledger_idx_nonvolatile,
9107 		    &ledger_idx_volatile_compressed,
9108 		    &ledger_idx_nonvolatile_compressed,
9109 		    &do_footprint);
9110 
9111 		/* more non-volatile bytes */
9112 		ledger_credit(owner->ledger,
9113 		    ledger_idx_nonvolatile,
9114 		    delayed_ledger_update);
9115 		if (do_footprint) {
9116 			/* more footprint */
9117 			ledger_credit(owner->ledger,
9118 			    task_ledgers.phys_footprint,
9119 			    delayed_ledger_update);
9120 		}
9121 	}
9122 
9123 	assert(page_grab_count);
9124 	*page_grab_count = pages_inserted;
9125 
9126 	return ret;
9127 }
9128 
9129 
9130 
9131 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9132 vm_object_iopl_request(
9133 	vm_object_t             object,
9134 	vm_object_offset_t      offset,
9135 	upl_size_t              size,
9136 	upl_t                   *upl_ptr,
9137 	upl_page_info_array_t   user_page_list,
9138 	unsigned int            *page_list_count,
9139 	upl_control_flags_t     cntrl_flags,
9140 	vm_tag_t                tag)
9141 {
9142 	vm_page_t               dst_page;
9143 	vm_object_offset_t      dst_offset;
9144 	upl_size_t              xfer_size;
9145 	upl_t                   upl = NULL;
9146 	unsigned int            entry;
9147 	wpl_array_t             lite_list = NULL;
9148 	int                     no_zero_fill = FALSE;
9149 	unsigned int            size_in_pages;
9150 	int                     page_grab_count = 0;
9151 	u_int32_t               psize;
9152 	kern_return_t           ret;
9153 	vm_prot_t               prot;
9154 	struct vm_object_fault_info fault_info = {};
9155 	struct  vm_page_delayed_work    dw_array;
9156 	struct  vm_page_delayed_work    *dwp, *dwp_start;
9157 	bool                    dwp_finish_ctx = TRUE;
9158 	int                     dw_count;
9159 	int                     dw_limit;
9160 	int                     dw_index;
9161 	boolean_t               caller_lookup;
9162 	int                     io_tracking_flag = 0;
9163 	int                     interruptible;
9164 	ppnum_t                 phys_page;
9165 
9166 	boolean_t               set_cache_attr_needed = FALSE;
9167 	boolean_t               free_wired_pages = FALSE;
9168 	boolean_t               fast_path_empty_req = FALSE;
9169 	boolean_t               fast_path_full_req = FALSE;
9170 
9171 #if DEVELOPMENT || DEBUG
9172 	task_t                  task = current_task();
9173 #endif /* DEVELOPMENT || DEBUG */
9174 
9175 	dwp_start = dwp = NULL;
9176 
9177 	vm_object_offset_t original_offset = offset;
9178 	upl_size_t original_size = size;
9179 
9180 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9181 
9182 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9183 	offset = vm_object_trunc_page(offset);
9184 	if (size != original_size || offset != original_offset) {
9185 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9186 	}
9187 
9188 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9189 		/*
9190 		 * For forward compatibility's sake,
9191 		 * reject any unknown flag.
9192 		 */
9193 		return KERN_INVALID_VALUE;
9194 	}
9195 	if (vm_lopage_needed == FALSE) {
9196 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9197 	}
9198 
9199 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9200 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9201 			return KERN_INVALID_VALUE;
9202 		}
9203 
9204 		if (object->phys_contiguous) {
9205 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9206 				return KERN_INVALID_ADDRESS;
9207 			}
9208 
9209 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9210 				return KERN_INVALID_ADDRESS;
9211 			}
9212 		}
9213 	}
9214 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9215 		no_zero_fill = TRUE;
9216 	}
9217 
9218 	if (cntrl_flags & UPL_COPYOUT_FROM) {
9219 		prot = VM_PROT_READ;
9220 	} else {
9221 		prot = VM_PROT_READ | VM_PROT_WRITE;
9222 	}
9223 
9224 	if ((!object->internal) && (object->paging_offset != 0)) {
9225 		panic("vm_object_iopl_request: external object with non-zero paging offset");
9226 	}
9227 
9228 
9229 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9230 
9231 #if CONFIG_IOSCHED || UPL_DEBUG
9232 	if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9233 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9234 	}
9235 #endif
9236 
9237 #if CONFIG_IOSCHED
9238 	if (object->io_tracking) {
9239 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9240 		if (object != kernel_object) {
9241 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9242 		}
9243 	}
9244 #endif
9245 
9246 	if (object->phys_contiguous) {
9247 		psize = PAGE_SIZE;
9248 	} else {
9249 		psize = size;
9250 
9251 		dw_count = 0;
9252 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9253 		dwp_start = vm_page_delayed_work_get_ctx();
9254 		if (dwp_start == NULL) {
9255 			dwp_start = &dw_array;
9256 			dw_limit = 1;
9257 			dwp_finish_ctx = FALSE;
9258 		}
9259 
9260 		dwp = dwp_start;
9261 	}
9262 
9263 	if (cntrl_flags & UPL_SET_INTERNAL) {
9264 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9265 
9266 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9267 		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9268 		    ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9269 		if (size == 0) {
9270 			user_page_list = NULL;
9271 			lite_list = NULL;
9272 		}
9273 	} else {
9274 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9275 
9276 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9277 		if (size == 0) {
9278 			lite_list = NULL;
9279 		}
9280 	}
9281 	if (user_page_list) {
9282 		user_page_list[0].device = FALSE;
9283 	}
9284 	*upl_ptr = upl;
9285 
9286 	if (cntrl_flags & UPL_NOZEROFILLIO) {
9287 		DTRACE_VM4(upl_nozerofillio,
9288 		    vm_object_t, object,
9289 		    vm_object_offset_t, offset,
9290 		    upl_size_t, size,
9291 		    upl_t, upl);
9292 	}
9293 
9294 	upl->map_object = object;
9295 	upl->u_offset = original_offset;
9296 	upl->u_size = original_size;
9297 
9298 	size_in_pages = size / PAGE_SIZE;
9299 
9300 	if (object == kernel_object &&
9301 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9302 		upl->flags |= UPL_KERNEL_OBJECT;
9303 #if UPL_DEBUG
9304 		vm_object_lock(object);
9305 #else
9306 		vm_object_lock_shared(object);
9307 #endif
9308 	} else {
9309 		vm_object_lock(object);
9310 		vm_object_activity_begin(object);
9311 	}
9312 	/*
9313 	 * paging in progress also protects the paging_offset
9314 	 */
9315 	upl->u_offset = original_offset + object->paging_offset;
9316 
9317 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9318 		/*
9319 		 * The user requested that access to the pages in this UPL
9320 		 * be blocked until the UPL is commited or aborted.
9321 		 */
9322 		upl->flags |= UPL_ACCESS_BLOCKED;
9323 	}
9324 
9325 #if CONFIG_IOSCHED || UPL_DEBUG
9326 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9327 		vm_object_activity_begin(object);
9328 		queue_enter(&object->uplq, upl, upl_t, uplq);
9329 	}
9330 #endif
9331 
9332 	if (object->phys_contiguous) {
9333 		if (upl->flags & UPL_ACCESS_BLOCKED) {
9334 			assert(!object->blocked_access);
9335 			object->blocked_access = TRUE;
9336 		}
9337 
9338 		vm_object_unlock(object);
9339 
9340 		/*
9341 		 * don't need any shadow mappings for this one
9342 		 * since it is already I/O memory
9343 		 */
9344 		upl->flags |= UPL_DEVICE_MEMORY;
9345 
9346 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9347 
9348 		if (user_page_list) {
9349 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9350 			user_page_list[0].device = TRUE;
9351 		}
9352 		if (page_list_count != NULL) {
9353 			if (upl->flags & UPL_INTERNAL) {
9354 				*page_list_count = 0;
9355 			} else {
9356 				*page_list_count = 1;
9357 			}
9358 		}
9359 
9360 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9361 #if DEVELOPMENT || DEBUG
9362 		if (task != NULL) {
9363 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9364 		}
9365 #endif /* DEVELOPMENT || DEBUG */
9366 		return KERN_SUCCESS;
9367 	}
9368 	if (object != kernel_object && object != compressor_object) {
9369 		/*
9370 		 * Protect user space from future COW operations
9371 		 */
9372 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9373 		if (!object->true_share &&
9374 		    vm_object_tracking_btlog) {
9375 			btlog_record(vm_object_tracking_btlog, object,
9376 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
9377 			    btref_get(__builtin_frame_address(0), 0));
9378 		}
9379 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9380 
9381 		vm_object_lock_assert_exclusive(object);
9382 		object->true_share = TRUE;
9383 
9384 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9385 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9386 		}
9387 	}
9388 
9389 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9390 	    object->copy != VM_OBJECT_NULL) {
9391 		/*
9392 		 * Honor copy-on-write obligations
9393 		 *
9394 		 * The caller is gathering these pages and
9395 		 * might modify their contents.  We need to
9396 		 * make sure that the copy object has its own
9397 		 * private copies of these pages before we let
9398 		 * the caller modify them.
9399 		 *
9400 		 * NOTE: someone else could map the original object
9401 		 * after we've done this copy-on-write here, and they
9402 		 * could then see an inconsistent picture of the memory
9403 		 * while it's being modified via the UPL.  To prevent this,
9404 		 * we would have to block access to these pages until the
9405 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
9406 		 * code path for that...
9407 		 */
9408 		vm_object_update(object,
9409 		    offset,
9410 		    size,
9411 		    NULL,
9412 		    NULL,
9413 		    FALSE,              /* should_return */
9414 		    MEMORY_OBJECT_COPY_SYNC,
9415 		    VM_PROT_NO_CHANGE);
9416 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
9417 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9418 	}
9419 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9420 	    object->purgable != VM_PURGABLE_VOLATILE &&
9421 	    object->purgable != VM_PURGABLE_EMPTY &&
9422 	    object->copy == NULL &&
9423 	    size == object->vo_size &&
9424 	    offset == 0 &&
9425 	    object->shadow == NULL &&
9426 	    object->pager == NULL) {
9427 		if (object->resident_page_count == size_in_pages) {
9428 			assert(object != compressor_object);
9429 			assert(object != kernel_object);
9430 			fast_path_full_req = TRUE;
9431 		} else if (object->resident_page_count == 0) {
9432 			assert(object != compressor_object);
9433 			assert(object != kernel_object);
9434 			fast_path_empty_req = TRUE;
9435 			set_cache_attr_needed = TRUE;
9436 		}
9437 	}
9438 
9439 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9440 		interruptible = THREAD_ABORTSAFE;
9441 	} else {
9442 		interruptible = THREAD_UNINT;
9443 	}
9444 
9445 	entry = 0;
9446 
9447 	xfer_size = size;
9448 	dst_offset = offset;
9449 
9450 	if (fast_path_full_req) {
9451 		if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9452 			goto finish;
9453 		}
9454 		/*
9455 		 * we couldn't complete the processing of this request on the fast path
9456 		 * so fall through to the slow path and finish up
9457 		 */
9458 	} else if (fast_path_empty_req) {
9459 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9460 			ret = KERN_MEMORY_ERROR;
9461 			goto return_err;
9462 		}
9463 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9464 
9465 		if (ret) {
9466 			free_wired_pages = TRUE;
9467 			goto return_err;
9468 		}
9469 		goto finish;
9470 	}
9471 
9472 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9473 	fault_info.lo_offset = offset;
9474 	fault_info.hi_offset = offset + xfer_size;
9475 	fault_info.mark_zf_absent = TRUE;
9476 	fault_info.interruptible = interruptible;
9477 	fault_info.batch_pmap_op = TRUE;
9478 
9479 	while (xfer_size) {
9480 		vm_fault_return_t       result;
9481 
9482 		dwp->dw_mask = 0;
9483 
9484 		if (fast_path_full_req) {
9485 			/*
9486 			 * if we get here, it means that we ran into a page
9487 			 * state we couldn't handle in the fast path and
9488 			 * bailed out to the slow path... since the order
9489 			 * we look at pages is different between the 2 paths,
9490 			 * the following check is needed to determine whether
9491 			 * this page was already processed in the fast path
9492 			 */
9493 			if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9494 				goto skip_page;
9495 			}
9496 		}
9497 		dst_page = vm_page_lookup(object, dst_offset);
9498 
9499 		if (dst_page == VM_PAGE_NULL ||
9500 		    dst_page->vmp_busy ||
9501 		    VMP_ERROR_GET(dst_page) ||
9502 		    dst_page->vmp_restart ||
9503 		    dst_page->vmp_absent ||
9504 		    dst_page->vmp_fictitious) {
9505 			if (object == kernel_object) {
9506 				panic("vm_object_iopl_request: missing/bad page in kernel object");
9507 			}
9508 			if (object == compressor_object) {
9509 				panic("vm_object_iopl_request: missing/bad page in compressor object");
9510 			}
9511 
9512 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9513 				ret = KERN_MEMORY_ERROR;
9514 				goto return_err;
9515 			}
9516 			set_cache_attr_needed = TRUE;
9517 
9518 			/*
9519 			 * We just looked up the page and the result remains valid
9520 			 * until the object lock is release, so send it to
9521 			 * vm_fault_page() (as "dst_page"), to avoid having to
9522 			 * look it up again there.
9523 			 */
9524 			caller_lookup = TRUE;
9525 
9526 			do {
9527 				vm_page_t       top_page;
9528 				kern_return_t   error_code;
9529 
9530 				fault_info.cluster_size = xfer_size;
9531 
9532 				vm_object_paging_begin(object);
9533 
9534 				result = vm_fault_page(object, dst_offset,
9535 				    prot | VM_PROT_WRITE, FALSE,
9536 				    caller_lookup,
9537 				    &prot, &dst_page, &top_page,
9538 				    (int *)0,
9539 				    &error_code, no_zero_fill,
9540 				    &fault_info);
9541 
9542 				/* our lookup is no longer valid at this point */
9543 				caller_lookup = FALSE;
9544 
9545 				switch (result) {
9546 				case VM_FAULT_SUCCESS:
9547 					page_grab_count++;
9548 
9549 					if (!dst_page->vmp_absent) {
9550 						PAGE_WAKEUP_DONE(dst_page);
9551 					} else {
9552 						/*
9553 						 * we only get back an absent page if we
9554 						 * requested that it not be zero-filled
9555 						 * because we are about to fill it via I/O
9556 						 *
9557 						 * absent pages should be left BUSY
9558 						 * to prevent them from being faulted
9559 						 * into an address space before we've
9560 						 * had a chance to complete the I/O on
9561 						 * them since they may contain info that
9562 						 * shouldn't be seen by the faulting task
9563 						 */
9564 					}
9565 					/*
9566 					 *	Release paging references and
9567 					 *	top-level placeholder page, if any.
9568 					 */
9569 					if (top_page != VM_PAGE_NULL) {
9570 						vm_object_t local_object;
9571 
9572 						local_object = VM_PAGE_OBJECT(top_page);
9573 
9574 						/*
9575 						 * comparing 2 packed pointers
9576 						 */
9577 						if (top_page->vmp_object != dst_page->vmp_object) {
9578 							vm_object_lock(local_object);
9579 							VM_PAGE_FREE(top_page);
9580 							vm_object_paging_end(local_object);
9581 							vm_object_unlock(local_object);
9582 						} else {
9583 							VM_PAGE_FREE(top_page);
9584 							vm_object_paging_end(local_object);
9585 						}
9586 					}
9587 					vm_object_paging_end(object);
9588 					break;
9589 
9590 				case VM_FAULT_RETRY:
9591 					vm_object_lock(object);
9592 					break;
9593 
9594 				case VM_FAULT_MEMORY_SHORTAGE:
9595 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9596 
9597 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9598 
9599 					if (vm_page_wait(interruptible)) {
9600 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9601 
9602 						VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9603 						vm_object_lock(object);
9604 
9605 						break;
9606 					}
9607 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9608 
9609 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9610 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9611 					OS_FALLTHROUGH;
9612 
9613 				case VM_FAULT_INTERRUPTED:
9614 					error_code = MACH_SEND_INTERRUPTED;
9615 					OS_FALLTHROUGH;
9616 				case VM_FAULT_MEMORY_ERROR:
9617 memory_error:
9618 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9619 
9620 					vm_object_lock(object);
9621 					goto return_err;
9622 
9623 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
9624 					/* success but no page: fail */
9625 					vm_object_paging_end(object);
9626 					vm_object_unlock(object);
9627 					goto memory_error;
9628 
9629 				default:
9630 					panic("vm_object_iopl_request: unexpected error"
9631 					    " 0x%x from vm_fault_page()\n", result);
9632 				}
9633 			} while (result != VM_FAULT_SUCCESS);
9634 		}
9635 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9636 
9637 		if (upl->flags & UPL_KERNEL_OBJECT) {
9638 			goto record_phys_addr;
9639 		}
9640 
9641 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9642 			dst_page->vmp_busy = TRUE;
9643 			goto record_phys_addr;
9644 		}
9645 
9646 		if (dst_page->vmp_cleaning) {
9647 			/*
9648 			 * Someone else is cleaning this page in place.
9649 			 * In theory, we should be able to  proceed and use this
9650 			 * page but they'll probably end up clearing the "busy"
9651 			 * bit on it in upl_commit_range() but they didn't set
9652 			 * it, so they would clear our "busy" bit and open
9653 			 * us to race conditions.
9654 			 * We'd better wait for the cleaning to complete and
9655 			 * then try again.
9656 			 */
9657 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9658 			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9659 			continue;
9660 		}
9661 		if (dst_page->vmp_laundry) {
9662 			vm_pageout_steal_laundry(dst_page, FALSE);
9663 		}
9664 
9665 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9666 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9667 			vm_page_t       low_page;
9668 			int             refmod;
9669 
9670 			/*
9671 			 * support devices that can't DMA above 32 bits
9672 			 * by substituting pages from a pool of low address
9673 			 * memory for any pages we find above the 4G mark
9674 			 * can't substitute if the page is already wired because
9675 			 * we don't know whether that physical address has been
9676 			 * handed out to some other 64 bit capable DMA device to use
9677 			 */
9678 			if (VM_PAGE_WIRED(dst_page)) {
9679 				ret = KERN_PROTECTION_FAILURE;
9680 				goto return_err;
9681 			}
9682 			low_page = vm_page_grablo();
9683 
9684 			if (low_page == VM_PAGE_NULL) {
9685 				ret = KERN_RESOURCE_SHORTAGE;
9686 				goto return_err;
9687 			}
9688 			/*
9689 			 * from here until the vm_page_replace completes
9690 			 * we musn't drop the object lock... we don't
9691 			 * want anyone refaulting this page in and using
9692 			 * it after we disconnect it... we want the fault
9693 			 * to find the new page being substituted.
9694 			 */
9695 			if (dst_page->vmp_pmapped) {
9696 				refmod = pmap_disconnect(phys_page);
9697 			} else {
9698 				refmod = 0;
9699 			}
9700 
9701 			if (!dst_page->vmp_absent) {
9702 				vm_page_copy(dst_page, low_page);
9703 			}
9704 
9705 			low_page->vmp_reference = dst_page->vmp_reference;
9706 			low_page->vmp_dirty     = dst_page->vmp_dirty;
9707 			low_page->vmp_absent    = dst_page->vmp_absent;
9708 
9709 			if (refmod & VM_MEM_REFERENCED) {
9710 				low_page->vmp_reference = TRUE;
9711 			}
9712 			if (refmod & VM_MEM_MODIFIED) {
9713 				SET_PAGE_DIRTY(low_page, FALSE);
9714 			}
9715 
9716 			vm_page_replace(low_page, object, dst_offset);
9717 
9718 			dst_page = low_page;
9719 			/*
9720 			 * vm_page_grablo returned the page marked
9721 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
9722 			 * here, because we've never dropped the object lock
9723 			 */
9724 			if (!dst_page->vmp_absent) {
9725 				dst_page->vmp_busy = FALSE;
9726 			}
9727 
9728 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9729 		}
9730 		if (!dst_page->vmp_busy) {
9731 			dwp->dw_mask |= DW_vm_page_wire;
9732 		}
9733 
9734 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
9735 			/*
9736 			 * Mark the page "busy" to block any future page fault
9737 			 * on this page in addition to wiring it.
9738 			 * We'll also remove the mapping
9739 			 * of all these pages before leaving this routine.
9740 			 */
9741 			assert(!dst_page->vmp_fictitious);
9742 			dst_page->vmp_busy = TRUE;
9743 		}
9744 		/*
9745 		 * expect the page to be used
9746 		 * page queues lock must be held to set 'reference'
9747 		 */
9748 		dwp->dw_mask |= DW_set_reference;
9749 
9750 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9751 			SET_PAGE_DIRTY(dst_page, TRUE);
9752 			/*
9753 			 * Page belonging to a code-signed object is about to
9754 			 * be written. Mark it tainted and disconnect it from
9755 			 * all pmaps so processes have to fault it back in and
9756 			 * deal with the tainted bit.
9757 			 */
9758 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9759 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9760 				vm_page_iopl_tainted++;
9761 				if (dst_page->vmp_pmapped) {
9762 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9763 					if (refmod & VM_MEM_REFERENCED) {
9764 						dst_page->vmp_reference = TRUE;
9765 					}
9766 				}
9767 			}
9768 		}
9769 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9770 			pmap_sync_page_attributes_phys(phys_page);
9771 			dst_page->vmp_written_by_kernel = FALSE;
9772 		}
9773 
9774 record_phys_addr:
9775 		if (dst_page->vmp_busy) {
9776 			upl->flags |= UPL_HAS_BUSY;
9777 		}
9778 
9779 		lite_list[entry >> 5] |= 1U << (entry & 31);
9780 
9781 		if (phys_page > upl->highest_page) {
9782 			upl->highest_page = phys_page;
9783 		}
9784 
9785 		if (user_page_list) {
9786 			user_page_list[entry].phys_addr = phys_page;
9787 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9788 			user_page_list[entry].absent    = dst_page->vmp_absent;
9789 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9790 			user_page_list[entry].precious  = dst_page->vmp_precious;
9791 			user_page_list[entry].device    = FALSE;
9792 			user_page_list[entry].needed    = FALSE;
9793 			if (dst_page->vmp_clustered == TRUE) {
9794 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9795 			} else {
9796 				user_page_list[entry].speculative = FALSE;
9797 			}
9798 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9799 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9800 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9801 			user_page_list[entry].mark      = FALSE;
9802 		}
9803 		if (object != kernel_object && object != compressor_object) {
9804 			/*
9805 			 * someone is explicitly grabbing this page...
9806 			 * update clustered and speculative state
9807 			 *
9808 			 */
9809 			if (dst_page->vmp_clustered) {
9810 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
9811 			}
9812 		}
9813 skip_page:
9814 		entry++;
9815 		dst_offset += PAGE_SIZE_64;
9816 		xfer_size -= PAGE_SIZE;
9817 
9818 		if (dwp->dw_mask) {
9819 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9820 
9821 			if (dw_count >= dw_limit) {
9822 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9823 
9824 				dwp = dwp_start;
9825 				dw_count = 0;
9826 			}
9827 		}
9828 	}
9829 	assert(entry == size_in_pages);
9830 
9831 	if (dw_count) {
9832 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9833 		dwp = dwp_start;
9834 		dw_count = 0;
9835 	}
9836 finish:
9837 	if (user_page_list && set_cache_attr_needed == TRUE) {
9838 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9839 	}
9840 
9841 	if (page_list_count != NULL) {
9842 		if (upl->flags & UPL_INTERNAL) {
9843 			*page_list_count = 0;
9844 		} else if (*page_list_count > size_in_pages) {
9845 			*page_list_count = size_in_pages;
9846 		}
9847 	}
9848 	vm_object_unlock(object);
9849 
9850 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9851 		/*
9852 		 * We've marked all the pages "busy" so that future
9853 		 * page faults will block.
9854 		 * Now remove the mapping for these pages, so that they
9855 		 * can't be accessed without causing a page fault.
9856 		 */
9857 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9858 		    PMAP_NULL,
9859 		    PAGE_SIZE,
9860 		    0, VM_PROT_NONE);
9861 		assert(!object->blocked_access);
9862 		object->blocked_access = TRUE;
9863 	}
9864 
9865 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9866 #if DEVELOPMENT || DEBUG
9867 	if (task != NULL) {
9868 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9869 	}
9870 #endif /* DEVELOPMENT || DEBUG */
9871 
9872 	if (dwp_start && dwp_finish_ctx) {
9873 		vm_page_delayed_work_finish_ctx(dwp_start);
9874 		dwp_start = dwp = NULL;
9875 	}
9876 
9877 	return KERN_SUCCESS;
9878 
9879 return_err:
9880 	dw_index = 0;
9881 
9882 	for (; offset < dst_offset; offset += PAGE_SIZE) {
9883 		boolean_t need_unwire;
9884 
9885 		dst_page = vm_page_lookup(object, offset);
9886 
9887 		if (dst_page == VM_PAGE_NULL) {
9888 			panic("vm_object_iopl_request: Wired page missing.");
9889 		}
9890 
9891 		/*
9892 		 * if we've already processed this page in an earlier
9893 		 * dw_do_work, we need to undo the wiring... we will
9894 		 * leave the dirty and reference bits on if they
9895 		 * were set, since we don't have a good way of knowing
9896 		 * what the previous state was and we won't get here
9897 		 * under any normal circumstances...  we will always
9898 		 * clear BUSY and wakeup any waiters via vm_page_free
9899 		 * or PAGE_WAKEUP_DONE
9900 		 */
9901 		need_unwire = TRUE;
9902 
9903 		if (dw_count) {
9904 			if ((dwp_start)[dw_index].dw_m == dst_page) {
9905 				/*
9906 				 * still in the deferred work list
9907 				 * which means we haven't yet called
9908 				 * vm_page_wire on this page
9909 				 */
9910 				need_unwire = FALSE;
9911 
9912 				dw_index++;
9913 				dw_count--;
9914 			}
9915 		}
9916 		vm_page_lock_queues();
9917 
9918 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9919 			vm_page_free(dst_page);
9920 
9921 			need_unwire = FALSE;
9922 		} else {
9923 			if (need_unwire == TRUE) {
9924 				vm_page_unwire(dst_page, TRUE);
9925 			}
9926 
9927 			PAGE_WAKEUP_DONE(dst_page);
9928 		}
9929 		vm_page_unlock_queues();
9930 
9931 		if (need_unwire == TRUE) {
9932 			counter_inc(&vm_statistics_reactivations);
9933 		}
9934 	}
9935 #if UPL_DEBUG
9936 	upl->upl_state = 2;
9937 #endif
9938 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9939 		vm_object_activity_end(object);
9940 		vm_object_collapse(object, 0, TRUE);
9941 	}
9942 	vm_object_unlock(object);
9943 	upl_destroy(upl);
9944 
9945 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9946 #if DEVELOPMENT || DEBUG
9947 	if (task != NULL) {
9948 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9949 	}
9950 #endif /* DEVELOPMENT || DEBUG */
9951 
9952 	if (dwp_start && dwp_finish_ctx) {
9953 		vm_page_delayed_work_finish_ctx(dwp_start);
9954 		dwp_start = dwp = NULL;
9955 	}
9956 	return ret;
9957 }
9958 
9959 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9960 upl_transpose(
9961 	upl_t           upl1,
9962 	upl_t           upl2)
9963 {
9964 	kern_return_t           retval;
9965 	boolean_t               upls_locked;
9966 	vm_object_t             object1, object2;
9967 
9968 	/* LD: Should mapped UPLs be eligible for a transpose? */
9969 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9970 		return KERN_INVALID_ARGUMENT;
9971 	}
9972 
9973 	upls_locked = FALSE;
9974 
9975 	/*
9976 	 * Since we need to lock both UPLs at the same time,
9977 	 * avoid deadlocks by always taking locks in the same order.
9978 	 */
9979 	if (upl1 < upl2) {
9980 		upl_lock(upl1);
9981 		upl_lock(upl2);
9982 	} else {
9983 		upl_lock(upl2);
9984 		upl_lock(upl1);
9985 	}
9986 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9987 
9988 	object1 = upl1->map_object;
9989 	object2 = upl2->map_object;
9990 
9991 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9992 	    upl1->u_size != upl2->u_size) {
9993 		/*
9994 		 * We deal only with full objects, not subsets.
9995 		 * That's because we exchange the entire backing store info
9996 		 * for the objects: pager, resident pages, etc...  We can't do
9997 		 * only part of it.
9998 		 */
9999 		retval = KERN_INVALID_VALUE;
10000 		goto done;
10001 	}
10002 
10003 	/*
10004 	 * Tranpose the VM objects' backing store.
10005 	 */
10006 	retval = vm_object_transpose(object1, object2,
10007 	    upl_adjusted_size(upl1, PAGE_MASK));
10008 
10009 	if (retval == KERN_SUCCESS) {
10010 		/*
10011 		 * Make each UPL point to the correct VM object, i.e. the
10012 		 * object holding the pages that the UPL refers to...
10013 		 */
10014 #if CONFIG_IOSCHED || UPL_DEBUG
10015 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10016 			vm_object_lock(object1);
10017 			vm_object_lock(object2);
10018 		}
10019 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10020 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
10021 		}
10022 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10023 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
10024 		}
10025 #endif
10026 		upl1->map_object = object2;
10027 		upl2->map_object = object1;
10028 
10029 #if CONFIG_IOSCHED || UPL_DEBUG
10030 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10031 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
10032 		}
10033 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10034 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
10035 		}
10036 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10037 			vm_object_unlock(object2);
10038 			vm_object_unlock(object1);
10039 		}
10040 #endif
10041 	}
10042 
10043 done:
10044 	/*
10045 	 * Cleanup.
10046 	 */
10047 	if (upls_locked) {
10048 		upl_unlock(upl1);
10049 		upl_unlock(upl2);
10050 		upls_locked = FALSE;
10051 	}
10052 
10053 	return retval;
10054 }
10055 
10056 void
upl_range_needed(upl_t upl,int index,int count)10057 upl_range_needed(
10058 	upl_t           upl,
10059 	int             index,
10060 	int             count)
10061 {
10062 	upl_page_info_t *user_page_list;
10063 	int             size_in_pages;
10064 
10065 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10066 		return;
10067 	}
10068 
10069 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10070 
10071 	user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
10072 
10073 	while (count-- && index < size_in_pages) {
10074 		user_page_list[index++].needed = TRUE;
10075 	}
10076 }
10077 
10078 
10079 /*
10080  * Reserve of virtual addresses in the kernel address space.
10081  * We need to map the physical pages in the kernel, so that we
10082  * can call the code-signing or slide routines with a kernel
10083  * virtual address.  We keep this pool of pre-allocated kernel
10084  * virtual addresses so that we don't have to scan the kernel's
10085  * virtaul address space each time we need to work with
10086  * a physical page.
10087  */
10088 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10089 #define VM_PAGING_NUM_PAGES     64
10090 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10091 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10092 int             vm_paging_max_index = 0;
10093 int             vm_paging_page_waiter = 0;
10094 int             vm_paging_page_waiter_total = 0;
10095 
10096 unsigned long   vm_paging_no_kernel_page = 0;
10097 unsigned long   vm_paging_objects_mapped = 0;
10098 unsigned long   vm_paging_pages_mapped = 0;
10099 unsigned long   vm_paging_objects_mapped_slow = 0;
10100 unsigned long   vm_paging_pages_mapped_slow = 0;
10101 
10102 __startup_func
10103 static void
vm_paging_map_init(void)10104 vm_paging_map_init(void)
10105 {
10106 	kmem_alloc(kernel_map, &vm_paging_base_address,
10107 	    ptoa(VM_PAGING_NUM_PAGES),
10108 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10109 	    VM_KERN_MEMORY_NONE);
10110 }
10111 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10112 
10113 /*
10114  * vm_paging_map_object:
10115  *	Maps part of a VM object's pages in the kernel
10116  *      virtual address space, using the pre-allocated
10117  *	kernel virtual addresses, if possible.
10118  * Context:
10119  *      The VM object is locked.  This lock will get
10120  *      dropped and re-acquired though, so the caller
10121  *      must make sure the VM object is kept alive
10122  *	(by holding a VM map that has a reference
10123  *      on it, for example, or taking an extra reference).
10124  *      The page should also be kept busy to prevent
10125  *	it from being reclaimed.
10126  */
10127 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10128 vm_paging_map_object(
10129 	vm_page_t               page,
10130 	vm_object_t             object,
10131 	vm_object_offset_t      offset,
10132 	vm_prot_t               protection,
10133 	boolean_t               can_unlock_object,
10134 	vm_map_size_t           *size,          /* IN/OUT */
10135 	vm_map_offset_t         *address,       /* OUT */
10136 	boolean_t               *need_unmap)    /* OUT */
10137 {
10138 	kern_return_t           kr;
10139 	vm_map_offset_t         page_map_offset;
10140 	vm_map_size_t           map_size;
10141 	vm_object_offset_t      object_offset;
10142 	int                     i;
10143 
10144 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10145 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
10146 		*address = (vm_map_offset_t)
10147 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10148 		*need_unmap = FALSE;
10149 		return KERN_SUCCESS;
10150 
10151 		assert(page->vmp_busy);
10152 		/*
10153 		 * Use one of the pre-allocated kernel virtual addresses
10154 		 * and just enter the VM page in the kernel address space
10155 		 * at that virtual address.
10156 		 */
10157 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10158 
10159 		/*
10160 		 * Try and find an available kernel virtual address
10161 		 * from our pre-allocated pool.
10162 		 */
10163 		page_map_offset = 0;
10164 		for (;;) {
10165 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10166 				if (vm_paging_page_inuse[i] == FALSE) {
10167 					page_map_offset =
10168 					    vm_paging_base_address +
10169 					    (i * PAGE_SIZE);
10170 					break;
10171 				}
10172 			}
10173 			if (page_map_offset != 0) {
10174 				/* found a space to map our page ! */
10175 				break;
10176 			}
10177 
10178 			if (can_unlock_object) {
10179 				/*
10180 				 * If we can afford to unlock the VM object,
10181 				 * let's take the slow path now...
10182 				 */
10183 				break;
10184 			}
10185 			/*
10186 			 * We can't afford to unlock the VM object, so
10187 			 * let's wait for a space to become available...
10188 			 */
10189 			vm_paging_page_waiter_total++;
10190 			vm_paging_page_waiter++;
10191 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10192 			if (kr == THREAD_WAITING) {
10193 				simple_unlock(&vm_paging_lock);
10194 				kr = thread_block(THREAD_CONTINUE_NULL);
10195 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10196 			}
10197 			vm_paging_page_waiter--;
10198 			/* ... and try again */
10199 		}
10200 
10201 		if (page_map_offset != 0) {
10202 			/*
10203 			 * We found a kernel virtual address;
10204 			 * map the physical page to that virtual address.
10205 			 */
10206 			if (i > vm_paging_max_index) {
10207 				vm_paging_max_index = i;
10208 			}
10209 			vm_paging_page_inuse[i] = TRUE;
10210 			simple_unlock(&vm_paging_lock);
10211 
10212 			page->vmp_pmapped = TRUE;
10213 
10214 			/*
10215 			 * Keep the VM object locked over the PMAP_ENTER
10216 			 * and the actual use of the page by the kernel,
10217 			 * or this pmap mapping might get undone by a
10218 			 * vm_object_pmap_protect() call...
10219 			 */
10220 			PMAP_ENTER(kernel_pmap,
10221 			    page_map_offset,
10222 			    page,
10223 			    protection,
10224 			    VM_PROT_NONE,
10225 			    0,
10226 			    TRUE,
10227 			    kr);
10228 			assert(kr == KERN_SUCCESS);
10229 			vm_paging_objects_mapped++;
10230 			vm_paging_pages_mapped++;
10231 			*address = page_map_offset;
10232 			*need_unmap = TRUE;
10233 
10234 #if KASAN
10235 			kasan_notify_address(page_map_offset, PAGE_SIZE);
10236 #endif
10237 
10238 			/* all done and mapped, ready to use ! */
10239 			return KERN_SUCCESS;
10240 		}
10241 
10242 		/*
10243 		 * We ran out of pre-allocated kernel virtual
10244 		 * addresses.  Just map the page in the kernel
10245 		 * the slow and regular way.
10246 		 */
10247 		vm_paging_no_kernel_page++;
10248 		simple_unlock(&vm_paging_lock);
10249 	}
10250 
10251 	if (!can_unlock_object) {
10252 		*address = 0;
10253 		*size = 0;
10254 		*need_unmap = FALSE;
10255 		return KERN_NOT_SUPPORTED;
10256 	}
10257 
10258 	object_offset = vm_object_trunc_page(offset);
10259 	map_size = vm_map_round_page(*size,
10260 	    VM_MAP_PAGE_MASK(kernel_map));
10261 
10262 	/*
10263 	 * Try and map the required range of the object
10264 	 * in the kernel_map. Given that allocation is
10265 	 * for pageable memory, it shouldn't contain
10266 	 * pointers and is mapped into the data range.
10267 	 */
10268 
10269 	vm_object_reference_locked(object);     /* for the map entry */
10270 	vm_object_unlock(object);
10271 
10272 	kr = vm_map_enter(kernel_map,
10273 	    address,
10274 	    map_size,
10275 	    0,
10276 	    VM_FLAGS_ANYWHERE,
10277 	    VM_MAP_KERNEL_FLAGS_DATA,
10278 	    VM_KERN_MEMORY_NONE,
10279 	    object,
10280 	    object_offset,
10281 	    FALSE,
10282 	    protection,
10283 	    VM_PROT_ALL,
10284 	    VM_INHERIT_NONE);
10285 	if (kr != KERN_SUCCESS) {
10286 		*address = 0;
10287 		*size = 0;
10288 		*need_unmap = FALSE;
10289 		vm_object_deallocate(object);   /* for the map entry */
10290 		vm_object_lock(object);
10291 		return kr;
10292 	}
10293 
10294 	*size = map_size;
10295 
10296 	/*
10297 	 * Enter the mapped pages in the page table now.
10298 	 */
10299 	vm_object_lock(object);
10300 	/*
10301 	 * VM object must be kept locked from before PMAP_ENTER()
10302 	 * until after the kernel is done accessing the page(s).
10303 	 * Otherwise, the pmap mappings in the kernel could be
10304 	 * undone by a call to vm_object_pmap_protect().
10305 	 */
10306 
10307 	for (page_map_offset = 0;
10308 	    map_size != 0;
10309 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10310 		page = vm_page_lookup(object, offset + page_map_offset);
10311 		if (page == VM_PAGE_NULL) {
10312 			printf("vm_paging_map_object: no page !?");
10313 			vm_object_unlock(object);
10314 			vm_map_remove(kernel_map, *address, *size);
10315 			*address = 0;
10316 			*size = 0;
10317 			*need_unmap = FALSE;
10318 			vm_object_lock(object);
10319 			return KERN_MEMORY_ERROR;
10320 		}
10321 		page->vmp_pmapped = TRUE;
10322 
10323 		PMAP_ENTER(kernel_pmap,
10324 		    *address + page_map_offset,
10325 		    page,
10326 		    protection,
10327 		    VM_PROT_NONE,
10328 		    0,
10329 		    TRUE,
10330 		    kr);
10331 		assert(kr == KERN_SUCCESS);
10332 #if KASAN
10333 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10334 #endif
10335 	}
10336 
10337 	vm_paging_objects_mapped_slow++;
10338 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10339 
10340 	*need_unmap = TRUE;
10341 
10342 	return KERN_SUCCESS;
10343 }
10344 
10345 /*
10346  * vm_paging_unmap_object:
10347  *	Unmaps part of a VM object's pages from the kernel
10348  *      virtual address space.
10349  * Context:
10350  *      The VM object is locked.  This lock will get
10351  *      dropped and re-acquired though.
10352  */
10353 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10354 vm_paging_unmap_object(
10355 	vm_object_t     object,
10356 	vm_map_offset_t start,
10357 	vm_map_offset_t end)
10358 {
10359 	int             i;
10360 
10361 	if ((vm_paging_base_address == 0) ||
10362 	    (start < vm_paging_base_address) ||
10363 	    (end > (vm_paging_base_address
10364 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10365 		/*
10366 		 * We didn't use our pre-allocated pool of
10367 		 * kernel virtual address.  Deallocate the
10368 		 * virtual memory.
10369 		 */
10370 		if (object != VM_OBJECT_NULL) {
10371 			vm_object_unlock(object);
10372 		}
10373 		vm_map_remove(kernel_map, start, end);
10374 		if (object != VM_OBJECT_NULL) {
10375 			vm_object_lock(object);
10376 		}
10377 	} else {
10378 		/*
10379 		 * We used a kernel virtual address from our
10380 		 * pre-allocated pool.  Put it back in the pool
10381 		 * for next time.
10382 		 */
10383 		assert(end - start == PAGE_SIZE);
10384 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10385 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10386 
10387 		/* undo the pmap mapping */
10388 		pmap_remove(kernel_pmap, start, end);
10389 
10390 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10391 		vm_paging_page_inuse[i] = FALSE;
10392 		if (vm_paging_page_waiter) {
10393 			thread_wakeup(&vm_paging_page_waiter);
10394 		}
10395 		simple_unlock(&vm_paging_lock);
10396 	}
10397 }
10398 
10399 
10400 /*
10401  * page->vmp_object must be locked
10402  */
10403 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10404 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10405 {
10406 	if (!queues_locked) {
10407 		vm_page_lockspin_queues();
10408 	}
10409 
10410 	page->vmp_free_when_done = FALSE;
10411 	/*
10412 	 * need to drop the laundry count...
10413 	 * we may also need to remove it
10414 	 * from the I/O paging queue...
10415 	 * vm_pageout_throttle_up handles both cases
10416 	 *
10417 	 * the laundry and pageout_queue flags are cleared...
10418 	 */
10419 	vm_pageout_throttle_up(page);
10420 
10421 	if (!queues_locked) {
10422 		vm_page_unlock_queues();
10423 	}
10424 }
10425 
10426 upl_t
vector_upl_create(vm_offset_t upl_offset)10427 vector_upl_create(vm_offset_t upl_offset)
10428 {
10429 	int i = 0;
10430 	upl_t   upl;
10431 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10432 
10433 	upl = upl_create(0, UPL_VECTOR, 0);
10434 	upl->vector_upl = vector_upl;
10435 	upl->u_offset = upl_offset;
10436 	vector_upl->size = 0;
10437 	vector_upl->offset = upl_offset;
10438 	vector_upl->invalid_upls = 0;
10439 	vector_upl->num_upls = 0;
10440 	vector_upl->pagelist = NULL;
10441 
10442 	for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10443 		vector_upl->upl_iostates[i].size = 0;
10444 		vector_upl->upl_iostates[i].offset = 0;
10445 	}
10446 	return upl;
10447 }
10448 
10449 void
vector_upl_deallocate(upl_t upl)10450 vector_upl_deallocate(upl_t upl)
10451 {
10452 	if (upl) {
10453 		vector_upl_t vector_upl = upl->vector_upl;
10454 		if (vector_upl) {
10455 			if (vector_upl->invalid_upls != vector_upl->num_upls) {
10456 				panic("Deallocating non-empty Vectored UPL");
10457 			}
10458 			kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10459 			vector_upl->invalid_upls = 0;
10460 			vector_upl->num_upls = 0;
10461 			vector_upl->pagelist = NULL;
10462 			vector_upl->size = 0;
10463 			vector_upl->offset = 0;
10464 			kfree_type(struct _vector_upl, vector_upl);
10465 			vector_upl = (vector_upl_t)0xfeedfeed;
10466 		} else {
10467 			panic("vector_upl_deallocate was passed a non-vectored upl");
10468 		}
10469 	} else {
10470 		panic("vector_upl_deallocate was passed a NULL upl");
10471 	}
10472 }
10473 
10474 boolean_t
vector_upl_is_valid(upl_t upl)10475 vector_upl_is_valid(upl_t upl)
10476 {
10477 	if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10478 		vector_upl_t vector_upl = upl->vector_upl;
10479 		if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10480 			return FALSE;
10481 		} else {
10482 			return TRUE;
10483 		}
10484 	}
10485 	return FALSE;
10486 }
10487 
10488 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10489 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10490 {
10491 	if (vector_upl_is_valid(upl)) {
10492 		vector_upl_t vector_upl = upl->vector_upl;
10493 
10494 		if (vector_upl) {
10495 			if (subupl) {
10496 				if (io_size) {
10497 					if (io_size < PAGE_SIZE) {
10498 						io_size = PAGE_SIZE;
10499 					}
10500 					subupl->vector_upl = (void*)vector_upl;
10501 					vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10502 					vector_upl->size += io_size;
10503 					upl->u_size += io_size;
10504 				} else {
10505 					uint32_t i = 0, invalid_upls = 0;
10506 					for (i = 0; i < vector_upl->num_upls; i++) {
10507 						if (vector_upl->upl_elems[i] == subupl) {
10508 							break;
10509 						}
10510 					}
10511 					if (i == vector_upl->num_upls) {
10512 						panic("Trying to remove sub-upl when none exists");
10513 					}
10514 
10515 					vector_upl->upl_elems[i] = NULL;
10516 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10517 					    relaxed);
10518 					if (invalid_upls == vector_upl->num_upls) {
10519 						return TRUE;
10520 					} else {
10521 						return FALSE;
10522 					}
10523 				}
10524 			} else {
10525 				panic("vector_upl_set_subupl was passed a NULL upl element");
10526 			}
10527 		} else {
10528 			panic("vector_upl_set_subupl was passed a non-vectored upl");
10529 		}
10530 	} else {
10531 		panic("vector_upl_set_subupl was passed a NULL upl");
10532 	}
10533 
10534 	return FALSE;
10535 }
10536 
10537 void
vector_upl_set_pagelist(upl_t upl)10538 vector_upl_set_pagelist(upl_t upl)
10539 {
10540 	if (vector_upl_is_valid(upl)) {
10541 		uint32_t i = 0;
10542 		vector_upl_t vector_upl = upl->vector_upl;
10543 
10544 		if (vector_upl) {
10545 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10546 
10547 			vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10548 
10549 			for (i = 0; i < vector_upl->num_upls; i++) {
10550 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10551 				bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10552 				pagelist_size += cur_upl_pagelist_size;
10553 				if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10554 					upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10555 				}
10556 			}
10557 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10558 		} else {
10559 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
10560 		}
10561 	} else {
10562 		panic("vector_upl_set_pagelist was passed a NULL upl");
10563 	}
10564 }
10565 
10566 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10567 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10568 {
10569 	if (vector_upl_is_valid(upl)) {
10570 		vector_upl_t vector_upl = upl->vector_upl;
10571 		if (vector_upl) {
10572 			if (index < vector_upl->num_upls) {
10573 				return vector_upl->upl_elems[index];
10574 			}
10575 		} else {
10576 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10577 		}
10578 	}
10579 	return NULL;
10580 }
10581 
10582 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10583 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10584 {
10585 	if (vector_upl_is_valid(upl)) {
10586 		uint32_t i = 0;
10587 		vector_upl_t vector_upl = upl->vector_upl;
10588 
10589 		if (vector_upl) {
10590 			upl_t subupl = NULL;
10591 			vector_upl_iostates_t subupl_state;
10592 
10593 			for (i = 0; i < vector_upl->num_upls; i++) {
10594 				subupl = vector_upl->upl_elems[i];
10595 				subupl_state = vector_upl->upl_iostates[i];
10596 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10597 					/* We could have been passed an offset/size pair that belongs
10598 					 * to an UPL element that has already been committed/aborted.
10599 					 * If so, return NULL.
10600 					 */
10601 					if (subupl == NULL) {
10602 						return NULL;
10603 					}
10604 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10605 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10606 						if (*upl_size > subupl_state.size) {
10607 							*upl_size = subupl_state.size;
10608 						}
10609 					}
10610 					if (*upl_offset >= subupl_state.offset) {
10611 						*upl_offset -= subupl_state.offset;
10612 					} else if (i) {
10613 						panic("Vector UPL offset miscalculation");
10614 					}
10615 					return subupl;
10616 				}
10617 			}
10618 		} else {
10619 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10620 		}
10621 	}
10622 	return NULL;
10623 }
10624 
10625 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10626 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10627 {
10628 	*v_upl_submap = NULL;
10629 
10630 	if (vector_upl_is_valid(upl)) {
10631 		vector_upl_t vector_upl = upl->vector_upl;
10632 		if (vector_upl) {
10633 			*v_upl_submap = vector_upl->submap;
10634 			*submap_dst_addr = vector_upl->submap_dst_addr;
10635 		} else {
10636 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10637 		}
10638 	} else {
10639 		panic("vector_upl_get_submap was passed a null UPL");
10640 	}
10641 }
10642 
10643 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10644 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10645 {
10646 	if (vector_upl_is_valid(upl)) {
10647 		vector_upl_t vector_upl = upl->vector_upl;
10648 		if (vector_upl) {
10649 			vector_upl->submap = submap;
10650 			vector_upl->submap_dst_addr = submap_dst_addr;
10651 		} else {
10652 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10653 		}
10654 	} else {
10655 		panic("vector_upl_get_submap was passed a NULL UPL");
10656 	}
10657 }
10658 
10659 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10660 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10661 {
10662 	if (vector_upl_is_valid(upl)) {
10663 		uint32_t i = 0;
10664 		vector_upl_t vector_upl = upl->vector_upl;
10665 
10666 		if (vector_upl) {
10667 			for (i = 0; i < vector_upl->num_upls; i++) {
10668 				if (vector_upl->upl_elems[i] == subupl) {
10669 					break;
10670 				}
10671 			}
10672 
10673 			if (i == vector_upl->num_upls) {
10674 				panic("setting sub-upl iostate when none exists");
10675 			}
10676 
10677 			vector_upl->upl_iostates[i].offset = offset;
10678 			if (size < PAGE_SIZE) {
10679 				size = PAGE_SIZE;
10680 			}
10681 			vector_upl->upl_iostates[i].size = size;
10682 		} else {
10683 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
10684 		}
10685 	} else {
10686 		panic("vector_upl_set_iostate was passed a NULL UPL");
10687 	}
10688 }
10689 
10690 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10691 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10692 {
10693 	if (vector_upl_is_valid(upl)) {
10694 		uint32_t i = 0;
10695 		vector_upl_t vector_upl = upl->vector_upl;
10696 
10697 		if (vector_upl) {
10698 			for (i = 0; i < vector_upl->num_upls; i++) {
10699 				if (vector_upl->upl_elems[i] == subupl) {
10700 					break;
10701 				}
10702 			}
10703 
10704 			if (i == vector_upl->num_upls) {
10705 				panic("getting sub-upl iostate when none exists");
10706 			}
10707 
10708 			*offset = vector_upl->upl_iostates[i].offset;
10709 			*size = vector_upl->upl_iostates[i].size;
10710 		} else {
10711 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
10712 		}
10713 	} else {
10714 		panic("vector_upl_get_iostate was passed a NULL UPL");
10715 	}
10716 }
10717 
10718 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10719 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10720 {
10721 	if (vector_upl_is_valid(upl)) {
10722 		vector_upl_t vector_upl = upl->vector_upl;
10723 		if (vector_upl) {
10724 			if (index < vector_upl->num_upls) {
10725 				*offset = vector_upl->upl_iostates[index].offset;
10726 				*size = vector_upl->upl_iostates[index].size;
10727 			} else {
10728 				*offset = *size = 0;
10729 			}
10730 		} else {
10731 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10732 		}
10733 	} else {
10734 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10735 	}
10736 }
10737 
10738 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10739 upl_get_internal_vectorupl_pagelist(upl_t upl)
10740 {
10741 	return ((vector_upl_t)(upl->vector_upl))->pagelist;
10742 }
10743 
10744 void *
upl_get_internal_vectorupl(upl_t upl)10745 upl_get_internal_vectorupl(upl_t upl)
10746 {
10747 	return upl->vector_upl;
10748 }
10749 
10750 vm_size_t
upl_get_internal_pagelist_offset(void)10751 upl_get_internal_pagelist_offset(void)
10752 {
10753 	return sizeof(struct upl);
10754 }
10755 
10756 void
upl_clear_dirty(upl_t upl,boolean_t value)10757 upl_clear_dirty(
10758 	upl_t           upl,
10759 	boolean_t       value)
10760 {
10761 	if (value) {
10762 		upl->flags |= UPL_CLEAR_DIRTY;
10763 	} else {
10764 		upl->flags &= ~UPL_CLEAR_DIRTY;
10765 	}
10766 }
10767 
10768 void
upl_set_referenced(upl_t upl,boolean_t value)10769 upl_set_referenced(
10770 	upl_t           upl,
10771 	boolean_t       value)
10772 {
10773 	upl_lock(upl);
10774 	if (value) {
10775 		upl->ext_ref_count++;
10776 	} else {
10777 		if (!upl->ext_ref_count) {
10778 			panic("upl_set_referenced not %p", upl);
10779 		}
10780 		upl->ext_ref_count--;
10781 	}
10782 	upl_unlock(upl);
10783 }
10784 
10785 #if CONFIG_IOSCHED
10786 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10787 upl_set_blkno(
10788 	upl_t           upl,
10789 	vm_offset_t     upl_offset,
10790 	int             io_size,
10791 	int64_t         blkno)
10792 {
10793 	int i, j;
10794 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10795 		return;
10796 	}
10797 
10798 	assert(upl->upl_reprio_info != 0);
10799 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10800 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10801 	}
10802 }
10803 #endif
10804 
10805 void inline
memoryshot(unsigned int event,unsigned int control)10806 memoryshot(unsigned int event, unsigned int control)
10807 {
10808 	if (vm_debug_events) {
10809 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10810 		    vm_page_active_count, vm_page_inactive_count,
10811 		    vm_page_free_count, vm_page_speculative_count,
10812 		    vm_page_throttled_count);
10813 	} else {
10814 		(void) event;
10815 		(void) control;
10816 	}
10817 }
10818 
10819 #ifdef MACH_BSD
10820 
10821 boolean_t
upl_device_page(upl_page_info_t * upl)10822 upl_device_page(upl_page_info_t *upl)
10823 {
10824 	return UPL_DEVICE_PAGE(upl);
10825 }
10826 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10827 upl_page_present(upl_page_info_t *upl, int index)
10828 {
10829 	return UPL_PAGE_PRESENT(upl, index);
10830 }
10831 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10832 upl_speculative_page(upl_page_info_t *upl, int index)
10833 {
10834 	return UPL_SPECULATIVE_PAGE(upl, index);
10835 }
10836 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10837 upl_dirty_page(upl_page_info_t *upl, int index)
10838 {
10839 	return UPL_DIRTY_PAGE(upl, index);
10840 }
10841 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10842 upl_valid_page(upl_page_info_t *upl, int index)
10843 {
10844 	return UPL_VALID_PAGE(upl, index);
10845 }
10846 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10847 upl_phys_page(upl_page_info_t *upl, int index)
10848 {
10849 	return UPL_PHYS_PAGE(upl, index);
10850 }
10851 
10852 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10853 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10854 {
10855 	upl[index].mark = v;
10856 }
10857 
10858 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10859 upl_page_get_mark(upl_page_info_t *upl, int index)
10860 {
10861 	return upl[index].mark;
10862 }
10863 
10864 void
vm_countdirtypages(void)10865 vm_countdirtypages(void)
10866 {
10867 	vm_page_t m;
10868 	int dpages;
10869 	int pgopages;
10870 	int precpages;
10871 
10872 
10873 	dpages = 0;
10874 	pgopages = 0;
10875 	precpages = 0;
10876 
10877 	vm_page_lock_queues();
10878 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10879 	do {
10880 		if (m == (vm_page_t)0) {
10881 			break;
10882 		}
10883 
10884 		if (m->vmp_dirty) {
10885 			dpages++;
10886 		}
10887 		if (m->vmp_free_when_done) {
10888 			pgopages++;
10889 		}
10890 		if (m->vmp_precious) {
10891 			precpages++;
10892 		}
10893 
10894 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10895 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10896 		if (m == (vm_page_t)0) {
10897 			break;
10898 		}
10899 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10900 	vm_page_unlock_queues();
10901 
10902 	vm_page_lock_queues();
10903 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10904 	do {
10905 		if (m == (vm_page_t)0) {
10906 			break;
10907 		}
10908 
10909 		dpages++;
10910 		assert(m->vmp_dirty);
10911 		assert(!m->vmp_free_when_done);
10912 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10913 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10914 		if (m == (vm_page_t)0) {
10915 			break;
10916 		}
10917 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10918 	vm_page_unlock_queues();
10919 
10920 	vm_page_lock_queues();
10921 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10922 	do {
10923 		if (m == (vm_page_t)0) {
10924 			break;
10925 		}
10926 
10927 		if (m->vmp_dirty) {
10928 			dpages++;
10929 		}
10930 		if (m->vmp_free_when_done) {
10931 			pgopages++;
10932 		}
10933 		if (m->vmp_precious) {
10934 			precpages++;
10935 		}
10936 
10937 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10938 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10939 		if (m == (vm_page_t)0) {
10940 			break;
10941 		}
10942 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10943 	vm_page_unlock_queues();
10944 
10945 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10946 
10947 	dpages = 0;
10948 	pgopages = 0;
10949 	precpages = 0;
10950 
10951 	vm_page_lock_queues();
10952 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10953 
10954 	do {
10955 		if (m == (vm_page_t)0) {
10956 			break;
10957 		}
10958 		if (m->vmp_dirty) {
10959 			dpages++;
10960 		}
10961 		if (m->vmp_free_when_done) {
10962 			pgopages++;
10963 		}
10964 		if (m->vmp_precious) {
10965 			precpages++;
10966 		}
10967 
10968 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10969 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10970 		if (m == (vm_page_t)0) {
10971 			break;
10972 		}
10973 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10974 	vm_page_unlock_queues();
10975 
10976 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10977 }
10978 #endif /* MACH_BSD */
10979 
10980 
10981 #if CONFIG_IOSCHED
10982 int
upl_get_cached_tier(upl_t upl)10983 upl_get_cached_tier(upl_t  upl)
10984 {
10985 	assert(upl);
10986 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10987 		return upl->upl_priority;
10988 	}
10989 	return -1;
10990 }
10991 #endif /* CONFIG_IOSCHED */
10992 
10993 
10994 void
upl_callout_iodone(upl_t upl)10995 upl_callout_iodone(upl_t upl)
10996 {
10997 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
10998 
10999 	if (upl_ctx) {
11000 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
11001 
11002 		assert(upl_ctx->io_done);
11003 
11004 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11005 	}
11006 }
11007 
11008 void
upl_set_iodone(upl_t upl,void * upl_iodone)11009 upl_set_iodone(upl_t upl, void *upl_iodone)
11010 {
11011 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11012 }
11013 
11014 void
upl_set_iodone_error(upl_t upl,int error)11015 upl_set_iodone_error(upl_t upl, int error)
11016 {
11017 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11018 
11019 	if (upl_ctx) {
11020 		upl_ctx->io_error = error;
11021 	}
11022 }
11023 
11024 
11025 ppnum_t
upl_get_highest_page(upl_t upl)11026 upl_get_highest_page(
11027 	upl_t                      upl)
11028 {
11029 	return upl->highest_page;
11030 }
11031 
11032 upl_size_t
upl_get_size(upl_t upl)11033 upl_get_size(
11034 	upl_t                      upl)
11035 {
11036 	return upl_adjusted_size(upl, PAGE_MASK);
11037 }
11038 
11039 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11040 upl_adjusted_size(
11041 	upl_t upl,
11042 	vm_map_offset_t pgmask)
11043 {
11044 	vm_object_offset_t start_offset, end_offset;
11045 
11046 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11047 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11048 
11049 	return (upl_size_t)(end_offset - start_offset);
11050 }
11051 
11052 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11053 upl_adjusted_offset(
11054 	upl_t upl,
11055 	vm_map_offset_t pgmask)
11056 {
11057 	return trunc_page_mask_64(upl->u_offset, pgmask);
11058 }
11059 
11060 vm_object_offset_t
upl_get_data_offset(upl_t upl)11061 upl_get_data_offset(
11062 	upl_t upl)
11063 {
11064 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11065 }
11066 
11067 upl_t
upl_associated_upl(upl_t upl)11068 upl_associated_upl(upl_t upl)
11069 {
11070 	return upl->associated_upl;
11071 }
11072 
11073 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11074 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11075 {
11076 	upl->associated_upl = associated_upl;
11077 }
11078 
11079 struct vnode *
upl_lookup_vnode(upl_t upl)11080 upl_lookup_vnode(upl_t upl)
11081 {
11082 	if (!upl->map_object->internal) {
11083 		return vnode_pager_lookup_vnode(upl->map_object->pager);
11084 	} else {
11085 		return NULL;
11086 	}
11087 }
11088 
11089 #if UPL_DEBUG
11090 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11091 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11092 {
11093 	upl->ubc_alias1 = alias1;
11094 	upl->ubc_alias2 = alias2;
11095 	return KERN_SUCCESS;
11096 }
11097 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11098 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11099 {
11100 	if (al) {
11101 		*al = upl->ubc_alias1;
11102 	}
11103 	if (al2) {
11104 		*al2 = upl->ubc_alias2;
11105 	}
11106 	return KERN_SUCCESS;
11107 }
11108 #endif /* UPL_DEBUG */
11109 
11110 #if VM_PRESSURE_EVENTS
11111 /*
11112  * Upward trajectory.
11113  */
11114 extern boolean_t vm_compressor_low_on_space(void);
11115 
11116 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11117 VM_PRESSURE_NORMAL_TO_WARNING(void)
11118 {
11119 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11120 		/* Available pages below our threshold */
11121 		if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11122 			/* No frozen processes to kill */
11123 			if (memorystatus_frozen_count == 0) {
11124 				/* Not enough suspended processes available. */
11125 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11126 					return TRUE;
11127 				}
11128 			}
11129 		}
11130 		return FALSE;
11131 	} else {
11132 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11133 	}
11134 }
11135 
11136 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11137 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11138 {
11139 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11140 		/* Available pages below our threshold */
11141 		if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11142 			return TRUE;
11143 		}
11144 		return FALSE;
11145 	} else {
11146 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11147 	}
11148 }
11149 
11150 /*
11151  * Downward trajectory.
11152  */
11153 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11154 VM_PRESSURE_WARNING_TO_NORMAL(void)
11155 {
11156 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11157 		/* Available pages above our threshold */
11158 		unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11159 		if (memorystatus_available_pages > target_threshold) {
11160 			return TRUE;
11161 		}
11162 		return FALSE;
11163 	} else {
11164 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11165 	}
11166 }
11167 
11168 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11169 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11170 {
11171 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11172 		/* Available pages above our threshold */
11173 		unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11174 		if (memorystatus_available_pages > target_threshold) {
11175 			return TRUE;
11176 		}
11177 		return FALSE;
11178 	} else {
11179 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11180 	}
11181 }
11182 #endif /* VM_PRESSURE_EVENTS */
11183 
11184 #if DEVELOPMENT || DEBUG
11185 bool compressor_running_perf_test;
11186 uint64_t compressor_perf_test_pages_processed;
11187 
11188 kern_return_t
11189 run_compressor_perf_test(
11190 	user_addr_t buf,
11191 	size_t buffer_size,
11192 	uint64_t *time,
11193 	uint64_t *bytes_compressed,
11194 	uint64_t *compressor_growth);
11195 
11196 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11197 move_pages_to_queue(
11198 	vm_map_t map,
11199 	user_addr_t start_addr,
11200 	size_t buffer_size,
11201 	vm_page_queue_head_t *queue,
11202 	size_t *pages_moved)
11203 {
11204 	kern_return_t err = KERN_SUCCESS;
11205 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11206 	boolean_t addr_in_map = FALSE;
11207 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11208 	vm_object_t curr_object = VM_OBJECT_NULL;
11209 	*pages_moved = 0;
11210 
11211 
11212 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11213 		/*
11214 		 * We don't currently support benchmarking maps with a different page size
11215 		 * than the kernel.
11216 		 */
11217 		return KERN_INVALID_ARGUMENT;
11218 	}
11219 
11220 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11221 		return KERN_INVALID_ARGUMENT;
11222 	}
11223 
11224 	vm_map_lock_read(map);
11225 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11226 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11227 
11228 
11229 	while (curr_addr < end_addr) {
11230 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11231 		if (!addr_in_map) {
11232 			err = KERN_INVALID_ARGUMENT;
11233 			break;
11234 		}
11235 		curr_object = VME_OBJECT(curr_entry);
11236 		if (curr_object) {
11237 			vm_object_lock(curr_object);
11238 			/* We really only want anonymous memory that's in the top level map and object here. */
11239 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11240 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11241 				err = KERN_INVALID_ARGUMENT;
11242 				vm_object_unlock(curr_object);
11243 				break;
11244 			}
11245 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11246 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11247 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
11248 			vm_map_offset_t curr_offset = start_offset;
11249 			vm_page_t curr_page;
11250 			while (curr_offset < end_offset) {
11251 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11252 				if (curr_page != VM_PAGE_NULL) {
11253 					vm_page_lock_queues();
11254 					if (curr_page->vmp_laundry) {
11255 						vm_pageout_steal_laundry(curr_page, TRUE);
11256 					}
11257 					/*
11258 					 * we've already factored out pages in the laundry which
11259 					 * means this page can't be on the pageout queue so it's
11260 					 * safe to do the vm_page_queues_remove
11261 					 */
11262 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11263 					vm_page_queues_remove(curr_page, TRUE);
11264 					if (donate) {
11265 						/*
11266 						 * The compressor needs to see this bit to know
11267 						 * where this page needs to land. Also if stolen,
11268 						 * this bit helps put the page back in the right
11269 						 * special queue where it belongs.
11270 						 */
11271 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11272 					}
11273 					// Clear the referenced bit so we ensure this gets paged out
11274 					curr_page->vmp_reference = false;
11275 					if (curr_page->vmp_pmapped) {
11276 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11277 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11278 					}
11279 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
11280 					vm_page_unlock_queues();
11281 					*pages_moved += 1;
11282 				}
11283 				curr_offset += PAGE_SIZE_64;
11284 				curr_addr += PAGE_SIZE_64;
11285 			}
11286 		}
11287 		vm_object_unlock(curr_object);
11288 	}
11289 	vm_map_unlock_read(map);
11290 	return err;
11291 }
11292 
11293 /*
11294  * Local queue for processing benchmark pages.
11295  * Can't be allocated on the stack because the pointer has to
11296  * be packable.
11297  */
11298 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11299 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11300 run_compressor_perf_test(
11301 	user_addr_t buf,
11302 	size_t buffer_size,
11303 	uint64_t *time,
11304 	uint64_t *bytes_compressed,
11305 	uint64_t *compressor_growth)
11306 {
11307 	kern_return_t err = KERN_SUCCESS;
11308 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11309 		return KERN_NOT_SUPPORTED;
11310 	}
11311 	if (current_task() == kernel_task) {
11312 		return KERN_INVALID_ARGUMENT;
11313 	}
11314 	vm_page_lock_queues();
11315 	if (compressor_running_perf_test) {
11316 		/* Only run one instance of the benchmark at a time. */
11317 		vm_page_unlock_queues();
11318 		return KERN_RESOURCE_SHORTAGE;
11319 	}
11320 	vm_page_unlock_queues();
11321 	size_t page_count = 0;
11322 	vm_map_t map;
11323 	vm_page_t p, next;
11324 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11325 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11326 	*bytes_compressed = *compressor_growth = 0;
11327 
11328 	vm_page_queue_init(&compressor_perf_test_queue);
11329 	map = current_task()->map;
11330 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11331 	if (err != KERN_SUCCESS) {
11332 		goto out;
11333 	}
11334 
11335 	vm_page_lock_queues();
11336 	compressor_running_perf_test = true;
11337 	compressor_perf_test_pages_processed = 0;
11338 	/*
11339 	 * At this point the compressor threads should only process the benchmark queue
11340 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11341 	 * to determine how many compressed bytes we ended up using.
11342 	 */
11343 	compressed_bytes_start = c_segment_compressed_bytes;
11344 	vm_page_unlock_queues();
11345 
11346 	compressor_perf_test_start = mach_absolute_time();
11347 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11348 
11349 	vm_page_lock_queues();
11350 	/*
11351 	 * Depending on when this test is run we could overshoot or be right on the mark
11352 	 * with our page_count. So the comparison is of the _less than_ variety.
11353 	 */
11354 	while (compressor_perf_test_pages_processed < page_count) {
11355 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11356 		vm_page_unlock_queues();
11357 		thread_block(THREAD_CONTINUE_NULL);
11358 		vm_page_lock_queues();
11359 	}
11360 	compressor_perf_test_end = mach_absolute_time();
11361 	compressed_bytes_end = c_segment_compressed_bytes;
11362 	vm_page_unlock_queues();
11363 
11364 
11365 out:
11366 	/*
11367 	 * If we errored out above, then we could still have some pages
11368 	 * on the local queue. Make sure to put them back on the active queue before
11369 	 * returning so they're not orphaned.
11370 	 */
11371 	vm_page_lock_queues();
11372 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11373 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11374 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11375 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11376 
11377 		vm_page_enqueue_active(p, FALSE);
11378 		p = next;
11379 	}
11380 
11381 	compressor_running_perf_test = false;
11382 	vm_page_unlock_queues();
11383 	if (err == KERN_SUCCESS) {
11384 		*bytes_compressed = page_count * PAGE_SIZE_64;
11385 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
11386 	}
11387 
11388 	/*
11389 	 * pageout_scan will consider waking the compactor swapper
11390 	 * before it blocks. Do the same thing here before we return
11391 	 * to ensure that back to back benchmark runs can't overly fragment the
11392 	 * compressor pool.
11393 	 */
11394 	vm_consider_waking_compactor_swapper();
11395 	return err;
11396 }
11397 #endif /* DEVELOPMENT || DEBUG */
11398