xref: /xnu-8792.61.2/osfmk/vm/vm_pageout.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include <stdint.h>
67 #include <ptrauth.h>
68 
69 #include <debug.h>
70 
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79 
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91 
92 #include <sys/kdebug_triage.h>
93 
94 #include <machine/vm_tuning.h>
95 #include <machine/commpage.h>
96 
97 #include <vm/pmap.h>
98 #include <vm/vm_compressor_pager.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map_internal.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_pageout.h>
104 #include <vm/vm_protos.h> /* must be last */
105 #include <vm/memory_object.h>
106 #include <vm/vm_purgeable_internal.h>
107 #include <vm/vm_shared_region.h>
108 #include <vm/vm_compressor.h>
109 
110 #include <san/kasan.h>
111 
112 #if CONFIG_PHANTOM_CACHE
113 #include <vm/vm_phantom_cache.h>
114 #endif
115 
116 #if UPL_DEBUG
117 #include <libkern/OSDebug.h>
118 #endif
119 
120 extern int cs_debug;
121 
122 extern void mbuf_drain(boolean_t);
123 
124 #if VM_PRESSURE_EVENTS
125 #if CONFIG_JETSAM
126 extern unsigned int memorystatus_available_pages;
127 extern unsigned int memorystatus_available_pages_pressure;
128 extern unsigned int memorystatus_available_pages_critical;
129 #else /* CONFIG_JETSAM */
130 extern uint64_t memorystatus_available_pages;
131 extern uint64_t memorystatus_available_pages_pressure;
132 extern uint64_t memorystatus_available_pages_critical;
133 #endif /* CONFIG_JETSAM */
134 
135 extern unsigned int memorystatus_frozen_count;
136 extern unsigned int memorystatus_suspended_count;
137 extern vm_pressure_level_t memorystatus_vm_pressure_level;
138 
139 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
140 extern uint32_t memorystatus_jetsam_fg_band_waiters;
141 
142 void vm_pressure_response(void);
143 extern void consider_vm_pressure_events(void);
144 
145 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
146 #endif /* VM_PRESSURE_EVENTS */
147 
148 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
149 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
150 boolean_t vps_dynamic_priority_enabled = FALSE;
151 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
152 
153 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
154 #if !XNU_TARGET_OS_OSX
155 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
156 #else /* !XNU_TARGET_OS_OSX */
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
158 #endif /* !XNU_TARGET_OS_OSX */
159 #endif
160 
161 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
162 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
163 #endif
164 
165 #ifndef VM_PAGE_LAUNDRY_MAX
166 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
167 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
168 
169 #ifndef VM_PAGEOUT_BURST_WAIT
170 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
171 #endif  /* VM_PAGEOUT_BURST_WAIT */
172 
173 #ifndef VM_PAGEOUT_EMPTY_WAIT
174 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
175 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
176 
177 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
178 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
179 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
180 
181 #ifndef VM_PAGEOUT_IDLE_WAIT
182 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
183 #endif  /* VM_PAGEOUT_IDLE_WAIT */
184 
185 #ifndef VM_PAGEOUT_SWAP_WAIT
186 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
187 #endif  /* VM_PAGEOUT_SWAP_WAIT */
188 
189 
190 #ifndef VM_PAGE_SPECULATIVE_TARGET
191 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
192 #endif /* VM_PAGE_SPECULATIVE_TARGET */
193 
194 
195 /*
196  *	To obtain a reasonable LRU approximation, the inactive queue
197  *	needs to be large enough to give pages on it a chance to be
198  *	referenced a second time.  This macro defines the fraction
199  *	of active+inactive pages that should be inactive.
200  *	The pageout daemon uses it to update vm_page_inactive_target.
201  *
202  *	If vm_page_free_count falls below vm_page_free_target and
203  *	vm_page_inactive_count is below vm_page_inactive_target,
204  *	then the pageout daemon starts running.
205  */
206 
207 #ifndef VM_PAGE_INACTIVE_TARGET
208 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
209 #endif  /* VM_PAGE_INACTIVE_TARGET */
210 
211 /*
212  *	Once the pageout daemon starts running, it keeps going
213  *	until vm_page_free_count meets or exceeds vm_page_free_target.
214  */
215 
216 #ifndef VM_PAGE_FREE_TARGET
217 #if !XNU_TARGET_OS_OSX
218 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
219 #else /* !XNU_TARGET_OS_OSX */
220 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
221 #endif /* !XNU_TARGET_OS_OSX */
222 #endif  /* VM_PAGE_FREE_TARGET */
223 
224 
225 /*
226  *	The pageout daemon always starts running once vm_page_free_count
227  *	falls below vm_page_free_min.
228  */
229 
230 #ifndef VM_PAGE_FREE_MIN
231 #if !XNU_TARGET_OS_OSX
232 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
233 #else /* !XNU_TARGET_OS_OSX */
234 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
235 #endif /* !XNU_TARGET_OS_OSX */
236 #endif  /* VM_PAGE_FREE_MIN */
237 
238 #if !XNU_TARGET_OS_OSX
239 #define VM_PAGE_FREE_RESERVED_LIMIT     100
240 #define VM_PAGE_FREE_MIN_LIMIT          1500
241 #define VM_PAGE_FREE_TARGET_LIMIT       2000
242 #else /* !XNU_TARGET_OS_OSX */
243 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
244 #define VM_PAGE_FREE_MIN_LIMIT          3500
245 #define VM_PAGE_FREE_TARGET_LIMIT       4000
246 #endif /* !XNU_TARGET_OS_OSX */
247 
248 /*
249  *	When vm_page_free_count falls below vm_page_free_reserved,
250  *	only vm-privileged threads can allocate pages.  vm-privilege
251  *	allows the pageout daemon and default pager (and any other
252  *	associated threads needed for default pageout) to continue
253  *	operation by dipping into the reserved pool of pages.
254  */
255 
256 #ifndef VM_PAGE_FREE_RESERVED
257 #define VM_PAGE_FREE_RESERVED(n)        \
258 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
259 #endif  /* VM_PAGE_FREE_RESERVED */
260 
261 /*
262  *	When we dequeue pages from the inactive list, they are
263  *	reactivated (ie, put back on the active queue) if referenced.
264  *	However, it is possible to starve the free list if other
265  *	processors are referencing pages faster than we can turn off
266  *	the referenced bit.  So we limit the number of reactivations
267  *	we will make per call of vm_pageout_scan().
268  */
269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270 
271 #ifndef VM_PAGE_REACTIVATE_LIMIT
272 #if !XNU_TARGET_OS_OSX
273 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
274 #else /* !XNU_TARGET_OS_OSX */
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
276 #endif /* !XNU_TARGET_OS_OSX */
277 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
278 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
279 
280 int vm_pageout_protect_realtime = true;
281 
282 extern boolean_t hibernate_cleaning_in_progress;
283 
284 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
285 
286 #if VM_PRESSURE_EVENTS
287 void vm_pressure_thread(void);
288 
289 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
290 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
291 
292 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
293 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
294 #endif
295 
296 static void vm_pageout_iothread_external(struct cq *, wait_result_t);
297 static void vm_pageout_iothread_internal(struct cq *, wait_result_t);
298 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
299 
300 extern void vm_pageout_continue(void);
301 extern void vm_pageout_scan(void);
302 
303 boolean_t vm_pageout_running = FALSE;
304 
305 uint32_t vm_page_upl_tainted = 0;
306 uint32_t vm_page_iopl_tainted = 0;
307 
308 #if XNU_TARGET_OS_OSX
309 static boolean_t vm_pageout_waiter  = FALSE;
310 #endif /* XNU_TARGET_OS_OSX */
311 
312 
313 #if DEVELOPMENT || DEBUG
314 struct vm_pageout_debug vm_pageout_debug;
315 #endif
316 struct vm_pageout_vminfo vm_pageout_vminfo;
317 struct vm_pageout_state  vm_pageout_state;
318 struct vm_config         vm_config;
319 
320 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
321 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
322 #if DEVELOPMENT || DEBUG
323 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
324 #endif /* DEVELOPMENT || DEBUG */
325 
326 int         vm_upl_wait_for_pages = 0;
327 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
328 
329 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
330 
331 int     vm_debug_events = 0;
332 
333 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
334 
335 #if CONFIG_MEMORYSTATUS
336 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
337 
338 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
339 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
340 
341 #endif
342 
343 #if __AMP__
344 int vm_compressor_ebound = 1;
345 int vm_pgo_pbound = 0;
346 extern void thread_bind_cluster_type(thread_t, char, bool);
347 #endif /* __AMP__ */
348 
349 
350 /*
351  *	Routine:	vm_pageout_object_terminate
352  *	Purpose:
353  *		Destroy the pageout_object, and perform all of the
354  *		required cleanup actions.
355  *
356  *	In/Out conditions:
357  *		The object must be locked, and will be returned locked.
358  */
359 void
vm_pageout_object_terminate(vm_object_t object)360 vm_pageout_object_terminate(
361 	vm_object_t     object)
362 {
363 	vm_object_t     shadow_object;
364 
365 	/*
366 	 * Deal with the deallocation (last reference) of a pageout object
367 	 * (used for cleaning-in-place) by dropping the paging references/
368 	 * freeing pages in the original object.
369 	 */
370 
371 	assert(object->pageout);
372 	shadow_object = object->shadow;
373 	vm_object_lock(shadow_object);
374 
375 	while (!vm_page_queue_empty(&object->memq)) {
376 		vm_page_t               p, m;
377 		vm_object_offset_t      offset;
378 
379 		p = (vm_page_t) vm_page_queue_first(&object->memq);
380 
381 		assert(p->vmp_private);
382 		assert(p->vmp_free_when_done);
383 		p->vmp_free_when_done = FALSE;
384 		assert(!p->vmp_cleaning);
385 		assert(!p->vmp_laundry);
386 
387 		offset = p->vmp_offset;
388 		VM_PAGE_FREE(p);
389 		p = VM_PAGE_NULL;
390 
391 		m = vm_page_lookup(shadow_object,
392 		    offset + object->vo_shadow_offset);
393 
394 		if (m == VM_PAGE_NULL) {
395 			continue;
396 		}
397 
398 		assert((m->vmp_dirty) || (m->vmp_precious) ||
399 		    (m->vmp_busy && m->vmp_cleaning));
400 
401 		/*
402 		 * Handle the trusted pager throttle.
403 		 * Also decrement the burst throttle (if external).
404 		 */
405 		vm_page_lock_queues();
406 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
407 			vm_pageout_throttle_up(m);
408 		}
409 
410 		/*
411 		 * Handle the "target" page(s). These pages are to be freed if
412 		 * successfully cleaned. Target pages are always busy, and are
413 		 * wired exactly once. The initial target pages are not mapped,
414 		 * (so cannot be referenced or modified) but converted target
415 		 * pages may have been modified between the selection as an
416 		 * adjacent page and conversion to a target.
417 		 */
418 		if (m->vmp_free_when_done) {
419 			assert(m->vmp_busy);
420 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
421 			assert(m->vmp_wire_count == 1);
422 			m->vmp_cleaning = FALSE;
423 			m->vmp_free_when_done = FALSE;
424 			/*
425 			 * Revoke all access to the page. Since the object is
426 			 * locked, and the page is busy, this prevents the page
427 			 * from being dirtied after the pmap_disconnect() call
428 			 * returns.
429 			 *
430 			 * Since the page is left "dirty" but "not modifed", we
431 			 * can detect whether the page was redirtied during
432 			 * pageout by checking the modify state.
433 			 */
434 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
435 				SET_PAGE_DIRTY(m, FALSE);
436 			} else {
437 				m->vmp_dirty = FALSE;
438 			}
439 
440 			if (m->vmp_dirty) {
441 				vm_page_unwire(m, TRUE);        /* reactivates */
442 				counter_inc(&vm_statistics_reactivations);
443 				PAGE_WAKEUP_DONE(m);
444 			} else {
445 				vm_page_free(m);  /* clears busy, etc. */
446 			}
447 			vm_page_unlock_queues();
448 			continue;
449 		}
450 		/*
451 		 * Handle the "adjacent" pages. These pages were cleaned in
452 		 * place, and should be left alone.
453 		 * If prep_pin_count is nonzero, then someone is using the
454 		 * page, so make it active.
455 		 */
456 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
457 			if (m->vmp_reference) {
458 				vm_page_activate(m);
459 			} else {
460 				vm_page_deactivate(m);
461 			}
462 		}
463 		if (m->vmp_overwriting) {
464 			/*
465 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
466 			 */
467 			if (m->vmp_busy) {
468 				/*
469 				 * We do not re-set m->vmp_dirty !
470 				 * The page was busy so no extraneous activity
471 				 * could have occurred. COPY_INTO is a read into the
472 				 * new pages. CLEAN_IN_PLACE does actually write
473 				 * out the pages but handling outside of this code
474 				 * will take care of resetting dirty. We clear the
475 				 * modify however for the Programmed I/O case.
476 				 */
477 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
478 
479 				m->vmp_busy = FALSE;
480 				m->vmp_absent = FALSE;
481 			} else {
482 				/*
483 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
484 				 * Occurs when the original page was wired
485 				 * at the time of the list request
486 				 */
487 				assert(VM_PAGE_WIRED(m));
488 				vm_page_unwire(m, TRUE);        /* reactivates */
489 			}
490 			m->vmp_overwriting = FALSE;
491 		} else {
492 			m->vmp_dirty = FALSE;
493 		}
494 		m->vmp_cleaning = FALSE;
495 
496 		/*
497 		 * Wakeup any thread waiting for the page to be un-cleaning.
498 		 */
499 		PAGE_WAKEUP(m);
500 		vm_page_unlock_queues();
501 	}
502 	/*
503 	 * Account for the paging reference taken in vm_paging_object_allocate.
504 	 */
505 	vm_object_activity_end(shadow_object);
506 	vm_object_unlock(shadow_object);
507 
508 	assert(object->ref_count == 0);
509 	assert(object->paging_in_progress == 0);
510 	assert(object->activity_in_progress == 0);
511 	assert(object->resident_page_count == 0);
512 	return;
513 }
514 
515 /*
516  * Routine:	vm_pageclean_setup
517  *
518  * Purpose:	setup a page to be cleaned (made non-dirty), but not
519  *		necessarily flushed from the VM page cache.
520  *		This is accomplished by cleaning in place.
521  *
522  *		The page must not be busy, and new_object
523  *		must be locked.
524  *
525  */
526 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)527 vm_pageclean_setup(
528 	vm_page_t               m,
529 	vm_page_t               new_m,
530 	vm_object_t             new_object,
531 	vm_object_offset_t      new_offset)
532 {
533 	assert(!m->vmp_busy);
534 #if 0
535 	assert(!m->vmp_cleaning);
536 #endif
537 
538 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
539 
540 	/*
541 	 * Mark original page as cleaning in place.
542 	 */
543 	m->vmp_cleaning = TRUE;
544 	SET_PAGE_DIRTY(m, FALSE);
545 	m->vmp_precious = FALSE;
546 
547 	/*
548 	 * Convert the fictitious page to a private shadow of
549 	 * the real page.
550 	 */
551 	assert(new_m->vmp_fictitious);
552 	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
553 	new_m->vmp_fictitious = FALSE;
554 	new_m->vmp_private = TRUE;
555 	new_m->vmp_free_when_done = TRUE;
556 	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
557 
558 	vm_page_lockspin_queues();
559 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
560 	vm_page_unlock_queues();
561 
562 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
563 	assert(!new_m->vmp_wanted);
564 	new_m->vmp_busy = FALSE;
565 }
566 
567 /*
568  *	Routine:	vm_pageout_initialize_page
569  *	Purpose:
570  *		Causes the specified page to be initialized in
571  *		the appropriate memory object. This routine is used to push
572  *		pages into a copy-object when they are modified in the
573  *		permanent object.
574  *
575  *		The page is moved to a temporary object and paged out.
576  *
577  *	In/out conditions:
578  *		The page in question must not be on any pageout queues.
579  *		The object to which it belongs must be locked.
580  *		The page must be busy, but not hold a paging reference.
581  *
582  *	Implementation:
583  *		Move this page to a completely new object.
584  */
585 void
vm_pageout_initialize_page(vm_page_t m)586 vm_pageout_initialize_page(
587 	vm_page_t       m)
588 {
589 	vm_object_t             object;
590 	vm_object_offset_t      paging_offset;
591 	memory_object_t         pager;
592 
593 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
594 
595 	object = VM_PAGE_OBJECT(m);
596 
597 	assert(m->vmp_busy);
598 	assert(object->internal);
599 
600 	/*
601 	 *	Verify that we really want to clean this page
602 	 */
603 	assert(!m->vmp_absent);
604 	assert(m->vmp_dirty);
605 
606 	/*
607 	 *	Create a paging reference to let us play with the object.
608 	 */
609 	paging_offset = m->vmp_offset + object->paging_offset;
610 
611 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
612 		panic("reservation without pageout?"); /* alan */
613 
614 		VM_PAGE_FREE(m);
615 		vm_object_unlock(object);
616 
617 		return;
618 	}
619 
620 	/*
621 	 * If there's no pager, then we can't clean the page.  This should
622 	 * never happen since this should be a copy object and therefore not
623 	 * an external object, so the pager should always be there.
624 	 */
625 
626 	pager = object->pager;
627 
628 	if (pager == MEMORY_OBJECT_NULL) {
629 		panic("missing pager for copy object");
630 
631 		VM_PAGE_FREE(m);
632 		return;
633 	}
634 
635 	/*
636 	 * set the page for future call to vm_fault_list_request
637 	 */
638 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
639 	SET_PAGE_DIRTY(m, FALSE);
640 
641 	/*
642 	 * keep the object from collapsing or terminating
643 	 */
644 	vm_object_paging_begin(object);
645 	vm_object_unlock(object);
646 
647 	/*
648 	 *	Write the data to its pager.
649 	 *	Note that the data is passed by naming the new object,
650 	 *	not a virtual address; the pager interface has been
651 	 *	manipulated to use the "internal memory" data type.
652 	 *	[The object reference from its allocation is donated
653 	 *	to the eventual recipient.]
654 	 */
655 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
656 
657 	vm_object_lock(object);
658 	vm_object_paging_end(object);
659 }
660 
661 
662 /*
663  * vm_pageout_cluster:
664  *
665  * Given a page, queue it to the appropriate I/O thread,
666  * which will page it out and attempt to clean adjacent pages
667  * in the same operation.
668  *
669  * The object and queues must be locked. We will take a
670  * paging reference to prevent deallocation or collapse when we
671  * release the object lock back at the call site.  The I/O thread
672  * is responsible for consuming this reference
673  *
674  * The page must not be on any pageout queue.
675  */
676 #if DEVELOPMENT || DEBUG
677 vmct_stats_t vmct_stats;
678 
679 int32_t vmct_active = 0;
680 uint64_t vm_compressor_epoch_start = 0;
681 uint64_t vm_compressor_epoch_stop = 0;
682 
683 typedef enum vmct_state_t {
684 	VMCT_IDLE,
685 	VMCT_AWAKENED,
686 	VMCT_ACTIVE,
687 } vmct_state_t;
688 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
689 #endif
690 
691 
692 
693 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)694 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
695 {
696 	event_t     wakeup_event;
697 	vm_object_t object = VM_PAGE_OBJECT(m);
698 
699 	VM_PAGE_CHECK(m);
700 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
701 	vm_object_lock_assert_exclusive(object);
702 
703 	/*
704 	 * Make sure it's OK to page this out.
705 	 */
706 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
707 	assert(!m->vmp_cleaning && !m->vmp_laundry);
708 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
709 
710 	/*
711 	 * protect the object from collapse or termination
712 	 */
713 	vm_object_activity_begin(object);
714 
715 	if (object->internal == TRUE) {
716 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
717 
718 		m->vmp_busy = TRUE;
719 		wakeup_event = (event_t) &(vm_pageout_queue_internal.pgo_pending);
720 	} else {
721 		wakeup_event = (event_t) &(vm_pageout_queue_external.pgo_pending);
722 	}
723 
724 	/*
725 	 * pgo_laundry count is tied to the laundry bit
726 	 */
727 	m->vmp_laundry = TRUE;
728 	q->pgo_laundry++;
729 
730 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
731 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
732 
733 	if (q->pgo_idle == TRUE) {
734 		q->pgo_idle = FALSE;
735 		thread_wakeup(wakeup_event);
736 	}
737 	VM_PAGE_CHECK(m);
738 }
739 
740 void
vm_pageout_cluster(vm_page_t m)741 vm_pageout_cluster(vm_page_t m)
742 {
743 	struct          vm_pageout_queue *q;
744 	vm_object_t     object = VM_PAGE_OBJECT(m);
745 	if (object->internal) {
746 		q = &vm_pageout_queue_internal;
747 	} else {
748 		q = &vm_pageout_queue_external;
749 	}
750 	vm_pageout_cluster_to_queue(m, q);
751 }
752 
753 
754 /*
755  * A page is back from laundry or we are stealing it back from
756  * the laundering state.  See if there are some pages waiting to
757  * go to laundry and if we can let some of them go now.
758  *
759  * Object and page queues must be locked.
760  */
761 void
vm_pageout_throttle_up(vm_page_t m)762 vm_pageout_throttle_up(
763 	vm_page_t       m)
764 {
765 	struct vm_pageout_queue *q;
766 	vm_object_t      m_object;
767 
768 	m_object = VM_PAGE_OBJECT(m);
769 
770 	assert(m_object != VM_OBJECT_NULL);
771 	assert(m_object != kernel_object);
772 
773 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
774 	vm_object_lock_assert_exclusive(m_object);
775 
776 	if (m_object->internal == TRUE) {
777 		q = &vm_pageout_queue_internal;
778 	} else {
779 		q = &vm_pageout_queue_external;
780 	}
781 
782 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
783 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
784 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
785 
786 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
787 
788 		vm_object_activity_end(m_object);
789 
790 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
791 	}
792 	if (m->vmp_laundry == TRUE) {
793 		m->vmp_laundry = FALSE;
794 		q->pgo_laundry--;
795 
796 		if (q->pgo_throttled == TRUE) {
797 			q->pgo_throttled = FALSE;
798 			thread_wakeup((event_t) &q->pgo_laundry);
799 		}
800 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
801 			q->pgo_draining = FALSE;
802 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
803 		}
804 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
805 	}
806 }
807 
808 
809 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)810 vm_pageout_throttle_up_batch(
811 	struct vm_pageout_queue *q,
812 	int             batch_cnt)
813 {
814 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
815 
816 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
817 
818 	q->pgo_laundry -= batch_cnt;
819 
820 	if (q->pgo_throttled == TRUE) {
821 		q->pgo_throttled = FALSE;
822 		thread_wakeup((event_t) &q->pgo_laundry);
823 	}
824 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
825 		q->pgo_draining = FALSE;
826 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
827 	}
828 }
829 
830 
831 
832 /*
833  * VM memory pressure monitoring.
834  *
835  * vm_pageout_scan() keeps track of the number of pages it considers and
836  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
837  *
838  * compute_memory_pressure() is called every second from compute_averages()
839  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
840  * of recalimed pages in a new vm_pageout_stat[] bucket.
841  *
842  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
843  * The caller provides the number of seconds ("nsecs") worth of statistics
844  * it wants, up to 30 seconds.
845  * It computes the number of pages reclaimed in the past "nsecs" seconds and
846  * also returns the number of pages the system still needs to reclaim at this
847  * moment in time.
848  */
849 #if DEVELOPMENT || DEBUG
850 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
851 #else
852 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
853 #endif
854 struct vm_pageout_stat {
855 	unsigned long vm_page_active_count;
856 	unsigned long vm_page_speculative_count;
857 	unsigned long vm_page_inactive_count;
858 	unsigned long vm_page_anonymous_count;
859 
860 	unsigned long vm_page_free_count;
861 	unsigned long vm_page_wire_count;
862 	unsigned long vm_page_compressor_count;
863 
864 	unsigned long vm_page_pages_compressed;
865 	unsigned long vm_page_pageable_internal_count;
866 	unsigned long vm_page_pageable_external_count;
867 	unsigned long vm_page_xpmapped_external_count;
868 
869 	unsigned int pages_grabbed;
870 	unsigned int pages_freed;
871 
872 	unsigned int pages_compressed;
873 	unsigned int pages_grabbed_by_compressor;
874 	unsigned int failed_compressions;
875 
876 	unsigned int pages_evicted;
877 	unsigned int pages_purged;
878 
879 	unsigned int considered;
880 	unsigned int considered_bq_internal;
881 	unsigned int considered_bq_external;
882 
883 	unsigned int skipped_external;
884 	unsigned int skipped_internal;
885 	unsigned int filecache_min_reactivations;
886 
887 	unsigned int freed_speculative;
888 	unsigned int freed_cleaned;
889 	unsigned int freed_internal;
890 	unsigned int freed_external;
891 
892 	unsigned int cleaned_dirty_external;
893 	unsigned int cleaned_dirty_internal;
894 
895 	unsigned int inactive_referenced;
896 	unsigned int inactive_nolock;
897 	unsigned int reactivation_limit_exceeded;
898 	unsigned int forced_inactive_reclaim;
899 
900 	unsigned int throttled_internal_q;
901 	unsigned int throttled_external_q;
902 
903 	unsigned int phantom_ghosts_found;
904 	unsigned int phantom_ghosts_added;
905 
906 	unsigned int vm_page_realtime_count;
907 	unsigned int forcereclaimed_sharedcache;
908 	unsigned int forcereclaimed_realtime;
909 	unsigned int protected_sharedcache;
910 	unsigned int protected_realtime;
911 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
912 
913 unsigned int vm_pageout_stat_now = 0;
914 
915 #define VM_PAGEOUT_STAT_BEFORE(i) \
916 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
917 #define VM_PAGEOUT_STAT_AFTER(i) \
918 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
919 
920 #if VM_PAGE_BUCKETS_CHECK
921 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
922 #endif /* VM_PAGE_BUCKETS_CHECK */
923 
924 
925 void
926 record_memory_pressure(void);
927 void
record_memory_pressure(void)928 record_memory_pressure(void)
929 {
930 	unsigned int vm_pageout_next;
931 
932 #if VM_PAGE_BUCKETS_CHECK
933 	/* check the consistency of VM page buckets at regular interval */
934 	static int counter = 0;
935 	if ((++counter % vm_page_buckets_check_interval) == 0) {
936 		vm_page_buckets_check();
937 	}
938 #endif /* VM_PAGE_BUCKETS_CHECK */
939 
940 	vm_pageout_state.vm_memory_pressure =
941 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
942 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
943 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
944 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
945 
946 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
947 
948 	/* move "now" forward */
949 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
950 
951 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
952 
953 	vm_pageout_stat_now = vm_pageout_next;
954 }
955 
956 
957 /*
958  * IMPORTANT
959  * mach_vm_ctl_page_free_wanted() is called indirectly, via
960  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
961  * it must be safe in the restricted stackshot context. Locks and/or
962  * blocking are not allowable.
963  */
964 unsigned int
mach_vm_ctl_page_free_wanted(void)965 mach_vm_ctl_page_free_wanted(void)
966 {
967 	unsigned int page_free_target, page_free_count, page_free_wanted;
968 
969 	page_free_target = vm_page_free_target;
970 	page_free_count = vm_page_free_count;
971 	if (page_free_target > page_free_count) {
972 		page_free_wanted = page_free_target - page_free_count;
973 	} else {
974 		page_free_wanted = 0;
975 	}
976 
977 	return page_free_wanted;
978 }
979 
980 
981 /*
982  * IMPORTANT:
983  * mach_vm_pressure_monitor() is called when taking a stackshot, with
984  * wait_for_pressure FALSE, so that code path must remain safe in the
985  * restricted stackshot context. No blocking or locks are allowable.
986  * on that code path.
987  */
988 
989 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)990 mach_vm_pressure_monitor(
991 	boolean_t       wait_for_pressure,
992 	unsigned int    nsecs_monitored,
993 	unsigned int    *pages_reclaimed_p,
994 	unsigned int    *pages_wanted_p)
995 {
996 	wait_result_t   wr;
997 	unsigned int    vm_pageout_then, vm_pageout_now;
998 	unsigned int    pages_reclaimed;
999 	unsigned int    units_of_monitor;
1000 
1001 	units_of_monitor = 8 * nsecs_monitored;
1002 	/*
1003 	 * We don't take the vm_page_queue_lock here because we don't want
1004 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1005 	 * thread when it's trying to reclaim memory.  We don't need fully
1006 	 * accurate monitoring anyway...
1007 	 */
1008 
1009 	if (wait_for_pressure) {
1010 		/* wait until there's memory pressure */
1011 		while (vm_page_free_count >= vm_page_free_target) {
1012 			wr = assert_wait((event_t) &vm_page_free_wanted,
1013 			    THREAD_INTERRUPTIBLE);
1014 			if (wr == THREAD_WAITING) {
1015 				wr = thread_block(THREAD_CONTINUE_NULL);
1016 			}
1017 			if (wr == THREAD_INTERRUPTED) {
1018 				return KERN_ABORTED;
1019 			}
1020 			if (wr == THREAD_AWAKENED) {
1021 				/*
1022 				 * The memory pressure might have already
1023 				 * been relieved but let's not block again
1024 				 * and let's report that there was memory
1025 				 * pressure at some point.
1026 				 */
1027 				break;
1028 			}
1029 		}
1030 	}
1031 
1032 	/* provide the number of pages the system wants to reclaim */
1033 	if (pages_wanted_p != NULL) {
1034 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1035 	}
1036 
1037 	if (pages_reclaimed_p == NULL) {
1038 		return KERN_SUCCESS;
1039 	}
1040 
1041 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1042 	vm_pageout_now = vm_pageout_stat_now;
1043 	pages_reclaimed = 0;
1044 	for (vm_pageout_then =
1045 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1046 	    vm_pageout_then != vm_pageout_now &&
1047 	    units_of_monitor-- != 0;
1048 	    vm_pageout_then =
1049 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1050 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1051 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1052 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1053 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1054 	}
1055 	*pages_reclaimed_p = pages_reclaimed;
1056 
1057 	return KERN_SUCCESS;
1058 }
1059 
1060 
1061 
1062 #if DEVELOPMENT || DEBUG
1063 
1064 static void
1065 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1066 
1067 /*
1068  * condition variable used to make sure there is
1069  * only a single sweep going on at a time
1070  */
1071 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1072 
1073 
1074 void
vm_pageout_disconnect_all_pages()1075 vm_pageout_disconnect_all_pages()
1076 {
1077 	vm_page_lock_queues();
1078 
1079 	if (vm_pageout_disconnect_all_pages_active == TRUE) {
1080 		vm_page_unlock_queues();
1081 		return;
1082 	}
1083 	vm_pageout_disconnect_all_pages_active = TRUE;
1084 	vm_page_unlock_queues();
1085 
1086 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1087 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1088 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1089 
1090 	vm_pageout_disconnect_all_pages_active = FALSE;
1091 }
1092 
1093 
1094 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1095 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1096 {
1097 	vm_page_t       m;
1098 	vm_object_t     t_object = NULL;
1099 	vm_object_t     l_object = NULL;
1100 	vm_object_t     m_object = NULL;
1101 	int             delayed_unlock = 0;
1102 	int             try_failed_count = 0;
1103 	int             disconnected_count = 0;
1104 	int             paused_count = 0;
1105 	int             object_locked_count = 0;
1106 
1107 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1108 	    q, qcount, 0, 0, 0);
1109 
1110 	vm_page_lock_queues();
1111 
1112 	while (qcount && !vm_page_queue_empty(q)) {
1113 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1114 
1115 		m = (vm_page_t) vm_page_queue_first(q);
1116 		m_object = VM_PAGE_OBJECT(m);
1117 
1118 		/*
1119 		 * check to see if we currently are working
1120 		 * with the same object... if so, we've
1121 		 * already got the lock
1122 		 */
1123 		if (m_object != l_object) {
1124 			/*
1125 			 * the object associated with candidate page is
1126 			 * different from the one we were just working
1127 			 * with... dump the lock if we still own it
1128 			 */
1129 			if (l_object != NULL) {
1130 				vm_object_unlock(l_object);
1131 				l_object = NULL;
1132 			}
1133 			if (m_object != t_object) {
1134 				try_failed_count = 0;
1135 			}
1136 
1137 			/*
1138 			 * Try to lock object; since we've alread got the
1139 			 * page queues lock, we can only 'try' for this one.
1140 			 * if the 'try' fails, we need to do a mutex_pause
1141 			 * to allow the owner of the object lock a chance to
1142 			 * run...
1143 			 */
1144 			if (!vm_object_lock_try_scan(m_object)) {
1145 				if (try_failed_count > 20) {
1146 					goto reenter_pg_on_q;
1147 				}
1148 				vm_page_unlock_queues();
1149 				mutex_pause(try_failed_count++);
1150 				vm_page_lock_queues();
1151 				delayed_unlock = 0;
1152 
1153 				paused_count++;
1154 
1155 				t_object = m_object;
1156 				continue;
1157 			}
1158 			object_locked_count++;
1159 
1160 			l_object = m_object;
1161 		}
1162 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1163 			/*
1164 			 * put it back on the head of its queue
1165 			 */
1166 			goto reenter_pg_on_q;
1167 		}
1168 		if (m->vmp_pmapped == TRUE) {
1169 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1170 
1171 			disconnected_count++;
1172 		}
1173 reenter_pg_on_q:
1174 		vm_page_queue_remove(q, m, vmp_pageq);
1175 		vm_page_queue_enter(q, m, vmp_pageq);
1176 
1177 		qcount--;
1178 		try_failed_count = 0;
1179 
1180 		if (delayed_unlock++ > 128) {
1181 			if (l_object != NULL) {
1182 				vm_object_unlock(l_object);
1183 				l_object = NULL;
1184 			}
1185 			lck_mtx_yield(&vm_page_queue_lock);
1186 			delayed_unlock = 0;
1187 		}
1188 	}
1189 	if (l_object != NULL) {
1190 		vm_object_unlock(l_object);
1191 		l_object = NULL;
1192 	}
1193 	vm_page_unlock_queues();
1194 
1195 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1196 	    q, disconnected_count, object_locked_count, paused_count, 0);
1197 }
1198 
1199 extern char* proc_best_name(struct proc* proc);
1200 
1201 int
vm_toggle_task_selfdonate_pages(task_t task)1202 vm_toggle_task_selfdonate_pages(task_t task)
1203 {
1204 	int state = 0;
1205 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1206 		printf("VM Donation mode is OFF on the system\n");
1207 		return state;
1208 	}
1209 	if (task != kernel_task) {
1210 		task_lock(task);
1211 		if (!task->donates_own_pages) {
1212 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1213 			task->donates_own_pages = true;
1214 			state = 1;
1215 		} else if (task->donates_own_pages) {
1216 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1217 			task->donates_own_pages = false;
1218 			state = 0;
1219 		}
1220 		task_unlock(task);
1221 	}
1222 	return state;
1223 }
1224 #endif /* DEVELOPMENT || DEBUG */
1225 
1226 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1227 vm_task_set_selfdonate_pages(task_t task, bool donate)
1228 {
1229 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1230 	assert(task != kernel_task);
1231 
1232 	task_lock(task);
1233 	task->donates_own_pages = donate;
1234 	task_unlock(task);
1235 }
1236 
1237 
1238 
1239 static size_t
1240 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1241 
1242 /*
1243  * condition variable used to make sure there is
1244  * only a single sweep going on at a time
1245  */
1246 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1247 
1248 
1249 void
vm_pageout_anonymous_pages()1250 vm_pageout_anonymous_pages()
1251 {
1252 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1253 		vm_page_lock_queues();
1254 
1255 		if (vm_pageout_anonymous_pages_active == TRUE) {
1256 			vm_page_unlock_queues();
1257 			return;
1258 		}
1259 		vm_pageout_anonymous_pages_active = TRUE;
1260 		vm_page_unlock_queues();
1261 
1262 		vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1263 		vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1264 		vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1265 
1266 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1267 			vm_consider_swapping();
1268 		}
1269 
1270 		vm_page_lock_queues();
1271 		vm_pageout_anonymous_pages_active = FALSE;
1272 		vm_page_unlock_queues();
1273 	}
1274 }
1275 
1276 
1277 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1278 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1279 {
1280 	vm_page_t       m;
1281 	vm_object_t     t_object = NULL;
1282 	vm_object_t     l_object = NULL;
1283 	vm_object_t     m_object = NULL;
1284 	int             delayed_unlock = 0;
1285 	int             try_failed_count = 0;
1286 	int             refmod_state;
1287 	int             pmap_options;
1288 	struct          vm_pageout_queue *iq;
1289 	ppnum_t         phys_page;
1290 	size_t          pages_moved = 0;
1291 
1292 
1293 	iq = &vm_pageout_queue_internal;
1294 
1295 	vm_page_lock_queues();
1296 
1297 #if DEVELOPMENT || DEBUG
1298 	if (perf_test) {
1299 		iq = &vm_pageout_queue_benchmark;
1300 	}
1301 #endif /* DEVELOPMENT ||DEBUG */
1302 
1303 	while (qcount && !vm_page_queue_empty(q)) {
1304 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1305 
1306 		if (VM_PAGE_Q_THROTTLED(iq)) {
1307 			if (l_object != NULL) {
1308 				vm_object_unlock(l_object);
1309 				l_object = NULL;
1310 			}
1311 			iq->pgo_draining = TRUE;
1312 
1313 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1314 			vm_page_unlock_queues();
1315 
1316 			thread_block(THREAD_CONTINUE_NULL);
1317 
1318 			vm_page_lock_queues();
1319 			delayed_unlock = 0;
1320 			continue;
1321 		}
1322 		m = (vm_page_t) vm_page_queue_first(q);
1323 		m_object = VM_PAGE_OBJECT(m);
1324 
1325 		/*
1326 		 * check to see if we currently are working
1327 		 * with the same object... if so, we've
1328 		 * already got the lock
1329 		 */
1330 		if (m_object != l_object) {
1331 			if (!m_object->internal) {
1332 				goto reenter_pg_on_q;
1333 			}
1334 
1335 			/*
1336 			 * the object associated with candidate page is
1337 			 * different from the one we were just working
1338 			 * with... dump the lock if we still own it
1339 			 */
1340 			if (l_object != NULL) {
1341 				vm_object_unlock(l_object);
1342 				l_object = NULL;
1343 			}
1344 			if (m_object != t_object) {
1345 				try_failed_count = 0;
1346 			}
1347 
1348 			/*
1349 			 * Try to lock object; since we've alread got the
1350 			 * page queues lock, we can only 'try' for this one.
1351 			 * if the 'try' fails, we need to do a mutex_pause
1352 			 * to allow the owner of the object lock a chance to
1353 			 * run...
1354 			 */
1355 			if (!vm_object_lock_try_scan(m_object)) {
1356 				if (try_failed_count > 20) {
1357 					goto reenter_pg_on_q;
1358 				}
1359 				vm_page_unlock_queues();
1360 				mutex_pause(try_failed_count++);
1361 				vm_page_lock_queues();
1362 				delayed_unlock = 0;
1363 
1364 				t_object = m_object;
1365 				continue;
1366 			}
1367 			l_object = m_object;
1368 		}
1369 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1370 			/*
1371 			 * page is not to be cleaned
1372 			 * put it back on the head of its queue
1373 			 */
1374 			goto reenter_pg_on_q;
1375 		}
1376 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1377 
1378 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1379 			refmod_state = pmap_get_refmod(phys_page);
1380 
1381 			if (refmod_state & VM_MEM_REFERENCED) {
1382 				m->vmp_reference = TRUE;
1383 			}
1384 			if (refmod_state & VM_MEM_MODIFIED) {
1385 				SET_PAGE_DIRTY(m, FALSE);
1386 			}
1387 		}
1388 		if (m->vmp_reference == TRUE) {
1389 			m->vmp_reference = FALSE;
1390 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1391 			goto reenter_pg_on_q;
1392 		}
1393 		if (m->vmp_pmapped == TRUE) {
1394 			if (m->vmp_dirty || m->vmp_precious) {
1395 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1396 			} else {
1397 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1398 			}
1399 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1400 			if (refmod_state & VM_MEM_MODIFIED) {
1401 				SET_PAGE_DIRTY(m, FALSE);
1402 			}
1403 		}
1404 
1405 		if (!m->vmp_dirty && !m->vmp_precious) {
1406 			vm_page_unlock_queues();
1407 			VM_PAGE_FREE(m);
1408 			vm_page_lock_queues();
1409 			delayed_unlock = 0;
1410 
1411 			goto next_pg;
1412 		}
1413 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1414 			if (!m_object->pager_initialized) {
1415 				vm_page_unlock_queues();
1416 
1417 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1418 
1419 				if (!m_object->pager_initialized) {
1420 					vm_object_compressor_pager_create(m_object);
1421 				}
1422 
1423 				vm_page_lock_queues();
1424 				delayed_unlock = 0;
1425 			}
1426 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1427 				goto reenter_pg_on_q;
1428 			}
1429 			/*
1430 			 * vm_object_compressor_pager_create will drop the object lock
1431 			 * which means 'm' may no longer be valid to use
1432 			 */
1433 			continue;
1434 		}
1435 
1436 		if (!perf_test) {
1437 			/*
1438 			 * we've already factored out pages in the laundry which
1439 			 * means this page can't be on the pageout queue so it's
1440 			 * safe to do the vm_page_queues_remove
1441 			 */
1442 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1443 			vm_page_queues_remove(m, TRUE);
1444 			if (donate) {
1445 				/*
1446 				 * The compressor needs to see this bit to know
1447 				 * where this page needs to land. Also if stolen,
1448 				 * this bit helps put the page back in the right
1449 				 * special queue where it belongs.
1450 				 */
1451 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1452 			}
1453 		} else {
1454 			vm_page_queue_remove(q, m, vmp_pageq);
1455 		}
1456 
1457 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1458 
1459 		vm_pageout_cluster_to_queue(m, iq);
1460 
1461 		pages_moved++;
1462 		goto next_pg;
1463 
1464 reenter_pg_on_q:
1465 		vm_page_queue_remove(q, m, vmp_pageq);
1466 		vm_page_queue_enter(q, m, vmp_pageq);
1467 next_pg:
1468 		qcount--;
1469 		try_failed_count = 0;
1470 
1471 		if (delayed_unlock++ > 128) {
1472 			if (l_object != NULL) {
1473 				vm_object_unlock(l_object);
1474 				l_object = NULL;
1475 			}
1476 			lck_mtx_yield(&vm_page_queue_lock);
1477 			delayed_unlock = 0;
1478 		}
1479 	}
1480 	if (l_object != NULL) {
1481 		vm_object_unlock(l_object);
1482 		l_object = NULL;
1483 	}
1484 	vm_page_unlock_queues();
1485 	return pages_moved;
1486 }
1487 
1488 
1489 
1490 /*
1491  * function in BSD to apply I/O throttle to the pageout thread
1492  */
1493 extern void vm_pageout_io_throttle(void);
1494 
1495 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1496 	MACRO_BEGIN                                                     \
1497 	/* \
1498 	 * If a "reusable" page somehow made it back into \
1499 	 * the active queue, it's been re-used and is not \
1500 	 * quite re-usable. \
1501 	 * If the VM object was "all_reusable", consider it \
1502 	 * as "all re-used" instead of converting it to \
1503 	 * "partially re-used", which could be expensive. \
1504 	 */                                                             \
1505 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1506 	if ((m)->vmp_reusable ||                                        \
1507 	    (obj)->all_reusable) {                                      \
1508 	        vm_object_reuse_pages((obj),                            \
1509 	                              (m)->vmp_offset,                  \
1510 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1511 	                              FALSE);                           \
1512 	}                                                               \
1513 	MACRO_END
1514 
1515 
1516 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1517 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1518 
1519 #define FCS_IDLE                0
1520 #define FCS_DELAYED             1
1521 #define FCS_DEADLOCK_DETECTED   2
1522 
1523 struct flow_control {
1524 	int             state;
1525 	mach_timespec_t ts;
1526 };
1527 
1528 
1529 uint64_t vm_pageout_rejected_bq_internal = 0;
1530 uint64_t vm_pageout_rejected_bq_external = 0;
1531 uint64_t vm_pageout_skipped_bq_internal = 0;
1532 uint64_t vm_pageout_skipped_bq_external = 0;
1533 
1534 #define ANONS_GRABBED_LIMIT     2
1535 
1536 
1537 #if 0
1538 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1539 #endif
1540 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1541 
1542 #define VM_PAGEOUT_PB_NO_ACTION                         0
1543 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1544 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1545 
1546 
1547 #if 0
1548 static void
1549 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1550 {
1551 	if (*local_freeq) {
1552 		vm_page_unlock_queues();
1553 
1554 		VM_DEBUG_CONSTANT_EVENT(
1555 			vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1556 			vm_page_free_count, 0, 0, 1);
1557 
1558 		vm_page_free_list(*local_freeq, TRUE);
1559 
1560 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1561 		    vm_page_free_count, *local_freed, 0, 1);
1562 
1563 		*local_freeq = NULL;
1564 		*local_freed = 0;
1565 
1566 		vm_page_lock_queues();
1567 	} else {
1568 		lck_mtx_yield(&vm_page_queue_lock);
1569 	}
1570 	*delayed_unlock = 1;
1571 }
1572 #endif
1573 
1574 
1575 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1576 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1577     vm_page_t *local_freeq, int *local_freed, int action)
1578 {
1579 	vm_page_unlock_queues();
1580 
1581 	if (*object != NULL) {
1582 		vm_object_unlock(*object);
1583 		*object = NULL;
1584 	}
1585 	if (*local_freeq) {
1586 		vm_page_free_list(*local_freeq, TRUE);
1587 
1588 		*local_freeq = NULL;
1589 		*local_freed = 0;
1590 	}
1591 	*delayed_unlock = 1;
1592 
1593 	switch (action) {
1594 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1595 		vm_consider_waking_compactor_swapper();
1596 		break;
1597 	case VM_PAGEOUT_PB_THREAD_YIELD:
1598 		thread_yield_internal(1);
1599 		break;
1600 	case VM_PAGEOUT_PB_NO_ACTION:
1601 	default:
1602 		break;
1603 	}
1604 	vm_page_lock_queues();
1605 }
1606 
1607 
1608 static struct vm_pageout_vminfo last;
1609 
1610 uint64_t last_vm_page_pages_grabbed = 0;
1611 
1612 extern  uint32_t c_segment_pages_compressed;
1613 
1614 extern uint64_t shared_region_pager_reclaimed;
1615 extern struct memory_object_pager_ops shared_region_pager_ops;
1616 
1617 void
update_vm_info(void)1618 update_vm_info(void)
1619 {
1620 	unsigned long tmp;
1621 	uint64_t tmp64;
1622 
1623 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1624 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1625 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1626 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1627 
1628 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1629 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1630 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1631 
1632 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1633 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1634 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1635 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1636 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1637 
1638 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1639 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1640 	last.vm_pageout_considered_page = tmp;
1641 
1642 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1643 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1644 	last.vm_pageout_compressions = tmp64;
1645 
1646 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1647 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1648 	last.vm_compressor_failed = tmp;
1649 
1650 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1651 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1652 	last.vm_compressor_pages_grabbed = tmp64;
1653 
1654 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1655 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1656 	last.vm_phantom_cache_found_ghost = tmp;
1657 
1658 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1659 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1660 	last.vm_phantom_cache_added_ghost = tmp;
1661 
1662 	tmp64 = counter_load(&vm_page_grab_count);
1663 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1664 	last_vm_page_pages_grabbed = tmp64;
1665 
1666 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1667 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1668 	last.vm_page_pages_freed = tmp;
1669 
1670 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1671 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1672 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1673 		last.vm_pageout_pages_evicted = tmp;
1674 
1675 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1676 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1677 		last.vm_pageout_pages_purged = tmp;
1678 
1679 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1680 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1681 		last.vm_pageout_freed_speculative = tmp;
1682 
1683 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1684 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1685 		last.vm_pageout_freed_external = tmp;
1686 
1687 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1688 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1689 		last.vm_pageout_inactive_referenced = tmp;
1690 
1691 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1692 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1693 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1694 
1695 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1696 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1697 		last.vm_pageout_inactive_dirty_external = tmp;
1698 
1699 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1700 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1701 		last.vm_pageout_freed_cleaned = tmp;
1702 
1703 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1704 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1705 		last.vm_pageout_inactive_nolock = tmp;
1706 
1707 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1708 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1709 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1710 
1711 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1712 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1713 		last.vm_pageout_skipped_external = tmp;
1714 
1715 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1716 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1717 		last.vm_pageout_skipped_internal = tmp;
1718 
1719 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1720 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1721 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1722 
1723 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1724 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1725 		last.vm_pageout_inactive_force_reclaim = tmp;
1726 
1727 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1728 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1729 		last.vm_pageout_freed_internal = tmp;
1730 
1731 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1732 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1733 		last.vm_pageout_considered_bq_internal = tmp;
1734 
1735 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1736 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1737 		last.vm_pageout_considered_bq_external = tmp;
1738 
1739 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1740 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1741 		last.vm_pageout_filecache_min_reactivated = tmp;
1742 
1743 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1744 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1745 		last.vm_pageout_inactive_dirty_internal = tmp;
1746 
1747 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1748 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1749 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1750 
1751 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1752 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1753 		last.vm_pageout_forcereclaimed_realtime = tmp;
1754 
1755 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1756 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1757 		last.vm_pageout_protected_sharedcache = tmp;
1758 
1759 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1760 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1761 		last.vm_pageout_protected_realtime = tmp;
1762 	}
1763 
1764 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1765 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1766 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1767 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1768 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1769 	    0);
1770 
1771 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1772 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1773 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1774 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1775 	    0,
1776 	    0);
1777 
1778 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1779 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1780 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1781 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1782 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1783 	    0);
1784 
1785 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1786 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1787 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1788 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1789 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1790 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1791 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1792 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1793 		    0);
1794 
1795 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1796 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1797 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1798 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1799 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1800 		    0);
1801 
1802 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1803 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1804 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1805 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1806 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1807 		    0);
1808 
1809 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1810 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1811 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1812 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1813 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1814 		    0);
1815 
1816 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1817 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1818 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1819 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1820 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1821 		    0);
1822 
1823 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1824 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1825 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1826 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1827 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1828 		    0);
1829 	}
1830 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1831 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1832 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1833 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1834 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1835 	    0);
1836 
1837 	record_memory_pressure();
1838 }
1839 
1840 extern boolean_t hibernation_vmqueues_inspection;
1841 
1842 /*
1843  * Return values for functions called by vm_pageout_scan
1844  * that control its flow.
1845  *
1846  * PROCEED -- vm_pageout_scan will keep making forward progress.
1847  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1848  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1849  */
1850 
1851 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1852 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1853 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1854 
1855 /*
1856  * This function is called only from vm_pageout_scan and
1857  * it moves overflow secluded pages (one-at-a-time) to the
1858  * batched 'local' free Q or active Q.
1859  */
1860 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1861 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1862 {
1863 #if CONFIG_SECLUDED_MEMORY
1864 	/*
1865 	 * Deal with secluded_q overflow.
1866 	 */
1867 	if (vm_page_secluded_count > vm_page_secluded_target) {
1868 		vm_page_t secluded_page;
1869 
1870 		/*
1871 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1872 		 * Excess secluded pages go to the active queue and
1873 		 * will later go to the inactive queue.
1874 		 */
1875 		assert((vm_page_secluded_count_free +
1876 		    vm_page_secluded_count_inuse) ==
1877 		    vm_page_secluded_count);
1878 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1879 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1880 
1881 		vm_page_queues_remove(secluded_page, FALSE);
1882 		assert(!secluded_page->vmp_fictitious);
1883 		assert(!VM_PAGE_WIRED(secluded_page));
1884 
1885 		if (secluded_page->vmp_object == 0) {
1886 			/* transfer to free queue */
1887 			assert(secluded_page->vmp_busy);
1888 			secluded_page->vmp_snext = *local_freeq;
1889 			*local_freeq = secluded_page;
1890 			*local_freed += 1;
1891 		} else {
1892 			/* transfer to head of active queue */
1893 			vm_page_enqueue_active(secluded_page, FALSE);
1894 			secluded_page = VM_PAGE_NULL;
1895 		}
1896 	}
1897 #else /* CONFIG_SECLUDED_MEMORY */
1898 
1899 #pragma unused(local_freeq)
1900 #pragma unused(local_freed)
1901 
1902 	return;
1903 
1904 #endif /* CONFIG_SECLUDED_MEMORY */
1905 }
1906 
1907 /*
1908  * This function is called only from vm_pageout_scan and
1909  * it initializes the loop targets for vm_pageout_scan().
1910  */
1911 static void
vps_init_page_targets(void)1912 vps_init_page_targets(void)
1913 {
1914 	/*
1915 	 * LD TODO: Other page targets should be calculated here too.
1916 	 */
1917 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1918 
1919 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1920 		vm_pageout_state.vm_page_speculative_percentage = 50;
1921 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1922 		vm_pageout_state.vm_page_speculative_percentage = 1;
1923 	}
1924 
1925 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1926 	    vm_page_inactive_count);
1927 }
1928 
1929 /*
1930  * This function is called only from vm_pageout_scan and
1931  * it purges a single VM object at-a-time and will either
1932  * make vm_pageout_scan() restart the loop or keeping moving forward.
1933  */
1934 static int
vps_purge_object()1935 vps_purge_object()
1936 {
1937 	int             force_purge;
1938 
1939 	assert(available_for_purge >= 0);
1940 	force_purge = 0; /* no force-purging */
1941 
1942 #if VM_PRESSURE_EVENTS
1943 	vm_pressure_level_t pressure_level;
1944 
1945 	pressure_level = memorystatus_vm_pressure_level;
1946 
1947 	if (pressure_level > kVMPressureNormal) {
1948 		if (pressure_level >= kVMPressureCritical) {
1949 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1950 		} else if (pressure_level >= kVMPressureUrgent) {
1951 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1952 		} else if (pressure_level >= kVMPressureWarning) {
1953 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1954 		}
1955 	}
1956 #endif /* VM_PRESSURE_EVENTS */
1957 
1958 	if (available_for_purge || force_purge) {
1959 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1960 
1961 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1962 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1963 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1964 			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1965 			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1966 
1967 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1968 		}
1969 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1970 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1971 	}
1972 
1973 	return VM_PAGEOUT_SCAN_PROCEED;
1974 }
1975 
1976 /*
1977  * This function is called only from vm_pageout_scan and
1978  * it will try to age the next speculative Q if the oldest
1979  * one is empty.
1980  */
1981 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1982 vps_age_speculative_queue(boolean_t force_speculative_aging)
1983 {
1984 #define DELAY_SPECULATIVE_AGE   1000
1985 
1986 	/*
1987 	 * try to pull pages from the aging bins...
1988 	 * see vm_page.h for an explanation of how
1989 	 * this mechanism works
1990 	 */
1991 	boolean_t                       can_steal = FALSE;
1992 	int                             num_scanned_queues;
1993 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1994 	mach_timespec_t                 ts;
1995 	struct vm_speculative_age_q     *aq;
1996 	struct vm_speculative_age_q     *sq;
1997 
1998 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1999 
2000 	aq = &vm_page_queue_speculative[speculative_steal_index];
2001 
2002 	num_scanned_queues = 0;
2003 	while (vm_page_queue_empty(&aq->age_q) &&
2004 	    num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2005 		speculative_steal_index++;
2006 
2007 		if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2008 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2009 		}
2010 
2011 		aq = &vm_page_queue_speculative[speculative_steal_index];
2012 	}
2013 
2014 	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2015 		/*
2016 		 * XXX We've scanned all the speculative
2017 		 * queues but still haven't found one
2018 		 * that is not empty, even though
2019 		 * vm_page_speculative_count is not 0.
2020 		 */
2021 		if (!vm_page_queue_empty(&sq->age_q)) {
2022 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2023 		}
2024 #if DEVELOPMENT || DEBUG
2025 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2026 #endif
2027 		/* readjust... */
2028 		vm_page_speculative_count = 0;
2029 		/* ... and continue */
2030 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2031 	}
2032 
2033 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2034 		can_steal = TRUE;
2035 	} else {
2036 		if (!delay_speculative_age) {
2037 			mach_timespec_t ts_fully_aged;
2038 
2039 			ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2040 			ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2041 			    * 1000 * NSEC_PER_USEC;
2042 
2043 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2044 
2045 			clock_sec_t sec;
2046 			clock_nsec_t nsec;
2047 			clock_get_system_nanotime(&sec, &nsec);
2048 			ts.tv_sec = (unsigned int) sec;
2049 			ts.tv_nsec = nsec;
2050 
2051 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2052 				can_steal = TRUE;
2053 			} else {
2054 				delay_speculative_age++;
2055 			}
2056 		} else {
2057 			delay_speculative_age++;
2058 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2059 				delay_speculative_age = 0;
2060 			}
2061 		}
2062 	}
2063 	if (can_steal == TRUE) {
2064 		vm_page_speculate_ageit(aq);
2065 	}
2066 
2067 	return VM_PAGEOUT_SCAN_PROCEED;
2068 }
2069 
2070 /*
2071  * This function is called only from vm_pageout_scan and
2072  * it evicts a single VM object from the cache.
2073  */
2074 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2075 vps_object_cache_evict(vm_object_t *object_to_unlock)
2076 {
2077 	static int                      cache_evict_throttle = 0;
2078 	struct vm_speculative_age_q     *sq;
2079 
2080 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2081 
2082 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2083 		int     pages_evicted;
2084 
2085 		if (*object_to_unlock != NULL) {
2086 			vm_object_unlock(*object_to_unlock);
2087 			*object_to_unlock = NULL;
2088 		}
2089 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2090 
2091 		pages_evicted = vm_object_cache_evict(100, 10);
2092 
2093 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2094 
2095 		if (pages_evicted) {
2096 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2097 
2098 			VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2099 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2100 			memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2101 
2102 			/*
2103 			 * we just freed up to 100 pages,
2104 			 * so go back to the top of the main loop
2105 			 * and re-evaulate the memory situation
2106 			 */
2107 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2108 		} else {
2109 			cache_evict_throttle = 1000;
2110 		}
2111 	}
2112 	if (cache_evict_throttle) {
2113 		cache_evict_throttle--;
2114 	}
2115 
2116 	return VM_PAGEOUT_SCAN_PROCEED;
2117 }
2118 
2119 
2120 /*
2121  * This function is called only from vm_pageout_scan and
2122  * it calculates the filecache min. that needs to be maintained
2123  * as we start to steal pages.
2124  */
2125 static void
vps_calculate_filecache_min(void)2126 vps_calculate_filecache_min(void)
2127 {
2128 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2129 
2130 #if CONFIG_JETSAM
2131 	/*
2132 	 * don't let the filecache_min fall below 15% of available memory
2133 	 * on systems with an active compressor that isn't nearing its
2134 	 * limits w/r to accepting new data
2135 	 *
2136 	 * on systems w/o the compressor/swapper, the filecache is always
2137 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2138 	 * since most (if not all) of the anonymous pages are in the
2139 	 * throttled queue (which isn't counted as available) which
2140 	 * effectively disables this filter
2141 	 */
2142 	if (vm_compressor_low_on_space() || divisor == 0) {
2143 		vm_pageout_state.vm_page_filecache_min = 0;
2144 	} else {
2145 		vm_pageout_state.vm_page_filecache_min =
2146 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2147 	}
2148 #else
2149 	if (vm_compressor_out_of_space() || divisor == 0) {
2150 		vm_pageout_state.vm_page_filecache_min = 0;
2151 	} else {
2152 		/*
2153 		 * don't let the filecache_min fall below the specified critical level
2154 		 */
2155 		vm_pageout_state.vm_page_filecache_min =
2156 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2157 	}
2158 #endif
2159 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2160 		vm_pageout_state.vm_page_filecache_min = 0;
2161 	}
2162 }
2163 
2164 /*
2165  * This function is called only from vm_pageout_scan and
2166  * it updates the flow control time to detect if VM pageoutscan
2167  * isn't making progress.
2168  */
2169 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2170 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2171 {
2172 	mach_timespec_t ts;
2173 	clock_sec_t sec;
2174 	clock_nsec_t nsec;
2175 
2176 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2177 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2178 	clock_get_system_nanotime(&sec, &nsec);
2179 	flow_control->ts.tv_sec = (unsigned int) sec;
2180 	flow_control->ts.tv_nsec = nsec;
2181 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2182 
2183 	flow_control->state = FCS_DELAYED;
2184 
2185 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2186 }
2187 
2188 /*
2189  * This function is called only from vm_pageout_scan and
2190  * it is the flow control logic of VM pageout scan which
2191  * controls if it should block and for how long.
2192  * Any blocking of vm_pageout_scan happens ONLY in this function.
2193  */
2194 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2195 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2196     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2197 {
2198 	boolean_t       exceeded_burst_throttle = FALSE;
2199 	unsigned int    msecs = 0;
2200 	uint32_t        inactive_external_count;
2201 	mach_timespec_t ts;
2202 	struct  vm_pageout_queue *iq;
2203 	struct  vm_pageout_queue *eq;
2204 	struct  vm_speculative_age_q *sq;
2205 
2206 	iq = &vm_pageout_queue_internal;
2207 	eq = &vm_pageout_queue_external;
2208 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2209 
2210 	/*
2211 	 * Sometimes we have to pause:
2212 	 *	1) No inactive pages - nothing to do.
2213 	 *	2) Loop control - no acceptable pages found on the inactive queue
2214 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2215 	 *	3) Flow control - default pageout queue is full
2216 	 */
2217 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2218 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2219 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2220 	    vm_page_queue_empty(&sq->age_q)) {
2221 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2222 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2223 	} else if (inactive_burst_count >=
2224 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2225 	    (vm_page_inactive_count +
2226 	    vm_page_speculative_count))) {
2227 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2228 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2229 
2230 		exceeded_burst_throttle = TRUE;
2231 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2232 	    VM_DYNAMIC_PAGING_ENABLED()) {
2233 		clock_sec_t sec;
2234 		clock_nsec_t nsec;
2235 
2236 		switch (flow_control->state) {
2237 		case FCS_IDLE:
2238 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2239 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2240 				/*
2241 				 * since the compressor is running independently of vm_pageout_scan
2242 				 * let's not wait for it just yet... as long as we have a healthy supply
2243 				 * of filecache pages to work with, let's keep stealing those.
2244 				 */
2245 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2246 
2247 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2248 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2249 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2250 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2251 					return VM_PAGEOUT_SCAN_PROCEED;
2252 				}
2253 			}
2254 
2255 			vps_flow_control_reset_deadlock_timer(flow_control);
2256 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2257 
2258 			break;
2259 
2260 		case FCS_DELAYED:
2261 			clock_get_system_nanotime(&sec, &nsec);
2262 			ts.tv_sec = (unsigned int) sec;
2263 			ts.tv_nsec = nsec;
2264 
2265 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2266 				/*
2267 				 * the pageout thread for the default pager is potentially
2268 				 * deadlocked since the
2269 				 * default pager queue has been throttled for more than the
2270 				 * allowable time... we need to move some clean pages or dirty
2271 				 * pages belonging to the external pagers if they aren't throttled
2272 				 * vm_page_free_wanted represents the number of threads currently
2273 				 * blocked waiting for pages... we'll move one page for each of
2274 				 * these plus a fixed amount to break the logjam... once we're done
2275 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2276 				 * with a new timeout target since we have no way of knowing
2277 				 * whether we've broken the deadlock except through observation
2278 				 * of the queue associated with the default pager... we need to
2279 				 * stop moving pages and allow the system to run to see what
2280 				 * state it settles into.
2281 				 */
2282 
2283 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2284 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2285 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2286 				flow_control->state = FCS_DEADLOCK_DETECTED;
2287 				thread_wakeup(VM_PAGEOUT_GC_EVENT);
2288 				return VM_PAGEOUT_SCAN_PROCEED;
2289 			}
2290 			/*
2291 			 * just resniff instead of trying
2292 			 * to compute a new delay time... we're going to be
2293 			 * awakened immediately upon a laundry completion,
2294 			 * so we won't wait any longer than necessary
2295 			 */
2296 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2297 			break;
2298 
2299 		case FCS_DEADLOCK_DETECTED:
2300 			if (*vm_pageout_deadlock_target) {
2301 				return VM_PAGEOUT_SCAN_PROCEED;
2302 			}
2303 
2304 			vps_flow_control_reset_deadlock_timer(flow_control);
2305 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2306 
2307 			break;
2308 		}
2309 	} else {
2310 		/*
2311 		 * No need to pause...
2312 		 */
2313 		return VM_PAGEOUT_SCAN_PROCEED;
2314 	}
2315 
2316 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2317 
2318 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2319 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2320 
2321 	if (vm_page_free_count >= vm_page_free_target) {
2322 		/*
2323 		 * we're here because
2324 		 *  1) someone else freed up some pages while we had
2325 		 *     the queues unlocked above
2326 		 * and we've hit one of the 3 conditions that
2327 		 * cause us to pause the pageout scan thread
2328 		 *
2329 		 * since we already have enough free pages,
2330 		 * let's avoid stalling and return normally
2331 		 *
2332 		 * before we return, make sure the pageout I/O threads
2333 		 * are running throttled in case there are still requests
2334 		 * in the laundry... since we have enough free pages
2335 		 * we don't need the laundry to be cleaned in a timely
2336 		 * fashion... so let's avoid interfering with foreground
2337 		 * activity
2338 		 *
2339 		 * we don't want to hold vm_page_queue_free_lock when
2340 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2341 		 * may cause other locks to be taken), we do the intitial
2342 		 * check outside of the lock.  Once we take the lock,
2343 		 * we recheck the condition since it may have changed.
2344 		 * if it has, no problem, we will make the threads
2345 		 * non-throttled before actually blocking
2346 		 */
2347 		vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2348 	}
2349 	vm_free_page_lock();
2350 
2351 	if (vm_page_free_count >= vm_page_free_target &&
2352 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2353 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2354 	}
2355 	vm_free_page_unlock();
2356 
2357 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2358 		/*
2359 		 * we're most likely about to block due to one of
2360 		 * the 3 conditions that cause vm_pageout_scan to
2361 		 * not be able to make forward progress w/r
2362 		 * to providing new pages to the free queue,
2363 		 * so unthrottle the I/O threads in case we
2364 		 * have laundry to be cleaned... it needs
2365 		 * to be completed ASAP.
2366 		 *
2367 		 * even if we don't block, we want the io threads
2368 		 * running unthrottled since the sum of free +
2369 		 * clean pages is still under our free target
2370 		 */
2371 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2372 	}
2373 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2374 		/*
2375 		 * if we get here we're below our free target and
2376 		 * we're stalling due to a full laundry queue or
2377 		 * we don't have any inactive pages other then
2378 		 * those in the clean queue...
2379 		 * however, we have pages on the clean queue that
2380 		 * can be moved to the free queue, so let's not
2381 		 * stall the pageout scan
2382 		 */
2383 		flow_control->state = FCS_IDLE;
2384 		return VM_PAGEOUT_SCAN_PROCEED;
2385 	}
2386 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2387 		flow_control->state = FCS_IDLE;
2388 		return VM_PAGEOUT_SCAN_PROCEED;
2389 	}
2390 
2391 	VM_CHECK_MEMORYSTATUS;
2392 
2393 	if (flow_control->state != FCS_IDLE) {
2394 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2395 	}
2396 
2397 	iq->pgo_throttled = TRUE;
2398 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2399 
2400 	vm_page_unlock_queues();
2401 
2402 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2403 
2404 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2405 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2406 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2407 
2408 	thread_block(THREAD_CONTINUE_NULL);
2409 
2410 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2411 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2412 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2413 
2414 	vm_page_lock_queues();
2415 
2416 	iq->pgo_throttled = FALSE;
2417 
2418 	vps_init_page_targets();
2419 
2420 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2421 }
2422 
2423 extern boolean_t vm_darkwake_mode;
2424 /*
2425  * This function is called only from vm_pageout_scan and
2426  * it will find and return the most appropriate page to be
2427  * reclaimed.
2428  */
2429 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2430 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2431     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2432 {
2433 	vm_page_t                       m = NULL;
2434 	vm_object_t                     m_object = VM_OBJECT_NULL;
2435 	uint32_t                        inactive_external_count;
2436 	struct vm_speculative_age_q     *sq;
2437 	struct vm_pageout_queue         *iq;
2438 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2439 
2440 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2441 	iq = &vm_pageout_queue_internal;
2442 
2443 	*is_page_from_bg_q = FALSE;
2444 
2445 	m = NULL;
2446 	m_object = VM_OBJECT_NULL;
2447 
2448 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2449 		assert(vm_page_throttled_count == 0);
2450 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2451 	}
2452 
2453 	/*
2454 	 * Try for a clean-queue inactive page.
2455 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2456 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2457 	 */
2458 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2459 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2460 
2461 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2462 
2463 		goto found_page;
2464 	}
2465 
2466 	/*
2467 	 * The next most eligible pages are ones we paged in speculatively,
2468 	 * but which have not yet been touched and have been aged out.
2469 	 */
2470 	if (!vm_page_queue_empty(&sq->age_q)) {
2471 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2472 
2473 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2474 
2475 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2476 			goto found_page;
2477 		} else {
2478 			m = NULL;
2479 		}
2480 	}
2481 
2482 #if !CONFIG_JETSAM
2483 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2484 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2485 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2486 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2487 			goto found_page;
2488 		}
2489 	}
2490 #endif /* !CONFIG_JETSAM */
2491 
2492 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2493 		vm_object_t     bg_m_object = NULL;
2494 
2495 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2496 
2497 		bg_m_object = VM_PAGE_OBJECT(m);
2498 
2499 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2500 			/*
2501 			 * This page is on the background queue
2502 			 * but not on a pageable queue OR is busy during
2503 			 * darkwake mode when the target is artificially lowered.
2504 			 * If it is busy during darkwake mode, and we don't skip it,
2505 			 * we will just swing back around and try again with the same
2506 			 * queue and might hit the same page or its neighbor in a
2507 			 * similar state. Both of these are transient states and will
2508 			 * get resolved, but, at this point let's ignore this page.
2509 			 */
2510 			if (vm_darkwake_mode && m->vmp_busy) {
2511 				if (bg_m_object->internal) {
2512 					vm_pageout_skipped_bq_internal++;
2513 				} else {
2514 					vm_pageout_skipped_bq_external++;
2515 				}
2516 			}
2517 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2518 			if (bg_m_object->internal &&
2519 			    (VM_PAGE_Q_THROTTLED(iq) ||
2520 			    vm_compressor_out_of_space() == TRUE ||
2521 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2522 				vm_pageout_skipped_bq_internal++;
2523 			} else {
2524 				*is_page_from_bg_q = TRUE;
2525 
2526 				if (bg_m_object->internal) {
2527 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2528 				} else {
2529 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2530 				}
2531 				goto found_page;
2532 			}
2533 		}
2534 	}
2535 
2536 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2537 
2538 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2539 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2540 		*grab_anonymous = TRUE;
2541 		*anons_grabbed = 0;
2542 
2543 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2544 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2545 		} else {
2546 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2547 				/*
2548 				 * No swap and we are in dangerously low levels of free memory.
2549 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2550 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2551 				 *
2552 				 * So, pick a file backed page...
2553 				 */
2554 				*grab_anonymous = FALSE;
2555 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2556 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2557 			}
2558 		}
2559 		goto want_anonymous;
2560 	}
2561 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2562 
2563 #if CONFIG_JETSAM
2564 	/* If the file-backed pool has accumulated
2565 	 * significantly more pages than the jetsam
2566 	 * threshold, prefer to reclaim those
2567 	 * inline to minimise compute overhead of reclaiming
2568 	 * anonymous pages.
2569 	 * This calculation does not account for the CPU local
2570 	 * external page queues, as those are expected to be
2571 	 * much smaller relative to the global pools.
2572 	 */
2573 
2574 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2575 
2576 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2577 		if (vm_page_pageable_external_count >
2578 		    vm_pageout_state.vm_page_filecache_min) {
2579 			if ((vm_page_pageable_external_count *
2580 			    vm_pageout_memorystatus_fb_factor_dr) >
2581 			    (memorystatus_available_pages_critical *
2582 			    vm_pageout_memorystatus_fb_factor_nr)) {
2583 				*grab_anonymous = FALSE;
2584 
2585 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2586 			}
2587 		}
2588 		if (*grab_anonymous) {
2589 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2590 		}
2591 	}
2592 #endif /* CONFIG_JETSAM */
2593 
2594 want_anonymous:
2595 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2596 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2597 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2598 
2599 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2600 			*anons_grabbed = 0;
2601 
2602 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2603 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2604 					if ((++(*reactivated_this_call) % 100)) {
2605 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2606 
2607 						vm_page_activate(m);
2608 						counter_inc(&vm_statistics_reactivations);
2609 #if DEVELOPMENT || DEBUG
2610 						if (*is_page_from_bg_q == TRUE) {
2611 							if (m_object->internal) {
2612 								vm_pageout_rejected_bq_internal++;
2613 							} else {
2614 								vm_pageout_rejected_bq_external++;
2615 							}
2616 						}
2617 #endif /* DEVELOPMENT || DEBUG */
2618 						vm_pageout_state.vm_pageout_inactive_used++;
2619 
2620 						m = NULL;
2621 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2622 
2623 						goto found_page;
2624 					}
2625 
2626 					/*
2627 					 * steal 1 of the file backed pages even if
2628 					 * we are under the limit that has been set
2629 					 * for a healthy filecache
2630 					 */
2631 				}
2632 			}
2633 			goto found_page;
2634 		}
2635 	}
2636 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2638 
2639 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2640 		*anons_grabbed += 1;
2641 
2642 		goto found_page;
2643 	}
2644 
2645 	m = NULL;
2646 
2647 found_page:
2648 	*victim_page = m;
2649 
2650 	return retval;
2651 }
2652 
2653 /*
2654  * This function is called only from vm_pageout_scan and
2655  * it will put a page back on the active/inactive queue
2656  * if we can't reclaim it for some reason.
2657  */
2658 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2659 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2660 {
2661 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2662 		vm_page_enqueue_inactive(m, FALSE);
2663 	} else {
2664 		vm_page_activate(m);
2665 	}
2666 
2667 #if DEVELOPMENT || DEBUG
2668 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2669 
2670 	if (page_from_bg_q == TRUE) {
2671 		if (m_object->internal) {
2672 			vm_pageout_rejected_bq_internal++;
2673 		} else {
2674 			vm_pageout_rejected_bq_external++;
2675 		}
2676 	}
2677 #endif /* DEVELOPMENT || DEBUG */
2678 }
2679 
2680 /*
2681  * This function is called only from vm_pageout_scan and
2682  * it will try to grab the victim page's VM object (m_object)
2683  * which differs from the previous victim page's object (object).
2684  */
2685 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2686 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2687 {
2688 	struct vm_speculative_age_q *sq;
2689 
2690 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2691 
2692 	/*
2693 	 * the object associated with candidate page is
2694 	 * different from the one we were just working
2695 	 * with... dump the lock if we still own it
2696 	 */
2697 	if (*object != NULL) {
2698 		vm_object_unlock(*object);
2699 		*object = NULL;
2700 	}
2701 	/*
2702 	 * Try to lock object; since we've alread got the
2703 	 * page queues lock, we can only 'try' for this one.
2704 	 * if the 'try' fails, we need to do a mutex_pause
2705 	 * to allow the owner of the object lock a chance to
2706 	 * run... otherwise, we're likely to trip over this
2707 	 * object in the same state as we work our way through
2708 	 * the queue... clumps of pages associated with the same
2709 	 * object are fairly typical on the inactive and active queues
2710 	 */
2711 	if (!vm_object_lock_try_scan(m_object)) {
2712 		vm_page_t m_want = NULL;
2713 
2714 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2715 
2716 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2717 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2718 		}
2719 
2720 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2721 
2722 		m->vmp_reference = FALSE;
2723 
2724 		if (!m_object->object_is_shared_cache) {
2725 			/*
2726 			 * don't apply this optimization if this is the shared cache
2727 			 * object, it's too easy to get rid of very hot and important
2728 			 * pages...
2729 			 * m->vmp_object must be stable since we hold the page queues lock...
2730 			 * we can update the scan_collisions field sans the object lock
2731 			 * since it is a separate field and this is the only spot that does
2732 			 * a read-modify-write operation and it is never executed concurrently...
2733 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2734 			 * is possible for the value to be a bit non-determistic, but that's ok
2735 			 * since it's only used as a hint
2736 			 */
2737 			m_object->scan_collisions = 1;
2738 		}
2739 		if (page_from_bg_q) {
2740 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2741 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2742 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2743 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2744 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2745 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2746 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2747 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2748 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2749 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2750 		}
2751 
2752 		/*
2753 		 * this is the next object we're going to be interested in
2754 		 * try to make sure its available after the mutex_pause
2755 		 * returns control
2756 		 */
2757 		if (m_want) {
2758 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2759 		}
2760 
2761 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2762 
2763 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2764 	} else {
2765 		*object = m_object;
2766 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2767 	}
2768 
2769 	return VM_PAGEOUT_SCAN_PROCEED;
2770 }
2771 
2772 /*
2773  * This function is called only from vm_pageout_scan and
2774  * it notices that pageout scan may be rendered ineffective
2775  * due to a FS deadlock and will jetsam a process if possible.
2776  * If jetsam isn't supported, it'll move the page to the active
2777  * queue to try and get some different pages pushed onwards so
2778  * we can try to get out of this scenario.
2779  */
2780 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2781 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2782     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2783 {
2784 	struct  vm_pageout_queue *eq;
2785 	vm_object_t cur_object = VM_OBJECT_NULL;
2786 
2787 	cur_object = *object;
2788 
2789 	eq = &vm_pageout_queue_external;
2790 
2791 	if (cur_object->internal == FALSE) {
2792 		/*
2793 		 * we need to break up the following potential deadlock case...
2794 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2795 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2796 		 *  c) Most of the pages in the inactive queue belong to this file.
2797 		 *
2798 		 * we are potentially in this deadlock because...
2799 		 *  a) the external pageout queue is throttled
2800 		 *  b) we're done with the active queue and moved on to the inactive queue
2801 		 *  c) we've got a dirty external page
2802 		 *
2803 		 * since we don't know the reason for the external pageout queue being throttled we
2804 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2805 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2806 		 *
2807 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2808 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2809 		 * pool the next time we select a victim page... if we can make enough new free pages,
2810 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2811 		 * be throttled
2812 		 *
2813 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2814 		 * that we can try to find clean pages in the active/inactive queues before
2815 		 * deciding to jetsam a process
2816 		 */
2817 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2818 
2819 		vm_page_check_pageable_safe(m);
2820 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2821 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2822 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2823 		vm_page_active_count++;
2824 		vm_page_pageable_external_count++;
2825 
2826 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2827 
2828 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2829 
2830 #pragma unused(force_anonymous)
2831 
2832 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2833 
2834 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2835 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2836 			/*
2837 			 * Possible deadlock scenario so request jetsam action
2838 			 */
2839 
2840 			assert(cur_object);
2841 			vm_object_unlock(cur_object);
2842 
2843 			cur_object = VM_OBJECT_NULL;
2844 
2845 			/*
2846 			 * VM pageout scan needs to know we have dropped this lock and so set the
2847 			 * object variable we got passed in to NULL.
2848 			 */
2849 			*object = VM_OBJECT_NULL;
2850 
2851 			vm_page_unlock_queues();
2852 
2853 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2854 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2855 
2856 			/* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2857 			if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2858 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2859 			}
2860 
2861 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2862 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2863 
2864 			vm_page_lock_queues();
2865 			*delayed_unlock = 1;
2866 		}
2867 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2868 
2869 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2870 #pragma unused(delayed_unlock)
2871 
2872 		*force_anonymous = TRUE;
2873 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2874 	} else {
2875 		vm_page_activate(m);
2876 		counter_inc(&vm_statistics_reactivations);
2877 
2878 #if DEVELOPMENT || DEBUG
2879 		if (is_page_from_bg_q == TRUE) {
2880 			if (cur_object->internal) {
2881 				vm_pageout_rejected_bq_internal++;
2882 			} else {
2883 				vm_pageout_rejected_bq_external++;
2884 			}
2885 		}
2886 #endif /* DEVELOPMENT || DEBUG */
2887 
2888 		vm_pageout_state.vm_pageout_inactive_used++;
2889 	}
2890 }
2891 
2892 
2893 void
vm_page_balance_inactive(int max_to_move)2894 vm_page_balance_inactive(int max_to_move)
2895 {
2896 	vm_page_t m;
2897 
2898 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2899 
2900 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2901 		/*
2902 		 * It is likely that the hibernation code path is
2903 		 * dealing with these very queues as we are about
2904 		 * to move pages around in/from them and completely
2905 		 * change the linkage of the pages.
2906 		 *
2907 		 * And so we skip the rebalancing of these queues.
2908 		 */
2909 		return;
2910 	}
2911 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2912 	    vm_page_inactive_count +
2913 	    vm_page_speculative_count);
2914 
2915 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2916 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2917 
2918 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2919 
2920 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2921 		assert(!m->vmp_laundry);
2922 		assert(VM_PAGE_OBJECT(m) != kernel_object);
2923 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2924 
2925 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2926 
2927 		/*
2928 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2929 		 *
2930 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2931 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2932 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2933 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2934 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2935 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2936 		 * have happened before we moved the page
2937 		 */
2938 		if (m->vmp_pmapped == TRUE) {
2939 			/*
2940 			 * We might be holding the page queue lock as a
2941 			 * spin lock and clearing the "referenced" bit could
2942 			 * take a while if there are lots of mappings of
2943 			 * that page, so make sure we acquire the lock as
2944 			 * as mutex to avoid a spinlock timeout.
2945 			 */
2946 			vm_page_lockconvert_queues();
2947 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2948 		}
2949 
2950 		/*
2951 		 * The page might be absent or busy,
2952 		 * but vm_page_deactivate can handle that.
2953 		 * FALSE indicates that we don't want a H/W clear reference
2954 		 */
2955 		vm_page_deactivate_internal(m, FALSE);
2956 	}
2957 }
2958 
2959 /*
2960  *	vm_pageout_scan does the dirty work for the pageout daemon.
2961  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2962  *	held and vm_page_free_wanted == 0.
2963  */
2964 void
vm_pageout_scan(void)2965 vm_pageout_scan(void)
2966 {
2967 	unsigned int loop_count = 0;
2968 	unsigned int inactive_burst_count = 0;
2969 	unsigned int reactivated_this_call;
2970 	unsigned int reactivate_limit;
2971 	vm_page_t   local_freeq = NULL;
2972 	int         local_freed = 0;
2973 	int         delayed_unlock;
2974 	int         delayed_unlock_limit = 0;
2975 	int         refmod_state = 0;
2976 	int     vm_pageout_deadlock_target = 0;
2977 	struct  vm_pageout_queue *iq;
2978 	struct  vm_pageout_queue *eq;
2979 	struct  vm_speculative_age_q *sq;
2980 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2981 	boolean_t inactive_throttled = FALSE;
2982 	vm_object_t     object = NULL;
2983 	uint32_t        inactive_reclaim_run;
2984 	boolean_t       grab_anonymous = FALSE;
2985 	boolean_t       force_anonymous = FALSE;
2986 	boolean_t       force_speculative_aging = FALSE;
2987 	int             anons_grabbed = 0;
2988 	int             page_prev_q_state = 0;
2989 	boolean_t       page_from_bg_q = FALSE;
2990 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
2991 	vm_object_t     m_object = VM_OBJECT_NULL;
2992 	int             retval = 0;
2993 	boolean_t       lock_yield_check = FALSE;
2994 
2995 
2996 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2997 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
2998 	    vm_pageout_state.vm_pageout_inactive_clean,
2999 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3000 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3001 
3002 	flow_control.state = FCS_IDLE;
3003 	iq = &vm_pageout_queue_internal;
3004 	eq = &vm_pageout_queue_external;
3005 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3006 
3007 	/* Ask the pmap layer to return any pages it no longer needs. */
3008 	uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
3009 
3010 	vm_page_lock_queues();
3011 
3012 	vm_page_wire_count -= pmap_wired_pages_freed;
3013 
3014 	delayed_unlock = 1;
3015 
3016 	/*
3017 	 *	Calculate the max number of referenced pages on the inactive
3018 	 *	queue that we will reactivate.
3019 	 */
3020 	reactivated_this_call = 0;
3021 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3022 	    vm_page_inactive_count);
3023 	inactive_reclaim_run = 0;
3024 
3025 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3026 
3027 	/*
3028 	 *	We must limit the rate at which we send pages to the pagers
3029 	 *	so that we don't tie up too many pages in the I/O queues.
3030 	 *	We implement a throttling mechanism using the laundry count
3031 	 *      to limit the number of pages outstanding to the default
3032 	 *	and external pagers.  We can bypass the throttles and look
3033 	 *	for clean pages if the pageout queues don't drain in a timely
3034 	 *	fashion since this may indicate that the pageout paths are
3035 	 *	stalled waiting for memory, which only we can provide.
3036 	 */
3037 
3038 	vps_init_page_targets();
3039 	assert(object == NULL);
3040 	assert(delayed_unlock != 0);
3041 
3042 	for (;;) {
3043 		vm_page_t m;
3044 
3045 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3046 
3047 		if (lock_yield_check) {
3048 			lock_yield_check = FALSE;
3049 
3050 			if (delayed_unlock++ > delayed_unlock_limit) {
3051 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3052 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3053 			} else if (vm_pageout_scan_wants_object) {
3054 				vm_page_unlock_queues();
3055 				mutex_pause(0);
3056 				vm_page_lock_queues();
3057 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3058 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3059 			}
3060 		}
3061 
3062 		if (vm_upl_wait_for_pages < 0) {
3063 			vm_upl_wait_for_pages = 0;
3064 		}
3065 
3066 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3067 
3068 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3069 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3070 		}
3071 
3072 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3073 
3074 		assert(delayed_unlock);
3075 
3076 		/*
3077 		 * maintain our balance
3078 		 */
3079 		vm_page_balance_inactive(1);
3080 
3081 
3082 		/**********************************************************************
3083 		* above this point we're playing with the active and secluded queues
3084 		* below this point we're playing with the throttling mechanisms
3085 		* and the inactive queue
3086 		**********************************************************************/
3087 
3088 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3089 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3090 
3091 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3092 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3093 			/*
3094 			 * make sure the pageout I/O threads are running
3095 			 * throttled in case there are still requests
3096 			 * in the laundry... since we have met our targets
3097 			 * we don't need the laundry to be cleaned in a timely
3098 			 * fashion... so let's avoid interfering with foreground
3099 			 * activity
3100 			 */
3101 			vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3102 
3103 			vm_free_page_lock();
3104 
3105 			if ((vm_page_free_count >= vm_page_free_target) &&
3106 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3107 				/*
3108 				 * done - we have met our target *and*
3109 				 * there is no one waiting for a page.
3110 				 */
3111 return_from_scan:
3112 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3113 
3114 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3115 				    vm_pageout_state.vm_pageout_inactive,
3116 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3117 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3118 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3119 				    vm_pageout_state.vm_pageout_inactive_clean,
3120 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3121 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3122 
3123 				return;
3124 			}
3125 			vm_free_page_unlock();
3126 		}
3127 
3128 		/*
3129 		 * Before anything, we check if we have any ripe volatile
3130 		 * objects around. If so, try to purge the first object.
3131 		 * If the purge fails, fall through to reclaim a page instead.
3132 		 * If the purge succeeds, go back to the top and reevalute
3133 		 * the new memory situation.
3134 		 */
3135 		retval = vps_purge_object();
3136 
3137 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3138 			/*
3139 			 * Success
3140 			 */
3141 			if (object != NULL) {
3142 				vm_object_unlock(object);
3143 				object = NULL;
3144 			}
3145 
3146 			lock_yield_check = FALSE;
3147 			continue;
3148 		}
3149 
3150 		/*
3151 		 * If our 'aged' queue is empty and we have some speculative pages
3152 		 * in the other queues, let's go through and see if we need to age
3153 		 * them.
3154 		 *
3155 		 * If we succeeded in aging a speculative Q or just that everything
3156 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3157 		 *
3158 		 * If, for some reason, we seem to have a mismatch between the spec.
3159 		 * page count and the page queues, we reset those variables and
3160 		 * restart the loop (LD TODO: Track this better?).
3161 		 */
3162 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3163 			retval = vps_age_speculative_queue(force_speculative_aging);
3164 
3165 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3166 				lock_yield_check = FALSE;
3167 				continue;
3168 			}
3169 		}
3170 		force_speculative_aging = FALSE;
3171 
3172 		/*
3173 		 * Check to see if we need to evict objects from the cache.
3174 		 *
3175 		 * Note: 'object' here doesn't have anything to do with
3176 		 * the eviction part. We just need to make sure we have dropped
3177 		 * any object lock we might be holding if we need to go down
3178 		 * into the eviction logic.
3179 		 */
3180 		retval = vps_object_cache_evict(&object);
3181 
3182 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3183 			lock_yield_check = FALSE;
3184 			continue;
3185 		}
3186 
3187 
3188 		/*
3189 		 * Calculate our filecache_min that will affect the loop
3190 		 * going forward.
3191 		 */
3192 		vps_calculate_filecache_min();
3193 
3194 		/*
3195 		 * LD TODO: Use a structure to hold all state variables for a single
3196 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3197 		 */
3198 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3199 		    &delayed_unlock, &local_freeq, &local_freed,
3200 		    &vm_pageout_deadlock_target, inactive_burst_count);
3201 
3202 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3203 			if (loop_count >= vm_page_inactive_count) {
3204 				loop_count = 0;
3205 			}
3206 
3207 			inactive_burst_count = 0;
3208 
3209 			assert(object == NULL);
3210 			assert(delayed_unlock != 0);
3211 
3212 			lock_yield_check = FALSE;
3213 			continue;
3214 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3215 			goto return_from_scan;
3216 		}
3217 
3218 		flow_control.state = FCS_IDLE;
3219 
3220 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3221 		    vm_pageout_inactive_external_forced_reactivate_limit);
3222 		loop_count++;
3223 		inactive_burst_count++;
3224 		vm_pageout_state.vm_pageout_inactive++;
3225 
3226 		/*
3227 		 * Choose a victim.
3228 		 */
3229 
3230 		m = NULL;
3231 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3232 
3233 		if (m == NULL) {
3234 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3235 				inactive_burst_count = 0;
3236 
3237 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3238 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3239 				}
3240 
3241 				lock_yield_check = TRUE;
3242 				continue;
3243 			}
3244 
3245 			/*
3246 			 * if we've gotten here, we have no victim page.
3247 			 * check to see if we've not finished balancing the queues
3248 			 * or we have a page on the aged speculative queue that we
3249 			 * skipped due to force_anonymous == TRUE.. or we have
3250 			 * speculative  pages that we can prematurely age... if
3251 			 * one of these cases we'll keep going, else panic
3252 			 */
3253 			force_anonymous = FALSE;
3254 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3255 
3256 			if (!vm_page_queue_empty(&sq->age_q)) {
3257 				lock_yield_check = TRUE;
3258 				continue;
3259 			}
3260 
3261 			if (vm_page_speculative_count) {
3262 				force_speculative_aging = TRUE;
3263 				lock_yield_check = TRUE;
3264 				continue;
3265 			}
3266 			panic("vm_pageout: no victim");
3267 
3268 			/* NOTREACHED */
3269 		}
3270 
3271 		assert(VM_PAGE_PAGEABLE(m));
3272 		m_object = VM_PAGE_OBJECT(m);
3273 		force_anonymous = FALSE;
3274 
3275 		page_prev_q_state = m->vmp_q_state;
3276 		/*
3277 		 * we just found this page on one of our queues...
3278 		 * it can't also be on the pageout queue, so safe
3279 		 * to call vm_page_queues_remove
3280 		 */
3281 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3282 		vm_page_queues_remove(m, TRUE);
3283 		if (donate) {
3284 			/*
3285 			 * The compressor needs to see this bit to know
3286 			 * where this page needs to land. Also if stolen,
3287 			 * this bit helps put the page back in the right
3288 			 * special queue where it belongs.
3289 			 */
3290 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3291 		}
3292 
3293 		assert(!m->vmp_laundry);
3294 		assert(!m->vmp_private);
3295 		assert(!m->vmp_fictitious);
3296 		assert(m_object != kernel_object);
3297 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3298 
3299 		vm_pageout_vminfo.vm_pageout_considered_page++;
3300 
3301 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3302 
3303 		/*
3304 		 * check to see if we currently are working
3305 		 * with the same object... if so, we've
3306 		 * already got the lock
3307 		 */
3308 		if (m_object != object) {
3309 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3310 
3311 			/*
3312 			 * vps_switch_object() will always drop the 'object' lock first
3313 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3314 			 * either 'm_object' or NULL.
3315 			 */
3316 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3317 
3318 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3319 				lock_yield_check = TRUE;
3320 				continue;
3321 			}
3322 		}
3323 		assert(m_object == object);
3324 		assert(VM_PAGE_OBJECT(m) == m_object);
3325 
3326 		if (m->vmp_busy) {
3327 			/*
3328 			 *	Somebody is already playing with this page.
3329 			 *	Put it back on the appropriate queue
3330 			 *
3331 			 */
3332 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3333 
3334 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3335 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3336 			}
3337 
3338 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3339 
3340 			lock_yield_check = TRUE;
3341 			continue;
3342 		}
3343 
3344 		/*
3345 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3346 		 *	If already cleaning this page in place
3347 		 *	just leave if off the paging queues.
3348 		 *	We can leave the page mapped, and upl_commit_range
3349 		 *	will put it on the clean queue.
3350 		 *
3351 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3352 		 *	an msync INVALIDATE is in progress...
3353 		 *	this page has been marked for destruction
3354 		 *      after it has been cleaned,
3355 		 *      but not yet gathered into a UPL
3356 		 *	where 'cleaning' will be set...
3357 		 *	just leave it off the paging queues
3358 		 *
3359 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3360 		 *	an msync INVALIDATE is in progress
3361 		 *	and the UPL has already gathered this page...
3362 		 *	just leave it off the paging queues
3363 		 */
3364 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3365 			lock_yield_check = TRUE;
3366 			continue;
3367 		}
3368 
3369 
3370 		/*
3371 		 *	If it's absent, in error or the object is no longer alive,
3372 		 *	we can reclaim the page... in the no longer alive case,
3373 		 *	there are 2 states the page can be in that preclude us
3374 		 *	from reclaiming it - busy or cleaning - that we've already
3375 		 *	dealt with
3376 		 */
3377 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3378 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3379 			if (m->vmp_absent) {
3380 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3381 			} else if (!object->alive ||
3382 			    (!object->internal &&
3383 			    object->pager == MEMORY_OBJECT_NULL)) {
3384 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3385 			} else {
3386 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3387 			}
3388 reclaim_page:
3389 			if (vm_pageout_deadlock_target) {
3390 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3391 				vm_pageout_deadlock_target--;
3392 			}
3393 
3394 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3395 
3396 			if (object->internal) {
3397 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3398 			} else {
3399 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3400 			}
3401 			assert(!m->vmp_cleaning);
3402 			assert(!m->vmp_laundry);
3403 
3404 			if (!object->internal &&
3405 			    object->pager != NULL &&
3406 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3407 				shared_region_pager_reclaimed++;
3408 			}
3409 
3410 			m->vmp_busy = TRUE;
3411 
3412 			/*
3413 			 * remove page from object here since we're already
3414 			 * behind the object lock... defer the rest of the work
3415 			 * we'd normally do in vm_page_free_prepare_object
3416 			 * until 'vm_page_free_list' is called
3417 			 */
3418 			if (m->vmp_tabled) {
3419 				vm_page_remove(m, TRUE);
3420 			}
3421 
3422 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3423 			m->vmp_snext = local_freeq;
3424 			local_freeq = m;
3425 			local_freed++;
3426 
3427 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3428 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3429 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3430 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3431 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3432 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3433 			} else {
3434 				vm_pageout_vminfo.vm_pageout_freed_external++;
3435 			}
3436 
3437 			inactive_burst_count = 0;
3438 
3439 			lock_yield_check = TRUE;
3440 			continue;
3441 		}
3442 		if (object->copy == VM_OBJECT_NULL) {
3443 			/*
3444 			 * No one else can have any interest in this page.
3445 			 * If this is an empty purgable object, the page can be
3446 			 * reclaimed even if dirty.
3447 			 * If the page belongs to a volatile purgable object, we
3448 			 * reactivate it if the compressor isn't active.
3449 			 */
3450 			if (object->purgable == VM_PURGABLE_EMPTY) {
3451 				if (m->vmp_pmapped == TRUE) {
3452 					/* unmap the page */
3453 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3454 					if (refmod_state & VM_MEM_MODIFIED) {
3455 						SET_PAGE_DIRTY(m, FALSE);
3456 					}
3457 				}
3458 				if (m->vmp_dirty || m->vmp_precious) {
3459 					/* we saved the cost of cleaning this page ! */
3460 					vm_page_purged_count++;
3461 				}
3462 				goto reclaim_page;
3463 			}
3464 
3465 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3466 				/*
3467 				 * With the VM compressor, the cost of
3468 				 * reclaiming a page is much lower (no I/O),
3469 				 * so if we find a "volatile" page, it's better
3470 				 * to let it get compressed rather than letting
3471 				 * it occupy a full page until it gets purged.
3472 				 * So no need to check for "volatile" here.
3473 				 */
3474 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3475 				/*
3476 				 * Avoid cleaning a "volatile" page which might
3477 				 * be purged soon.
3478 				 */
3479 
3480 				/* if it's wired, we can't put it on our queue */
3481 				assert(!VM_PAGE_WIRED(m));
3482 
3483 				/* just stick it back on! */
3484 				reactivated_this_call++;
3485 
3486 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3487 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3488 				}
3489 
3490 				goto reactivate_page;
3491 			}
3492 		}
3493 		/*
3494 		 *	If it's being used, reactivate.
3495 		 *	(Fictitious pages are either busy or absent.)
3496 		 *	First, update the reference and dirty bits
3497 		 *	to make sure the page is unreferenced.
3498 		 */
3499 		refmod_state = -1;
3500 
3501 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3502 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3503 
3504 			if (refmod_state & VM_MEM_REFERENCED) {
3505 				m->vmp_reference = TRUE;
3506 			}
3507 			if (refmod_state & VM_MEM_MODIFIED) {
3508 				SET_PAGE_DIRTY(m, FALSE);
3509 			}
3510 		}
3511 
3512 		if (m->vmp_reference || m->vmp_dirty) {
3513 			/* deal with a rogue "reusable" page */
3514 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3515 		}
3516 
3517 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3518 			vm_pageout_state.vm_page_xpmapped_min = 0;
3519 		} else {
3520 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3521 		}
3522 
3523 		if (!m->vmp_no_cache &&
3524 		    page_from_bg_q == FALSE &&
3525 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3526 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3527 			/*
3528 			 * The page we pulled off the inactive list has
3529 			 * been referenced.  It is possible for other
3530 			 * processors to be touching pages faster than we
3531 			 * can clear the referenced bit and traverse the
3532 			 * inactive queue, so we limit the number of
3533 			 * reactivations.
3534 			 */
3535 			if (++reactivated_this_call >= reactivate_limit &&
3536 			    !object->object_is_shared_cache &&
3537 			    !((m->vmp_realtime ||
3538 			    object->for_realtime) &&
3539 			    vm_pageout_protect_realtime)) {
3540 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3541 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3542 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3543 				if (object->object_is_shared_cache) {
3544 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3545 				} else if (m->vmp_realtime ||
3546 				    object->for_realtime) {
3547 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3548 				}
3549 			} else {
3550 				uint32_t isinuse;
3551 
3552 				if (reactivated_this_call >= reactivate_limit) {
3553 					if (object->object_is_shared_cache) {
3554 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3555 					} else if ((m->vmp_realtime ||
3556 					    object->for_realtime) &&
3557 					    vm_pageout_protect_realtime) {
3558 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3559 					}
3560 				}
3561 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3562 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3563 				}
3564 
3565 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3566 reactivate_page:
3567 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3568 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3569 					/*
3570 					 * no explict mappings of this object exist
3571 					 * and it's not open via the filesystem
3572 					 */
3573 					vm_page_deactivate(m);
3574 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3575 				} else {
3576 					/*
3577 					 * The page was/is being used, so put back on active list.
3578 					 */
3579 					vm_page_activate(m);
3580 					counter_inc(&vm_statistics_reactivations);
3581 					inactive_burst_count = 0;
3582 				}
3583 #if DEVELOPMENT || DEBUG
3584 				if (page_from_bg_q == TRUE) {
3585 					if (m_object->internal) {
3586 						vm_pageout_rejected_bq_internal++;
3587 					} else {
3588 						vm_pageout_rejected_bq_external++;
3589 					}
3590 				}
3591 #endif /* DEVELOPMENT || DEBUG */
3592 
3593 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3594 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3595 				}
3596 				vm_pageout_state.vm_pageout_inactive_used++;
3597 
3598 				lock_yield_check = TRUE;
3599 				continue;
3600 			}
3601 			/*
3602 			 * Make sure we call pmap_get_refmod() if it
3603 			 * wasn't already called just above, to update
3604 			 * the dirty bit.
3605 			 */
3606 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3607 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3608 				if (refmod_state & VM_MEM_MODIFIED) {
3609 					SET_PAGE_DIRTY(m, FALSE);
3610 				}
3611 			}
3612 		}
3613 
3614 		/*
3615 		 * we've got a candidate page to steal...
3616 		 *
3617 		 * m->vmp_dirty is up to date courtesy of the
3618 		 * preceding check for m->vmp_reference... if
3619 		 * we get here, then m->vmp_reference had to be
3620 		 * FALSE (or possibly "reactivate_limit" was
3621 		 * exceeded), but in either case we called
3622 		 * pmap_get_refmod() and updated both
3623 		 * m->vmp_reference and m->vmp_dirty
3624 		 *
3625 		 * if it's dirty or precious we need to
3626 		 * see if the target queue is throtttled
3627 		 * it if is, we need to skip over it by moving it back
3628 		 * to the end of the inactive queue
3629 		 */
3630 
3631 		inactive_throttled = FALSE;
3632 
3633 		if (m->vmp_dirty || m->vmp_precious) {
3634 			if (object->internal) {
3635 				if (VM_PAGE_Q_THROTTLED(iq)) {
3636 					inactive_throttled = TRUE;
3637 				}
3638 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3639 				inactive_throttled = TRUE;
3640 			}
3641 		}
3642 throttle_inactive:
3643 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3644 		    object->internal && m->vmp_dirty &&
3645 		    (object->purgable == VM_PURGABLE_DENY ||
3646 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3647 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3648 			vm_page_check_pageable_safe(m);
3649 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3650 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3651 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3652 			vm_page_throttled_count++;
3653 
3654 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3655 
3656 			inactive_burst_count = 0;
3657 
3658 			lock_yield_check = TRUE;
3659 			continue;
3660 		}
3661 		if (inactive_throttled == TRUE) {
3662 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3663 			    &delayed_unlock, &force_anonymous, page_from_bg_q);
3664 
3665 			inactive_burst_count = 0;
3666 
3667 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3668 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3669 			}
3670 
3671 			lock_yield_check = TRUE;
3672 			continue;
3673 		}
3674 
3675 		/*
3676 		 * we've got a page that we can steal...
3677 		 * eliminate all mappings and make sure
3678 		 * we have the up-to-date modified state
3679 		 *
3680 		 * if we need to do a pmap_disconnect then we
3681 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3682 		 * provides the true state atomically... the
3683 		 * page was still mapped up to the pmap_disconnect
3684 		 * and may have been dirtied at the last microsecond
3685 		 *
3686 		 * Note that if 'pmapped' is FALSE then the page is not
3687 		 * and has not been in any map, so there is no point calling
3688 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3689 		 * of likely usage of the page.
3690 		 */
3691 		if (m->vmp_pmapped == TRUE) {
3692 			int pmap_options;
3693 
3694 			/*
3695 			 * Don't count this page as going into the compressor
3696 			 * if any of these are true:
3697 			 * 1) compressed pager isn't enabled
3698 			 * 2) Freezer enabled device with compressed pager
3699 			 *    backend (exclusive use) i.e. most of the VM system
3700 			 *    (including vm_pageout_scan) has no knowledge of
3701 			 *    the compressor
3702 			 * 3) This page belongs to a file and hence will not be
3703 			 *    sent into the compressor
3704 			 */
3705 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3706 			    object->internal == FALSE) {
3707 				pmap_options = 0;
3708 			} else if (m->vmp_dirty || m->vmp_precious) {
3709 				/*
3710 				 * VM knows that this page is dirty (or
3711 				 * precious) and needs to be compressed
3712 				 * rather than freed.
3713 				 * Tell the pmap layer to count this page
3714 				 * as "compressed".
3715 				 */
3716 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3717 			} else {
3718 				/*
3719 				 * VM does not know if the page needs to
3720 				 * be preserved but the pmap layer might tell
3721 				 * us if any mapping has "modified" it.
3722 				 * Let's the pmap layer to count this page
3723 				 * as compressed if and only if it has been
3724 				 * modified.
3725 				 */
3726 				pmap_options =
3727 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3728 			}
3729 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3730 			    pmap_options,
3731 			    NULL);
3732 			if (refmod_state & VM_MEM_MODIFIED) {
3733 				SET_PAGE_DIRTY(m, FALSE);
3734 			}
3735 		}
3736 
3737 		/*
3738 		 * reset our count of pages that have been reclaimed
3739 		 * since the last page was 'stolen'
3740 		 */
3741 		inactive_reclaim_run = 0;
3742 
3743 		/*
3744 		 *	If it's clean and not precious, we can free the page.
3745 		 */
3746 		if (!m->vmp_dirty && !m->vmp_precious) {
3747 			vm_pageout_state.vm_pageout_inactive_clean++;
3748 
3749 			/*
3750 			 * OK, at this point we have found a page we are going to free.
3751 			 */
3752 #if CONFIG_PHANTOM_CACHE
3753 			if (!object->internal) {
3754 				vm_phantom_cache_add_ghost(m);
3755 			}
3756 #endif
3757 			goto reclaim_page;
3758 		}
3759 
3760 		/*
3761 		 * The page may have been dirtied since the last check
3762 		 * for a throttled target queue (which may have been skipped
3763 		 * if the page was clean then).  With the dirty page
3764 		 * disconnected here, we can make one final check.
3765 		 */
3766 		if (object->internal) {
3767 			if (VM_PAGE_Q_THROTTLED(iq)) {
3768 				inactive_throttled = TRUE;
3769 			}
3770 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3771 			inactive_throttled = TRUE;
3772 		}
3773 
3774 		if (inactive_throttled == TRUE) {
3775 			goto throttle_inactive;
3776 		}
3777 
3778 #if VM_PRESSURE_EVENTS
3779 #if CONFIG_JETSAM
3780 
3781 		/*
3782 		 * If Jetsam is enabled, then the sending
3783 		 * of memory pressure notifications is handled
3784 		 * from the same thread that takes care of high-water
3785 		 * and other jetsams i.e. the memorystatus_thread.
3786 		 */
3787 
3788 #else /* CONFIG_JETSAM */
3789 
3790 		vm_pressure_response();
3791 
3792 #endif /* CONFIG_JETSAM */
3793 #endif /* VM_PRESSURE_EVENTS */
3794 
3795 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3796 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3797 		}
3798 
3799 		if (object->internal) {
3800 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3801 		} else {
3802 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3803 		}
3804 
3805 		/*
3806 		 * internal pages will go to the compressor...
3807 		 * external pages will go to the appropriate pager to be cleaned
3808 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3809 		 * is a preferred queue to steal from
3810 		 */
3811 		vm_pageout_cluster(m);
3812 		inactive_burst_count = 0;
3813 
3814 		/*
3815 		 * back to top of pageout scan loop
3816 		 */
3817 	}
3818 }
3819 
3820 
3821 void
vm_page_free_reserve(int pages)3822 vm_page_free_reserve(
3823 	int pages)
3824 {
3825 	int             free_after_reserve;
3826 
3827 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3828 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3829 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3830 		} else {
3831 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3832 		}
3833 	} else {
3834 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3835 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3836 		} else {
3837 			vm_page_free_reserved += pages;
3838 		}
3839 	}
3840 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3841 
3842 	vm_page_free_min = vm_page_free_reserved +
3843 	    VM_PAGE_FREE_MIN(free_after_reserve);
3844 
3845 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3846 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3847 	}
3848 
3849 	vm_page_free_target = vm_page_free_reserved +
3850 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3851 
3852 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3853 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3854 	}
3855 
3856 	if (vm_page_free_target < vm_page_free_min + 5) {
3857 		vm_page_free_target = vm_page_free_min + 5;
3858 	}
3859 
3860 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3861 }
3862 
3863 /*
3864  *	vm_pageout is the high level pageout daemon.
3865  */
3866 
3867 void
vm_pageout_continue(void)3868 vm_pageout_continue(void)
3869 {
3870 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3871 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3872 
3873 	vm_free_page_lock();
3874 	vm_pageout_running = TRUE;
3875 	vm_free_page_unlock();
3876 
3877 	vm_pageout_scan();
3878 	/*
3879 	 * we hold both the vm_page_queue_free_lock
3880 	 * and the vm_page_queues_lock at this point
3881 	 */
3882 	assert(vm_page_free_wanted == 0);
3883 	assert(vm_page_free_wanted_privileged == 0);
3884 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3885 
3886 	vm_pageout_running = FALSE;
3887 #if XNU_TARGET_OS_OSX
3888 	if (vm_pageout_waiter) {
3889 		vm_pageout_waiter = FALSE;
3890 		thread_wakeup((event_t)&vm_pageout_waiter);
3891 	}
3892 #endif /* XNU_TARGET_OS_OSX */
3893 
3894 	vm_free_page_unlock();
3895 	vm_page_unlock_queues();
3896 
3897 	thread_block((thread_continue_t)vm_pageout_continue);
3898 	/*NOTREACHED*/
3899 }
3900 
3901 #if XNU_TARGET_OS_OSX
3902 kern_return_t
vm_pageout_wait(uint64_t deadline)3903 vm_pageout_wait(uint64_t deadline)
3904 {
3905 	kern_return_t kr;
3906 
3907 	vm_free_page_lock();
3908 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3909 		vm_pageout_waiter = TRUE;
3910 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3911 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3912 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3913 			kr = KERN_OPERATION_TIMED_OUT;
3914 		}
3915 	}
3916 	vm_free_page_unlock();
3917 
3918 	return kr;
3919 }
3920 #endif /* XNU_TARGET_OS_OSX */
3921 
3922 
3923 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q,__unused wait_result_t w)3924 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q, __unused wait_result_t w)
3925 {
3926 	vm_page_t       m = NULL;
3927 	vm_object_t     object;
3928 	vm_object_offset_t offset;
3929 	memory_object_t pager;
3930 
3931 	/* On systems with a compressor, the external IO thread clears its
3932 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3933 	 * creation)
3934 	 */
3935 	if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3936 		current_thread()->options &= ~TH_OPT_VMPRIV;
3937 	}
3938 
3939 	vm_page_lockspin_queues();
3940 
3941 	while (!vm_page_queue_empty(&q->pgo_pending)) {
3942 		q->pgo_busy = TRUE;
3943 		vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3944 
3945 		assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3946 		VM_PAGE_CHECK(m);
3947 		/*
3948 		 * grab a snapshot of the object and offset this
3949 		 * page is tabled in so that we can relookup this
3950 		 * page after we've taken the object lock - these
3951 		 * fields are stable while we hold the page queues lock
3952 		 * but as soon as we drop it, there is nothing to keep
3953 		 * this page in this object... we hold an activity_in_progress
3954 		 * on this object which will keep it from terminating
3955 		 */
3956 		object = VM_PAGE_OBJECT(m);
3957 		offset = m->vmp_offset;
3958 
3959 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3960 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3961 
3962 		vm_page_unlock_queues();
3963 
3964 		vm_object_lock(object);
3965 
3966 		m = vm_page_lookup(object, offset);
3967 
3968 		if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3969 		    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3970 			/*
3971 			 * it's either the same page that someone else has
3972 			 * started cleaning (or it's finished cleaning or
3973 			 * been put back on the pageout queue), or
3974 			 * the page has been freed or we have found a
3975 			 * new page at this offset... in all of these cases
3976 			 * we merely need to release the activity_in_progress
3977 			 * we took when we put the page on the pageout queue
3978 			 */
3979 			vm_object_activity_end(object);
3980 			vm_object_unlock(object);
3981 
3982 			vm_page_lockspin_queues();
3983 			continue;
3984 		}
3985 		pager = object->pager;
3986 
3987 		if (pager == MEMORY_OBJECT_NULL) {
3988 			/*
3989 			 * This pager has been destroyed by either
3990 			 * memory_object_destroy or vm_object_destroy, and
3991 			 * so there is nowhere for the page to go.
3992 			 */
3993 			if (m->vmp_free_when_done) {
3994 				/*
3995 				 * Just free the page... VM_PAGE_FREE takes
3996 				 * care of cleaning up all the state...
3997 				 * including doing the vm_pageout_throttle_up
3998 				 */
3999 				VM_PAGE_FREE(m);
4000 			} else {
4001 				vm_page_lockspin_queues();
4002 
4003 				vm_pageout_throttle_up(m);
4004 				vm_page_activate(m);
4005 
4006 				vm_page_unlock_queues();
4007 
4008 				/*
4009 				 *	And we are done with it.
4010 				 */
4011 			}
4012 			vm_object_activity_end(object);
4013 			vm_object_unlock(object);
4014 
4015 			vm_page_lockspin_queues();
4016 			continue;
4017 		}
4018 #if 0
4019 		/*
4020 		 * we don't hold the page queue lock
4021 		 * so this check isn't safe to make
4022 		 */
4023 		VM_PAGE_CHECK(m);
4024 #endif
4025 		/*
4026 		 * give back the activity_in_progress reference we
4027 		 * took when we queued up this page and replace it
4028 		 * it with a paging_in_progress reference that will
4029 		 * also hold the paging offset from changing and
4030 		 * prevent the object from terminating
4031 		 */
4032 		vm_object_activity_end(object);
4033 		vm_object_paging_begin(object);
4034 		vm_object_unlock(object);
4035 
4036 		/*
4037 		 * Send the data to the pager.
4038 		 * any pageout clustering happens there
4039 		 */
4040 		memory_object_data_return(pager,
4041 		    m->vmp_offset + object->paging_offset,
4042 		    PAGE_SIZE,
4043 		    NULL,
4044 		    NULL,
4045 		    FALSE,
4046 		    FALSE,
4047 		    0);
4048 
4049 		vm_object_lock(object);
4050 		vm_object_paging_end(object);
4051 		vm_object_unlock(object);
4052 
4053 		vm_pageout_io_throttle();
4054 
4055 		vm_page_lockspin_queues();
4056 	}
4057 	q->pgo_busy = FALSE;
4058 	q->pgo_idle = TRUE;
4059 
4060 	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
4061 	vm_page_unlock_queues();
4062 
4063 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
4064 	/*NOTREACHED*/
4065 }
4066 
4067 
4068 #define         MAX_FREE_BATCH          32
4069 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4070                                      * this thread.
4071                                      */
4072 
4073 
4074 void
4075 vm_pageout_iothread_internal_continue(struct cq *, __unused wait_result_t);
4076 void
vm_pageout_iothread_internal_continue(struct cq * cq,__unused wait_result_t w)4077 vm_pageout_iothread_internal_continue(struct cq *cq, __unused wait_result_t w)
4078 {
4079 	struct vm_pageout_queue *q;
4080 	vm_page_t       m = NULL;
4081 	boolean_t       pgo_draining;
4082 	vm_page_t   local_q;
4083 	int         local_cnt;
4084 	vm_page_t   local_freeq = NULL;
4085 	int         local_freed = 0;
4086 	int         local_batch_size;
4087 #if DEVELOPMENT || DEBUG
4088 	int       ncomps = 0;
4089 	boolean_t marked_active = FALSE;
4090 	int       num_pages_processed = 0;
4091 #endif
4092 	void *chead = NULL;
4093 
4094 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4095 
4096 	q = cq->q;
4097 #if DEVELOPMENT || DEBUG
4098 	bool benchmark_accounting = false;
4099 	/*
4100 	 * If we're running the compressor perf test, only process the benchmark pages.
4101 	 * We'll get back to our regular queue once the benchmark is done
4102 	 */
4103 	if (compressor_running_perf_test) {
4104 		q = cq->benchmark_q;
4105 		if (!vm_page_queue_empty(&q->pgo_pending)) {
4106 			benchmark_accounting = true;
4107 		} else {
4108 			q = cq->q;
4109 			benchmark_accounting = false;
4110 		}
4111 	}
4112 #endif /* DEVELOPMENT || DEBUG */
4113 
4114 #if __AMP__
4115 	if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4116 		local_batch_size = (q->pgo_maxlaundry >> 3);
4117 		local_batch_size = MAX(local_batch_size, 16);
4118 	} else {
4119 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4120 	}
4121 #else
4122 	local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4123 #endif
4124 
4125 #if RECORD_THE_COMPRESSED_DATA
4126 	if (q->pgo_laundry) {
4127 		c_compressed_record_init();
4128 	}
4129 #endif
4130 	while (TRUE) {
4131 		int     pages_left_on_q = 0;
4132 
4133 		local_cnt = 0;
4134 		local_q = NULL;
4135 
4136 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4137 
4138 		vm_page_lock_queues();
4139 #if DEVELOPMENT || DEBUG
4140 		if (marked_active == FALSE) {
4141 			vmct_active++;
4142 			vmct_state[cq->id] = VMCT_ACTIVE;
4143 			marked_active = TRUE;
4144 			if (vmct_active == 1) {
4145 				vm_compressor_epoch_start = mach_absolute_time();
4146 			}
4147 		}
4148 #endif
4149 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4150 
4151 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4152 
4153 		while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4154 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4155 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4156 			VM_PAGE_CHECK(m);
4157 
4158 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4159 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4160 			m->vmp_laundry = FALSE;
4161 
4162 			m->vmp_snext = local_q;
4163 			local_q = m;
4164 			local_cnt++;
4165 		}
4166 		if (local_q == NULL) {
4167 			break;
4168 		}
4169 
4170 		q->pgo_busy = TRUE;
4171 
4172 		if ((pgo_draining = q->pgo_draining) == FALSE) {
4173 			vm_pageout_throttle_up_batch(q, local_cnt);
4174 			pages_left_on_q = q->pgo_laundry;
4175 		} else {
4176 			pages_left_on_q = q->pgo_laundry - local_cnt;
4177 		}
4178 
4179 		vm_page_unlock_queues();
4180 
4181 #if !RECORD_THE_COMPRESSED_DATA
4182 		if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4183 			thread_wakeup((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id + 1));
4184 		}
4185 #endif
4186 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4187 
4188 		while (local_q) {
4189 			KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4190 
4191 			m = local_q;
4192 			local_q = m->vmp_snext;
4193 			m->vmp_snext = NULL;
4194 
4195 			/*
4196 			 * Technically we need the pageq locks to manipulate this field.
4197 			 * However, this page has been removed from all queues and is only
4198 			 * known to this compressor thread dealing with this local queue.
4199 			 *
4200 			 * TODO LIONEL: Add a second localq that is the early localq and
4201 			 * put special pages like this one on that queue in the block above
4202 			 * under the pageq lock to avoid this 'works but not clean' logic.
4203 			 */
4204 			void *donate_queue_head;
4205 #if XNU_TARGET_OS_OSX
4206 			donate_queue_head = &cq->current_early_swapout_chead;
4207 #else /* XNU_TARGET_OS_OSX */
4208 			donate_queue_head = &cq->current_late_swapout_chead;
4209 #endif /* XNU_TARGET_OS_OSX */
4210 			if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4211 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4212 				chead = donate_queue_head;
4213 			} else {
4214 				chead = &cq->current_regular_swapout_chead;
4215 			}
4216 
4217 			if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4218 #if DEVELOPMENT || DEBUG
4219 				ncomps++;
4220 #endif
4221 				KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4222 
4223 				m->vmp_snext = local_freeq;
4224 				local_freeq = m;
4225 				local_freed++;
4226 
4227 				if (local_freed >= MAX_FREE_BATCH) {
4228 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4229 
4230 					vm_page_free_list(local_freeq, TRUE);
4231 
4232 					local_freeq = NULL;
4233 					local_freed = 0;
4234 				}
4235 			}
4236 #if DEVELOPMENT || DEBUG
4237 			num_pages_processed++;
4238 #endif /* DEVELOPMENT || DEBUG */
4239 #if !CONFIG_JETSAM
4240 			while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4241 				kern_return_t   wait_result;
4242 				int             need_wakeup = 0;
4243 
4244 				if (local_freeq) {
4245 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4246 
4247 					vm_page_free_list(local_freeq, TRUE);
4248 					local_freeq = NULL;
4249 					local_freed = 0;
4250 
4251 					continue;
4252 				}
4253 				vm_free_page_lock_spin();
4254 
4255 				if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4256 					if (vm_page_free_wanted_privileged++ == 0) {
4257 						need_wakeup = 1;
4258 					}
4259 					wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4260 
4261 					vm_free_page_unlock();
4262 
4263 					if (need_wakeup) {
4264 						thread_wakeup((event_t)&vm_page_free_wanted);
4265 					}
4266 
4267 					if (wait_result == THREAD_WAITING) {
4268 						thread_block(THREAD_CONTINUE_NULL);
4269 					}
4270 				} else {
4271 					vm_free_page_unlock();
4272 				}
4273 			}
4274 #endif
4275 		}
4276 		if (local_freeq) {
4277 			OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4278 
4279 			vm_page_free_list(local_freeq, TRUE);
4280 			local_freeq = NULL;
4281 			local_freed = 0;
4282 		}
4283 		if (pgo_draining == TRUE) {
4284 			vm_page_lockspin_queues();
4285 			vm_pageout_throttle_up_batch(q, local_cnt);
4286 			vm_page_unlock_queues();
4287 		}
4288 	}
4289 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4290 
4291 	/*
4292 	 * queue lock is held and our q is empty
4293 	 */
4294 	q->pgo_busy = FALSE;
4295 	q->pgo_idle = TRUE;
4296 
4297 	assert_wait((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id), THREAD_UNINT);
4298 #if DEVELOPMENT || DEBUG
4299 	if (marked_active == TRUE) {
4300 		vmct_active--;
4301 		vmct_state[cq->id] = VMCT_IDLE;
4302 
4303 		if (vmct_active == 0) {
4304 			vm_compressor_epoch_stop = mach_absolute_time();
4305 			assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4306 			    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4307 			    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4308 			/* This interval includes intervals where one or more
4309 			 * compressor threads were pre-empted
4310 			 */
4311 			vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4312 		}
4313 	}
4314 	if (compressor_running_perf_test && benchmark_accounting) {
4315 		/*
4316 		 * We could turn ON compressor_running_perf_test while still processing
4317 		 * regular non-benchmark pages. We shouldn't count them here else we
4318 		 * could overshoot. We might also still be populating that benchmark Q
4319 		 * and be under pressure. So we will go back to the regular queues. And
4320 		 * benchmark accounting will be off for that case too.
4321 		 */
4322 		compressor_perf_test_pages_processed += num_pages_processed;
4323 		thread_wakeup(&compressor_perf_test_pages_processed);
4324 	}
4325 #endif
4326 	vm_page_unlock_queues();
4327 #if DEVELOPMENT || DEBUG
4328 	if (__improbable(vm_compressor_time_thread)) {
4329 		vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4330 		vmct_stats.vmct_pages[cq->id] += ncomps;
4331 		vmct_stats.vmct_iterations[cq->id]++;
4332 		if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4333 			vmct_stats.vmct_maxpages[cq->id] = ncomps;
4334 		}
4335 		if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4336 			vmct_stats.vmct_minpages[cq->id] = ncomps;
4337 		}
4338 	}
4339 #endif
4340 
4341 	KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4342 
4343 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4344 	/*NOTREACHED*/
4345 }
4346 
4347 
4348 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4349 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4350 {
4351 	vm_object_t     object;
4352 	memory_object_t pager;
4353 	int             compressed_count_delta;
4354 	kern_return_t   retval;
4355 
4356 	object = VM_PAGE_OBJECT(m);
4357 
4358 	assert(!m->vmp_free_when_done);
4359 	assert(!m->vmp_laundry);
4360 
4361 	pager = object->pager;
4362 
4363 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4364 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4365 
4366 		vm_object_lock(object);
4367 
4368 		/*
4369 		 * If there is no memory object for the page, create
4370 		 * one and hand it to the compression pager.
4371 		 */
4372 
4373 		if (!object->pager_initialized) {
4374 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4375 		}
4376 		if (!object->pager_initialized) {
4377 			vm_object_compressor_pager_create(object);
4378 		}
4379 
4380 		pager = object->pager;
4381 
4382 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4383 			/*
4384 			 * Still no pager for the object,
4385 			 * or the pager has been destroyed.
4386 			 * Reactivate the page.
4387 			 *
4388 			 * Should only happen if there is no
4389 			 * compression pager
4390 			 */
4391 			PAGE_WAKEUP_DONE(m);
4392 
4393 			vm_page_lockspin_queues();
4394 			vm_page_activate(m);
4395 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4396 			vm_page_unlock_queues();
4397 
4398 			/*
4399 			 *	And we are done with it.
4400 			 */
4401 			vm_object_activity_end(object);
4402 			vm_object_unlock(object);
4403 
4404 			return KERN_FAILURE;
4405 		}
4406 		vm_object_unlock(object);
4407 
4408 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4409 	}
4410 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4411 	assert(object->activity_in_progress > 0);
4412 
4413 	retval = vm_compressor_pager_put(
4414 		pager,
4415 		m->vmp_offset + object->paging_offset,
4416 		VM_PAGE_GET_PHYS_PAGE(m),
4417 		current_chead,
4418 		scratch_buf,
4419 		&compressed_count_delta);
4420 
4421 	vm_object_lock(object);
4422 
4423 	assert(object->activity_in_progress > 0);
4424 	assert(VM_PAGE_OBJECT(m) == object);
4425 	assert( !VM_PAGE_WIRED(m));
4426 
4427 	vm_compressor_pager_count(pager,
4428 	    compressed_count_delta,
4429 	    FALSE,                       /* shared_lock */
4430 	    object);
4431 
4432 	if (retval == KERN_SUCCESS) {
4433 		/*
4434 		 * If the object is purgeable, its owner's
4435 		 * purgeable ledgers will be updated in
4436 		 * vm_page_remove() but the page still
4437 		 * contributes to the owner's memory footprint,
4438 		 * so account for it as such.
4439 		 */
4440 		if ((object->purgable != VM_PURGABLE_DENY ||
4441 		    object->vo_ledger_tag) &&
4442 		    object->vo_owner != NULL) {
4443 			/* one more compressed purgeable/tagged page */
4444 			vm_object_owner_compressed_update(object,
4445 			    +1);
4446 		}
4447 		counter_inc(&vm_statistics_compressions);
4448 
4449 		if (m->vmp_tabled) {
4450 			vm_page_remove(m, TRUE);
4451 		}
4452 	} else {
4453 		PAGE_WAKEUP_DONE(m);
4454 
4455 		vm_page_lockspin_queues();
4456 
4457 		vm_page_activate(m);
4458 		vm_pageout_vminfo.vm_compressor_failed++;
4459 
4460 		vm_page_unlock_queues();
4461 	}
4462 	vm_object_activity_end(object);
4463 	vm_object_unlock(object);
4464 
4465 	return retval;
4466 }
4467 
4468 
4469 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4470 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4471 {
4472 	uint32_t        policy;
4473 
4474 	if (hibernate_cleaning_in_progress == TRUE) {
4475 		req_lowpriority = FALSE;
4476 	}
4477 
4478 	if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4479 		vm_page_unlock_queues();
4480 
4481 		if (req_lowpriority == TRUE) {
4482 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4483 			DTRACE_VM(laundrythrottle);
4484 		} else {
4485 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4486 			DTRACE_VM(laundryunthrottle);
4487 		}
4488 		proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4489 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4490 
4491 		vm_page_lock_queues();
4492 		eq->pgo_lowpriority = req_lowpriority;
4493 	}
4494 }
4495 
4496 
4497 static void
vm_pageout_iothread_external(__unused struct cq * c,__unused wait_result_t w)4498 vm_pageout_iothread_external(__unused struct cq *c, __unused wait_result_t w)
4499 {
4500 	thread_t        self = current_thread();
4501 
4502 	self->options |= TH_OPT_VMPRIV;
4503 
4504 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4505 
4506 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4507 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4508 
4509 	vm_page_lock_queues();
4510 
4511 	vm_pageout_queue_external.pgo_tid = self->thread_id;
4512 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4513 	vm_pageout_queue_external.pgo_inited = TRUE;
4514 
4515 	vm_page_unlock_queues();
4516 
4517 #if CONFIG_THREAD_GROUPS
4518 	thread_group_vm_add();
4519 #endif /* CONFIG_THREAD_GROUPS */
4520 
4521 	vm_pageout_iothread_external_continue(&vm_pageout_queue_external, 0);
4522 
4523 	/*NOTREACHED*/
4524 }
4525 
4526 
4527 static void
vm_pageout_iothread_internal(struct cq * cq,__unused wait_result_t w)4528 vm_pageout_iothread_internal(struct cq *cq, __unused wait_result_t w)
4529 {
4530 	thread_t        self = current_thread();
4531 
4532 	self->options |= TH_OPT_VMPRIV;
4533 
4534 	vm_page_lock_queues();
4535 
4536 	vm_pageout_queue_internal.pgo_tid = self->thread_id;
4537 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4538 	vm_pageout_queue_internal.pgo_inited = TRUE;
4539 
4540 #if DEVELOPMENT || DEBUG
4541 	vm_pageout_queue_benchmark.pgo_tid = vm_pageout_queue_internal.pgo_tid;
4542 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4543 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4544 	vm_pageout_queue_benchmark.pgo_idle = TRUE;
4545 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4546 #endif /* DEVELOPMENT || DEBUG */
4547 
4548 	vm_page_unlock_queues();
4549 
4550 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4551 		thread_vm_bind_group_add();
4552 	}
4553 
4554 #if CONFIG_THREAD_GROUPS
4555 	thread_group_vm_add();
4556 #endif /* CONFIG_THREAD_GROUPS */
4557 
4558 #if __AMP__
4559 	if (vm_compressor_ebound) {
4560 		/*
4561 		 * Use the soft bound option for vm_compressor to allow it to run on
4562 		 * P-cores if E-cluster is unavailable.
4563 		 */
4564 		thread_bind_cluster_type(self, 'E', true);
4565 	}
4566 #endif /* __AMP__ */
4567 
4568 	thread_set_thread_name(current_thread(), "VM_compressor");
4569 #if DEVELOPMENT || DEBUG
4570 	vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4571 #endif
4572 	vm_pageout_iothread_internal_continue(cq, 0);
4573 
4574 	/*NOTREACHED*/
4575 }
4576 
4577 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4578 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4579 {
4580 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4581 		return KERN_SUCCESS;
4582 	} else {
4583 		return KERN_FAILURE; /* Already set */
4584 	}
4585 }
4586 
4587 extern boolean_t        memorystatus_manual_testing_on;
4588 extern unsigned int     memorystatus_level;
4589 
4590 
4591 #if VM_PRESSURE_EVENTS
4592 
4593 boolean_t vm_pressure_events_enabled = FALSE;
4594 
4595 extern uint64_t next_warning_notification_sent_at_ts;
4596 extern uint64_t next_critical_notification_sent_at_ts;
4597 
4598 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4599 
4600 /*
4601  * The last time there was change in pressure level OR we forced a check
4602  * because the system is stuck in a non-normal pressure level.
4603  */
4604 uint64_t  vm_pressure_last_level_transition_abs = 0;
4605 
4606 /*
4607  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4608  * level before resending out notifications for that level again.
4609  */
4610 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4611 
4612 void
vm_pressure_response(void)4613 vm_pressure_response(void)
4614 {
4615 	vm_pressure_level_t     old_level = kVMPressureNormal;
4616 	int                     new_level = -1;
4617 	unsigned int            total_pages;
4618 	uint64_t                available_memory = 0;
4619 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4620 	bool                    force_check = false;
4621 	int                     time_in_mins;
4622 
4623 
4624 	if (vm_pressure_events_enabled == FALSE) {
4625 		return;
4626 	}
4627 
4628 #if !XNU_TARGET_OS_OSX
4629 
4630 	available_memory = (uint64_t) memorystatus_available_pages;
4631 
4632 #else /* !XNU_TARGET_OS_OSX */
4633 
4634 	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4635 	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4636 
4637 #endif /* !XNU_TARGET_OS_OSX */
4638 
4639 	total_pages = (unsigned int) atop_64(max_mem);
4640 #if CONFIG_SECLUDED_MEMORY
4641 	total_pages -= vm_page_secluded_count;
4642 #endif /* CONFIG_SECLUDED_MEMORY */
4643 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4644 
4645 	if (memorystatus_manual_testing_on) {
4646 		return;
4647 	}
4648 
4649 	curr_ts = mach_absolute_time();
4650 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4651 
4652 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4653 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4654 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4655 
4656 	old_level = memorystatus_vm_pressure_level;
4657 
4658 	switch (memorystatus_vm_pressure_level) {
4659 	case kVMPressureNormal:
4660 	{
4661 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4662 			new_level = kVMPressureCritical;
4663 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4664 			new_level = kVMPressureWarning;
4665 		}
4666 		break;
4667 	}
4668 
4669 	case kVMPressureWarning:
4670 	case kVMPressureUrgent:
4671 	{
4672 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4673 			new_level = kVMPressureNormal;
4674 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4675 			new_level = kVMPressureCritical;
4676 		} else if (force_check) {
4677 			new_level = kVMPressureWarning;
4678 			next_warning_notification_sent_at_ts = curr_ts;
4679 		}
4680 		break;
4681 	}
4682 
4683 	case kVMPressureCritical:
4684 	{
4685 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4686 			new_level = kVMPressureNormal;
4687 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4688 			new_level = kVMPressureWarning;
4689 		} else if (force_check) {
4690 			new_level = kVMPressureCritical;
4691 			next_critical_notification_sent_at_ts = curr_ts;
4692 		}
4693 		break;
4694 	}
4695 
4696 	default:
4697 		return;
4698 	}
4699 
4700 	if (new_level != -1 || force_check) {
4701 		if (new_level != -1) {
4702 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4703 
4704 			if (new_level != (int) old_level) {
4705 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4706 				    new_level, old_level, 0, 0);
4707 			}
4708 		} else {
4709 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4710 			    new_level, old_level, force_check, 0);
4711 		}
4712 
4713 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4714 			/*
4715 			 * We don't want to schedule a wakeup while hibernation is in progress
4716 			 * because that could collide with checks for non-monotonicity in the scheduler.
4717 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4718 			 * we _might_ want to use that for decisions regarding which pages or how
4719 			 * many pages we want to dump in hibernation.
4720 			 */
4721 			return;
4722 		}
4723 
4724 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4725 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4726 				thread_wakeup(&vm_pressure_thread);
4727 			}
4728 
4729 			if (old_level != memorystatus_vm_pressure_level) {
4730 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4731 			}
4732 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4733 		}
4734 	}
4735 }
4736 #endif /* VM_PRESSURE_EVENTS */
4737 
4738 /*
4739  * Function called by a kernel thread to either get the current pressure level or
4740  * wait until memory pressure changes from a given level.
4741  */
4742 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4743 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4744 {
4745 #if !VM_PRESSURE_EVENTS
4746 
4747 	return KERN_FAILURE;
4748 
4749 #else /* VM_PRESSURE_EVENTS */
4750 
4751 	wait_result_t       wr = 0;
4752 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4753 
4754 	if (pressure_level == NULL) {
4755 		return KERN_INVALID_ARGUMENT;
4756 	}
4757 
4758 	if (*pressure_level == kVMPressureJetsam) {
4759 		if (!wait_for_pressure) {
4760 			return KERN_INVALID_ARGUMENT;
4761 		}
4762 
4763 		lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4764 		wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4765 		    THREAD_INTERRUPTIBLE);
4766 		if (wr == THREAD_WAITING) {
4767 			++memorystatus_jetsam_fg_band_waiters;
4768 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4769 			wr = thread_block(THREAD_CONTINUE_NULL);
4770 		} else {
4771 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4772 		}
4773 		if (wr != THREAD_AWAKENED) {
4774 			return KERN_ABORTED;
4775 		}
4776 		*pressure_level = kVMPressureJetsam;
4777 		return KERN_SUCCESS;
4778 	}
4779 
4780 	if (wait_for_pressure == TRUE) {
4781 		while (old_level == *pressure_level) {
4782 			wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4783 			    THREAD_INTERRUPTIBLE);
4784 			if (wr == THREAD_WAITING) {
4785 				wr = thread_block(THREAD_CONTINUE_NULL);
4786 			}
4787 			if (wr == THREAD_INTERRUPTED) {
4788 				return KERN_ABORTED;
4789 			}
4790 
4791 			if (wr == THREAD_AWAKENED) {
4792 				old_level = memorystatus_vm_pressure_level;
4793 			}
4794 		}
4795 	}
4796 
4797 	*pressure_level = old_level;
4798 	return KERN_SUCCESS;
4799 #endif /* VM_PRESSURE_EVENTS */
4800 }
4801 
4802 #if VM_PRESSURE_EVENTS
4803 void
vm_pressure_thread(void)4804 vm_pressure_thread(void)
4805 {
4806 	static boolean_t thread_initialized = FALSE;
4807 
4808 	if (thread_initialized == TRUE) {
4809 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4810 		consider_vm_pressure_events();
4811 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4812 	}
4813 
4814 #if CONFIG_THREAD_GROUPS
4815 	thread_group_vm_add();
4816 #endif /* CONFIG_THREAD_GROUPS */
4817 
4818 	thread_set_thread_name(current_thread(), "VM_pressure");
4819 	thread_initialized = TRUE;
4820 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4821 	thread_block((thread_continue_t)vm_pressure_thread);
4822 }
4823 #endif /* VM_PRESSURE_EVENTS */
4824 
4825 
4826 /*
4827  * called once per-second via "compute_averages"
4828  */
4829 void
compute_pageout_gc_throttle(__unused void * arg)4830 compute_pageout_gc_throttle(__unused void *arg)
4831 {
4832 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4833 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4834 
4835 		thread_wakeup(VM_PAGEOUT_GC_EVENT);
4836 	}
4837 }
4838 
4839 /*
4840  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4841  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4842  * jetsams. We need to check if the zone map size is above its jetsam limit to
4843  * decide if this was indeed the case.
4844  *
4845  * We need to do this on a different thread because of the following reasons:
4846  *
4847  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4848  * itself causing the system to hang. We perform synchronous jetsams if we're
4849  * leaking in the VM map entries zone, so the leaking process could be doing a
4850  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4851  * jetsam itself. We also need the vm_map lock on the process termination path,
4852  * which would now lead the dying process to deadlock against itself.
4853  *
4854  * 2. The jetsam path might need to allocate zone memory itself. We could try
4855  * using the non-blocking variant of zalloc for this path, but we can still
4856  * end up trying to do a kmem_alloc when the zone maps are almost full.
4857  */
4858 __dead2
4859 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4860 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4861 {
4862 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4863 
4864 	if (step == VM_PAGEOUT_GC_INIT) {
4865 		/* first time being called is not about GC */
4866 #if CONFIG_THREAD_GROUPS
4867 		thread_group_vm_add();
4868 #endif /* CONFIG_THREAD_GROUPS */
4869 	} else if (zone_map_nearing_exhaustion()) {
4870 		/*
4871 		 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4872 		 *
4873 		 * Bail out after calling zone_gc (which triggers the
4874 		 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4875 		 * operations that clear out a bunch of caches might allocate zone
4876 		 * memory themselves (for eg. vm_map operations would need VM map
4877 		 * entries). Since the zone map is almost full at this point, we
4878 		 * could end up with a panic. We just need to quickly jetsam a
4879 		 * process and exit here.
4880 		 *
4881 		 * It could so happen that we were woken up to relieve memory
4882 		 * pressure and the zone map also happened to be near its limit at
4883 		 * the time, in which case we'll skip out early. But that should be
4884 		 * ok; if memory pressure persists, the thread will simply be woken
4885 		 * up again.
4886 		 */
4887 		zone_gc(ZONE_GC_JETSAM);
4888 	} else {
4889 		/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4890 		boolean_t buf_large_zfree = FALSE;
4891 		boolean_t first_try = TRUE;
4892 
4893 		stack_collect();
4894 
4895 		consider_machine_collect();
4896 		mbuf_drain(FALSE);
4897 
4898 		do {
4899 			if (consider_buffer_cache_collect != NULL) {
4900 				buf_large_zfree = (*consider_buffer_cache_collect)(0);
4901 			}
4902 			if (first_try == TRUE || buf_large_zfree == TRUE) {
4903 				/*
4904 				 * zone_gc should be last, because the other operations
4905 				 * might return memory to zones.
4906 				 */
4907 				zone_gc(ZONE_GC_TRIM);
4908 			}
4909 			first_try = FALSE;
4910 		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4911 
4912 		consider_machine_adjust();
4913 	}
4914 
4915 	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4916 
4917 	thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4918 	__builtin_unreachable();
4919 }
4920 
4921 
4922 #if VM_PAGE_BUCKETS_CHECK
4923 #if VM_PAGE_FAKE_BUCKETS
4924 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4925 #endif /* VM_PAGE_FAKE_BUCKETS */
4926 #endif /* VM_PAGE_BUCKETS_CHECK */
4927 
4928 
4929 
4930 void
vm_set_restrictions(unsigned int num_cpus)4931 vm_set_restrictions(unsigned int num_cpus)
4932 {
4933 	int vm_restricted_to_single_processor = 0;
4934 
4935 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4936 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4937 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4938 	} else {
4939 		assert(num_cpus > 0);
4940 
4941 		if (num_cpus <= 3) {
4942 			/*
4943 			 * on systems with a limited number of CPUS, bind the
4944 			 * 4 major threads that can free memory and that tend to use
4945 			 * a fair bit of CPU under pressured conditions to a single processor.
4946 			 * This insures that these threads don't hog all of the available CPUs
4947 			 * (important for camera launch), while allowing them to run independently
4948 			 * w/r to locks... the 4 threads are
4949 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4950 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
4951 			 * memorystatus_thread (jetsams).
4952 			 *
4953 			 * the first time the thread is run, it is responsible for checking the
4954 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
4955 			 * thread_bind_master...  someday this should be replaced with a group
4956 			 * scheduling mechanism and KPI.
4957 			 */
4958 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4959 		} else {
4960 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4961 		}
4962 	}
4963 }
4964 
4965 /*
4966  * Set up vm_config based on the vm_compressor_mode.
4967  * Must run BEFORE the pageout thread starts up.
4968  */
4969 __startup_func
4970 void
vm_config_init(void)4971 vm_config_init(void)
4972 {
4973 	bzero(&vm_config, sizeof(vm_config));
4974 
4975 	switch (vm_compressor_mode) {
4976 	case VM_PAGER_DEFAULT:
4977 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4978 		OS_FALLTHROUGH;
4979 
4980 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
4981 		vm_config.compressor_is_present = TRUE;
4982 		vm_config.swap_is_present = TRUE;
4983 		vm_config.compressor_is_active = TRUE;
4984 		vm_config.swap_is_active = TRUE;
4985 		break;
4986 
4987 	case VM_PAGER_COMPRESSOR_NO_SWAP:
4988 		vm_config.compressor_is_present = TRUE;
4989 		vm_config.swap_is_present = TRUE;
4990 		vm_config.compressor_is_active = TRUE;
4991 		break;
4992 
4993 	case VM_PAGER_FREEZER_DEFAULT:
4994 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4995 		OS_FALLTHROUGH;
4996 
4997 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4998 		vm_config.compressor_is_present = TRUE;
4999 		vm_config.swap_is_present = TRUE;
5000 		break;
5001 
5002 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5003 		vm_config.compressor_is_present = TRUE;
5004 		vm_config.swap_is_present = TRUE;
5005 		vm_config.compressor_is_active = TRUE;
5006 		vm_config.freezer_swap_is_active = TRUE;
5007 		break;
5008 
5009 	case VM_PAGER_NOT_CONFIGURED:
5010 		break;
5011 
5012 	default:
5013 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5014 		break;
5015 	}
5016 }
5017 
5018 __startup_func
5019 static void
vm_pageout_create_gc_thread(void)5020 vm_pageout_create_gc_thread(void)
5021 {
5022 	thread_t thread;
5023 
5024 	if (kernel_thread_create(vm_pageout_garbage_collect,
5025 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5026 		panic("vm_pageout_garbage_collect: create failed");
5027 	}
5028 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5029 	if (thread->reserved_stack == 0) {
5030 		assert(thread->kernel_stack);
5031 		thread->reserved_stack = thread->kernel_stack;
5032 	}
5033 
5034 	/* thread is started in vm_pageout() */
5035 	vm_pageout_gc_thread = thread;
5036 }
5037 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5038 
5039 void
vm_pageout(void)5040 vm_pageout(void)
5041 {
5042 	thread_t        self = current_thread();
5043 	thread_t        thread;
5044 	kern_return_t   result;
5045 	spl_t           s;
5046 
5047 	/*
5048 	 * Set thread privileges.
5049 	 */
5050 	s = splsched();
5051 
5052 #if CONFIG_VPS_DYNAMIC_PRIO
5053 
5054 	int             vps_dynprio_bootarg = 0;
5055 
5056 	if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5057 		vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5058 		kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5059 	} else {
5060 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5061 			vps_dynamic_priority_enabled = TRUE;
5062 		} else {
5063 			vps_dynamic_priority_enabled = FALSE;
5064 		}
5065 	}
5066 
5067 	if (vps_dynamic_priority_enabled) {
5068 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5069 		thread_set_eager_preempt(self);
5070 	} else {
5071 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5072 	}
5073 
5074 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5075 
5076 	vps_dynamic_priority_enabled = FALSE;
5077 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5078 
5079 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5080 
5081 	thread_lock(self);
5082 	self->options |= TH_OPT_VMPRIV;
5083 	thread_unlock(self);
5084 
5085 	if (!self->reserved_stack) {
5086 		self->reserved_stack = self->kernel_stack;
5087 	}
5088 
5089 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5090 	    vps_dynamic_priority_enabled == FALSE) {
5091 		thread_vm_bind_group_add();
5092 	}
5093 
5094 
5095 #if CONFIG_THREAD_GROUPS
5096 	thread_group_vm_add();
5097 #endif /* CONFIG_THREAD_GROUPS */
5098 
5099 #if __AMP__
5100 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5101 	if (vm_pgo_pbound) {
5102 		/*
5103 		 * Use the soft bound option for vm pageout to allow it to run on
5104 		 * E-cores if P-cluster is unavailable.
5105 		 */
5106 		thread_bind_cluster_type(self, 'P', true);
5107 	}
5108 #endif /* __AMP__ */
5109 
5110 	PE_parse_boot_argn("vmpgo_protect_realtime",
5111 	    &vm_pageout_protect_realtime,
5112 	    sizeof(vm_pageout_protect_realtime));
5113 	splx(s);
5114 
5115 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5116 
5117 	/*
5118 	 *	Initialize some paging parameters.
5119 	 */
5120 
5121 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5122 	vm_pageout_state.vm_pressure_changed = FALSE;
5123 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5124 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5125 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5126 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5127 	vm_pageout_state.vm_page_speculative_percentage = 5;
5128 	vm_pageout_state.vm_page_speculative_target = 0;
5129 
5130 	vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
5131 	vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
5132 
5133 	vm_pageout_state.vm_pageout_swap_wait = 0;
5134 	vm_pageout_state.vm_pageout_idle_wait = 0;
5135 	vm_pageout_state.vm_pageout_empty_wait = 0;
5136 	vm_pageout_state.vm_pageout_burst_wait = 0;
5137 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5138 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5139 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5140 
5141 	vm_pageout_state.vm_pageout_inactive = 0;
5142 	vm_pageout_state.vm_pageout_inactive_used = 0;
5143 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5144 
5145 	vm_pageout_state.vm_memory_pressure = 0;
5146 	vm_pageout_state.vm_page_filecache_min = 0;
5147 #if CONFIG_JETSAM
5148 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5149 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5150 #else
5151 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5152 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5153 #endif
5154 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5155 
5156 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5157 
5158 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5159 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5160 	}
5161 
5162 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5163 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5164 	}
5165 
5166 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5167 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5168 	}
5169 
5170 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5171 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5172 	}
5173 
5174 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5175 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5176 	}
5177 
5178 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5179 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5180 	}
5181 
5182 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5183 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5184 	}
5185 	/*
5186 	 * even if we've already called vm_page_free_reserve
5187 	 * call it again here to insure that the targets are
5188 	 * accurately calculated (it uses vm_page_free_count_init)
5189 	 * calling it with an arg of 0 will not change the reserve
5190 	 * but will re-calculate free_min and free_target
5191 	 */
5192 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5193 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5194 	} else {
5195 		vm_page_free_reserve(0);
5196 	}
5197 
5198 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5199 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5200 
5201 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5202 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5203 	vm_pageout_queue_external.pgo_tid = -1;
5204 
5205 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5206 	vm_pageout_queue_internal.pgo_tid = -1;
5207 
5208 #if DEVELOPMENT || DEBUG
5209 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5210 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5211 	vm_pageout_queue_internal.pgo_tid = -1;
5212 #endif /* DEVELOPMENT || DEBUG */
5213 
5214 
5215 	/* internal pageout thread started when default pager registered first time */
5216 	/* external pageout and garbage collection threads started here */
5217 
5218 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5219 	    BASEPRI_VM,
5220 	    &vm_pageout_state.vm_pageout_external_iothread);
5221 	if (result != KERN_SUCCESS) {
5222 		panic("vm_pageout_iothread_external: create failed");
5223 	}
5224 	thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5225 	thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5226 
5227 	thread_mtx_lock(vm_pageout_gc_thread );
5228 	thread_start(vm_pageout_gc_thread );
5229 	thread_mtx_unlock(vm_pageout_gc_thread);
5230 
5231 #if VM_PRESSURE_EVENTS
5232 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5233 	    BASEPRI_DEFAULT,
5234 	    &thread);
5235 
5236 	if (result != KERN_SUCCESS) {
5237 		panic("vm_pressure_thread: create failed");
5238 	}
5239 
5240 	thread_deallocate(thread);
5241 #endif
5242 
5243 	vm_object_reaper_init();
5244 
5245 
5246 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5247 		vm_compressor_init();
5248 	}
5249 
5250 #if VM_PRESSURE_EVENTS
5251 	vm_pressure_events_enabled = TRUE;
5252 #endif /* VM_PRESSURE_EVENTS */
5253 
5254 #if CONFIG_PHANTOM_CACHE
5255 	vm_phantom_cache_init();
5256 #endif
5257 #if VM_PAGE_BUCKETS_CHECK
5258 #if VM_PAGE_FAKE_BUCKETS
5259 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5260 	    (uint64_t) vm_page_fake_buckets_start,
5261 	    (uint64_t) vm_page_fake_buckets_end);
5262 	pmap_protect(kernel_pmap,
5263 	    vm_page_fake_buckets_start,
5264 	    vm_page_fake_buckets_end,
5265 	    VM_PROT_READ);
5266 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5267 #endif /* VM_PAGE_FAKE_BUCKETS */
5268 #endif /* VM_PAGE_BUCKETS_CHECK */
5269 
5270 #if VM_OBJECT_TRACKING
5271 	vm_object_tracking_init();
5272 #endif /* VM_OBJECT_TRACKING */
5273 
5274 #if __arm64__
5275 //	vm_tests();
5276 #endif /* __arm64__ */
5277 
5278 	vm_pageout_continue();
5279 
5280 	/*
5281 	 * Unreached code!
5282 	 *
5283 	 * The vm_pageout_continue() call above never returns, so the code below is never
5284 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5285 	 * points that our kernel doesn't have an analog for.  These are probe points that
5286 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5287 	 * scripts that use them.  Declaring the probe points here means their scripts will
5288 	 * compile and execute which we want for portability of the scripts, but since this
5289 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5290 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5291 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5292 	 */
5293 
5294 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5295 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5296 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5297 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5298 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5299 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5300 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5301 	/*NOTREACHED*/
5302 }
5303 
5304 
5305 
5306 kern_return_t
vm_pageout_internal_start(void)5307 vm_pageout_internal_start(void)
5308 {
5309 	kern_return_t   result = KERN_SUCCESS;
5310 	host_basic_info_data_t hinfo;
5311 	vm_offset_t     buf, bufsize;
5312 
5313 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5314 
5315 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5316 #define BSD_HOST 1
5317 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5318 
5319 	assert(hinfo.max_cpus > 0);
5320 
5321 #if !XNU_TARGET_OS_OSX
5322 	vm_pageout_state.vm_compressor_thread_count = 1;
5323 #else /* !XNU_TARGET_OS_OSX */
5324 	if (hinfo.max_cpus > 4) {
5325 		vm_pageout_state.vm_compressor_thread_count = 2;
5326 	} else {
5327 		vm_pageout_state.vm_compressor_thread_count = 1;
5328 	}
5329 #endif /* !XNU_TARGET_OS_OSX */
5330 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5331 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5332 
5333 #if     __AMP__
5334 	PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5335 	if (vm_compressor_ebound) {
5336 		vm_pageout_state.vm_compressor_thread_count = 2;
5337 	}
5338 #endif
5339 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5340 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5341 	}
5342 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5343 		vm_pageout_state.vm_compressor_thread_count = 1;
5344 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5345 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5346 	}
5347 
5348 	vm_pageout_queue_internal.pgo_maxlaundry =
5349 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5350 
5351 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5352 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5353 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5354 
5355 #if DEVELOPMENT || DEBUG
5356 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5357 #endif /* DEVELOPMENT || DEBUG */
5358 
5359 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5360 
5361 	kmem_alloc(kernel_map, &buf,
5362 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5363 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5364 	    VM_KERN_MEMORY_COMPRESSOR);
5365 
5366 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5367 		ciq[i].id = i;
5368 		ciq[i].q = &vm_pageout_queue_internal;
5369 		ciq[i].current_early_swapout_chead = NULL;
5370 		ciq[i].current_regular_swapout_chead = NULL;
5371 		ciq[i].current_late_swapout_chead = NULL;
5372 		ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5373 #if DEVELOPMENT || DEBUG
5374 		ciq[i].benchmark_q = &vm_pageout_queue_benchmark;
5375 #endif /* DEVELOPMENT || DEBUG */
5376 
5377 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5378 		    (void *)&ciq[i], BASEPRI_VM,
5379 		    &vm_pageout_state.vm_pageout_internal_iothread);
5380 
5381 		if (result == KERN_SUCCESS) {
5382 			thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5383 		} else {
5384 			break;
5385 		}
5386 	}
5387 	return result;
5388 }
5389 
5390 #if CONFIG_IOSCHED
5391 /*
5392  * To support I/O Expedite for compressed files we mark the upls with special flags.
5393  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5394  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5395  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5396  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5397  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5398  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5399  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5400  * unless the real I/O upl is being destroyed).
5401  */
5402 
5403 
5404 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5405 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5406 {
5407 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5408 
5409 	upl_lock(src_upl);
5410 	if (src_upl->decmp_io_upl) {
5411 		/*
5412 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5413 		 * This case should rarely happen and even if it does, it just means
5414 		 * that we might issue a spurious expedite which the driver is expected
5415 		 * to handle.
5416 		 */
5417 		upl_unlock(src_upl);
5418 		return;
5419 	}
5420 	src_upl->decmp_io_upl = (void *)upl;
5421 	src_upl->ref_count++;
5422 
5423 	upl->flags |= UPL_DECMP_REAL_IO;
5424 	upl->decmp_io_upl = (void *)src_upl;
5425 	upl_unlock(src_upl);
5426 }
5427 #endif /* CONFIG_IOSCHED */
5428 
5429 #if UPL_DEBUG
5430 int     upl_debug_enabled = 1;
5431 #else
5432 int     upl_debug_enabled = 0;
5433 #endif
5434 
5435 static upl_t
upl_create(int type,int flags,upl_size_t size)5436 upl_create(int type, int flags, upl_size_t size)
5437 {
5438 	upl_t   upl;
5439 	vm_size_t       page_field_size = 0;
5440 	int     upl_flags = 0;
5441 	vm_size_t       upl_size  = sizeof(struct upl);
5442 
5443 	assert(page_aligned(size));
5444 
5445 	size = round_page_32(size);
5446 
5447 	if (type & UPL_CREATE_LITE) {
5448 		page_field_size = (atop(size) + 7) >> 3;
5449 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5450 
5451 		upl_flags |= UPL_LITE;
5452 	}
5453 	if (type & UPL_CREATE_INTERNAL) {
5454 		upl_size += sizeof(struct upl_page_info) * atop(size);
5455 
5456 		upl_flags |= UPL_INTERNAL;
5457 	}
5458 	// rdar://88964158
5459 	/* BEGIN IGNORE CODESTYLE */
5460 	__typed_allocators_ignore_push
5461 	upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5462 	__typed_allocators_ignore_pop
5463 	/* END IGNORE CODESTYLE */
5464 
5465 	upl->flags = upl_flags | flags;
5466 	upl->ref_count = 1;
5467 	upl_lock_init(upl);
5468 #if CONFIG_IOSCHED
5469 	if (type & UPL_CREATE_IO_TRACKING) {
5470 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5471 	}
5472 
5473 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5474 		/* Only support expedite on internal UPLs */
5475 		thread_t        curthread = current_thread();
5476 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5477 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5478 		if (curthread->decmp_upl != NULL) {
5479 			upl_set_decmp_info(upl, curthread->decmp_upl);
5480 		}
5481 	}
5482 #endif
5483 #if CONFIG_IOSCHED || UPL_DEBUG
5484 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5485 		upl->upl_creator = current_thread();
5486 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5487 	}
5488 #endif
5489 
5490 #if UPL_DEBUG
5491 	(void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5492 #endif /* UPL_DEBUG */
5493 
5494 	return upl;
5495 }
5496 
5497 static void
upl_destroy(upl_t upl)5498 upl_destroy(upl_t upl)
5499 {
5500 	int     page_field_size;  /* bit field in word size buf */
5501 	int     size;
5502 
5503 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5504 
5505 	if (upl->ext_ref_count) {
5506 		panic("upl(%p) ext_ref_count", upl);
5507 	}
5508 
5509 #if CONFIG_IOSCHED
5510 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5511 		upl_t src_upl;
5512 		src_upl = upl->decmp_io_upl;
5513 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5514 		upl_lock(src_upl);
5515 		src_upl->decmp_io_upl = NULL;
5516 		upl_unlock(src_upl);
5517 		upl_deallocate(src_upl);
5518 	}
5519 #endif /* CONFIG_IOSCHED */
5520 
5521 #if CONFIG_IOSCHED || UPL_DEBUG
5522 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5523 	    !(upl->flags & UPL_VECTOR)) {
5524 		vm_object_t     object;
5525 
5526 		if (upl->flags & UPL_SHADOWED) {
5527 			object = upl->map_object->shadow;
5528 		} else {
5529 			object = upl->map_object;
5530 		}
5531 
5532 		vm_object_lock(object);
5533 		queue_remove(&object->uplq, upl, upl_t, uplq);
5534 		vm_object_activity_end(object);
5535 		vm_object_collapse(object, 0, TRUE);
5536 		vm_object_unlock(object);
5537 	}
5538 #endif
5539 	/*
5540 	 * drop a reference on the map_object whether or
5541 	 * not a pageout object is inserted
5542 	 */
5543 	if (upl->flags & UPL_SHADOWED) {
5544 		vm_object_deallocate(upl->map_object);
5545 	}
5546 
5547 	if (upl->flags & UPL_DEVICE_MEMORY) {
5548 		size = PAGE_SIZE;
5549 	} else {
5550 		size = upl_adjusted_size(upl, PAGE_MASK);
5551 	}
5552 	page_field_size = 0;
5553 
5554 	if (upl->flags & UPL_LITE) {
5555 		page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5556 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5557 	}
5558 	upl_lock_destroy(upl);
5559 	upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5560 
5561 #if CONFIG_IOSCHED
5562 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5563 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5564 	}
5565 #endif
5566 
5567 	// rdar://88964158
5568 	__typed_allocators_ignore_push
5569 	if (upl->flags & UPL_INTERNAL) {
5570 		kheap_free(KHEAP_DEFAULT, upl,
5571 		    sizeof(struct upl) +
5572 		    (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5573 		    + page_field_size);
5574 	} else {
5575 		kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5576 	}
5577 	__typed_allocators_ignore_pop
5578 }
5579 
5580 void
upl_deallocate(upl_t upl)5581 upl_deallocate(upl_t upl)
5582 {
5583 	upl_lock(upl);
5584 
5585 	if (--upl->ref_count == 0) {
5586 		if (vector_upl_is_valid(upl)) {
5587 			vector_upl_deallocate(upl);
5588 		}
5589 		upl_unlock(upl);
5590 
5591 		if (upl->upl_iodone) {
5592 			upl_callout_iodone(upl);
5593 		}
5594 
5595 		upl_destroy(upl);
5596 	} else {
5597 		upl_unlock(upl);
5598 	}
5599 }
5600 
5601 #if CONFIG_IOSCHED
5602 void
upl_mark_decmp(upl_t upl)5603 upl_mark_decmp(upl_t upl)
5604 {
5605 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5606 		upl->flags |= UPL_DECMP_REQ;
5607 		upl->upl_creator->decmp_upl = (void *)upl;
5608 	}
5609 }
5610 
5611 void
upl_unmark_decmp(upl_t upl)5612 upl_unmark_decmp(upl_t upl)
5613 {
5614 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5615 		upl->upl_creator->decmp_upl = NULL;
5616 	}
5617 }
5618 
5619 #endif /* CONFIG_IOSCHED */
5620 
5621 #define VM_PAGE_Q_BACKING_UP(q)         \
5622 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5623 
5624 boolean_t must_throttle_writes(void);
5625 
5626 boolean_t
must_throttle_writes()5627 must_throttle_writes()
5628 {
5629 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5630 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5631 		return TRUE;
5632 	}
5633 
5634 	return FALSE;
5635 }
5636 
5637 int vm_page_delayed_work_ctx_needed = 0;
5638 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5639 
5640 __startup_func
5641 static void
vm_page_delayed_work_init_ctx(void)5642 vm_page_delayed_work_init_ctx(void)
5643 {
5644 	uint16_t min_delayed_work_ctx_allocated = 16;
5645 
5646 	/*
5647 	 * try really hard to always keep NCPU elements around in the zone
5648 	 * in order for the UPL code to almost always get an element.
5649 	 */
5650 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5651 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5652 	}
5653 
5654 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5655 }
5656 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5657 
5658 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5659 vm_page_delayed_work_get_ctx(void)
5660 {
5661 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5662 
5663 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5664 
5665 	if (__probable(dw_ctx)) {
5666 		dw_ctx->delayed_owner = current_thread();
5667 	} else {
5668 		vm_page_delayed_work_ctx_needed++;
5669 	}
5670 	return dw_ctx ? dw_ctx->dwp : NULL;
5671 }
5672 
5673 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5674 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5675 {
5676 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5677 
5678 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5679 	ldw_ctx->delayed_owner = NULL;
5680 
5681 	zfree(dw_ctx_zone, ldw_ctx);
5682 }
5683 
5684 /*
5685  *	Routine:	vm_object_upl_request
5686  *	Purpose:
5687  *		Cause the population of a portion of a vm_object.
5688  *		Depending on the nature of the request, the pages
5689  *		returned may be contain valid data or be uninitialized.
5690  *		A page list structure, listing the physical pages
5691  *		will be returned upon request.
5692  *		This function is called by the file system or any other
5693  *		supplier of backing store to a pager.
5694  *		IMPORTANT NOTE: The caller must still respect the relationship
5695  *		between the vm_object and its backing memory object.  The
5696  *		caller MUST NOT substitute changes in the backing file
5697  *		without first doing a memory_object_lock_request on the
5698  *		target range unless it is know that the pages are not
5699  *		shared with another entity at the pager level.
5700  *		Copy_in_to:
5701  *			if a page list structure is present
5702  *			return the mapped physical pages, where a
5703  *			page is not present, return a non-initialized
5704  *			one.  If the no_sync bit is turned on, don't
5705  *			call the pager unlock to synchronize with other
5706  *			possible copies of the page. Leave pages busy
5707  *			in the original object, if a page list structure
5708  *			was specified.  When a commit of the page list
5709  *			pages is done, the dirty bit will be set for each one.
5710  *		Copy_out_from:
5711  *			If a page list structure is present, return
5712  *			all mapped pages.  Where a page does not exist
5713  *			map a zero filled one. Leave pages busy in
5714  *			the original object.  If a page list structure
5715  *			is not specified, this call is a no-op.
5716  *
5717  *		Note:  access of default pager objects has a rather interesting
5718  *		twist.  The caller of this routine, presumably the file system
5719  *		page cache handling code, will never actually make a request
5720  *		against a default pager backed object.  Only the default
5721  *		pager will make requests on backing store related vm_objects
5722  *		In this way the default pager can maintain the relationship
5723  *		between backing store files (abstract memory objects) and
5724  *		the vm_objects (cache objects), they support.
5725  *
5726  */
5727 
5728 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5729 vm_object_upl_request(
5730 	vm_object_t             object,
5731 	vm_object_offset_t      offset,
5732 	upl_size_t              size,
5733 	upl_t                   *upl_ptr,
5734 	upl_page_info_array_t   user_page_list,
5735 	unsigned int            *page_list_count,
5736 	upl_control_flags_t     cntrl_flags,
5737 	vm_tag_t                tag)
5738 {
5739 	vm_page_t               dst_page = VM_PAGE_NULL;
5740 	vm_object_offset_t      dst_offset;
5741 	upl_size_t              xfer_size;
5742 	unsigned int            size_in_pages;
5743 	boolean_t               dirty;
5744 	boolean_t               hw_dirty;
5745 	upl_t                   upl = NULL;
5746 	unsigned int            entry;
5747 	vm_page_t               alias_page = NULL;
5748 	int                     refmod_state = 0;
5749 	wpl_array_t             lite_list = NULL;
5750 	vm_object_t             last_copy_object;
5751 	struct  vm_page_delayed_work    dw_array;
5752 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5753 	bool                    dwp_finish_ctx = TRUE;
5754 	int                     dw_count;
5755 	int                     dw_limit;
5756 	int                     io_tracking_flag = 0;
5757 	int                     grab_options;
5758 	int                     page_grab_count = 0;
5759 	ppnum_t                 phys_page;
5760 	pmap_flush_context      pmap_flush_context_storage;
5761 	boolean_t               pmap_flushes_delayed = FALSE;
5762 #if DEVELOPMENT || DEBUG
5763 	task_t                  task = current_task();
5764 #endif /* DEVELOPMENT || DEBUG */
5765 
5766 	dwp_start = dwp = NULL;
5767 
5768 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5769 		/*
5770 		 * For forward compatibility's sake,
5771 		 * reject any unknown flag.
5772 		 */
5773 		return KERN_INVALID_VALUE;
5774 	}
5775 	if ((!object->internal) && (object->paging_offset != 0)) {
5776 		panic("vm_object_upl_request: external object with non-zero paging offset");
5777 	}
5778 	if (object->phys_contiguous) {
5779 		panic("vm_object_upl_request: contiguous object specified");
5780 	}
5781 
5782 	assertf(page_aligned(offset) && page_aligned(size),
5783 	    "offset 0x%llx size 0x%x",
5784 	    offset, size);
5785 
5786 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5787 
5788 	dw_count = 0;
5789 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5790 	dwp_start = vm_page_delayed_work_get_ctx();
5791 	if (dwp_start == NULL) {
5792 		dwp_start = &dw_array;
5793 		dw_limit = 1;
5794 		dwp_finish_ctx = FALSE;
5795 	}
5796 
5797 	dwp = dwp_start;
5798 
5799 	if (size > MAX_UPL_SIZE_BYTES) {
5800 		size = MAX_UPL_SIZE_BYTES;
5801 	}
5802 
5803 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5804 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5805 	}
5806 
5807 #if CONFIG_IOSCHED || UPL_DEBUG
5808 	if (object->io_tracking || upl_debug_enabled) {
5809 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5810 	}
5811 #endif
5812 #if CONFIG_IOSCHED
5813 	if (object->io_tracking) {
5814 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5815 	}
5816 #endif
5817 
5818 	if (cntrl_flags & UPL_SET_INTERNAL) {
5819 		if (cntrl_flags & UPL_SET_LITE) {
5820 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5821 
5822 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5823 			lite_list = (wpl_array_t)
5824 			    (((uintptr_t)user_page_list) +
5825 			    ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5826 			if (size == 0) {
5827 				user_page_list = NULL;
5828 				lite_list = NULL;
5829 			}
5830 		} else {
5831 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5832 
5833 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5834 			if (size == 0) {
5835 				user_page_list = NULL;
5836 			}
5837 		}
5838 	} else {
5839 		if (cntrl_flags & UPL_SET_LITE) {
5840 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5841 
5842 			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5843 			if (size == 0) {
5844 				lite_list = NULL;
5845 			}
5846 		} else {
5847 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5848 		}
5849 	}
5850 	*upl_ptr = upl;
5851 
5852 	if (user_page_list) {
5853 		user_page_list[0].device = FALSE;
5854 	}
5855 
5856 	if (cntrl_flags & UPL_SET_LITE) {
5857 		upl->map_object = object;
5858 	} else {
5859 		upl->map_object = vm_object_allocate(size);
5860 		/*
5861 		 * No neeed to lock the new object: nobody else knows
5862 		 * about it yet, so it's all ours so far.
5863 		 */
5864 		upl->map_object->shadow = object;
5865 		upl->map_object->pageout = TRUE;
5866 		upl->map_object->can_persist = FALSE;
5867 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5868 		upl->map_object->vo_shadow_offset = offset;
5869 		upl->map_object->wimg_bits = object->wimg_bits;
5870 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5871 		    "object %p shadow_offset 0x%llx",
5872 		    upl->map_object, upl->map_object->vo_shadow_offset);
5873 
5874 		alias_page = vm_page_grab_fictitious(TRUE);
5875 
5876 		upl->flags |= UPL_SHADOWED;
5877 	}
5878 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5879 		upl->flags |= UPL_PAGEOUT;
5880 	}
5881 
5882 	vm_object_lock(object);
5883 	vm_object_activity_begin(object);
5884 
5885 	grab_options = 0;
5886 #if CONFIG_SECLUDED_MEMORY
5887 	if (object->can_grab_secluded) {
5888 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5889 	}
5890 #endif /* CONFIG_SECLUDED_MEMORY */
5891 
5892 	/*
5893 	 * we can lock in the paging_offset once paging_in_progress is set
5894 	 */
5895 	upl->u_size = size;
5896 	upl->u_offset = offset + object->paging_offset;
5897 
5898 #if CONFIG_IOSCHED || UPL_DEBUG
5899 	if (object->io_tracking || upl_debug_enabled) {
5900 		vm_object_activity_begin(object);
5901 		queue_enter(&object->uplq, upl, upl_t, uplq);
5902 	}
5903 #endif
5904 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5905 		/*
5906 		 * Honor copy-on-write obligations
5907 		 *
5908 		 * The caller is gathering these pages and
5909 		 * might modify their contents.  We need to
5910 		 * make sure that the copy object has its own
5911 		 * private copies of these pages before we let
5912 		 * the caller modify them.
5913 		 */
5914 		vm_object_update(object,
5915 		    offset,
5916 		    size,
5917 		    NULL,
5918 		    NULL,
5919 		    FALSE,              /* should_return */
5920 		    MEMORY_OBJECT_COPY_SYNC,
5921 		    VM_PROT_NO_CHANGE);
5922 
5923 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5924 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5925 	}
5926 	/*
5927 	 * remember which copy object we synchronized with
5928 	 */
5929 	last_copy_object = object->copy;
5930 	entry = 0;
5931 
5932 	xfer_size = size;
5933 	dst_offset = offset;
5934 	size_in_pages = size / PAGE_SIZE;
5935 
5936 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5937 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5938 		object->scan_collisions = 0;
5939 	}
5940 
5941 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5942 		boolean_t       isSSD = FALSE;
5943 
5944 #if !XNU_TARGET_OS_OSX
5945 		isSSD = TRUE;
5946 #else /* !XNU_TARGET_OS_OSX */
5947 		vnode_pager_get_isSSD(object->pager, &isSSD);
5948 #endif /* !XNU_TARGET_OS_OSX */
5949 		vm_object_unlock(object);
5950 
5951 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5952 
5953 		if (isSSD == TRUE) {
5954 			delay(1000 * size_in_pages);
5955 		} else {
5956 			delay(5000 * size_in_pages);
5957 		}
5958 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5959 
5960 		vm_object_lock(object);
5961 	}
5962 
5963 	while (xfer_size) {
5964 		dwp->dw_mask = 0;
5965 
5966 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5967 			vm_object_unlock(object);
5968 			alias_page = vm_page_grab_fictitious(TRUE);
5969 			vm_object_lock(object);
5970 		}
5971 		if (cntrl_flags & UPL_COPYOUT_FROM) {
5972 			upl->flags |= UPL_PAGE_SYNC_DONE;
5973 
5974 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5975 			    dst_page->vmp_fictitious ||
5976 			    dst_page->vmp_absent ||
5977 			    VMP_ERROR_GET(dst_page) ||
5978 			    dst_page->vmp_cleaning ||
5979 			    (VM_PAGE_WIRED(dst_page))) {
5980 				if (user_page_list) {
5981 					user_page_list[entry].phys_addr = 0;
5982 				}
5983 
5984 				goto try_next_page;
5985 			}
5986 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5987 
5988 			/*
5989 			 * grab this up front...
5990 			 * a high percentange of the time we're going to
5991 			 * need the hardware modification state a bit later
5992 			 * anyway... so we can eliminate an extra call into
5993 			 * the pmap layer by grabbing it here and recording it
5994 			 */
5995 			if (dst_page->vmp_pmapped) {
5996 				refmod_state = pmap_get_refmod(phys_page);
5997 			} else {
5998 				refmod_state = 0;
5999 			}
6000 
6001 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6002 				/*
6003 				 * page is on inactive list and referenced...
6004 				 * reactivate it now... this gets it out of the
6005 				 * way of vm_pageout_scan which would have to
6006 				 * reactivate it upon tripping over it
6007 				 */
6008 				dwp->dw_mask |= DW_vm_page_activate;
6009 			}
6010 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6011 				/*
6012 				 * we're only asking for DIRTY pages to be returned
6013 				 */
6014 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6015 					/*
6016 					 * if we were the page stolen by vm_pageout_scan to be
6017 					 * cleaned (as opposed to a buddy being clustered in
6018 					 * or this request is not being driven by a PAGEOUT cluster
6019 					 * then we only need to check for the page being dirty or
6020 					 * precious to decide whether to return it
6021 					 */
6022 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6023 						goto check_busy;
6024 					}
6025 					goto dont_return;
6026 				}
6027 				/*
6028 				 * this is a request for a PAGEOUT cluster and this page
6029 				 * is merely along for the ride as a 'buddy'... not only
6030 				 * does it have to be dirty to be returned, but it also
6031 				 * can't have been referenced recently...
6032 				 */
6033 				if ((hibernate_cleaning_in_progress == TRUE ||
6034 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6035 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6036 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6037 					goto check_busy;
6038 				}
6039 dont_return:
6040 				/*
6041 				 * if we reach here, we're not to return
6042 				 * the page... go on to the next one
6043 				 */
6044 				if (dst_page->vmp_laundry == TRUE) {
6045 					/*
6046 					 * if we get here, the page is not 'cleaning' (filtered out above).
6047 					 * since it has been referenced, remove it from the laundry
6048 					 * so we don't pay the cost of an I/O to clean a page
6049 					 * we're just going to take back
6050 					 */
6051 					vm_page_lockspin_queues();
6052 
6053 					vm_pageout_steal_laundry(dst_page, TRUE);
6054 					vm_page_activate(dst_page);
6055 
6056 					vm_page_unlock_queues();
6057 				}
6058 				if (user_page_list) {
6059 					user_page_list[entry].phys_addr = 0;
6060 				}
6061 
6062 				goto try_next_page;
6063 			}
6064 check_busy:
6065 			if (dst_page->vmp_busy) {
6066 				if (cntrl_flags & UPL_NOBLOCK) {
6067 					if (user_page_list) {
6068 						user_page_list[entry].phys_addr = 0;
6069 					}
6070 					dwp->dw_mask = 0;
6071 
6072 					goto try_next_page;
6073 				}
6074 				/*
6075 				 * someone else is playing with the
6076 				 * page.  We will have to wait.
6077 				 */
6078 				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6079 
6080 				continue;
6081 			}
6082 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6083 				vm_page_lockspin_queues();
6084 
6085 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6086 					/*
6087 					 * we've buddied up a page for a clustered pageout
6088 					 * that has already been moved to the pageout
6089 					 * queue by pageout_scan... we need to remove
6090 					 * it from the queue and drop the laundry count
6091 					 * on that queue
6092 					 */
6093 					vm_pageout_throttle_up(dst_page);
6094 				}
6095 				vm_page_unlock_queues();
6096 			}
6097 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6098 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6099 
6100 			if (phys_page > upl->highest_page) {
6101 				upl->highest_page = phys_page;
6102 			}
6103 
6104 			assert(!pmap_is_noencrypt(phys_page));
6105 
6106 			if (cntrl_flags & UPL_SET_LITE) {
6107 				unsigned int    pg_num;
6108 
6109 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6110 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6111 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6112 
6113 				if (hw_dirty) {
6114 					if (pmap_flushes_delayed == FALSE) {
6115 						pmap_flush_context_init(&pmap_flush_context_storage);
6116 						pmap_flushes_delayed = TRUE;
6117 					}
6118 					pmap_clear_refmod_options(phys_page,
6119 					    VM_MEM_MODIFIED,
6120 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6121 					    &pmap_flush_context_storage);
6122 				}
6123 
6124 				/*
6125 				 * Mark original page as cleaning
6126 				 * in place.
6127 				 */
6128 				dst_page->vmp_cleaning = TRUE;
6129 				dst_page->vmp_precious = FALSE;
6130 			} else {
6131 				/*
6132 				 * use pageclean setup, it is more
6133 				 * convenient even for the pageout
6134 				 * cases here
6135 				 */
6136 				vm_object_lock(upl->map_object);
6137 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6138 				vm_object_unlock(upl->map_object);
6139 
6140 				alias_page->vmp_absent = FALSE;
6141 				alias_page = NULL;
6142 			}
6143 			if (dirty) {
6144 				SET_PAGE_DIRTY(dst_page, FALSE);
6145 			} else {
6146 				dst_page->vmp_dirty = FALSE;
6147 			}
6148 
6149 			if (!dirty) {
6150 				dst_page->vmp_precious = TRUE;
6151 			}
6152 
6153 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6154 				if (!VM_PAGE_WIRED(dst_page)) {
6155 					dst_page->vmp_free_when_done = TRUE;
6156 				}
6157 			}
6158 		} else {
6159 			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6160 				/*
6161 				 * Honor copy-on-write obligations
6162 				 *
6163 				 * The copy object has changed since we
6164 				 * last synchronized for copy-on-write.
6165 				 * Another copy object might have been
6166 				 * inserted while we released the object's
6167 				 * lock.  Since someone could have seen the
6168 				 * original contents of the remaining pages
6169 				 * through that new object, we have to
6170 				 * synchronize with it again for the remaining
6171 				 * pages only.  The previous pages are "busy"
6172 				 * so they can not be seen through the new
6173 				 * mapping.  The new mapping will see our
6174 				 * upcoming changes for those previous pages,
6175 				 * but that's OK since they couldn't see what
6176 				 * was there before.  It's just a race anyway
6177 				 * and there's no guarantee of consistency or
6178 				 * atomicity.  We just don't want new mappings
6179 				 * to see both the *before* and *after* pages.
6180 				 */
6181 				if (object->copy != VM_OBJECT_NULL) {
6182 					vm_object_update(
6183 						object,
6184 						dst_offset,/* current offset */
6185 						xfer_size, /* remaining size */
6186 						NULL,
6187 						NULL,
6188 						FALSE,     /* should_return */
6189 						MEMORY_OBJECT_COPY_SYNC,
6190 						VM_PROT_NO_CHANGE);
6191 
6192 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6193 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6194 				}
6195 				/*
6196 				 * remember the copy object we synced with
6197 				 */
6198 				last_copy_object = object->copy;
6199 			}
6200 			dst_page = vm_page_lookup(object, dst_offset);
6201 
6202 			if (dst_page != VM_PAGE_NULL) {
6203 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6204 					/*
6205 					 * skip over pages already present in the cache
6206 					 */
6207 					if (user_page_list) {
6208 						user_page_list[entry].phys_addr = 0;
6209 					}
6210 
6211 					goto try_next_page;
6212 				}
6213 				if (dst_page->vmp_fictitious) {
6214 					panic("need corner case for fictitious page");
6215 				}
6216 
6217 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6218 					/*
6219 					 * someone else is playing with the
6220 					 * page.  We will have to wait.
6221 					 */
6222 					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6223 
6224 					continue;
6225 				}
6226 				if (dst_page->vmp_laundry) {
6227 					vm_pageout_steal_laundry(dst_page, FALSE);
6228 				}
6229 			} else {
6230 				if (object->private) {
6231 					/*
6232 					 * This is a nasty wrinkle for users
6233 					 * of upl who encounter device or
6234 					 * private memory however, it is
6235 					 * unavoidable, only a fault can
6236 					 * resolve the actual backing
6237 					 * physical page by asking the
6238 					 * backing device.
6239 					 */
6240 					if (user_page_list) {
6241 						user_page_list[entry].phys_addr = 0;
6242 					}
6243 
6244 					goto try_next_page;
6245 				}
6246 				if (object->scan_collisions) {
6247 					/*
6248 					 * the pageout_scan thread is trying to steal
6249 					 * pages from this object, but has run into our
6250 					 * lock... grab 2 pages from the head of the object...
6251 					 * the first is freed on behalf of pageout_scan, the
6252 					 * 2nd is for our own use... we use vm_object_page_grab
6253 					 * in both cases to avoid taking pages from the free
6254 					 * list since we are under memory pressure and our
6255 					 * lock on this object is getting in the way of
6256 					 * relieving it
6257 					 */
6258 					dst_page = vm_object_page_grab(object);
6259 
6260 					if (dst_page != VM_PAGE_NULL) {
6261 						vm_page_release(dst_page,
6262 						    FALSE);
6263 					}
6264 
6265 					dst_page = vm_object_page_grab(object);
6266 				}
6267 				if (dst_page == VM_PAGE_NULL) {
6268 					/*
6269 					 * need to allocate a page
6270 					 */
6271 					dst_page = vm_page_grab_options(grab_options);
6272 					if (dst_page != VM_PAGE_NULL) {
6273 						page_grab_count++;
6274 					}
6275 				}
6276 				if (dst_page == VM_PAGE_NULL) {
6277 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6278 						/*
6279 						 * we don't want to stall waiting for pages to come onto the free list
6280 						 * while we're already holding absent pages in this UPL
6281 						 * the caller will deal with the empty slots
6282 						 */
6283 						if (user_page_list) {
6284 							user_page_list[entry].phys_addr = 0;
6285 						}
6286 
6287 						goto try_next_page;
6288 					}
6289 					/*
6290 					 * no pages available... wait
6291 					 * then try again for the same
6292 					 * offset...
6293 					 */
6294 					vm_object_unlock(object);
6295 
6296 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6297 
6298 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6299 
6300 					VM_PAGE_WAIT();
6301 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6302 
6303 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6304 
6305 					vm_object_lock(object);
6306 
6307 					continue;
6308 				}
6309 				vm_page_insert(dst_page, object, dst_offset);
6310 
6311 				dst_page->vmp_absent = TRUE;
6312 				dst_page->vmp_busy = FALSE;
6313 
6314 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6315 					/*
6316 					 * if UPL_RET_ONLY_ABSENT was specified,
6317 					 * than we're definitely setting up a
6318 					 * upl for a clustered read/pagein
6319 					 * operation... mark the pages as clustered
6320 					 * so upl_commit_range can put them on the
6321 					 * speculative list
6322 					 */
6323 					dst_page->vmp_clustered = TRUE;
6324 
6325 					if (!(cntrl_flags & UPL_FILE_IO)) {
6326 						counter_inc(&vm_statistics_pageins);
6327 					}
6328 				}
6329 			}
6330 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6331 
6332 			dst_page->vmp_overwriting = TRUE;
6333 
6334 			if (dst_page->vmp_pmapped) {
6335 				if (!(cntrl_flags & UPL_FILE_IO)) {
6336 					/*
6337 					 * eliminate all mappings from the
6338 					 * original object and its prodigy
6339 					 */
6340 					refmod_state = pmap_disconnect(phys_page);
6341 				} else {
6342 					refmod_state = pmap_get_refmod(phys_page);
6343 				}
6344 			} else {
6345 				refmod_state = 0;
6346 			}
6347 
6348 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6349 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6350 
6351 			if (cntrl_flags & UPL_SET_LITE) {
6352 				unsigned int    pg_num;
6353 
6354 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6355 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6356 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6357 
6358 				if (hw_dirty) {
6359 					pmap_clear_modify(phys_page);
6360 				}
6361 
6362 				/*
6363 				 * Mark original page as cleaning
6364 				 * in place.
6365 				 */
6366 				dst_page->vmp_cleaning = TRUE;
6367 				dst_page->vmp_precious = FALSE;
6368 			} else {
6369 				/*
6370 				 * use pageclean setup, it is more
6371 				 * convenient even for the pageout
6372 				 * cases here
6373 				 */
6374 				vm_object_lock(upl->map_object);
6375 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6376 				vm_object_unlock(upl->map_object);
6377 
6378 				alias_page->vmp_absent = FALSE;
6379 				alias_page = NULL;
6380 			}
6381 
6382 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6383 				upl->flags &= ~UPL_CLEAR_DIRTY;
6384 				upl->flags |= UPL_SET_DIRTY;
6385 				dirty = TRUE;
6386 				/*
6387 				 * Page belonging to a code-signed object is about to
6388 				 * be written. Mark it tainted and disconnect it from
6389 				 * all pmaps so processes have to fault it back in and
6390 				 * deal with the tainted bit.
6391 				 */
6392 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6393 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6394 					vm_page_upl_tainted++;
6395 					if (dst_page->vmp_pmapped) {
6396 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6397 						if (refmod_state & VM_MEM_REFERENCED) {
6398 							dst_page->vmp_reference = TRUE;
6399 						}
6400 					}
6401 				}
6402 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6403 				/*
6404 				 * clean in place for read implies
6405 				 * that a write will be done on all
6406 				 * the pages that are dirty before
6407 				 * a upl commit is done.  The caller
6408 				 * is obligated to preserve the
6409 				 * contents of all pages marked dirty
6410 				 */
6411 				upl->flags |= UPL_CLEAR_DIRTY;
6412 			}
6413 			dst_page->vmp_dirty = dirty;
6414 
6415 			if (!dirty) {
6416 				dst_page->vmp_precious = TRUE;
6417 			}
6418 
6419 			if (!VM_PAGE_WIRED(dst_page)) {
6420 				/*
6421 				 * deny access to the target page while
6422 				 * it is being worked on
6423 				 */
6424 				dst_page->vmp_busy = TRUE;
6425 			} else {
6426 				dwp->dw_mask |= DW_vm_page_wire;
6427 			}
6428 
6429 			/*
6430 			 * We might be about to satisfy a fault which has been
6431 			 * requested. So no need for the "restart" bit.
6432 			 */
6433 			dst_page->vmp_restart = FALSE;
6434 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6435 				/*
6436 				 * expect the page to be used
6437 				 */
6438 				dwp->dw_mask |= DW_set_reference;
6439 			}
6440 			if (cntrl_flags & UPL_PRECIOUS) {
6441 				if (object->internal) {
6442 					SET_PAGE_DIRTY(dst_page, FALSE);
6443 					dst_page->vmp_precious = FALSE;
6444 				} else {
6445 					dst_page->vmp_precious = TRUE;
6446 				}
6447 			} else {
6448 				dst_page->vmp_precious = FALSE;
6449 			}
6450 		}
6451 		if (dst_page->vmp_busy) {
6452 			upl->flags |= UPL_HAS_BUSY;
6453 		}
6454 
6455 		if (phys_page > upl->highest_page) {
6456 			upl->highest_page = phys_page;
6457 		}
6458 		assert(!pmap_is_noencrypt(phys_page));
6459 		if (user_page_list) {
6460 			user_page_list[entry].phys_addr = phys_page;
6461 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6462 			user_page_list[entry].absent    = dst_page->vmp_absent;
6463 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6464 			user_page_list[entry].precious  = dst_page->vmp_precious;
6465 			user_page_list[entry].device    = FALSE;
6466 			user_page_list[entry].needed    = FALSE;
6467 			if (dst_page->vmp_clustered == TRUE) {
6468 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6469 			} else {
6470 				user_page_list[entry].speculative = FALSE;
6471 			}
6472 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6473 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6474 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6475 			user_page_list[entry].mark      = FALSE;
6476 		}
6477 		/*
6478 		 * if UPL_RET_ONLY_ABSENT is set, then
6479 		 * we are working with a fresh page and we've
6480 		 * just set the clustered flag on it to
6481 		 * indicate that it was drug in as part of a
6482 		 * speculative cluster... so leave it alone
6483 		 */
6484 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6485 			/*
6486 			 * someone is explicitly grabbing this page...
6487 			 * update clustered and speculative state
6488 			 *
6489 			 */
6490 			if (dst_page->vmp_clustered) {
6491 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6492 			}
6493 		}
6494 try_next_page:
6495 		if (dwp->dw_mask) {
6496 			if (dwp->dw_mask & DW_vm_page_activate) {
6497 				counter_inc(&vm_statistics_reactivations);
6498 			}
6499 
6500 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6501 
6502 			if (dw_count >= dw_limit) {
6503 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6504 
6505 				dwp = dwp_start;
6506 				dw_count = 0;
6507 			}
6508 		}
6509 		entry++;
6510 		dst_offset += PAGE_SIZE_64;
6511 		xfer_size -= PAGE_SIZE;
6512 	}
6513 	if (dw_count) {
6514 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6515 		dwp = dwp_start;
6516 		dw_count = 0;
6517 	}
6518 
6519 	if (alias_page != NULL) {
6520 		VM_PAGE_FREE(alias_page);
6521 	}
6522 	if (pmap_flushes_delayed == TRUE) {
6523 		pmap_flush(&pmap_flush_context_storage);
6524 	}
6525 
6526 	if (page_list_count != NULL) {
6527 		if (upl->flags & UPL_INTERNAL) {
6528 			*page_list_count = 0;
6529 		} else if (*page_list_count > entry) {
6530 			*page_list_count = entry;
6531 		}
6532 	}
6533 #if UPL_DEBUG
6534 	upl->upl_state = 1;
6535 #endif
6536 	vm_object_unlock(object);
6537 
6538 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6539 #if DEVELOPMENT || DEBUG
6540 	if (task != NULL) {
6541 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6542 	}
6543 #endif /* DEVELOPMENT || DEBUG */
6544 
6545 	if (dwp_start && dwp_finish_ctx) {
6546 		vm_page_delayed_work_finish_ctx(dwp_start);
6547 		dwp_start = dwp = NULL;
6548 	}
6549 
6550 	return KERN_SUCCESS;
6551 }
6552 
6553 /*
6554  *	Routine:	vm_object_super_upl_request
6555  *	Purpose:
6556  *		Cause the population of a portion of a vm_object
6557  *		in much the same way as memory_object_upl_request.
6558  *		Depending on the nature of the request, the pages
6559  *		returned may be contain valid data or be uninitialized.
6560  *		However, the region may be expanded up to the super
6561  *		cluster size provided.
6562  */
6563 
6564 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6565 vm_object_super_upl_request(
6566 	vm_object_t object,
6567 	vm_object_offset_t      offset,
6568 	upl_size_t              size,
6569 	upl_size_t              super_cluster,
6570 	upl_t                   *upl,
6571 	upl_page_info_t         *user_page_list,
6572 	unsigned int            *page_list_count,
6573 	upl_control_flags_t     cntrl_flags,
6574 	vm_tag_t                tag)
6575 {
6576 	if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6577 		return KERN_FAILURE;
6578 	}
6579 
6580 	assert(object->paging_in_progress);
6581 	offset = offset - object->paging_offset;
6582 
6583 	if (super_cluster > size) {
6584 		vm_object_offset_t      base_offset;
6585 		upl_size_t              super_size;
6586 		vm_object_size_t        super_size_64;
6587 
6588 		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6589 		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6590 		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6591 		super_size = (upl_size_t) super_size_64;
6592 		assert(super_size == super_size_64);
6593 
6594 		if (offset > (base_offset + super_size)) {
6595 			panic("vm_object_super_upl_request: Missed target pageout"
6596 			    " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6597 			    offset, base_offset, super_size, super_cluster,
6598 			    size, object->paging_offset);
6599 		}
6600 		/*
6601 		 * apparently there is a case where the vm requests a
6602 		 * page to be written out who's offset is beyond the
6603 		 * object size
6604 		 */
6605 		if ((offset + size) > (base_offset + super_size)) {
6606 			super_size_64 = (offset + size) - base_offset;
6607 			super_size = (upl_size_t) super_size_64;
6608 			assert(super_size == super_size_64);
6609 		}
6610 
6611 		offset = base_offset;
6612 		size = super_size;
6613 	}
6614 	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6615 }
6616 
6617 int cs_executable_create_upl = 0;
6618 extern int proc_selfpid(void);
6619 extern char *proc_name_address(void *p);
6620 
6621 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6622 vm_map_create_upl(
6623 	vm_map_t                map,
6624 	vm_map_address_t        offset,
6625 	upl_size_t              *upl_size,
6626 	upl_t                   *upl,
6627 	upl_page_info_array_t   page_list,
6628 	unsigned int            *count,
6629 	upl_control_flags_t     *flags,
6630 	vm_tag_t                tag)
6631 {
6632 	vm_map_entry_t          entry;
6633 	upl_control_flags_t     caller_flags;
6634 	int                     force_data_sync;
6635 	int                     sync_cow_data;
6636 	vm_object_t             local_object;
6637 	vm_map_offset_t         local_offset;
6638 	vm_map_offset_t         local_start;
6639 	kern_return_t           ret;
6640 	vm_map_address_t        original_offset;
6641 	vm_map_size_t           original_size, adjusted_size;
6642 	vm_map_offset_t         local_entry_start;
6643 	vm_object_offset_t      local_entry_offset;
6644 	vm_object_offset_t      offset_in_mapped_page;
6645 	boolean_t               release_map = FALSE;
6646 
6647 start_with_map:
6648 
6649 	original_offset = offset;
6650 	original_size = *upl_size;
6651 	adjusted_size = original_size;
6652 
6653 	caller_flags = *flags;
6654 
6655 	if (caller_flags & ~UPL_VALID_FLAGS) {
6656 		/*
6657 		 * For forward compatibility's sake,
6658 		 * reject any unknown flag.
6659 		 */
6660 		ret = KERN_INVALID_VALUE;
6661 		goto done;
6662 	}
6663 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6664 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6665 
6666 	if (upl == NULL) {
6667 		ret = KERN_INVALID_ARGUMENT;
6668 		goto done;
6669 	}
6670 
6671 REDISCOVER_ENTRY:
6672 	vm_map_lock_read(map);
6673 
6674 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6675 		vm_map_unlock_read(map);
6676 		ret = KERN_FAILURE;
6677 		goto done;
6678 	}
6679 
6680 	local_entry_start = entry->vme_start;
6681 	local_entry_offset = VME_OFFSET(entry);
6682 
6683 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6684 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6685 	}
6686 
6687 	if (entry->vme_end - original_offset < adjusted_size) {
6688 		adjusted_size = entry->vme_end - original_offset;
6689 		assert(adjusted_size > 0);
6690 		*upl_size = (upl_size_t) adjusted_size;
6691 		assert(*upl_size == adjusted_size);
6692 	}
6693 
6694 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6695 		*flags = 0;
6696 
6697 		if (!entry->is_sub_map &&
6698 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6699 			if (VME_OBJECT(entry)->private) {
6700 				*flags = UPL_DEV_MEMORY;
6701 			}
6702 
6703 			if (VME_OBJECT(entry)->phys_contiguous) {
6704 				*flags |= UPL_PHYS_CONTIG;
6705 			}
6706 		}
6707 		vm_map_unlock_read(map);
6708 		ret = KERN_SUCCESS;
6709 		goto done;
6710 	}
6711 
6712 	offset_in_mapped_page = 0;
6713 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6714 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6715 		*upl_size = (upl_size_t)
6716 		    (vm_map_round_page(original_offset + adjusted_size,
6717 		    VM_MAP_PAGE_MASK(map))
6718 		    - offset);
6719 
6720 		offset_in_mapped_page = original_offset - offset;
6721 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6722 
6723 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6724 	}
6725 
6726 	if (!entry->is_sub_map) {
6727 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6728 		    !VME_OBJECT(entry)->phys_contiguous) {
6729 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6730 				*upl_size = MAX_UPL_SIZE_BYTES;
6731 			}
6732 		}
6733 
6734 		/*
6735 		 *      Create an object if necessary.
6736 		 */
6737 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6738 			if (vm_map_lock_read_to_write(map)) {
6739 				goto REDISCOVER_ENTRY;
6740 			}
6741 
6742 			VME_OBJECT_SET(entry,
6743 			    vm_object_allocate((vm_size_t)
6744 			    vm_object_round_page((entry->vme_end - entry->vme_start))),
6745 			    false, 0);
6746 			VME_OFFSET_SET(entry, 0);
6747 			assert(entry->use_pmap);
6748 
6749 			vm_map_lock_write_to_read(map);
6750 		}
6751 
6752 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6753 		    !(entry->protection & VM_PROT_WRITE)) {
6754 			vm_map_unlock_read(map);
6755 			ret = KERN_PROTECTION_FAILURE;
6756 			goto done;
6757 		}
6758 	}
6759 
6760 #if !XNU_TARGET_OS_OSX
6761 	if (map->pmap != kernel_pmap &&
6762 	    (caller_flags & UPL_COPYOUT_FROM) &&
6763 	    (entry->protection & VM_PROT_EXECUTE) &&
6764 	    !(entry->protection & VM_PROT_WRITE)) {
6765 		vm_offset_t     kaddr;
6766 		vm_size_t       ksize;
6767 
6768 		/*
6769 		 * We're about to create a read-only UPL backed by
6770 		 * memory from an executable mapping.
6771 		 * Wiring the pages would result in the pages being copied
6772 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6773 		 * code-signed, so no longer eligible for execution.
6774 		 * Instead, let's copy the data into a kernel buffer and
6775 		 * create the UPL from this kernel buffer.
6776 		 * The kernel buffer is then freed, leaving the UPL holding
6777 		 * the last reference on the VM object, so the memory will
6778 		 * be released when the UPL is committed.
6779 		 */
6780 
6781 		vm_map_unlock_read(map);
6782 		entry = VM_MAP_ENTRY_NULL;
6783 		/* allocate kernel buffer */
6784 		ksize = round_page(*upl_size);
6785 		kaddr = 0;
6786 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6787 		    KMA_PAGEABLE | KMA_DATA, tag);
6788 		if (ret == KERN_SUCCESS) {
6789 			/* copyin the user data */
6790 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6791 		}
6792 		if (ret == KERN_SUCCESS) {
6793 			if (ksize > *upl_size) {
6794 				/* zero out the extra space in kernel buffer */
6795 				memset((void *)(kaddr + *upl_size),
6796 				    0,
6797 				    ksize - *upl_size);
6798 			}
6799 			/* create the UPL from the kernel buffer */
6800 			vm_object_offset_t      offset_in_object;
6801 			vm_object_offset_t      offset_in_object_page;
6802 
6803 			offset_in_object = offset - local_entry_start + local_entry_offset;
6804 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6805 			assert(offset_in_object_page < PAGE_SIZE);
6806 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6807 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6808 			ret = vm_map_create_upl(kernel_map,
6809 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6810 			    upl_size, upl, page_list, count, flags, tag);
6811 		}
6812 		if (kaddr != 0) {
6813 			/* free the kernel buffer */
6814 			kmem_free(kernel_map, kaddr, ksize);
6815 			kaddr = 0;
6816 			ksize = 0;
6817 		}
6818 #if DEVELOPMENT || DEBUG
6819 		DTRACE_VM4(create_upl_from_executable,
6820 		    vm_map_t, map,
6821 		    vm_map_address_t, offset,
6822 		    upl_size_t, *upl_size,
6823 		    kern_return_t, ret);
6824 #endif /* DEVELOPMENT || DEBUG */
6825 		goto done;
6826 	}
6827 #endif /* !XNU_TARGET_OS_OSX */
6828 
6829 	if (!entry->is_sub_map) {
6830 		local_object = VME_OBJECT(entry);
6831 		assert(local_object != VM_OBJECT_NULL);
6832 	}
6833 
6834 	if (!entry->is_sub_map &&
6835 	    !entry->needs_copy &&
6836 	    *upl_size != 0 &&
6837 	    local_object->vo_size > *upl_size && /* partial UPL */
6838 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6839 	    (map->pmap != kernel_pmap) && /* alias checks */
6840 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6841 	    ||
6842 	    ( /* case 2 */
6843 		    local_object->internal &&
6844 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6845 		    local_object->ref_count > 1))) {
6846 		vm_prot_t       prot;
6847 
6848 		/*
6849 		 * Case 1:
6850 		 * Set up the targeted range for copy-on-write to avoid
6851 		 * applying true_share/copy_delay to the entire object.
6852 		 *
6853 		 * Case 2:
6854 		 * This map entry covers only part of an internal
6855 		 * object.  There could be other map entries covering
6856 		 * other areas of this object and some of these map
6857 		 * entries could be marked as "needs_copy", which
6858 		 * assumes that the object is COPY_SYMMETRIC.
6859 		 * To avoid marking this object as COPY_DELAY and
6860 		 * "true_share", let's shadow it and mark the new
6861 		 * (smaller) object as "true_share" and COPY_DELAY.
6862 		 */
6863 
6864 		if (vm_map_lock_read_to_write(map)) {
6865 			goto REDISCOVER_ENTRY;
6866 		}
6867 		vm_map_lock_assert_exclusive(map);
6868 		assert(VME_OBJECT(entry) == local_object);
6869 
6870 		vm_map_clip_start(map,
6871 		    entry,
6872 		    vm_map_trunc_page(offset,
6873 		    VM_MAP_PAGE_MASK(map)));
6874 		vm_map_clip_end(map,
6875 		    entry,
6876 		    vm_map_round_page(offset + *upl_size,
6877 		    VM_MAP_PAGE_MASK(map)));
6878 		if ((entry->vme_end - offset) < *upl_size) {
6879 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6880 			assert(*upl_size == entry->vme_end - offset);
6881 		}
6882 
6883 		prot = entry->protection & ~VM_PROT_WRITE;
6884 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6885 			prot |= VM_PROT_EXECUTE;
6886 		}
6887 		vm_object_pmap_protect(local_object,
6888 		    VME_OFFSET(entry),
6889 		    entry->vme_end - entry->vme_start,
6890 		    ((entry->is_shared ||
6891 		    map->mapped_in_other_pmaps)
6892 		    ? PMAP_NULL
6893 		    : map->pmap),
6894 		    VM_MAP_PAGE_SIZE(map),
6895 		    entry->vme_start,
6896 		    prot);
6897 
6898 		assert(entry->wired_count == 0);
6899 
6900 		/*
6901 		 * Lock the VM object and re-check its status: if it's mapped
6902 		 * in another address space, we could still be racing with
6903 		 * another thread holding that other VM map exclusively.
6904 		 */
6905 		vm_object_lock(local_object);
6906 		if (local_object->true_share) {
6907 			/* object is already in proper state: no COW needed */
6908 			assert(local_object->copy_strategy !=
6909 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6910 		} else {
6911 			/* not true_share: ask for copy-on-write below */
6912 			assert(local_object->copy_strategy ==
6913 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6914 			entry->needs_copy = TRUE;
6915 		}
6916 		vm_object_unlock(local_object);
6917 
6918 		vm_map_lock_write_to_read(map);
6919 	}
6920 
6921 	if (entry->needs_copy) {
6922 		/*
6923 		 * Honor copy-on-write for COPY_SYMMETRIC
6924 		 * strategy.
6925 		 */
6926 		vm_map_t                local_map;
6927 		vm_object_t             object;
6928 		vm_object_offset_t      new_offset;
6929 		vm_prot_t               prot;
6930 		boolean_t               wired;
6931 		vm_map_version_t        version;
6932 		vm_map_t                real_map;
6933 		vm_prot_t               fault_type;
6934 
6935 		local_map = map;
6936 
6937 		if (caller_flags & UPL_COPYOUT_FROM) {
6938 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6939 			vm_counters.create_upl_extra_cow++;
6940 			vm_counters.create_upl_extra_cow_pages +=
6941 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6942 		} else {
6943 			fault_type = VM_PROT_WRITE;
6944 		}
6945 		if (vm_map_lookup_and_lock_object(&local_map,
6946 		    offset, fault_type,
6947 		    OBJECT_LOCK_EXCLUSIVE,
6948 		    &version, &object,
6949 		    &new_offset, &prot, &wired,
6950 		    NULL,
6951 		    &real_map, NULL) != KERN_SUCCESS) {
6952 			if (fault_type == VM_PROT_WRITE) {
6953 				vm_counters.create_upl_lookup_failure_write++;
6954 			} else {
6955 				vm_counters.create_upl_lookup_failure_copy++;
6956 			}
6957 			vm_map_unlock_read(local_map);
6958 			ret = KERN_FAILURE;
6959 			goto done;
6960 		}
6961 		if (real_map != local_map) {
6962 			vm_map_unlock(real_map);
6963 		}
6964 		vm_map_unlock_read(local_map);
6965 
6966 		vm_object_unlock(object);
6967 
6968 		goto REDISCOVER_ENTRY;
6969 	}
6970 
6971 	if (entry->is_sub_map) {
6972 		vm_map_t        submap;
6973 
6974 		submap = VME_SUBMAP(entry);
6975 		local_start = entry->vme_start;
6976 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6977 
6978 		vm_map_reference(submap);
6979 		vm_map_unlock_read(map);
6980 
6981 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6982 		offset += offset_in_mapped_page;
6983 		*upl_size -= offset_in_mapped_page;
6984 
6985 		if (release_map) {
6986 			vm_map_deallocate(map);
6987 		}
6988 		map = submap;
6989 		release_map = TRUE;
6990 		offset = local_offset + (offset - local_start);
6991 		goto start_with_map;
6992 	}
6993 
6994 	if (sync_cow_data &&
6995 	    (VME_OBJECT(entry)->shadow ||
6996 	    VME_OBJECT(entry)->copy)) {
6997 		local_object = VME_OBJECT(entry);
6998 		local_start = entry->vme_start;
6999 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7000 
7001 		vm_object_reference(local_object);
7002 		vm_map_unlock_read(map);
7003 
7004 		if (local_object->shadow && local_object->copy) {
7005 			vm_object_lock_request(local_object->shadow,
7006 			    ((vm_object_offset_t)
7007 			    ((offset - local_start) +
7008 			    local_offset) +
7009 			    local_object->vo_shadow_offset),
7010 			    *upl_size, FALSE,
7011 			    MEMORY_OBJECT_DATA_SYNC,
7012 			    VM_PROT_NO_CHANGE);
7013 		}
7014 		sync_cow_data = FALSE;
7015 		vm_object_deallocate(local_object);
7016 
7017 		goto REDISCOVER_ENTRY;
7018 	}
7019 	if (force_data_sync) {
7020 		local_object = VME_OBJECT(entry);
7021 		local_start = entry->vme_start;
7022 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7023 
7024 		vm_object_reference(local_object);
7025 		vm_map_unlock_read(map);
7026 
7027 		vm_object_lock_request(local_object,
7028 		    ((vm_object_offset_t)
7029 		    ((offset - local_start) +
7030 		    local_offset)),
7031 		    (vm_object_size_t)*upl_size,
7032 		    FALSE,
7033 		    MEMORY_OBJECT_DATA_SYNC,
7034 		    VM_PROT_NO_CHANGE);
7035 
7036 		force_data_sync = FALSE;
7037 		vm_object_deallocate(local_object);
7038 
7039 		goto REDISCOVER_ENTRY;
7040 	}
7041 	if (VME_OBJECT(entry)->private) {
7042 		*flags = UPL_DEV_MEMORY;
7043 	} else {
7044 		*flags = 0;
7045 	}
7046 
7047 	if (VME_OBJECT(entry)->phys_contiguous) {
7048 		*flags |= UPL_PHYS_CONTIG;
7049 	}
7050 
7051 	local_object = VME_OBJECT(entry);
7052 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7053 	local_start = entry->vme_start;
7054 
7055 	/*
7056 	 * Wiring will copy the pages to the shadow object.
7057 	 * The shadow object will not be code-signed so
7058 	 * attempting to execute code from these copied pages
7059 	 * would trigger a code-signing violation.
7060 	 */
7061 	if (entry->protection & VM_PROT_EXECUTE) {
7062 #if MACH_ASSERT
7063 		printf("pid %d[%s] create_upl out of executable range from "
7064 		    "0x%llx to 0x%llx: side effects may include "
7065 		    "code-signing violations later on\n",
7066 		    proc_selfpid(),
7067 		    (get_bsdtask_info(current_task())
7068 		    ? proc_name_address(get_bsdtask_info(current_task()))
7069 		    : "?"),
7070 		    (uint64_t) entry->vme_start,
7071 		    (uint64_t) entry->vme_end);
7072 #endif /* MACH_ASSERT */
7073 		DTRACE_VM2(cs_executable_create_upl,
7074 		    uint64_t, (uint64_t)entry->vme_start,
7075 		    uint64_t, (uint64_t)entry->vme_end);
7076 		cs_executable_create_upl++;
7077 	}
7078 
7079 	vm_object_lock(local_object);
7080 
7081 	/*
7082 	 * Ensure that this object is "true_share" and "copy_delay" now,
7083 	 * while we're still holding the VM map lock.  After we unlock the map,
7084 	 * anything could happen to that mapping, including some copy-on-write
7085 	 * activity.  We need to make sure that the IOPL will point at the
7086 	 * same memory as the mapping.
7087 	 */
7088 	if (local_object->true_share) {
7089 		assert(local_object->copy_strategy !=
7090 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7091 	} else if (local_object != kernel_object &&
7092 	    local_object != compressor_object &&
7093 	    !local_object->phys_contiguous) {
7094 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7095 		if (!local_object->true_share &&
7096 		    vm_object_tracking_btlog) {
7097 			btlog_record(vm_object_tracking_btlog, local_object,
7098 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7099 			    btref_get(__builtin_frame_address(0), 0));
7100 		}
7101 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7102 		local_object->true_share = TRUE;
7103 		if (local_object->copy_strategy ==
7104 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7105 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7106 		}
7107 	}
7108 
7109 	vm_object_reference_locked(local_object);
7110 	vm_object_unlock(local_object);
7111 
7112 	vm_map_unlock_read(map);
7113 
7114 	offset += offset_in_mapped_page;
7115 	assert(*upl_size > offset_in_mapped_page);
7116 	*upl_size -= offset_in_mapped_page;
7117 
7118 	ret = vm_object_iopl_request(local_object,
7119 	    ((vm_object_offset_t)
7120 	    ((offset - local_start) + local_offset)),
7121 	    *upl_size,
7122 	    upl,
7123 	    page_list,
7124 	    count,
7125 	    caller_flags,
7126 	    tag);
7127 	vm_object_deallocate(local_object);
7128 
7129 done:
7130 	if (release_map) {
7131 		vm_map_deallocate(map);
7132 	}
7133 
7134 	return ret;
7135 }
7136 
7137 /*
7138  * Internal routine to enter a UPL into a VM map.
7139  *
7140  * JMM - This should just be doable through the standard
7141  * vm_map_enter() API.
7142  */
7143 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7144 vm_map_enter_upl_range(
7145 	vm_map_t                map,
7146 	upl_t                   upl,
7147 	vm_object_offset_t      offset_to_map,
7148 	upl_size_t              size_to_map,
7149 	vm_prot_t               prot_to_map,
7150 	vm_map_offset_t         *dst_addr)
7151 {
7152 	vm_map_size_t           size;
7153 	vm_object_offset_t      offset;
7154 	vm_map_offset_t         addr;
7155 	vm_page_t               m;
7156 	kern_return_t           kr;
7157 	int                     isVectorUPL = 0, curr_upl = 0;
7158 	upl_t                   vector_upl = NULL;
7159 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7160 	vm_map_t                vector_upl_submap = NULL;
7161 	upl_offset_t            subupl_offset = 0;
7162 	upl_size_t              subupl_size = 0;
7163 
7164 	if (upl == UPL_NULL) {
7165 		return KERN_INVALID_ARGUMENT;
7166 	}
7167 
7168 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7169 	assert(map == kernel_map);
7170 
7171 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7172 		int mapped = 0, valid_upls = 0;
7173 		vector_upl = upl;
7174 
7175 		upl_lock(vector_upl);
7176 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7177 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7178 			if (upl == NULL) {
7179 				continue;
7180 			}
7181 			valid_upls++;
7182 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7183 				mapped++;
7184 			}
7185 		}
7186 
7187 		if (mapped) {
7188 			if (mapped != valid_upls) {
7189 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7190 			} else {
7191 				upl_unlock(vector_upl);
7192 				return KERN_FAILURE;
7193 			}
7194 		}
7195 
7196 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7197 			panic("TODO4K: vector UPL not implemented");
7198 		}
7199 
7200 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7201 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7202 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7203 		    VM_KERN_MEMORY_NONE).kmr_submap;
7204 		map = vector_upl_submap;
7205 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7206 		curr_upl = 0;
7207 	} else {
7208 		upl_lock(upl);
7209 	}
7210 
7211 process_upl_to_enter:
7212 	if (isVectorUPL) {
7213 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7214 			*dst_addr = vector_upl_dst_addr;
7215 			upl_unlock(vector_upl);
7216 			return KERN_SUCCESS;
7217 		}
7218 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7219 		if (upl == NULL) {
7220 			goto process_upl_to_enter;
7221 		}
7222 
7223 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7224 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7225 	} else {
7226 		/*
7227 		 * check to see if already mapped
7228 		 */
7229 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7230 			upl_unlock(upl);
7231 			return KERN_FAILURE;
7232 		}
7233 	}
7234 
7235 	if ((!(upl->flags & UPL_SHADOWED)) &&
7236 	    ((upl->flags & UPL_HAS_BUSY) ||
7237 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7238 		vm_object_t             object;
7239 		vm_page_t               alias_page;
7240 		vm_object_offset_t      new_offset;
7241 		unsigned int            pg_num;
7242 		wpl_array_t             lite_list;
7243 
7244 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7245 		if (upl->flags & UPL_INTERNAL) {
7246 			lite_list = (wpl_array_t)
7247 			    ((((uintptr_t)upl) + sizeof(struct upl))
7248 			    + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7249 		} else {
7250 			lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7251 		}
7252 		object = upl->map_object;
7253 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7254 
7255 		vm_object_lock(upl->map_object);
7256 
7257 		upl->map_object->shadow = object;
7258 		upl->map_object->pageout = TRUE;
7259 		upl->map_object->can_persist = FALSE;
7260 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7261 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7262 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7263 		    "object %p shadow_offset 0x%llx",
7264 		    upl->map_object,
7265 		    (uint64_t)upl->map_object->vo_shadow_offset);
7266 		upl->map_object->wimg_bits = object->wimg_bits;
7267 		offset = upl->map_object->vo_shadow_offset;
7268 		new_offset = 0;
7269 
7270 		upl->flags |= UPL_SHADOWED;
7271 
7272 		while (size) {
7273 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7274 			assert(pg_num == new_offset / PAGE_SIZE);
7275 
7276 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7277 				alias_page = vm_page_grab_fictitious(TRUE);
7278 
7279 				vm_object_lock(object);
7280 
7281 				m = vm_page_lookup(object, offset);
7282 				if (m == VM_PAGE_NULL) {
7283 					panic("vm_upl_map: page missing");
7284 				}
7285 
7286 				/*
7287 				 * Convert the fictitious page to a private
7288 				 * shadow of the real page.
7289 				 */
7290 				assert(alias_page->vmp_fictitious);
7291 				alias_page->vmp_fictitious = FALSE;
7292 				alias_page->vmp_private = TRUE;
7293 				alias_page->vmp_free_when_done = TRUE;
7294 				/*
7295 				 * since m is a page in the upl it must
7296 				 * already be wired or BUSY, so it's
7297 				 * safe to assign the underlying physical
7298 				 * page to the alias
7299 				 */
7300 				VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7301 
7302 				vm_object_unlock(object);
7303 
7304 				vm_page_lockspin_queues();
7305 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7306 				vm_page_unlock_queues();
7307 
7308 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7309 
7310 				assert(!alias_page->vmp_wanted);
7311 				alias_page->vmp_busy = FALSE;
7312 				alias_page->vmp_absent = FALSE;
7313 			}
7314 			size -= PAGE_SIZE;
7315 			offset += PAGE_SIZE_64;
7316 			new_offset += PAGE_SIZE_64;
7317 		}
7318 		vm_object_unlock(upl->map_object);
7319 	}
7320 	if (upl->flags & UPL_SHADOWED) {
7321 		if (isVectorUPL) {
7322 			offset = 0;
7323 		} else {
7324 			offset = offset_to_map;
7325 		}
7326 	} else {
7327 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7328 		if (!isVectorUPL) {
7329 			offset += offset_to_map;
7330 		}
7331 	}
7332 
7333 	if (isVectorUPL) {
7334 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7335 	} else {
7336 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7337 	}
7338 
7339 	vm_object_reference(upl->map_object);
7340 
7341 	if (!isVectorUPL) {
7342 		*dst_addr = 0;
7343 		/*
7344 		 * NEED A UPL_MAP ALIAS
7345 		 */
7346 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7347 		    VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_DATA, VM_KERN_MEMORY_OSFMK,
7348 		    upl->map_object, offset, FALSE,
7349 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7350 
7351 		if (kr != KERN_SUCCESS) {
7352 			vm_object_deallocate(upl->map_object);
7353 			upl_unlock(upl);
7354 			return kr;
7355 		}
7356 	} else {
7357 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7358 		    VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7359 		    upl->map_object, offset, FALSE,
7360 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7361 		if (kr) {
7362 			panic("vm_map_enter failed for a Vector UPL");
7363 		}
7364 	}
7365 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7366 	                                        /* this will have to be an increment rather than */
7367 	                                        /* an assignment. */
7368 	vm_object_lock(upl->map_object);
7369 
7370 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7371 		m = vm_page_lookup(upl->map_object, offset);
7372 
7373 		if (m) {
7374 			m->vmp_pmapped = TRUE;
7375 
7376 			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7377 			 * but only in kernel space. If this was on a user map,
7378 			 * we'd have to set the wpmapped bit. */
7379 			/* m->vmp_wpmapped = TRUE; */
7380 			assert(map->pmap == kernel_pmap);
7381 
7382 			PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7383 
7384 			assert(kr == KERN_SUCCESS);
7385 #if KASAN
7386 			kasan_notify_address(addr, PAGE_SIZE_64);
7387 #endif
7388 		}
7389 		offset += PAGE_SIZE_64;
7390 	}
7391 	vm_object_unlock(upl->map_object);
7392 
7393 	/*
7394 	 * hold a reference for the mapping
7395 	 */
7396 	upl->ref_count++;
7397 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7398 	upl->kaddr = (vm_offset_t) *dst_addr;
7399 	assert(upl->kaddr == *dst_addr);
7400 
7401 	if (isVectorUPL) {
7402 		goto process_upl_to_enter;
7403 	}
7404 
7405 	if (!isVectorUPL) {
7406 		vm_map_offset_t addr_adjustment;
7407 
7408 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7409 		if (addr_adjustment) {
7410 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7411 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7412 			*dst_addr += addr_adjustment;
7413 		}
7414 	}
7415 
7416 	upl_unlock(upl);
7417 
7418 	return KERN_SUCCESS;
7419 }
7420 
7421 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7422 vm_map_enter_upl(
7423 	vm_map_t                map,
7424 	upl_t                   upl,
7425 	vm_map_offset_t         *dst_addr)
7426 {
7427 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7428 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7429 }
7430 
7431 /*
7432  * Internal routine to remove a UPL mapping from a VM map.
7433  *
7434  * XXX - This should just be doable through a standard
7435  * vm_map_remove() operation.  Otherwise, implicit clean-up
7436  * of the target map won't be able to correctly remove
7437  * these (and release the reference on the UPL).  Having
7438  * to do this means we can't map these into user-space
7439  * maps yet.
7440  */
7441 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7442 vm_map_remove_upl_range(
7443 	vm_map_t        map,
7444 	upl_t           upl,
7445 	__unused vm_object_offset_t    offset_to_unmap,
7446 	__unused upl_size_t      size_to_unmap)
7447 {
7448 	vm_address_t    addr;
7449 	upl_size_t      size;
7450 	int             isVectorUPL = 0, curr_upl = 0;
7451 	upl_t           vector_upl = NULL;
7452 
7453 	if (upl == UPL_NULL) {
7454 		return KERN_INVALID_ARGUMENT;
7455 	}
7456 
7457 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7458 		int     unmapped = 0, valid_upls = 0;
7459 		vector_upl = upl;
7460 		upl_lock(vector_upl);
7461 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7462 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7463 			if (upl == NULL) {
7464 				continue;
7465 			}
7466 			valid_upls++;
7467 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7468 				unmapped++;
7469 			}
7470 		}
7471 
7472 		if (unmapped) {
7473 			if (unmapped != valid_upls) {
7474 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7475 			} else {
7476 				upl_unlock(vector_upl);
7477 				return KERN_FAILURE;
7478 			}
7479 		}
7480 		curr_upl = 0;
7481 	} else {
7482 		upl_lock(upl);
7483 	}
7484 
7485 process_upl_to_remove:
7486 	if (isVectorUPL) {
7487 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7488 			vm_map_t v_upl_submap;
7489 			vm_offset_t v_upl_submap_dst_addr;
7490 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7491 
7492 			kmem_free_guard(map, v_upl_submap_dst_addr,
7493 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7494 			vm_map_deallocate(v_upl_submap);
7495 			upl_unlock(vector_upl);
7496 			return KERN_SUCCESS;
7497 		}
7498 
7499 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7500 		if (upl == NULL) {
7501 			goto process_upl_to_remove;
7502 		}
7503 	}
7504 
7505 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7506 		addr = upl->kaddr;
7507 		size = upl->u_mapped_size;
7508 
7509 		assert(upl->ref_count > 1);
7510 		upl->ref_count--;               /* removing mapping ref */
7511 
7512 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7513 		upl->kaddr = (vm_offset_t) 0;
7514 		upl->u_mapped_size = 0;
7515 
7516 		if (isVectorUPL) {
7517 			/*
7518 			 * If it's a Vectored UPL, we'll be removing the entire
7519 			 * submap anyways, so no need to remove individual UPL
7520 			 * element mappings from within the submap
7521 			 */
7522 			goto process_upl_to_remove;
7523 		}
7524 
7525 		upl_unlock(upl);
7526 
7527 		vm_map_remove(map,
7528 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7529 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7530 		return KERN_SUCCESS;
7531 	}
7532 	upl_unlock(upl);
7533 
7534 	return KERN_FAILURE;
7535 }
7536 
7537 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7538 vm_map_remove_upl(
7539 	vm_map_t        map,
7540 	upl_t           upl)
7541 {
7542 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7543 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7544 }
7545 
7546 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7547 upl_commit_range(
7548 	upl_t                   upl,
7549 	upl_offset_t            offset,
7550 	upl_size_t              size,
7551 	int                     flags,
7552 	upl_page_info_t         *page_list,
7553 	mach_msg_type_number_t  count,
7554 	boolean_t               *empty)
7555 {
7556 	upl_size_t              xfer_size, subupl_size;
7557 	vm_object_t             shadow_object;
7558 	vm_object_t             object;
7559 	vm_object_t             m_object;
7560 	vm_object_offset_t      target_offset;
7561 	upl_offset_t            subupl_offset = offset;
7562 	int                     entry;
7563 	wpl_array_t             lite_list;
7564 	int                     occupied;
7565 	int                     clear_refmod = 0;
7566 	int                     pgpgout_count = 0;
7567 	struct  vm_page_delayed_work    dw_array;
7568 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7569 	bool                    dwp_finish_ctx = TRUE;
7570 	int                     dw_count;
7571 	int                     dw_limit;
7572 	int                     isVectorUPL = 0;
7573 	upl_t                   vector_upl = NULL;
7574 	boolean_t               should_be_throttled = FALSE;
7575 
7576 	vm_page_t               nxt_page = VM_PAGE_NULL;
7577 	int                     fast_path_possible = 0;
7578 	int                     fast_path_full_commit = 0;
7579 	int                     throttle_page = 0;
7580 	int                     unwired_count = 0;
7581 	int                     local_queue_count = 0;
7582 	vm_page_t               first_local, last_local;
7583 	vm_object_offset_t      obj_start, obj_end, obj_offset;
7584 	kern_return_t           kr = KERN_SUCCESS;
7585 
7586 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7587 
7588 	dwp_start = dwp = NULL;
7589 
7590 	subupl_size = size;
7591 	*empty = FALSE;
7592 
7593 	if (upl == UPL_NULL) {
7594 		return KERN_INVALID_ARGUMENT;
7595 	}
7596 
7597 	dw_count = 0;
7598 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7599 	dwp_start = vm_page_delayed_work_get_ctx();
7600 	if (dwp_start == NULL) {
7601 		dwp_start = &dw_array;
7602 		dw_limit = 1;
7603 		dwp_finish_ctx = FALSE;
7604 	}
7605 
7606 	dwp = dwp_start;
7607 
7608 	if (count == 0) {
7609 		page_list = NULL;
7610 	}
7611 
7612 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7613 		vector_upl = upl;
7614 		upl_lock(vector_upl);
7615 	} else {
7616 		upl_lock(upl);
7617 	}
7618 
7619 process_upl_to_commit:
7620 
7621 	if (isVectorUPL) {
7622 		size = subupl_size;
7623 		offset = subupl_offset;
7624 		if (size == 0) {
7625 			upl_unlock(vector_upl);
7626 			kr = KERN_SUCCESS;
7627 			goto done;
7628 		}
7629 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7630 		if (upl == NULL) {
7631 			upl_unlock(vector_upl);
7632 			kr = KERN_FAILURE;
7633 			goto done;
7634 		}
7635 		page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7636 		subupl_size -= size;
7637 		subupl_offset += size;
7638 	}
7639 
7640 #if UPL_DEBUG
7641 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7642 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7643 
7644 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7645 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7646 
7647 		upl->upl_commit_index++;
7648 	}
7649 #endif
7650 	if (upl->flags & UPL_DEVICE_MEMORY) {
7651 		xfer_size = 0;
7652 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7653 		xfer_size = size;
7654 	} else {
7655 		if (!isVectorUPL) {
7656 			upl_unlock(upl);
7657 		} else {
7658 			upl_unlock(vector_upl);
7659 		}
7660 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7661 		kr = KERN_FAILURE;
7662 		goto done;
7663 	}
7664 	if (upl->flags & UPL_SET_DIRTY) {
7665 		flags |= UPL_COMMIT_SET_DIRTY;
7666 	}
7667 	if (upl->flags & UPL_CLEAR_DIRTY) {
7668 		flags |= UPL_COMMIT_CLEAR_DIRTY;
7669 	}
7670 
7671 	if (upl->flags & UPL_INTERNAL) {
7672 		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7673 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7674 	} else {
7675 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7676 	}
7677 
7678 	object = upl->map_object;
7679 
7680 	if (upl->flags & UPL_SHADOWED) {
7681 		vm_object_lock(object);
7682 		shadow_object = object->shadow;
7683 	} else {
7684 		shadow_object = object;
7685 	}
7686 	entry = offset / PAGE_SIZE;
7687 	target_offset = (vm_object_offset_t)offset;
7688 
7689 	if (upl->flags & UPL_KERNEL_OBJECT) {
7690 		vm_object_lock_shared(shadow_object);
7691 	} else {
7692 		vm_object_lock(shadow_object);
7693 	}
7694 
7695 	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7696 
7697 	if (upl->flags & UPL_ACCESS_BLOCKED) {
7698 		assert(shadow_object->blocked_access);
7699 		shadow_object->blocked_access = FALSE;
7700 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7701 	}
7702 
7703 	if (shadow_object->code_signed) {
7704 		/*
7705 		 * CODE SIGNING:
7706 		 * If the object is code-signed, do not let this UPL tell
7707 		 * us if the pages are valid or not.  Let the pages be
7708 		 * validated by VM the normal way (when they get mapped or
7709 		 * copied).
7710 		 */
7711 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7712 	}
7713 	if (!page_list) {
7714 		/*
7715 		 * No page list to get the code-signing info from !?
7716 		 */
7717 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7718 	}
7719 	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7720 		should_be_throttled = TRUE;
7721 	}
7722 
7723 	if ((upl->flags & UPL_IO_WIRE) &&
7724 	    !(flags & UPL_COMMIT_FREE_ABSENT) &&
7725 	    !isVectorUPL &&
7726 	    shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7727 	    shadow_object->purgable != VM_PURGABLE_EMPTY) {
7728 		if (!vm_page_queue_empty(&shadow_object->memq)) {
7729 			if (size == shadow_object->vo_size) {
7730 				nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7731 				fast_path_full_commit = 1;
7732 			}
7733 			fast_path_possible = 1;
7734 
7735 			if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7736 			    (shadow_object->purgable == VM_PURGABLE_DENY ||
7737 			    shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7738 			    shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7739 				throttle_page = 1;
7740 			}
7741 		}
7742 	}
7743 	first_local = VM_PAGE_NULL;
7744 	last_local = VM_PAGE_NULL;
7745 
7746 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7747 	obj_end = obj_start + xfer_size;
7748 	obj_start = vm_object_trunc_page(obj_start);
7749 	obj_end = vm_object_round_page(obj_end);
7750 	for (obj_offset = obj_start;
7751 	    obj_offset < obj_end;
7752 	    obj_offset += PAGE_SIZE) {
7753 		vm_page_t       t, m;
7754 
7755 		dwp->dw_mask = 0;
7756 		clear_refmod = 0;
7757 
7758 		m = VM_PAGE_NULL;
7759 
7760 		if (upl->flags & UPL_LITE) {
7761 			unsigned int    pg_num;
7762 
7763 			if (nxt_page != VM_PAGE_NULL) {
7764 				m = nxt_page;
7765 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7766 				target_offset = m->vmp_offset;
7767 			}
7768 			pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7769 			assert(pg_num == target_offset / PAGE_SIZE);
7770 
7771 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7772 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7773 
7774 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7775 					m = vm_page_lookup(shadow_object, obj_offset);
7776 				}
7777 			} else {
7778 				m = NULL;
7779 			}
7780 		}
7781 		if (upl->flags & UPL_SHADOWED) {
7782 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7783 				t->vmp_free_when_done = FALSE;
7784 
7785 				VM_PAGE_FREE(t);
7786 
7787 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7788 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7789 				}
7790 			}
7791 		}
7792 		if (m == VM_PAGE_NULL) {
7793 			goto commit_next_page;
7794 		}
7795 
7796 		m_object = VM_PAGE_OBJECT(m);
7797 
7798 		if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7799 			assert(m->vmp_busy);
7800 
7801 			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7802 			goto commit_next_page;
7803 		}
7804 
7805 		if (flags & UPL_COMMIT_CS_VALIDATED) {
7806 			/*
7807 			 * CODE SIGNING:
7808 			 * Set the code signing bits according to
7809 			 * what the UPL says they should be.
7810 			 */
7811 			m->vmp_cs_validated |= page_list[entry].cs_validated;
7812 			m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7813 			m->vmp_cs_nx |= page_list[entry].cs_nx;
7814 		}
7815 		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7816 			m->vmp_written_by_kernel = TRUE;
7817 		}
7818 
7819 		if (upl->flags & UPL_IO_WIRE) {
7820 			if (page_list) {
7821 				page_list[entry].phys_addr = 0;
7822 			}
7823 
7824 			if (flags & UPL_COMMIT_SET_DIRTY) {
7825 				SET_PAGE_DIRTY(m, FALSE);
7826 			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7827 				m->vmp_dirty = FALSE;
7828 
7829 				if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7830 				    m->vmp_cs_validated &&
7831 				    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7832 					/*
7833 					 * CODE SIGNING:
7834 					 * This page is no longer dirty
7835 					 * but could have been modified,
7836 					 * so it will need to be
7837 					 * re-validated.
7838 					 */
7839 					m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7840 
7841 					VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7842 
7843 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7844 				}
7845 				clear_refmod |= VM_MEM_MODIFIED;
7846 			}
7847 			if (upl->flags & UPL_ACCESS_BLOCKED) {
7848 				/*
7849 				 * We blocked access to the pages in this UPL.
7850 				 * Clear the "busy" bit and wake up any waiter
7851 				 * for this page.
7852 				 */
7853 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7854 			}
7855 			if (fast_path_possible) {
7856 				assert(m_object->purgable != VM_PURGABLE_EMPTY);
7857 				assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7858 				if (m->vmp_absent) {
7859 					assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7860 					assert(m->vmp_wire_count == 0);
7861 					assert(m->vmp_busy);
7862 
7863 					m->vmp_absent = FALSE;
7864 					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7865 				} else {
7866 					if (m->vmp_wire_count == 0) {
7867 						panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7868 					}
7869 					assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7870 
7871 					/*
7872 					 * XXX FBDP need to update some other
7873 					 * counters here (purgeable_wired_count)
7874 					 * (ledgers), ...
7875 					 */
7876 					assert(m->vmp_wire_count > 0);
7877 					m->vmp_wire_count--;
7878 
7879 					if (m->vmp_wire_count == 0) {
7880 						m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7881 						unwired_count++;
7882 					}
7883 				}
7884 				if (m->vmp_wire_count == 0) {
7885 					assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7886 
7887 					if (last_local == VM_PAGE_NULL) {
7888 						assert(first_local == VM_PAGE_NULL);
7889 
7890 						last_local = m;
7891 						first_local = m;
7892 					} else {
7893 						assert(first_local != VM_PAGE_NULL);
7894 
7895 						m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7896 						first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7897 						first_local = m;
7898 					}
7899 					local_queue_count++;
7900 
7901 					if (throttle_page) {
7902 						m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7903 					} else {
7904 						if (flags & UPL_COMMIT_INACTIVATE) {
7905 							if (shadow_object->internal) {
7906 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7907 							} else {
7908 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7909 							}
7910 						} else {
7911 							m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7912 						}
7913 					}
7914 				}
7915 			} else {
7916 				if (flags & UPL_COMMIT_INACTIVATE) {
7917 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7918 					clear_refmod |= VM_MEM_REFERENCED;
7919 				}
7920 				if (m->vmp_absent) {
7921 					if (flags & UPL_COMMIT_FREE_ABSENT) {
7922 						dwp->dw_mask |= DW_vm_page_free;
7923 					} else {
7924 						m->vmp_absent = FALSE;
7925 						dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7926 
7927 						if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7928 							dwp->dw_mask |= DW_vm_page_activate;
7929 						}
7930 					}
7931 				} else {
7932 					dwp->dw_mask |= DW_vm_page_unwire;
7933 				}
7934 			}
7935 			goto commit_next_page;
7936 		}
7937 		assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7938 
7939 		if (page_list) {
7940 			page_list[entry].phys_addr = 0;
7941 		}
7942 
7943 		/*
7944 		 * make sure to clear the hardware
7945 		 * modify or reference bits before
7946 		 * releasing the BUSY bit on this page
7947 		 * otherwise we risk losing a legitimate
7948 		 * change of state
7949 		 */
7950 		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7951 			m->vmp_dirty = FALSE;
7952 
7953 			clear_refmod |= VM_MEM_MODIFIED;
7954 		}
7955 		if (m->vmp_laundry) {
7956 			dwp->dw_mask |= DW_vm_pageout_throttle_up;
7957 		}
7958 
7959 		if (VM_PAGE_WIRED(m)) {
7960 			m->vmp_free_when_done = FALSE;
7961 		}
7962 
7963 		if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7964 		    m->vmp_cs_validated &&
7965 		    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7966 			/*
7967 			 * CODE SIGNING:
7968 			 * This page is no longer dirty
7969 			 * but could have been modified,
7970 			 * so it will need to be
7971 			 * re-validated.
7972 			 */
7973 			m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7974 
7975 			VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7976 
7977 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7978 		}
7979 		if (m->vmp_overwriting) {
7980 			/*
7981 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
7982 			 */
7983 			if (m->vmp_busy) {
7984 #if CONFIG_PHANTOM_CACHE
7985 				if (m->vmp_absent && !m_object->internal) {
7986 					dwp->dw_mask |= DW_vm_phantom_cache_update;
7987 				}
7988 #endif
7989 				m->vmp_absent = FALSE;
7990 
7991 				dwp->dw_mask |= DW_clear_busy;
7992 			} else {
7993 				/*
7994 				 * alternate (COPY_OUT_FROM == FALSE) page_list case
7995 				 * Occurs when the original page was wired
7996 				 * at the time of the list request
7997 				 */
7998 				assert(VM_PAGE_WIRED(m));
7999 
8000 				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8001 			}
8002 			m->vmp_overwriting = FALSE;
8003 		}
8004 		m->vmp_cleaning = FALSE;
8005 
8006 		if (m->vmp_free_when_done) {
8007 			/*
8008 			 * With the clean queue enabled, UPL_PAGEOUT should
8009 			 * no longer set the pageout bit. Its pages now go
8010 			 * to the clean queue.
8011 			 *
8012 			 * We don't use the cleaned Q anymore and so this
8013 			 * assert isn't correct. The code for the clean Q
8014 			 * still exists and might be used in the future. If we
8015 			 * go back to the cleaned Q, we will re-enable this
8016 			 * assert.
8017 			 *
8018 			 * assert(!(upl->flags & UPL_PAGEOUT));
8019 			 */
8020 			assert(!m_object->internal);
8021 
8022 			m->vmp_free_when_done = FALSE;
8023 
8024 			if ((flags & UPL_COMMIT_SET_DIRTY) ||
8025 			    (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8026 				/*
8027 				 * page was re-dirtied after we started
8028 				 * the pageout... reactivate it since
8029 				 * we don't know whether the on-disk
8030 				 * copy matches what is now in memory
8031 				 */
8032 				SET_PAGE_DIRTY(m, FALSE);
8033 
8034 				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8035 
8036 				if (upl->flags & UPL_PAGEOUT) {
8037 					counter_inc(&vm_statistics_reactivations);
8038 					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8039 				}
8040 			} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8041 				/*
8042 				 * Someone else might still be handling this
8043 				 * page (vm_fault() for example), so let's not
8044 				 * free it or "un-busy" it!
8045 				 * Put that page in the "speculative" queue
8046 				 * for now (since we would otherwise have freed
8047 				 * it) and let whoever is keeping the page
8048 				 * "busy" move it if needed when they're done
8049 				 * with it.
8050 				 */
8051 				dwp->dw_mask |= DW_vm_page_speculate;
8052 			} else {
8053 				/*
8054 				 * page has been successfully cleaned
8055 				 * go ahead and free it for other use
8056 				 */
8057 				if (m_object->internal) {
8058 					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8059 				} else {
8060 					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8061 				}
8062 				m->vmp_dirty = FALSE;
8063 				if (!(upl->flags & UPL_HAS_BUSY)) {
8064 					assert(!m->vmp_busy);
8065 				}
8066 				m->vmp_busy = TRUE;
8067 
8068 				dwp->dw_mask |= DW_vm_page_free;
8069 			}
8070 			goto commit_next_page;
8071 		}
8072 		/*
8073 		 * It is a part of the semantic of COPYOUT_FROM
8074 		 * UPLs that a commit implies cache sync
8075 		 * between the vm page and the backing store
8076 		 * this can be used to strip the precious bit
8077 		 * as well as clean
8078 		 */
8079 		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8080 			m->vmp_precious = FALSE;
8081 		}
8082 
8083 		if (flags & UPL_COMMIT_SET_DIRTY) {
8084 			SET_PAGE_DIRTY(m, FALSE);
8085 		} else {
8086 			m->vmp_dirty = FALSE;
8087 		}
8088 
8089 		/* with the clean queue on, move *all* cleaned pages to the clean queue */
8090 		if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8091 			pgpgout_count++;
8092 
8093 			counter_inc(&vm_statistics_pageouts);
8094 			DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8095 
8096 			dwp->dw_mask |= DW_enqueue_cleaned;
8097 		} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8098 			/*
8099 			 * page coming back in from being 'frozen'...
8100 			 * it was dirty before it was frozen, so keep it so
8101 			 * the vm_page_activate will notice that it really belongs
8102 			 * on the throttle queue and put it there
8103 			 */
8104 			SET_PAGE_DIRTY(m, FALSE);
8105 			dwp->dw_mask |= DW_vm_page_activate;
8106 		} else {
8107 			if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8108 				dwp->dw_mask |= DW_vm_page_deactivate_internal;
8109 				clear_refmod |= VM_MEM_REFERENCED;
8110 			} else if (!VM_PAGE_PAGEABLE(m)) {
8111 				if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8112 					dwp->dw_mask |= DW_vm_page_speculate;
8113 				} else if (m->vmp_reference) {
8114 					dwp->dw_mask |= DW_vm_page_activate;
8115 				} else {
8116 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
8117 					clear_refmod |= VM_MEM_REFERENCED;
8118 				}
8119 			}
8120 		}
8121 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8122 			/*
8123 			 * We blocked access to the pages in this URL.
8124 			 * Clear the "busy" bit on this page before we
8125 			 * wake up any waiter.
8126 			 */
8127 			dwp->dw_mask |= DW_clear_busy;
8128 		}
8129 		/*
8130 		 * Wakeup any thread waiting for the page to be un-cleaning.
8131 		 */
8132 		dwp->dw_mask |= DW_PAGE_WAKEUP;
8133 
8134 commit_next_page:
8135 		if (clear_refmod) {
8136 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8137 		}
8138 
8139 		target_offset += PAGE_SIZE_64;
8140 		xfer_size -= PAGE_SIZE;
8141 		entry++;
8142 
8143 		if (dwp->dw_mask) {
8144 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8145 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8146 
8147 				if (dw_count >= dw_limit) {
8148 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8149 
8150 					dwp = dwp_start;
8151 					dw_count = 0;
8152 				}
8153 			} else {
8154 				if (dwp->dw_mask & DW_clear_busy) {
8155 					m->vmp_busy = FALSE;
8156 				}
8157 
8158 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8159 					PAGE_WAKEUP(m);
8160 				}
8161 			}
8162 		}
8163 	}
8164 	if (dw_count) {
8165 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8166 		dwp = dwp_start;
8167 		dw_count = 0;
8168 	}
8169 
8170 	if (fast_path_possible) {
8171 		assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8172 		assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8173 
8174 		if (local_queue_count || unwired_count) {
8175 			if (local_queue_count) {
8176 				vm_page_t       first_target;
8177 				vm_page_queue_head_t    *target_queue;
8178 
8179 				if (throttle_page) {
8180 					target_queue = &vm_page_queue_throttled;
8181 				} else {
8182 					if (flags & UPL_COMMIT_INACTIVATE) {
8183 						if (shadow_object->internal) {
8184 							target_queue = &vm_page_queue_anonymous;
8185 						} else {
8186 							target_queue = &vm_page_queue_inactive;
8187 						}
8188 					} else {
8189 						target_queue = &vm_page_queue_active;
8190 					}
8191 				}
8192 				/*
8193 				 * Transfer the entire local queue to a regular LRU page queues.
8194 				 */
8195 				vm_page_lockspin_queues();
8196 
8197 				first_target = (vm_page_t) vm_page_queue_first(target_queue);
8198 
8199 				if (vm_page_queue_empty(target_queue)) {
8200 					target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8201 				} else {
8202 					first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8203 				}
8204 
8205 				target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8206 				first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8207 				last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8208 
8209 				/*
8210 				 * Adjust the global page counts.
8211 				 */
8212 				if (throttle_page) {
8213 					vm_page_throttled_count += local_queue_count;
8214 				} else {
8215 					if (flags & UPL_COMMIT_INACTIVATE) {
8216 						if (shadow_object->internal) {
8217 							vm_page_anonymous_count += local_queue_count;
8218 						}
8219 						vm_page_inactive_count += local_queue_count;
8220 
8221 						token_new_pagecount += local_queue_count;
8222 					} else {
8223 						vm_page_active_count += local_queue_count;
8224 					}
8225 
8226 					if (shadow_object->internal) {
8227 						vm_page_pageable_internal_count += local_queue_count;
8228 					} else {
8229 						vm_page_pageable_external_count += local_queue_count;
8230 					}
8231 				}
8232 			} else {
8233 				vm_page_lockspin_queues();
8234 			}
8235 			if (unwired_count) {
8236 				vm_page_wire_count -= unwired_count;
8237 				VM_CHECK_MEMORYSTATUS;
8238 			}
8239 			vm_page_unlock_queues();
8240 
8241 			VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8242 		}
8243 	}
8244 	occupied = 1;
8245 
8246 	if (upl->flags & UPL_DEVICE_MEMORY) {
8247 		occupied = 0;
8248 	} else if (upl->flags & UPL_LITE) {
8249 		int     pg_num;
8250 		int     i;
8251 
8252 		occupied = 0;
8253 
8254 		if (!fast_path_full_commit) {
8255 			pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8256 			pg_num = (pg_num + 31) >> 5;
8257 
8258 			for (i = 0; i < pg_num; i++) {
8259 				if (lite_list[i] != 0) {
8260 					occupied = 1;
8261 					break;
8262 				}
8263 			}
8264 		}
8265 	} else {
8266 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8267 			occupied = 0;
8268 		}
8269 	}
8270 	if (occupied == 0) {
8271 		/*
8272 		 * If this UPL element belongs to a Vector UPL and is
8273 		 * empty, then this is the right function to deallocate
8274 		 * it. So go ahead set the *empty variable. The flag
8275 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8276 		 * should be considered relevant for the Vector UPL and not
8277 		 * the internal UPLs.
8278 		 */
8279 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8280 			*empty = TRUE;
8281 		}
8282 
8283 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8284 			/*
8285 			 * this is not a paging object
8286 			 * so we need to drop the paging reference
8287 			 * that was taken when we created the UPL
8288 			 * against this object
8289 			 */
8290 			vm_object_activity_end(shadow_object);
8291 			vm_object_collapse(shadow_object, 0, TRUE);
8292 		} else {
8293 			/*
8294 			 * we dontated the paging reference to
8295 			 * the map object... vm_pageout_object_terminate
8296 			 * will drop this reference
8297 			 */
8298 		}
8299 	}
8300 	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8301 	vm_object_unlock(shadow_object);
8302 	if (object != shadow_object) {
8303 		vm_object_unlock(object);
8304 	}
8305 
8306 	if (!isVectorUPL) {
8307 		upl_unlock(upl);
8308 	} else {
8309 		/*
8310 		 * If we completed our operations on an UPL that is
8311 		 * part of a Vectored UPL and if empty is TRUE, then
8312 		 * we should go ahead and deallocate this UPL element.
8313 		 * Then we check if this was the last of the UPL elements
8314 		 * within that Vectored UPL. If so, set empty to TRUE
8315 		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8316 		 * can go ahead and deallocate the Vector UPL too.
8317 		 */
8318 		if (*empty == TRUE) {
8319 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8320 			upl_deallocate(upl);
8321 		}
8322 		goto process_upl_to_commit;
8323 	}
8324 	if (pgpgout_count) {
8325 		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8326 	}
8327 
8328 	kr = KERN_SUCCESS;
8329 done:
8330 	if (dwp_start && dwp_finish_ctx) {
8331 		vm_page_delayed_work_finish_ctx(dwp_start);
8332 		dwp_start = dwp = NULL;
8333 	}
8334 
8335 	return kr;
8336 }
8337 
8338 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8339 upl_abort_range(
8340 	upl_t                   upl,
8341 	upl_offset_t            offset,
8342 	upl_size_t              size,
8343 	int                     error,
8344 	boolean_t               *empty)
8345 {
8346 	upl_page_info_t         *user_page_list = NULL;
8347 	upl_size_t              xfer_size, subupl_size;
8348 	vm_object_t             shadow_object;
8349 	vm_object_t             object;
8350 	vm_object_offset_t      target_offset;
8351 	upl_offset_t            subupl_offset = offset;
8352 	int                     entry;
8353 	wpl_array_t             lite_list;
8354 	int                     occupied;
8355 	struct  vm_page_delayed_work    dw_array;
8356 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8357 	bool                    dwp_finish_ctx = TRUE;
8358 	int                     dw_count;
8359 	int                     dw_limit;
8360 	int                     isVectorUPL = 0;
8361 	upl_t                   vector_upl = NULL;
8362 	vm_object_offset_t      obj_start, obj_end, obj_offset;
8363 	kern_return_t           kr = KERN_SUCCESS;
8364 
8365 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8366 
8367 	dwp_start = dwp = NULL;
8368 
8369 	subupl_size = size;
8370 	*empty = FALSE;
8371 
8372 	if (upl == UPL_NULL) {
8373 		return KERN_INVALID_ARGUMENT;
8374 	}
8375 
8376 	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8377 		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8378 	}
8379 
8380 	dw_count = 0;
8381 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8382 	dwp_start = vm_page_delayed_work_get_ctx();
8383 	if (dwp_start == NULL) {
8384 		dwp_start = &dw_array;
8385 		dw_limit = 1;
8386 		dwp_finish_ctx = FALSE;
8387 	}
8388 
8389 	dwp = dwp_start;
8390 
8391 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8392 		vector_upl = upl;
8393 		upl_lock(vector_upl);
8394 	} else {
8395 		upl_lock(upl);
8396 	}
8397 
8398 process_upl_to_abort:
8399 	if (isVectorUPL) {
8400 		size = subupl_size;
8401 		offset = subupl_offset;
8402 		if (size == 0) {
8403 			upl_unlock(vector_upl);
8404 			kr = KERN_SUCCESS;
8405 			goto done;
8406 		}
8407 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8408 		if (upl == NULL) {
8409 			upl_unlock(vector_upl);
8410 			kr = KERN_FAILURE;
8411 			goto done;
8412 		}
8413 		subupl_size -= size;
8414 		subupl_offset += size;
8415 	}
8416 
8417 	*empty = FALSE;
8418 
8419 #if UPL_DEBUG
8420 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8421 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8422 
8423 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8424 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8425 		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8426 
8427 		upl->upl_commit_index++;
8428 	}
8429 #endif
8430 	if (upl->flags & UPL_DEVICE_MEMORY) {
8431 		xfer_size = 0;
8432 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8433 		xfer_size = size;
8434 	} else {
8435 		if (!isVectorUPL) {
8436 			upl_unlock(upl);
8437 		} else {
8438 			upl_unlock(vector_upl);
8439 		}
8440 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8441 		kr = KERN_FAILURE;
8442 		goto done;
8443 	}
8444 	if (upl->flags & UPL_INTERNAL) {
8445 		lite_list = (wpl_array_t)
8446 		    ((((uintptr_t)upl) + sizeof(struct upl))
8447 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8448 
8449 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8450 	} else {
8451 		lite_list = (wpl_array_t)
8452 		    (((uintptr_t)upl) + sizeof(struct upl));
8453 	}
8454 	object = upl->map_object;
8455 
8456 	if (upl->flags & UPL_SHADOWED) {
8457 		vm_object_lock(object);
8458 		shadow_object = object->shadow;
8459 	} else {
8460 		shadow_object = object;
8461 	}
8462 
8463 	entry = offset / PAGE_SIZE;
8464 	target_offset = (vm_object_offset_t)offset;
8465 
8466 	if (upl->flags & UPL_KERNEL_OBJECT) {
8467 		vm_object_lock_shared(shadow_object);
8468 	} else {
8469 		vm_object_lock(shadow_object);
8470 	}
8471 
8472 	if (upl->flags & UPL_ACCESS_BLOCKED) {
8473 		assert(shadow_object->blocked_access);
8474 		shadow_object->blocked_access = FALSE;
8475 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8476 	}
8477 
8478 	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8479 		panic("upl_abort_range: kernel_object being DUMPED");
8480 	}
8481 
8482 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8483 	obj_end = obj_start + xfer_size;
8484 	obj_start = vm_object_trunc_page(obj_start);
8485 	obj_end = vm_object_round_page(obj_end);
8486 	for (obj_offset = obj_start;
8487 	    obj_offset < obj_end;
8488 	    obj_offset += PAGE_SIZE) {
8489 		vm_page_t       t, m;
8490 		unsigned int    pg_num;
8491 		boolean_t       needed;
8492 
8493 		pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8494 		assert(pg_num == target_offset / PAGE_SIZE);
8495 
8496 		needed = FALSE;
8497 
8498 		if (user_page_list) {
8499 			needed = user_page_list[pg_num].needed;
8500 		}
8501 
8502 		dwp->dw_mask = 0;
8503 		m = VM_PAGE_NULL;
8504 
8505 		if (upl->flags & UPL_LITE) {
8506 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8507 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8508 
8509 				if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8510 					m = vm_page_lookup(shadow_object, obj_offset);
8511 				}
8512 			}
8513 		}
8514 		if (upl->flags & UPL_SHADOWED) {
8515 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8516 				t->vmp_free_when_done = FALSE;
8517 
8518 				VM_PAGE_FREE(t);
8519 
8520 				if (m == VM_PAGE_NULL) {
8521 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8522 				}
8523 			}
8524 		}
8525 		if ((upl->flags & UPL_KERNEL_OBJECT)) {
8526 			goto abort_next_page;
8527 		}
8528 
8529 		if (m != VM_PAGE_NULL) {
8530 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8531 
8532 			if (m->vmp_absent) {
8533 				boolean_t must_free = TRUE;
8534 
8535 				/*
8536 				 * COPYOUT = FALSE case
8537 				 * check for error conditions which must
8538 				 * be passed back to the pages customer
8539 				 */
8540 				if (error & UPL_ABORT_RESTART) {
8541 					m->vmp_restart = TRUE;
8542 					m->vmp_absent = FALSE;
8543 					m->vmp_unusual = TRUE;
8544 					must_free = FALSE;
8545 				} else if (error & UPL_ABORT_UNAVAILABLE) {
8546 					m->vmp_restart = FALSE;
8547 					m->vmp_unusual = TRUE;
8548 					must_free = FALSE;
8549 				} else if (error & UPL_ABORT_ERROR) {
8550 					m->vmp_restart = FALSE;
8551 					m->vmp_absent = FALSE;
8552 					m->vmp_error = TRUE;
8553 					m->vmp_unusual = TRUE;
8554 					must_free = FALSE;
8555 				}
8556 				if (m->vmp_clustered && needed == FALSE) {
8557 					/*
8558 					 * This page was a part of a speculative
8559 					 * read-ahead initiated by the kernel
8560 					 * itself.  No one is expecting this
8561 					 * page and no one will clean up its
8562 					 * error state if it ever becomes valid
8563 					 * in the future.
8564 					 * We have to free it here.
8565 					 */
8566 					must_free = TRUE;
8567 				}
8568 				m->vmp_cleaning = FALSE;
8569 
8570 				if (m->vmp_overwriting && !m->vmp_busy) {
8571 					/*
8572 					 * this shouldn't happen since
8573 					 * this is an 'absent' page, but
8574 					 * it doesn't hurt to check for
8575 					 * the 'alternate' method of
8576 					 * stabilizing the page...
8577 					 * we will mark 'busy' to be cleared
8578 					 * in the following code which will
8579 					 * take care of the primary stabilzation
8580 					 * method (i.e. setting 'busy' to TRUE)
8581 					 */
8582 					dwp->dw_mask |= DW_vm_page_unwire;
8583 				}
8584 				m->vmp_overwriting = FALSE;
8585 
8586 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8587 
8588 				if (must_free == TRUE) {
8589 					dwp->dw_mask |= DW_vm_page_free;
8590 				} else {
8591 					dwp->dw_mask |= DW_vm_page_activate;
8592 				}
8593 			} else {
8594 				/*
8595 				 * Handle the trusted pager throttle.
8596 				 */
8597 				if (m->vmp_laundry) {
8598 					dwp->dw_mask |= DW_vm_pageout_throttle_up;
8599 				}
8600 
8601 				if (upl->flags & UPL_ACCESS_BLOCKED) {
8602 					/*
8603 					 * We blocked access to the pages in this UPL.
8604 					 * Clear the "busy" bit and wake up any waiter
8605 					 * for this page.
8606 					 */
8607 					dwp->dw_mask |= DW_clear_busy;
8608 				}
8609 				if (m->vmp_overwriting) {
8610 					if (m->vmp_busy) {
8611 						dwp->dw_mask |= DW_clear_busy;
8612 					} else {
8613 						/*
8614 						 * deal with the 'alternate' method
8615 						 * of stabilizing the page...
8616 						 * we will either free the page
8617 						 * or mark 'busy' to be cleared
8618 						 * in the following code which will
8619 						 * take care of the primary stabilzation
8620 						 * method (i.e. setting 'busy' to TRUE)
8621 						 */
8622 						dwp->dw_mask |= DW_vm_page_unwire;
8623 					}
8624 					m->vmp_overwriting = FALSE;
8625 				}
8626 				m->vmp_free_when_done = FALSE;
8627 				m->vmp_cleaning = FALSE;
8628 
8629 				if (error & UPL_ABORT_DUMP_PAGES) {
8630 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8631 
8632 					dwp->dw_mask |= DW_vm_page_free;
8633 				} else {
8634 					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8635 						if (error & UPL_ABORT_REFERENCE) {
8636 							/*
8637 							 * we've been told to explictly
8638 							 * reference this page... for
8639 							 * file I/O, this is done by
8640 							 * implementing an LRU on the inactive q
8641 							 */
8642 							dwp->dw_mask |= DW_vm_page_lru;
8643 						} else if (!VM_PAGE_PAGEABLE(m)) {
8644 							dwp->dw_mask |= DW_vm_page_deactivate_internal;
8645 						}
8646 					}
8647 					dwp->dw_mask |= DW_PAGE_WAKEUP;
8648 				}
8649 			}
8650 		}
8651 abort_next_page:
8652 		target_offset += PAGE_SIZE_64;
8653 		xfer_size -= PAGE_SIZE;
8654 		entry++;
8655 
8656 		if (dwp->dw_mask) {
8657 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8658 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8659 
8660 				if (dw_count >= dw_limit) {
8661 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8662 
8663 					dwp = dwp_start;
8664 					dw_count = 0;
8665 				}
8666 			} else {
8667 				if (dwp->dw_mask & DW_clear_busy) {
8668 					m->vmp_busy = FALSE;
8669 				}
8670 
8671 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8672 					PAGE_WAKEUP(m);
8673 				}
8674 			}
8675 		}
8676 	}
8677 	if (dw_count) {
8678 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8679 		dwp = dwp_start;
8680 		dw_count = 0;
8681 	}
8682 
8683 	occupied = 1;
8684 
8685 	if (upl->flags & UPL_DEVICE_MEMORY) {
8686 		occupied = 0;
8687 	} else if (upl->flags & UPL_LITE) {
8688 		int     pg_num;
8689 		int     i;
8690 
8691 		pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8692 		pg_num = (pg_num + 31) >> 5;
8693 		occupied = 0;
8694 
8695 		for (i = 0; i < pg_num; i++) {
8696 			if (lite_list[i] != 0) {
8697 				occupied = 1;
8698 				break;
8699 			}
8700 		}
8701 	} else {
8702 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8703 			occupied = 0;
8704 		}
8705 	}
8706 	if (occupied == 0) {
8707 		/*
8708 		 * If this UPL element belongs to a Vector UPL and is
8709 		 * empty, then this is the right function to deallocate
8710 		 * it. So go ahead set the *empty variable. The flag
8711 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8712 		 * should be considered relevant for the Vector UPL and
8713 		 * not the internal UPLs.
8714 		 */
8715 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8716 			*empty = TRUE;
8717 		}
8718 
8719 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8720 			/*
8721 			 * this is not a paging object
8722 			 * so we need to drop the paging reference
8723 			 * that was taken when we created the UPL
8724 			 * against this object
8725 			 */
8726 			vm_object_activity_end(shadow_object);
8727 			vm_object_collapse(shadow_object, 0, TRUE);
8728 		} else {
8729 			/*
8730 			 * we dontated the paging reference to
8731 			 * the map object... vm_pageout_object_terminate
8732 			 * will drop this reference
8733 			 */
8734 		}
8735 	}
8736 	vm_object_unlock(shadow_object);
8737 	if (object != shadow_object) {
8738 		vm_object_unlock(object);
8739 	}
8740 
8741 	if (!isVectorUPL) {
8742 		upl_unlock(upl);
8743 	} else {
8744 		/*
8745 		 * If we completed our operations on an UPL that is
8746 		 * part of a Vectored UPL and if empty is TRUE, then
8747 		 * we should go ahead and deallocate this UPL element.
8748 		 * Then we check if this was the last of the UPL elements
8749 		 * within that Vectored UPL. If so, set empty to TRUE
8750 		 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8751 		 * can go ahead and deallocate the Vector UPL too.
8752 		 */
8753 		if (*empty == TRUE) {
8754 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8755 			upl_deallocate(upl);
8756 		}
8757 		goto process_upl_to_abort;
8758 	}
8759 
8760 	kr = KERN_SUCCESS;
8761 
8762 done:
8763 	if (dwp_start && dwp_finish_ctx) {
8764 		vm_page_delayed_work_finish_ctx(dwp_start);
8765 		dwp_start = dwp = NULL;
8766 	}
8767 
8768 	return kr;
8769 }
8770 
8771 
8772 kern_return_t
upl_abort(upl_t upl,int error)8773 upl_abort(
8774 	upl_t   upl,
8775 	int     error)
8776 {
8777 	boolean_t       empty;
8778 
8779 	if (upl == UPL_NULL) {
8780 		return KERN_INVALID_ARGUMENT;
8781 	}
8782 
8783 	return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8784 }
8785 
8786 
8787 /* an option on commit should be wire */
8788 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8789 upl_commit(
8790 	upl_t                   upl,
8791 	upl_page_info_t         *page_list,
8792 	mach_msg_type_number_t  count)
8793 {
8794 	boolean_t       empty;
8795 
8796 	if (upl == UPL_NULL) {
8797 		return KERN_INVALID_ARGUMENT;
8798 	}
8799 
8800 	return upl_commit_range(upl, 0, upl->u_size, 0,
8801 	           page_list, count, &empty);
8802 }
8803 
8804 
8805 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8806 iopl_valid_data(
8807 	upl_t    upl,
8808 	vm_tag_t tag)
8809 {
8810 	vm_object_t     object;
8811 	vm_offset_t     offset;
8812 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
8813 	upl_size_t      size;
8814 	int             wired_count = 0;
8815 
8816 	if (upl == NULL) {
8817 		panic("iopl_valid_data: NULL upl");
8818 	}
8819 	if (vector_upl_is_valid(upl)) {
8820 		panic("iopl_valid_data: vector upl");
8821 	}
8822 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8823 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8824 	}
8825 
8826 	object = upl->map_object;
8827 
8828 	if (object == kernel_object || object == compressor_object) {
8829 		panic("iopl_valid_data: object == kernel or compressor");
8830 	}
8831 
8832 	if (object->purgable == VM_PURGABLE_VOLATILE ||
8833 	    object->purgable == VM_PURGABLE_EMPTY) {
8834 		panic("iopl_valid_data: object %p purgable %d",
8835 		    object, object->purgable);
8836 	}
8837 
8838 	size = upl_adjusted_size(upl, PAGE_MASK);
8839 
8840 	vm_object_lock(object);
8841 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8842 
8843 	bool whole_object;
8844 
8845 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8846 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8847 		whole_object = true;
8848 	} else {
8849 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8850 		whole_object = false;
8851 	}
8852 
8853 	while (size) {
8854 		if (whole_object) {
8855 			if (nxt_page != VM_PAGE_NULL) {
8856 				m = nxt_page;
8857 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8858 			}
8859 		} else {
8860 			m = vm_page_lookup(object, offset);
8861 			offset += PAGE_SIZE;
8862 
8863 			if (m == VM_PAGE_NULL) {
8864 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8865 			}
8866 		}
8867 		if (m->vmp_busy) {
8868 			if (!m->vmp_absent) {
8869 				panic("iopl_valid_data: busy page w/o absent");
8870 			}
8871 
8872 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8873 				panic("iopl_valid_data: busy+absent page on page queue");
8874 			}
8875 			if (m->vmp_reusable) {
8876 				panic("iopl_valid_data: %p is reusable", m);
8877 			}
8878 
8879 			m->vmp_absent = FALSE;
8880 			m->vmp_dirty = TRUE;
8881 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8882 			assert(m->vmp_wire_count == 0);
8883 			m->vmp_wire_count++;
8884 			assert(m->vmp_wire_count);
8885 			if (m->vmp_wire_count == 1) {
8886 				m->vmp_q_state = VM_PAGE_IS_WIRED;
8887 				wired_count++;
8888 			} else {
8889 				panic("iopl_valid_data: %p already wired", m);
8890 			}
8891 
8892 			PAGE_WAKEUP_DONE(m);
8893 		}
8894 		size -= PAGE_SIZE;
8895 	}
8896 	if (wired_count) {
8897 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8898 		assert(object->resident_page_count >= object->wired_page_count);
8899 
8900 		/* no need to adjust purgeable accounting for this object: */
8901 		assert(object->purgable != VM_PURGABLE_VOLATILE);
8902 		assert(object->purgable != VM_PURGABLE_EMPTY);
8903 
8904 		vm_page_lockspin_queues();
8905 		vm_page_wire_count += wired_count;
8906 		vm_page_unlock_queues();
8907 	}
8908 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8909 	vm_object_unlock(object);
8910 }
8911 
8912 
8913 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8914 vm_object_set_pmap_cache_attr(
8915 	vm_object_t             object,
8916 	upl_page_info_array_t   user_page_list,
8917 	unsigned int            num_pages,
8918 	boolean_t               batch_pmap_op)
8919 {
8920 	unsigned int    cache_attr = 0;
8921 
8922 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8923 	assert(user_page_list);
8924 	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8925 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8926 	}
8927 }
8928 
8929 
8930 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8931 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8932 
8933 
8934 
8935 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8936 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8937     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8938 {
8939 	vm_page_t       dst_page;
8940 	unsigned int    entry;
8941 	int             page_count;
8942 	int             delayed_unlock = 0;
8943 	boolean_t       retval = TRUE;
8944 	ppnum_t         phys_page;
8945 
8946 	vm_object_lock_assert_exclusive(object);
8947 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8948 	assert(object->purgable != VM_PURGABLE_EMPTY);
8949 	assert(object->pager == NULL);
8950 	assert(object->copy == NULL);
8951 	assert(object->shadow == NULL);
8952 
8953 	page_count = object->resident_page_count;
8954 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8955 
8956 	vm_page_lock_queues();
8957 
8958 	while (page_count--) {
8959 		if (dst_page->vmp_busy ||
8960 		    dst_page->vmp_fictitious ||
8961 		    dst_page->vmp_absent ||
8962 		    VMP_ERROR_GET(dst_page) ||
8963 		    dst_page->vmp_cleaning ||
8964 		    dst_page->vmp_restart ||
8965 		    dst_page->vmp_laundry) {
8966 			retval = FALSE;
8967 			goto done;
8968 		}
8969 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8970 			retval = FALSE;
8971 			goto done;
8972 		}
8973 		dst_page->vmp_reference = TRUE;
8974 
8975 		vm_page_wire(dst_page, tag, FALSE);
8976 
8977 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8978 			SET_PAGE_DIRTY(dst_page, FALSE);
8979 		}
8980 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8981 		assert(entry >= 0 && entry < object->resident_page_count);
8982 		lite_list[entry >> 5] |= 1U << (entry & 31);
8983 
8984 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8985 
8986 		if (phys_page > upl->highest_page) {
8987 			upl->highest_page = phys_page;
8988 		}
8989 
8990 		if (user_page_list) {
8991 			user_page_list[entry].phys_addr = phys_page;
8992 			user_page_list[entry].absent    = dst_page->vmp_absent;
8993 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8994 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8995 			user_page_list[entry].precious  = dst_page->vmp_precious;
8996 			user_page_list[entry].device    = FALSE;
8997 			user_page_list[entry].speculative = FALSE;
8998 			user_page_list[entry].cs_validated = FALSE;
8999 			user_page_list[entry].cs_tainted = FALSE;
9000 			user_page_list[entry].cs_nx     = FALSE;
9001 			user_page_list[entry].needed    = FALSE;
9002 			user_page_list[entry].mark      = FALSE;
9003 		}
9004 		if (delayed_unlock++ > 256) {
9005 			delayed_unlock = 0;
9006 			lck_mtx_yield(&vm_page_queue_lock);
9007 
9008 			VM_CHECK_MEMORYSTATUS;
9009 		}
9010 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
9011 	}
9012 done:
9013 	vm_page_unlock_queues();
9014 
9015 	VM_CHECK_MEMORYSTATUS;
9016 
9017 	return retval;
9018 }
9019 
9020 
9021 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)9022 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
9023     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
9024     int page_count, int* page_grab_count)
9025 {
9026 	vm_page_t       dst_page;
9027 	boolean_t       no_zero_fill = FALSE;
9028 	int             interruptible;
9029 	int             pages_wired = 0;
9030 	int             pages_inserted = 0;
9031 	int             entry = 0;
9032 	uint64_t        delayed_ledger_update = 0;
9033 	kern_return_t   ret = KERN_SUCCESS;
9034 	int             grab_options;
9035 	ppnum_t         phys_page;
9036 
9037 	vm_object_lock_assert_exclusive(object);
9038 	assert(object->purgable != VM_PURGABLE_VOLATILE);
9039 	assert(object->purgable != VM_PURGABLE_EMPTY);
9040 	assert(object->pager == NULL);
9041 	assert(object->copy == NULL);
9042 	assert(object->shadow == NULL);
9043 
9044 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9045 		interruptible = THREAD_ABORTSAFE;
9046 	} else {
9047 		interruptible = THREAD_UNINT;
9048 	}
9049 
9050 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9051 		no_zero_fill = TRUE;
9052 	}
9053 
9054 	grab_options = 0;
9055 #if CONFIG_SECLUDED_MEMORY
9056 	if (object->can_grab_secluded) {
9057 		grab_options |= VM_PAGE_GRAB_SECLUDED;
9058 	}
9059 #endif /* CONFIG_SECLUDED_MEMORY */
9060 
9061 	while (page_count--) {
9062 		while ((dst_page = vm_page_grab_options(grab_options))
9063 		    == VM_PAGE_NULL) {
9064 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9065 
9066 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9067 
9068 			if (vm_page_wait(interruptible) == FALSE) {
9069 				/*
9070 				 * interrupted case
9071 				 */
9072 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9073 
9074 				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9075 
9076 				ret = MACH_SEND_INTERRUPTED;
9077 				goto done;
9078 			}
9079 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9080 
9081 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9082 		}
9083 		if (no_zero_fill == FALSE) {
9084 			vm_page_zero_fill(dst_page);
9085 		} else {
9086 			dst_page->vmp_absent = TRUE;
9087 		}
9088 
9089 		dst_page->vmp_reference = TRUE;
9090 
9091 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9092 			SET_PAGE_DIRTY(dst_page, FALSE);
9093 		}
9094 		if (dst_page->vmp_absent == FALSE) {
9095 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9096 			assert(dst_page->vmp_wire_count == 0);
9097 			dst_page->vmp_wire_count++;
9098 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9099 			assert(dst_page->vmp_wire_count);
9100 			pages_wired++;
9101 			PAGE_WAKEUP_DONE(dst_page);
9102 		}
9103 		pages_inserted++;
9104 
9105 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9106 
9107 		lite_list[entry >> 5] |= 1U << (entry & 31);
9108 
9109 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9110 
9111 		if (phys_page > upl->highest_page) {
9112 			upl->highest_page = phys_page;
9113 		}
9114 
9115 		if (user_page_list) {
9116 			user_page_list[entry].phys_addr = phys_page;
9117 			user_page_list[entry].absent    = dst_page->vmp_absent;
9118 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9119 			user_page_list[entry].free_when_done    = FALSE;
9120 			user_page_list[entry].precious  = FALSE;
9121 			user_page_list[entry].device    = FALSE;
9122 			user_page_list[entry].speculative = FALSE;
9123 			user_page_list[entry].cs_validated = FALSE;
9124 			user_page_list[entry].cs_tainted = FALSE;
9125 			user_page_list[entry].cs_nx     = FALSE;
9126 			user_page_list[entry].needed    = FALSE;
9127 			user_page_list[entry].mark      = FALSE;
9128 		}
9129 		entry++;
9130 		*dst_offset += PAGE_SIZE_64;
9131 	}
9132 done:
9133 	if (pages_wired) {
9134 		vm_page_lockspin_queues();
9135 		vm_page_wire_count += pages_wired;
9136 		vm_page_unlock_queues();
9137 	}
9138 	if (pages_inserted) {
9139 		if (object->internal) {
9140 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
9141 		} else {
9142 			OSAddAtomic(pages_inserted, &vm_page_external_count);
9143 		}
9144 	}
9145 	if (delayed_ledger_update) {
9146 		task_t          owner;
9147 		int             ledger_idx_volatile;
9148 		int             ledger_idx_nonvolatile;
9149 		int             ledger_idx_volatile_compressed;
9150 		int             ledger_idx_nonvolatile_compressed;
9151 		boolean_t       do_footprint;
9152 
9153 		owner = VM_OBJECT_OWNER(object);
9154 		assert(owner);
9155 
9156 		vm_object_ledger_tag_ledgers(object,
9157 		    &ledger_idx_volatile,
9158 		    &ledger_idx_nonvolatile,
9159 		    &ledger_idx_volatile_compressed,
9160 		    &ledger_idx_nonvolatile_compressed,
9161 		    &do_footprint);
9162 
9163 		/* more non-volatile bytes */
9164 		ledger_credit(owner->ledger,
9165 		    ledger_idx_nonvolatile,
9166 		    delayed_ledger_update);
9167 		if (do_footprint) {
9168 			/* more footprint */
9169 			ledger_credit(owner->ledger,
9170 			    task_ledgers.phys_footprint,
9171 			    delayed_ledger_update);
9172 		}
9173 	}
9174 
9175 	assert(page_grab_count);
9176 	*page_grab_count = pages_inserted;
9177 
9178 	return ret;
9179 }
9180 
9181 
9182 
9183 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9184 vm_object_iopl_request(
9185 	vm_object_t             object,
9186 	vm_object_offset_t      offset,
9187 	upl_size_t              size,
9188 	upl_t                   *upl_ptr,
9189 	upl_page_info_array_t   user_page_list,
9190 	unsigned int            *page_list_count,
9191 	upl_control_flags_t     cntrl_flags,
9192 	vm_tag_t                tag)
9193 {
9194 	vm_page_t               dst_page;
9195 	vm_object_offset_t      dst_offset;
9196 	upl_size_t              xfer_size;
9197 	upl_t                   upl = NULL;
9198 	unsigned int            entry;
9199 	wpl_array_t             lite_list = NULL;
9200 	int                     no_zero_fill = FALSE;
9201 	unsigned int            size_in_pages;
9202 	int                     page_grab_count = 0;
9203 	u_int32_t               psize;
9204 	kern_return_t           ret;
9205 	vm_prot_t               prot;
9206 	struct vm_object_fault_info fault_info = {};
9207 	struct  vm_page_delayed_work    dw_array;
9208 	struct  vm_page_delayed_work    *dwp, *dwp_start;
9209 	bool                    dwp_finish_ctx = TRUE;
9210 	int                     dw_count;
9211 	int                     dw_limit;
9212 	int                     dw_index;
9213 	boolean_t               caller_lookup;
9214 	int                     io_tracking_flag = 0;
9215 	int                     interruptible;
9216 	ppnum_t                 phys_page;
9217 
9218 	boolean_t               set_cache_attr_needed = FALSE;
9219 	boolean_t               free_wired_pages = FALSE;
9220 	boolean_t               fast_path_empty_req = FALSE;
9221 	boolean_t               fast_path_full_req = FALSE;
9222 
9223 #if DEVELOPMENT || DEBUG
9224 	task_t                  task = current_task();
9225 #endif /* DEVELOPMENT || DEBUG */
9226 
9227 	dwp_start = dwp = NULL;
9228 
9229 	vm_object_offset_t original_offset = offset;
9230 	upl_size_t original_size = size;
9231 
9232 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9233 
9234 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9235 	offset = vm_object_trunc_page(offset);
9236 	if (size != original_size || offset != original_offset) {
9237 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9238 	}
9239 
9240 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9241 		/*
9242 		 * For forward compatibility's sake,
9243 		 * reject any unknown flag.
9244 		 */
9245 		return KERN_INVALID_VALUE;
9246 	}
9247 	if (vm_lopage_needed == FALSE) {
9248 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9249 	}
9250 
9251 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9252 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9253 			return KERN_INVALID_VALUE;
9254 		}
9255 
9256 		if (object->phys_contiguous) {
9257 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9258 				return KERN_INVALID_ADDRESS;
9259 			}
9260 
9261 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9262 				return KERN_INVALID_ADDRESS;
9263 			}
9264 		}
9265 	}
9266 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9267 		no_zero_fill = TRUE;
9268 	}
9269 
9270 	if (cntrl_flags & UPL_COPYOUT_FROM) {
9271 		prot = VM_PROT_READ;
9272 	} else {
9273 		prot = VM_PROT_READ | VM_PROT_WRITE;
9274 	}
9275 
9276 	if ((!object->internal) && (object->paging_offset != 0)) {
9277 		panic("vm_object_iopl_request: external object with non-zero paging offset");
9278 	}
9279 
9280 
9281 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9282 
9283 #if CONFIG_IOSCHED || UPL_DEBUG
9284 	if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9285 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9286 	}
9287 #endif
9288 
9289 #if CONFIG_IOSCHED
9290 	if (object->io_tracking) {
9291 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9292 		if (object != kernel_object) {
9293 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9294 		}
9295 	}
9296 #endif
9297 
9298 	if (object->phys_contiguous) {
9299 		psize = PAGE_SIZE;
9300 	} else {
9301 		psize = size;
9302 
9303 		dw_count = 0;
9304 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9305 		dwp_start = vm_page_delayed_work_get_ctx();
9306 		if (dwp_start == NULL) {
9307 			dwp_start = &dw_array;
9308 			dw_limit = 1;
9309 			dwp_finish_ctx = FALSE;
9310 		}
9311 
9312 		dwp = dwp_start;
9313 	}
9314 
9315 	if (cntrl_flags & UPL_SET_INTERNAL) {
9316 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9317 
9318 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9319 		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9320 		    ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9321 		if (size == 0) {
9322 			user_page_list = NULL;
9323 			lite_list = NULL;
9324 		}
9325 	} else {
9326 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9327 
9328 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9329 		if (size == 0) {
9330 			lite_list = NULL;
9331 		}
9332 	}
9333 	if (user_page_list) {
9334 		user_page_list[0].device = FALSE;
9335 	}
9336 	*upl_ptr = upl;
9337 
9338 	if (cntrl_flags & UPL_NOZEROFILLIO) {
9339 		DTRACE_VM4(upl_nozerofillio,
9340 		    vm_object_t, object,
9341 		    vm_object_offset_t, offset,
9342 		    upl_size_t, size,
9343 		    upl_t, upl);
9344 	}
9345 
9346 	upl->map_object = object;
9347 	upl->u_offset = original_offset;
9348 	upl->u_size = original_size;
9349 
9350 	size_in_pages = size / PAGE_SIZE;
9351 
9352 	if (object == kernel_object &&
9353 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9354 		upl->flags |= UPL_KERNEL_OBJECT;
9355 #if UPL_DEBUG
9356 		vm_object_lock(object);
9357 #else
9358 		vm_object_lock_shared(object);
9359 #endif
9360 	} else {
9361 		vm_object_lock(object);
9362 		vm_object_activity_begin(object);
9363 	}
9364 	/*
9365 	 * paging in progress also protects the paging_offset
9366 	 */
9367 	upl->u_offset = original_offset + object->paging_offset;
9368 
9369 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9370 		/*
9371 		 * The user requested that access to the pages in this UPL
9372 		 * be blocked until the UPL is commited or aborted.
9373 		 */
9374 		upl->flags |= UPL_ACCESS_BLOCKED;
9375 	}
9376 
9377 #if CONFIG_IOSCHED || UPL_DEBUG
9378 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9379 		vm_object_activity_begin(object);
9380 		queue_enter(&object->uplq, upl, upl_t, uplq);
9381 	}
9382 #endif
9383 
9384 	if (object->phys_contiguous) {
9385 		if (upl->flags & UPL_ACCESS_BLOCKED) {
9386 			assert(!object->blocked_access);
9387 			object->blocked_access = TRUE;
9388 		}
9389 
9390 		vm_object_unlock(object);
9391 
9392 		/*
9393 		 * don't need any shadow mappings for this one
9394 		 * since it is already I/O memory
9395 		 */
9396 		upl->flags |= UPL_DEVICE_MEMORY;
9397 
9398 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9399 
9400 		if (user_page_list) {
9401 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9402 			user_page_list[0].device = TRUE;
9403 		}
9404 		if (page_list_count != NULL) {
9405 			if (upl->flags & UPL_INTERNAL) {
9406 				*page_list_count = 0;
9407 			} else {
9408 				*page_list_count = 1;
9409 			}
9410 		}
9411 
9412 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9413 #if DEVELOPMENT || DEBUG
9414 		if (task != NULL) {
9415 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9416 		}
9417 #endif /* DEVELOPMENT || DEBUG */
9418 		return KERN_SUCCESS;
9419 	}
9420 	if (object != kernel_object && object != compressor_object) {
9421 		/*
9422 		 * Protect user space from future COW operations
9423 		 */
9424 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9425 		if (!object->true_share &&
9426 		    vm_object_tracking_btlog) {
9427 			btlog_record(vm_object_tracking_btlog, object,
9428 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
9429 			    btref_get(__builtin_frame_address(0), 0));
9430 		}
9431 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9432 
9433 		vm_object_lock_assert_exclusive(object);
9434 		object->true_share = TRUE;
9435 
9436 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9437 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9438 		}
9439 	}
9440 
9441 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9442 	    object->copy != VM_OBJECT_NULL) {
9443 		/*
9444 		 * Honor copy-on-write obligations
9445 		 *
9446 		 * The caller is gathering these pages and
9447 		 * might modify their contents.  We need to
9448 		 * make sure that the copy object has its own
9449 		 * private copies of these pages before we let
9450 		 * the caller modify them.
9451 		 *
9452 		 * NOTE: someone else could map the original object
9453 		 * after we've done this copy-on-write here, and they
9454 		 * could then see an inconsistent picture of the memory
9455 		 * while it's being modified via the UPL.  To prevent this,
9456 		 * we would have to block access to these pages until the
9457 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
9458 		 * code path for that...
9459 		 */
9460 		vm_object_update(object,
9461 		    offset,
9462 		    size,
9463 		    NULL,
9464 		    NULL,
9465 		    FALSE,              /* should_return */
9466 		    MEMORY_OBJECT_COPY_SYNC,
9467 		    VM_PROT_NO_CHANGE);
9468 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
9469 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9470 	}
9471 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9472 	    object->purgable != VM_PURGABLE_VOLATILE &&
9473 	    object->purgable != VM_PURGABLE_EMPTY &&
9474 	    object->copy == NULL &&
9475 	    size == object->vo_size &&
9476 	    offset == 0 &&
9477 	    object->shadow == NULL &&
9478 	    object->pager == NULL) {
9479 		if (object->resident_page_count == size_in_pages) {
9480 			assert(object != compressor_object);
9481 			assert(object != kernel_object);
9482 			fast_path_full_req = TRUE;
9483 		} else if (object->resident_page_count == 0) {
9484 			assert(object != compressor_object);
9485 			assert(object != kernel_object);
9486 			fast_path_empty_req = TRUE;
9487 			set_cache_attr_needed = TRUE;
9488 		}
9489 	}
9490 
9491 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9492 		interruptible = THREAD_ABORTSAFE;
9493 	} else {
9494 		interruptible = THREAD_UNINT;
9495 	}
9496 
9497 	entry = 0;
9498 
9499 	xfer_size = size;
9500 	dst_offset = offset;
9501 
9502 	if (fast_path_full_req) {
9503 		if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9504 			goto finish;
9505 		}
9506 		/*
9507 		 * we couldn't complete the processing of this request on the fast path
9508 		 * so fall through to the slow path and finish up
9509 		 */
9510 	} else if (fast_path_empty_req) {
9511 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9512 			ret = KERN_MEMORY_ERROR;
9513 			goto return_err;
9514 		}
9515 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9516 
9517 		if (ret) {
9518 			free_wired_pages = TRUE;
9519 			goto return_err;
9520 		}
9521 		goto finish;
9522 	}
9523 
9524 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9525 	fault_info.lo_offset = offset;
9526 	fault_info.hi_offset = offset + xfer_size;
9527 	fault_info.mark_zf_absent = TRUE;
9528 	fault_info.interruptible = interruptible;
9529 	fault_info.batch_pmap_op = TRUE;
9530 
9531 	while (xfer_size) {
9532 		vm_fault_return_t       result;
9533 
9534 		dwp->dw_mask = 0;
9535 
9536 		if (fast_path_full_req) {
9537 			/*
9538 			 * if we get here, it means that we ran into a page
9539 			 * state we couldn't handle in the fast path and
9540 			 * bailed out to the slow path... since the order
9541 			 * we look at pages is different between the 2 paths,
9542 			 * the following check is needed to determine whether
9543 			 * this page was already processed in the fast path
9544 			 */
9545 			if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9546 				goto skip_page;
9547 			}
9548 		}
9549 		dst_page = vm_page_lookup(object, dst_offset);
9550 
9551 		if (dst_page == VM_PAGE_NULL ||
9552 		    dst_page->vmp_busy ||
9553 		    VMP_ERROR_GET(dst_page) ||
9554 		    dst_page->vmp_restart ||
9555 		    dst_page->vmp_absent ||
9556 		    dst_page->vmp_fictitious) {
9557 			if (object == kernel_object) {
9558 				panic("vm_object_iopl_request: missing/bad page in kernel object");
9559 			}
9560 			if (object == compressor_object) {
9561 				panic("vm_object_iopl_request: missing/bad page in compressor object");
9562 			}
9563 
9564 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9565 				ret = KERN_MEMORY_ERROR;
9566 				goto return_err;
9567 			}
9568 			set_cache_attr_needed = TRUE;
9569 
9570 			/*
9571 			 * We just looked up the page and the result remains valid
9572 			 * until the object lock is release, so send it to
9573 			 * vm_fault_page() (as "dst_page"), to avoid having to
9574 			 * look it up again there.
9575 			 */
9576 			caller_lookup = TRUE;
9577 
9578 			do {
9579 				vm_page_t       top_page;
9580 				kern_return_t   error_code;
9581 
9582 				fault_info.cluster_size = xfer_size;
9583 
9584 				vm_object_paging_begin(object);
9585 
9586 				result = vm_fault_page(object, dst_offset,
9587 				    prot | VM_PROT_WRITE, FALSE,
9588 				    caller_lookup,
9589 				    &prot, &dst_page, &top_page,
9590 				    (int *)0,
9591 				    &error_code, no_zero_fill,
9592 				    &fault_info);
9593 
9594 				/* our lookup is no longer valid at this point */
9595 				caller_lookup = FALSE;
9596 
9597 				switch (result) {
9598 				case VM_FAULT_SUCCESS:
9599 					page_grab_count++;
9600 
9601 					if (!dst_page->vmp_absent) {
9602 						PAGE_WAKEUP_DONE(dst_page);
9603 					} else {
9604 						/*
9605 						 * we only get back an absent page if we
9606 						 * requested that it not be zero-filled
9607 						 * because we are about to fill it via I/O
9608 						 *
9609 						 * absent pages should be left BUSY
9610 						 * to prevent them from being faulted
9611 						 * into an address space before we've
9612 						 * had a chance to complete the I/O on
9613 						 * them since they may contain info that
9614 						 * shouldn't be seen by the faulting task
9615 						 */
9616 					}
9617 					/*
9618 					 *	Release paging references and
9619 					 *	top-level placeholder page, if any.
9620 					 */
9621 					if (top_page != VM_PAGE_NULL) {
9622 						vm_object_t local_object;
9623 
9624 						local_object = VM_PAGE_OBJECT(top_page);
9625 
9626 						/*
9627 						 * comparing 2 packed pointers
9628 						 */
9629 						if (top_page->vmp_object != dst_page->vmp_object) {
9630 							vm_object_lock(local_object);
9631 							VM_PAGE_FREE(top_page);
9632 							vm_object_paging_end(local_object);
9633 							vm_object_unlock(local_object);
9634 						} else {
9635 							VM_PAGE_FREE(top_page);
9636 							vm_object_paging_end(local_object);
9637 						}
9638 					}
9639 					vm_object_paging_end(object);
9640 					break;
9641 
9642 				case VM_FAULT_RETRY:
9643 					vm_object_lock(object);
9644 					break;
9645 
9646 				case VM_FAULT_MEMORY_SHORTAGE:
9647 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9648 
9649 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9650 
9651 					if (vm_page_wait(interruptible)) {
9652 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9653 
9654 						VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9655 						vm_object_lock(object);
9656 
9657 						break;
9658 					}
9659 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9660 
9661 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9662 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9663 					OS_FALLTHROUGH;
9664 
9665 				case VM_FAULT_INTERRUPTED:
9666 					error_code = MACH_SEND_INTERRUPTED;
9667 					OS_FALLTHROUGH;
9668 				case VM_FAULT_MEMORY_ERROR:
9669 memory_error:
9670 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9671 
9672 					vm_object_lock(object);
9673 					goto return_err;
9674 
9675 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
9676 					/* success but no page: fail */
9677 					vm_object_paging_end(object);
9678 					vm_object_unlock(object);
9679 					goto memory_error;
9680 
9681 				default:
9682 					panic("vm_object_iopl_request: unexpected error"
9683 					    " 0x%x from vm_fault_page()\n", result);
9684 				}
9685 			} while (result != VM_FAULT_SUCCESS);
9686 		}
9687 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9688 
9689 		if (upl->flags & UPL_KERNEL_OBJECT) {
9690 			goto record_phys_addr;
9691 		}
9692 
9693 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9694 			dst_page->vmp_busy = TRUE;
9695 			goto record_phys_addr;
9696 		}
9697 
9698 		if (dst_page->vmp_cleaning) {
9699 			/*
9700 			 * Someone else is cleaning this page in place.
9701 			 * In theory, we should be able to  proceed and use this
9702 			 * page but they'll probably end up clearing the "busy"
9703 			 * bit on it in upl_commit_range() but they didn't set
9704 			 * it, so they would clear our "busy" bit and open
9705 			 * us to race conditions.
9706 			 * We'd better wait for the cleaning to complete and
9707 			 * then try again.
9708 			 */
9709 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9710 			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9711 			continue;
9712 		}
9713 		if (dst_page->vmp_laundry) {
9714 			vm_pageout_steal_laundry(dst_page, FALSE);
9715 		}
9716 
9717 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9718 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9719 			vm_page_t       low_page;
9720 			int             refmod;
9721 
9722 			/*
9723 			 * support devices that can't DMA above 32 bits
9724 			 * by substituting pages from a pool of low address
9725 			 * memory for any pages we find above the 4G mark
9726 			 * can't substitute if the page is already wired because
9727 			 * we don't know whether that physical address has been
9728 			 * handed out to some other 64 bit capable DMA device to use
9729 			 */
9730 			if (VM_PAGE_WIRED(dst_page)) {
9731 				ret = KERN_PROTECTION_FAILURE;
9732 				goto return_err;
9733 			}
9734 			low_page = vm_page_grablo();
9735 
9736 			if (low_page == VM_PAGE_NULL) {
9737 				ret = KERN_RESOURCE_SHORTAGE;
9738 				goto return_err;
9739 			}
9740 			/*
9741 			 * from here until the vm_page_replace completes
9742 			 * we musn't drop the object lock... we don't
9743 			 * want anyone refaulting this page in and using
9744 			 * it after we disconnect it... we want the fault
9745 			 * to find the new page being substituted.
9746 			 */
9747 			if (dst_page->vmp_pmapped) {
9748 				refmod = pmap_disconnect(phys_page);
9749 			} else {
9750 				refmod = 0;
9751 			}
9752 
9753 			if (!dst_page->vmp_absent) {
9754 				vm_page_copy(dst_page, low_page);
9755 			}
9756 
9757 			low_page->vmp_reference = dst_page->vmp_reference;
9758 			low_page->vmp_dirty     = dst_page->vmp_dirty;
9759 			low_page->vmp_absent    = dst_page->vmp_absent;
9760 
9761 			if (refmod & VM_MEM_REFERENCED) {
9762 				low_page->vmp_reference = TRUE;
9763 			}
9764 			if (refmod & VM_MEM_MODIFIED) {
9765 				SET_PAGE_DIRTY(low_page, FALSE);
9766 			}
9767 
9768 			vm_page_replace(low_page, object, dst_offset);
9769 
9770 			dst_page = low_page;
9771 			/*
9772 			 * vm_page_grablo returned the page marked
9773 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
9774 			 * here, because we've never dropped the object lock
9775 			 */
9776 			if (!dst_page->vmp_absent) {
9777 				dst_page->vmp_busy = FALSE;
9778 			}
9779 
9780 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9781 		}
9782 		if (!dst_page->vmp_busy) {
9783 			dwp->dw_mask |= DW_vm_page_wire;
9784 		}
9785 
9786 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
9787 			/*
9788 			 * Mark the page "busy" to block any future page fault
9789 			 * on this page in addition to wiring it.
9790 			 * We'll also remove the mapping
9791 			 * of all these pages before leaving this routine.
9792 			 */
9793 			assert(!dst_page->vmp_fictitious);
9794 			dst_page->vmp_busy = TRUE;
9795 		}
9796 		/*
9797 		 * expect the page to be used
9798 		 * page queues lock must be held to set 'reference'
9799 		 */
9800 		dwp->dw_mask |= DW_set_reference;
9801 
9802 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9803 			SET_PAGE_DIRTY(dst_page, TRUE);
9804 			/*
9805 			 * Page belonging to a code-signed object is about to
9806 			 * be written. Mark it tainted and disconnect it from
9807 			 * all pmaps so processes have to fault it back in and
9808 			 * deal with the tainted bit.
9809 			 */
9810 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9811 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9812 				vm_page_iopl_tainted++;
9813 				if (dst_page->vmp_pmapped) {
9814 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9815 					if (refmod & VM_MEM_REFERENCED) {
9816 						dst_page->vmp_reference = TRUE;
9817 					}
9818 				}
9819 			}
9820 		}
9821 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9822 			pmap_sync_page_attributes_phys(phys_page);
9823 			dst_page->vmp_written_by_kernel = FALSE;
9824 		}
9825 
9826 record_phys_addr:
9827 		if (dst_page->vmp_busy) {
9828 			upl->flags |= UPL_HAS_BUSY;
9829 		}
9830 
9831 		lite_list[entry >> 5] |= 1U << (entry & 31);
9832 
9833 		if (phys_page > upl->highest_page) {
9834 			upl->highest_page = phys_page;
9835 		}
9836 
9837 		if (user_page_list) {
9838 			user_page_list[entry].phys_addr = phys_page;
9839 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9840 			user_page_list[entry].absent    = dst_page->vmp_absent;
9841 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9842 			user_page_list[entry].precious  = dst_page->vmp_precious;
9843 			user_page_list[entry].device    = FALSE;
9844 			user_page_list[entry].needed    = FALSE;
9845 			if (dst_page->vmp_clustered == TRUE) {
9846 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9847 			} else {
9848 				user_page_list[entry].speculative = FALSE;
9849 			}
9850 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9851 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9852 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9853 			user_page_list[entry].mark      = FALSE;
9854 		}
9855 		if (object != kernel_object && object != compressor_object) {
9856 			/*
9857 			 * someone is explicitly grabbing this page...
9858 			 * update clustered and speculative state
9859 			 *
9860 			 */
9861 			if (dst_page->vmp_clustered) {
9862 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
9863 			}
9864 		}
9865 skip_page:
9866 		entry++;
9867 		dst_offset += PAGE_SIZE_64;
9868 		xfer_size -= PAGE_SIZE;
9869 
9870 		if (dwp->dw_mask) {
9871 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9872 
9873 			if (dw_count >= dw_limit) {
9874 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9875 
9876 				dwp = dwp_start;
9877 				dw_count = 0;
9878 			}
9879 		}
9880 	}
9881 	assert(entry == size_in_pages);
9882 
9883 	if (dw_count) {
9884 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9885 		dwp = dwp_start;
9886 		dw_count = 0;
9887 	}
9888 finish:
9889 	if (user_page_list && set_cache_attr_needed == TRUE) {
9890 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9891 	}
9892 
9893 	if (page_list_count != NULL) {
9894 		if (upl->flags & UPL_INTERNAL) {
9895 			*page_list_count = 0;
9896 		} else if (*page_list_count > size_in_pages) {
9897 			*page_list_count = size_in_pages;
9898 		}
9899 	}
9900 	vm_object_unlock(object);
9901 
9902 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9903 		/*
9904 		 * We've marked all the pages "busy" so that future
9905 		 * page faults will block.
9906 		 * Now remove the mapping for these pages, so that they
9907 		 * can't be accessed without causing a page fault.
9908 		 */
9909 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9910 		    PMAP_NULL,
9911 		    PAGE_SIZE,
9912 		    0, VM_PROT_NONE);
9913 		assert(!object->blocked_access);
9914 		object->blocked_access = TRUE;
9915 	}
9916 
9917 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9918 #if DEVELOPMENT || DEBUG
9919 	if (task != NULL) {
9920 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9921 	}
9922 #endif /* DEVELOPMENT || DEBUG */
9923 
9924 	if (dwp_start && dwp_finish_ctx) {
9925 		vm_page_delayed_work_finish_ctx(dwp_start);
9926 		dwp_start = dwp = NULL;
9927 	}
9928 
9929 	return KERN_SUCCESS;
9930 
9931 return_err:
9932 	dw_index = 0;
9933 
9934 	for (; offset < dst_offset; offset += PAGE_SIZE) {
9935 		boolean_t need_unwire;
9936 
9937 		dst_page = vm_page_lookup(object, offset);
9938 
9939 		if (dst_page == VM_PAGE_NULL) {
9940 			panic("vm_object_iopl_request: Wired page missing.");
9941 		}
9942 
9943 		/*
9944 		 * if we've already processed this page in an earlier
9945 		 * dw_do_work, we need to undo the wiring... we will
9946 		 * leave the dirty and reference bits on if they
9947 		 * were set, since we don't have a good way of knowing
9948 		 * what the previous state was and we won't get here
9949 		 * under any normal circumstances...  we will always
9950 		 * clear BUSY and wakeup any waiters via vm_page_free
9951 		 * or PAGE_WAKEUP_DONE
9952 		 */
9953 		need_unwire = TRUE;
9954 
9955 		if (dw_count) {
9956 			if ((dwp_start)[dw_index].dw_m == dst_page) {
9957 				/*
9958 				 * still in the deferred work list
9959 				 * which means we haven't yet called
9960 				 * vm_page_wire on this page
9961 				 */
9962 				need_unwire = FALSE;
9963 
9964 				dw_index++;
9965 				dw_count--;
9966 			}
9967 		}
9968 		vm_page_lock_queues();
9969 
9970 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9971 			vm_page_free(dst_page);
9972 
9973 			need_unwire = FALSE;
9974 		} else {
9975 			if (need_unwire == TRUE) {
9976 				vm_page_unwire(dst_page, TRUE);
9977 			}
9978 
9979 			PAGE_WAKEUP_DONE(dst_page);
9980 		}
9981 		vm_page_unlock_queues();
9982 
9983 		if (need_unwire == TRUE) {
9984 			counter_inc(&vm_statistics_reactivations);
9985 		}
9986 	}
9987 #if UPL_DEBUG
9988 	upl->upl_state = 2;
9989 #endif
9990 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9991 		vm_object_activity_end(object);
9992 		vm_object_collapse(object, 0, TRUE);
9993 	}
9994 	vm_object_unlock(object);
9995 	upl_destroy(upl);
9996 
9997 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9998 #if DEVELOPMENT || DEBUG
9999 	if (task != NULL) {
10000 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
10001 	}
10002 #endif /* DEVELOPMENT || DEBUG */
10003 
10004 	if (dwp_start && dwp_finish_ctx) {
10005 		vm_page_delayed_work_finish_ctx(dwp_start);
10006 		dwp_start = dwp = NULL;
10007 	}
10008 	return ret;
10009 }
10010 
10011 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)10012 upl_transpose(
10013 	upl_t           upl1,
10014 	upl_t           upl2)
10015 {
10016 	kern_return_t           retval;
10017 	boolean_t               upls_locked;
10018 	vm_object_t             object1, object2;
10019 
10020 	/* LD: Should mapped UPLs be eligible for a transpose? */
10021 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
10022 		return KERN_INVALID_ARGUMENT;
10023 	}
10024 
10025 	upls_locked = FALSE;
10026 
10027 	/*
10028 	 * Since we need to lock both UPLs at the same time,
10029 	 * avoid deadlocks by always taking locks in the same order.
10030 	 */
10031 	if (upl1 < upl2) {
10032 		upl_lock(upl1);
10033 		upl_lock(upl2);
10034 	} else {
10035 		upl_lock(upl2);
10036 		upl_lock(upl1);
10037 	}
10038 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
10039 
10040 	object1 = upl1->map_object;
10041 	object2 = upl2->map_object;
10042 
10043 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
10044 	    upl1->u_size != upl2->u_size) {
10045 		/*
10046 		 * We deal only with full objects, not subsets.
10047 		 * That's because we exchange the entire backing store info
10048 		 * for the objects: pager, resident pages, etc...  We can't do
10049 		 * only part of it.
10050 		 */
10051 		retval = KERN_INVALID_VALUE;
10052 		goto done;
10053 	}
10054 
10055 	/*
10056 	 * Tranpose the VM objects' backing store.
10057 	 */
10058 	retval = vm_object_transpose(object1, object2,
10059 	    upl_adjusted_size(upl1, PAGE_MASK));
10060 
10061 	if (retval == KERN_SUCCESS) {
10062 		/*
10063 		 * Make each UPL point to the correct VM object, i.e. the
10064 		 * object holding the pages that the UPL refers to...
10065 		 */
10066 #if CONFIG_IOSCHED || UPL_DEBUG
10067 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10068 			vm_object_lock(object1);
10069 			vm_object_lock(object2);
10070 		}
10071 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10072 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
10073 		}
10074 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10075 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
10076 		}
10077 #endif
10078 		upl1->map_object = object2;
10079 		upl2->map_object = object1;
10080 
10081 #if CONFIG_IOSCHED || UPL_DEBUG
10082 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10083 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
10084 		}
10085 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10086 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
10087 		}
10088 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10089 			vm_object_unlock(object2);
10090 			vm_object_unlock(object1);
10091 		}
10092 #endif
10093 	}
10094 
10095 done:
10096 	/*
10097 	 * Cleanup.
10098 	 */
10099 	if (upls_locked) {
10100 		upl_unlock(upl1);
10101 		upl_unlock(upl2);
10102 		upls_locked = FALSE;
10103 	}
10104 
10105 	return retval;
10106 }
10107 
10108 void
upl_range_needed(upl_t upl,int index,int count)10109 upl_range_needed(
10110 	upl_t           upl,
10111 	int             index,
10112 	int             count)
10113 {
10114 	upl_page_info_t *user_page_list;
10115 	int             size_in_pages;
10116 
10117 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10118 		return;
10119 	}
10120 
10121 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10122 
10123 	user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
10124 
10125 	while (count-- && index < size_in_pages) {
10126 		user_page_list[index++].needed = TRUE;
10127 	}
10128 }
10129 
10130 
10131 /*
10132  * Reserve of virtual addresses in the kernel address space.
10133  * We need to map the physical pages in the kernel, so that we
10134  * can call the code-signing or slide routines with a kernel
10135  * virtual address.  We keep this pool of pre-allocated kernel
10136  * virtual addresses so that we don't have to scan the kernel's
10137  * virtaul address space each time we need to work with
10138  * a physical page.
10139  */
10140 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10141 #define VM_PAGING_NUM_PAGES     64
10142 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10143 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10144 int             vm_paging_max_index = 0;
10145 int             vm_paging_page_waiter = 0;
10146 int             vm_paging_page_waiter_total = 0;
10147 
10148 unsigned long   vm_paging_no_kernel_page = 0;
10149 unsigned long   vm_paging_objects_mapped = 0;
10150 unsigned long   vm_paging_pages_mapped = 0;
10151 unsigned long   vm_paging_objects_mapped_slow = 0;
10152 unsigned long   vm_paging_pages_mapped_slow = 0;
10153 
10154 __startup_func
10155 static void
vm_paging_map_init(void)10156 vm_paging_map_init(void)
10157 {
10158 	kmem_alloc(kernel_map, &vm_paging_base_address,
10159 	    ptoa(VM_PAGING_NUM_PAGES),
10160 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10161 	    VM_KERN_MEMORY_NONE);
10162 }
10163 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10164 
10165 /*
10166  * vm_paging_map_object:
10167  *	Maps part of a VM object's pages in the kernel
10168  *      virtual address space, using the pre-allocated
10169  *	kernel virtual addresses, if possible.
10170  * Context:
10171  *      The VM object is locked.  This lock will get
10172  *      dropped and re-acquired though, so the caller
10173  *      must make sure the VM object is kept alive
10174  *	(by holding a VM map that has a reference
10175  *      on it, for example, or taking an extra reference).
10176  *      The page should also be kept busy to prevent
10177  *	it from being reclaimed.
10178  */
10179 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10180 vm_paging_map_object(
10181 	vm_page_t               page,
10182 	vm_object_t             object,
10183 	vm_object_offset_t      offset,
10184 	vm_prot_t               protection,
10185 	boolean_t               can_unlock_object,
10186 	vm_map_size_t           *size,          /* IN/OUT */
10187 	vm_map_offset_t         *address,       /* OUT */
10188 	boolean_t               *need_unmap)    /* OUT */
10189 {
10190 	kern_return_t           kr;
10191 	vm_map_offset_t         page_map_offset;
10192 	vm_map_size_t           map_size;
10193 	vm_object_offset_t      object_offset;
10194 	int                     i;
10195 
10196 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10197 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
10198 		*address = (vm_map_offset_t)
10199 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10200 		*need_unmap = FALSE;
10201 		return KERN_SUCCESS;
10202 
10203 		assert(page->vmp_busy);
10204 		/*
10205 		 * Use one of the pre-allocated kernel virtual addresses
10206 		 * and just enter the VM page in the kernel address space
10207 		 * at that virtual address.
10208 		 */
10209 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10210 
10211 		/*
10212 		 * Try and find an available kernel virtual address
10213 		 * from our pre-allocated pool.
10214 		 */
10215 		page_map_offset = 0;
10216 		for (;;) {
10217 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10218 				if (vm_paging_page_inuse[i] == FALSE) {
10219 					page_map_offset =
10220 					    vm_paging_base_address +
10221 					    (i * PAGE_SIZE);
10222 					break;
10223 				}
10224 			}
10225 			if (page_map_offset != 0) {
10226 				/* found a space to map our page ! */
10227 				break;
10228 			}
10229 
10230 			if (can_unlock_object) {
10231 				/*
10232 				 * If we can afford to unlock the VM object,
10233 				 * let's take the slow path now...
10234 				 */
10235 				break;
10236 			}
10237 			/*
10238 			 * We can't afford to unlock the VM object, so
10239 			 * let's wait for a space to become available...
10240 			 */
10241 			vm_paging_page_waiter_total++;
10242 			vm_paging_page_waiter++;
10243 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10244 			if (kr == THREAD_WAITING) {
10245 				simple_unlock(&vm_paging_lock);
10246 				kr = thread_block(THREAD_CONTINUE_NULL);
10247 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10248 			}
10249 			vm_paging_page_waiter--;
10250 			/* ... and try again */
10251 		}
10252 
10253 		if (page_map_offset != 0) {
10254 			/*
10255 			 * We found a kernel virtual address;
10256 			 * map the physical page to that virtual address.
10257 			 */
10258 			if (i > vm_paging_max_index) {
10259 				vm_paging_max_index = i;
10260 			}
10261 			vm_paging_page_inuse[i] = TRUE;
10262 			simple_unlock(&vm_paging_lock);
10263 
10264 			page->vmp_pmapped = TRUE;
10265 
10266 			/*
10267 			 * Keep the VM object locked over the PMAP_ENTER
10268 			 * and the actual use of the page by the kernel,
10269 			 * or this pmap mapping might get undone by a
10270 			 * vm_object_pmap_protect() call...
10271 			 */
10272 			PMAP_ENTER(kernel_pmap,
10273 			    page_map_offset,
10274 			    page,
10275 			    protection,
10276 			    VM_PROT_NONE,
10277 			    0,
10278 			    TRUE,
10279 			    kr);
10280 			assert(kr == KERN_SUCCESS);
10281 			vm_paging_objects_mapped++;
10282 			vm_paging_pages_mapped++;
10283 			*address = page_map_offset;
10284 			*need_unmap = TRUE;
10285 
10286 #if KASAN
10287 			kasan_notify_address(page_map_offset, PAGE_SIZE);
10288 #endif
10289 
10290 			/* all done and mapped, ready to use ! */
10291 			return KERN_SUCCESS;
10292 		}
10293 
10294 		/*
10295 		 * We ran out of pre-allocated kernel virtual
10296 		 * addresses.  Just map the page in the kernel
10297 		 * the slow and regular way.
10298 		 */
10299 		vm_paging_no_kernel_page++;
10300 		simple_unlock(&vm_paging_lock);
10301 	}
10302 
10303 	if (!can_unlock_object) {
10304 		*address = 0;
10305 		*size = 0;
10306 		*need_unmap = FALSE;
10307 		return KERN_NOT_SUPPORTED;
10308 	}
10309 
10310 	object_offset = vm_object_trunc_page(offset);
10311 	map_size = vm_map_round_page(*size,
10312 	    VM_MAP_PAGE_MASK(kernel_map));
10313 
10314 	/*
10315 	 * Try and map the required range of the object
10316 	 * in the kernel_map. Given that allocation is
10317 	 * for pageable memory, it shouldn't contain
10318 	 * pointers and is mapped into the data range.
10319 	 */
10320 
10321 	vm_object_reference_locked(object);     /* for the map entry */
10322 	vm_object_unlock(object);
10323 
10324 	kr = vm_map_enter(kernel_map,
10325 	    address,
10326 	    map_size,
10327 	    0,
10328 	    VM_FLAGS_ANYWHERE,
10329 	    VM_MAP_KERNEL_FLAGS_DATA,
10330 	    VM_KERN_MEMORY_NONE,
10331 	    object,
10332 	    object_offset,
10333 	    FALSE,
10334 	    protection,
10335 	    VM_PROT_ALL,
10336 	    VM_INHERIT_NONE);
10337 	if (kr != KERN_SUCCESS) {
10338 		*address = 0;
10339 		*size = 0;
10340 		*need_unmap = FALSE;
10341 		vm_object_deallocate(object);   /* for the map entry */
10342 		vm_object_lock(object);
10343 		return kr;
10344 	}
10345 
10346 	*size = map_size;
10347 
10348 	/*
10349 	 * Enter the mapped pages in the page table now.
10350 	 */
10351 	vm_object_lock(object);
10352 	/*
10353 	 * VM object must be kept locked from before PMAP_ENTER()
10354 	 * until after the kernel is done accessing the page(s).
10355 	 * Otherwise, the pmap mappings in the kernel could be
10356 	 * undone by a call to vm_object_pmap_protect().
10357 	 */
10358 
10359 	for (page_map_offset = 0;
10360 	    map_size != 0;
10361 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10362 		page = vm_page_lookup(object, offset + page_map_offset);
10363 		if (page == VM_PAGE_NULL) {
10364 			printf("vm_paging_map_object: no page !?");
10365 			vm_object_unlock(object);
10366 			vm_map_remove(kernel_map, *address, *size);
10367 			*address = 0;
10368 			*size = 0;
10369 			*need_unmap = FALSE;
10370 			vm_object_lock(object);
10371 			return KERN_MEMORY_ERROR;
10372 		}
10373 		page->vmp_pmapped = TRUE;
10374 
10375 		PMAP_ENTER(kernel_pmap,
10376 		    *address + page_map_offset,
10377 		    page,
10378 		    protection,
10379 		    VM_PROT_NONE,
10380 		    0,
10381 		    TRUE,
10382 		    kr);
10383 		assert(kr == KERN_SUCCESS);
10384 #if KASAN
10385 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10386 #endif
10387 	}
10388 
10389 	vm_paging_objects_mapped_slow++;
10390 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10391 
10392 	*need_unmap = TRUE;
10393 
10394 	return KERN_SUCCESS;
10395 }
10396 
10397 /*
10398  * vm_paging_unmap_object:
10399  *	Unmaps part of a VM object's pages from the kernel
10400  *      virtual address space.
10401  * Context:
10402  *      The VM object is locked.  This lock will get
10403  *      dropped and re-acquired though.
10404  */
10405 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10406 vm_paging_unmap_object(
10407 	vm_object_t     object,
10408 	vm_map_offset_t start,
10409 	vm_map_offset_t end)
10410 {
10411 	int             i;
10412 
10413 	if ((vm_paging_base_address == 0) ||
10414 	    (start < vm_paging_base_address) ||
10415 	    (end > (vm_paging_base_address
10416 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10417 		/*
10418 		 * We didn't use our pre-allocated pool of
10419 		 * kernel virtual address.  Deallocate the
10420 		 * virtual memory.
10421 		 */
10422 		if (object != VM_OBJECT_NULL) {
10423 			vm_object_unlock(object);
10424 		}
10425 		vm_map_remove(kernel_map, start, end);
10426 		if (object != VM_OBJECT_NULL) {
10427 			vm_object_lock(object);
10428 		}
10429 	} else {
10430 		/*
10431 		 * We used a kernel virtual address from our
10432 		 * pre-allocated pool.  Put it back in the pool
10433 		 * for next time.
10434 		 */
10435 		assert(end - start == PAGE_SIZE);
10436 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10437 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10438 
10439 		/* undo the pmap mapping */
10440 		pmap_remove(kernel_pmap, start, end);
10441 
10442 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10443 		vm_paging_page_inuse[i] = FALSE;
10444 		if (vm_paging_page_waiter) {
10445 			thread_wakeup(&vm_paging_page_waiter);
10446 		}
10447 		simple_unlock(&vm_paging_lock);
10448 	}
10449 }
10450 
10451 
10452 /*
10453  * page->vmp_object must be locked
10454  */
10455 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10456 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10457 {
10458 	if (!queues_locked) {
10459 		vm_page_lockspin_queues();
10460 	}
10461 
10462 	page->vmp_free_when_done = FALSE;
10463 	/*
10464 	 * need to drop the laundry count...
10465 	 * we may also need to remove it
10466 	 * from the I/O paging queue...
10467 	 * vm_pageout_throttle_up handles both cases
10468 	 *
10469 	 * the laundry and pageout_queue flags are cleared...
10470 	 */
10471 	vm_pageout_throttle_up(page);
10472 
10473 	if (!queues_locked) {
10474 		vm_page_unlock_queues();
10475 	}
10476 }
10477 
10478 upl_t
vector_upl_create(vm_offset_t upl_offset)10479 vector_upl_create(vm_offset_t upl_offset)
10480 {
10481 	int i = 0;
10482 	upl_t   upl;
10483 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10484 
10485 	upl = upl_create(0, UPL_VECTOR, 0);
10486 	upl->vector_upl = vector_upl;
10487 	upl->u_offset = upl_offset;
10488 	vector_upl->size = 0;
10489 	vector_upl->offset = upl_offset;
10490 	vector_upl->invalid_upls = 0;
10491 	vector_upl->num_upls = 0;
10492 	vector_upl->pagelist = NULL;
10493 
10494 	for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10495 		vector_upl->upl_iostates[i].size = 0;
10496 		vector_upl->upl_iostates[i].offset = 0;
10497 	}
10498 	return upl;
10499 }
10500 
10501 void
vector_upl_deallocate(upl_t upl)10502 vector_upl_deallocate(upl_t upl)
10503 {
10504 	if (upl) {
10505 		vector_upl_t vector_upl = upl->vector_upl;
10506 		if (vector_upl) {
10507 			if (vector_upl->invalid_upls != vector_upl->num_upls) {
10508 				panic("Deallocating non-empty Vectored UPL");
10509 			}
10510 			kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10511 			vector_upl->invalid_upls = 0;
10512 			vector_upl->num_upls = 0;
10513 			vector_upl->pagelist = NULL;
10514 			vector_upl->size = 0;
10515 			vector_upl->offset = 0;
10516 			kfree_type(struct _vector_upl, vector_upl);
10517 			vector_upl = (vector_upl_t)0xfeedfeed;
10518 		} else {
10519 			panic("vector_upl_deallocate was passed a non-vectored upl");
10520 		}
10521 	} else {
10522 		panic("vector_upl_deallocate was passed a NULL upl");
10523 	}
10524 }
10525 
10526 boolean_t
vector_upl_is_valid(upl_t upl)10527 vector_upl_is_valid(upl_t upl)
10528 {
10529 	if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10530 		vector_upl_t vector_upl = upl->vector_upl;
10531 		if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10532 			return FALSE;
10533 		} else {
10534 			return TRUE;
10535 		}
10536 	}
10537 	return FALSE;
10538 }
10539 
10540 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10541 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10542 {
10543 	if (vector_upl_is_valid(upl)) {
10544 		vector_upl_t vector_upl = upl->vector_upl;
10545 
10546 		if (vector_upl) {
10547 			if (subupl) {
10548 				if (io_size) {
10549 					if (io_size < PAGE_SIZE) {
10550 						io_size = PAGE_SIZE;
10551 					}
10552 					subupl->vector_upl = (void*)vector_upl;
10553 					vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10554 					vector_upl->size += io_size;
10555 					upl->u_size += io_size;
10556 				} else {
10557 					uint32_t i = 0, invalid_upls = 0;
10558 					for (i = 0; i < vector_upl->num_upls; i++) {
10559 						if (vector_upl->upl_elems[i] == subupl) {
10560 							break;
10561 						}
10562 					}
10563 					if (i == vector_upl->num_upls) {
10564 						panic("Trying to remove sub-upl when none exists");
10565 					}
10566 
10567 					vector_upl->upl_elems[i] = NULL;
10568 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10569 					    relaxed);
10570 					if (invalid_upls == vector_upl->num_upls) {
10571 						return TRUE;
10572 					} else {
10573 						return FALSE;
10574 					}
10575 				}
10576 			} else {
10577 				panic("vector_upl_set_subupl was passed a NULL upl element");
10578 			}
10579 		} else {
10580 			panic("vector_upl_set_subupl was passed a non-vectored upl");
10581 		}
10582 	} else {
10583 		panic("vector_upl_set_subupl was passed a NULL upl");
10584 	}
10585 
10586 	return FALSE;
10587 }
10588 
10589 void
vector_upl_set_pagelist(upl_t upl)10590 vector_upl_set_pagelist(upl_t upl)
10591 {
10592 	if (vector_upl_is_valid(upl)) {
10593 		uint32_t i = 0;
10594 		vector_upl_t vector_upl = upl->vector_upl;
10595 
10596 		if (vector_upl) {
10597 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10598 
10599 			vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10600 
10601 			for (i = 0; i < vector_upl->num_upls; i++) {
10602 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10603 				bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10604 				pagelist_size += cur_upl_pagelist_size;
10605 				if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10606 					upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10607 				}
10608 			}
10609 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10610 		} else {
10611 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
10612 		}
10613 	} else {
10614 		panic("vector_upl_set_pagelist was passed a NULL upl");
10615 	}
10616 }
10617 
10618 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10619 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10620 {
10621 	if (vector_upl_is_valid(upl)) {
10622 		vector_upl_t vector_upl = upl->vector_upl;
10623 		if (vector_upl) {
10624 			if (index < vector_upl->num_upls) {
10625 				return vector_upl->upl_elems[index];
10626 			}
10627 		} else {
10628 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10629 		}
10630 	}
10631 	return NULL;
10632 }
10633 
10634 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10635 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10636 {
10637 	if (vector_upl_is_valid(upl)) {
10638 		uint32_t i = 0;
10639 		vector_upl_t vector_upl = upl->vector_upl;
10640 
10641 		if (vector_upl) {
10642 			upl_t subupl = NULL;
10643 			vector_upl_iostates_t subupl_state;
10644 
10645 			for (i = 0; i < vector_upl->num_upls; i++) {
10646 				subupl = vector_upl->upl_elems[i];
10647 				subupl_state = vector_upl->upl_iostates[i];
10648 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10649 					/* We could have been passed an offset/size pair that belongs
10650 					 * to an UPL element that has already been committed/aborted.
10651 					 * If so, return NULL.
10652 					 */
10653 					if (subupl == NULL) {
10654 						return NULL;
10655 					}
10656 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10657 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10658 						if (*upl_size > subupl_state.size) {
10659 							*upl_size = subupl_state.size;
10660 						}
10661 					}
10662 					if (*upl_offset >= subupl_state.offset) {
10663 						*upl_offset -= subupl_state.offset;
10664 					} else if (i) {
10665 						panic("Vector UPL offset miscalculation");
10666 					}
10667 					return subupl;
10668 				}
10669 			}
10670 		} else {
10671 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10672 		}
10673 	}
10674 	return NULL;
10675 }
10676 
10677 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10678 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10679 {
10680 	*v_upl_submap = NULL;
10681 
10682 	if (vector_upl_is_valid(upl)) {
10683 		vector_upl_t vector_upl = upl->vector_upl;
10684 		if (vector_upl) {
10685 			*v_upl_submap = vector_upl->submap;
10686 			*submap_dst_addr = vector_upl->submap_dst_addr;
10687 		} else {
10688 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10689 		}
10690 	} else {
10691 		panic("vector_upl_get_submap was passed a null UPL");
10692 	}
10693 }
10694 
10695 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10696 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10697 {
10698 	if (vector_upl_is_valid(upl)) {
10699 		vector_upl_t vector_upl = upl->vector_upl;
10700 		if (vector_upl) {
10701 			vector_upl->submap = submap;
10702 			vector_upl->submap_dst_addr = submap_dst_addr;
10703 		} else {
10704 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10705 		}
10706 	} else {
10707 		panic("vector_upl_get_submap was passed a NULL UPL");
10708 	}
10709 }
10710 
10711 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10712 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10713 {
10714 	if (vector_upl_is_valid(upl)) {
10715 		uint32_t i = 0;
10716 		vector_upl_t vector_upl = upl->vector_upl;
10717 
10718 		if (vector_upl) {
10719 			for (i = 0; i < vector_upl->num_upls; i++) {
10720 				if (vector_upl->upl_elems[i] == subupl) {
10721 					break;
10722 				}
10723 			}
10724 
10725 			if (i == vector_upl->num_upls) {
10726 				panic("setting sub-upl iostate when none exists");
10727 			}
10728 
10729 			vector_upl->upl_iostates[i].offset = offset;
10730 			if (size < PAGE_SIZE) {
10731 				size = PAGE_SIZE;
10732 			}
10733 			vector_upl->upl_iostates[i].size = size;
10734 		} else {
10735 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
10736 		}
10737 	} else {
10738 		panic("vector_upl_set_iostate was passed a NULL UPL");
10739 	}
10740 }
10741 
10742 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10743 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10744 {
10745 	if (vector_upl_is_valid(upl)) {
10746 		uint32_t i = 0;
10747 		vector_upl_t vector_upl = upl->vector_upl;
10748 
10749 		if (vector_upl) {
10750 			for (i = 0; i < vector_upl->num_upls; i++) {
10751 				if (vector_upl->upl_elems[i] == subupl) {
10752 					break;
10753 				}
10754 			}
10755 
10756 			if (i == vector_upl->num_upls) {
10757 				panic("getting sub-upl iostate when none exists");
10758 			}
10759 
10760 			*offset = vector_upl->upl_iostates[i].offset;
10761 			*size = vector_upl->upl_iostates[i].size;
10762 		} else {
10763 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
10764 		}
10765 	} else {
10766 		panic("vector_upl_get_iostate was passed a NULL UPL");
10767 	}
10768 }
10769 
10770 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10771 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10772 {
10773 	if (vector_upl_is_valid(upl)) {
10774 		vector_upl_t vector_upl = upl->vector_upl;
10775 		if (vector_upl) {
10776 			if (index < vector_upl->num_upls) {
10777 				*offset = vector_upl->upl_iostates[index].offset;
10778 				*size = vector_upl->upl_iostates[index].size;
10779 			} else {
10780 				*offset = *size = 0;
10781 			}
10782 		} else {
10783 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10784 		}
10785 	} else {
10786 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10787 	}
10788 }
10789 
10790 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10791 upl_get_internal_vectorupl_pagelist(upl_t upl)
10792 {
10793 	return ((vector_upl_t)(upl->vector_upl))->pagelist;
10794 }
10795 
10796 void *
upl_get_internal_vectorupl(upl_t upl)10797 upl_get_internal_vectorupl(upl_t upl)
10798 {
10799 	return upl->vector_upl;
10800 }
10801 
10802 vm_size_t
upl_get_internal_pagelist_offset(void)10803 upl_get_internal_pagelist_offset(void)
10804 {
10805 	return sizeof(struct upl);
10806 }
10807 
10808 void
upl_clear_dirty(upl_t upl,boolean_t value)10809 upl_clear_dirty(
10810 	upl_t           upl,
10811 	boolean_t       value)
10812 {
10813 	if (value) {
10814 		upl->flags |= UPL_CLEAR_DIRTY;
10815 	} else {
10816 		upl->flags &= ~UPL_CLEAR_DIRTY;
10817 	}
10818 }
10819 
10820 void
upl_set_referenced(upl_t upl,boolean_t value)10821 upl_set_referenced(
10822 	upl_t           upl,
10823 	boolean_t       value)
10824 {
10825 	upl_lock(upl);
10826 	if (value) {
10827 		upl->ext_ref_count++;
10828 	} else {
10829 		if (!upl->ext_ref_count) {
10830 			panic("upl_set_referenced not %p", upl);
10831 		}
10832 		upl->ext_ref_count--;
10833 	}
10834 	upl_unlock(upl);
10835 }
10836 
10837 #if CONFIG_IOSCHED
10838 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10839 upl_set_blkno(
10840 	upl_t           upl,
10841 	vm_offset_t     upl_offset,
10842 	int             io_size,
10843 	int64_t         blkno)
10844 {
10845 	int i, j;
10846 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10847 		return;
10848 	}
10849 
10850 	assert(upl->upl_reprio_info != 0);
10851 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10852 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10853 	}
10854 }
10855 #endif
10856 
10857 void inline
memoryshot(unsigned int event,unsigned int control)10858 memoryshot(unsigned int event, unsigned int control)
10859 {
10860 	if (vm_debug_events) {
10861 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10862 		    vm_page_active_count, vm_page_inactive_count,
10863 		    vm_page_free_count, vm_page_speculative_count,
10864 		    vm_page_throttled_count);
10865 	} else {
10866 		(void) event;
10867 		(void) control;
10868 	}
10869 }
10870 
10871 #ifdef MACH_BSD
10872 
10873 boolean_t
upl_device_page(upl_page_info_t * upl)10874 upl_device_page(upl_page_info_t *upl)
10875 {
10876 	return UPL_DEVICE_PAGE(upl);
10877 }
10878 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10879 upl_page_present(upl_page_info_t *upl, int index)
10880 {
10881 	return UPL_PAGE_PRESENT(upl, index);
10882 }
10883 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10884 upl_speculative_page(upl_page_info_t *upl, int index)
10885 {
10886 	return UPL_SPECULATIVE_PAGE(upl, index);
10887 }
10888 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10889 upl_dirty_page(upl_page_info_t *upl, int index)
10890 {
10891 	return UPL_DIRTY_PAGE(upl, index);
10892 }
10893 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10894 upl_valid_page(upl_page_info_t *upl, int index)
10895 {
10896 	return UPL_VALID_PAGE(upl, index);
10897 }
10898 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10899 upl_phys_page(upl_page_info_t *upl, int index)
10900 {
10901 	return UPL_PHYS_PAGE(upl, index);
10902 }
10903 
10904 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10905 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10906 {
10907 	upl[index].mark = v;
10908 }
10909 
10910 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10911 upl_page_get_mark(upl_page_info_t *upl, int index)
10912 {
10913 	return upl[index].mark;
10914 }
10915 
10916 void
vm_countdirtypages(void)10917 vm_countdirtypages(void)
10918 {
10919 	vm_page_t m;
10920 	int dpages;
10921 	int pgopages;
10922 	int precpages;
10923 
10924 
10925 	dpages = 0;
10926 	pgopages = 0;
10927 	precpages = 0;
10928 
10929 	vm_page_lock_queues();
10930 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10931 	do {
10932 		if (m == (vm_page_t)0) {
10933 			break;
10934 		}
10935 
10936 		if (m->vmp_dirty) {
10937 			dpages++;
10938 		}
10939 		if (m->vmp_free_when_done) {
10940 			pgopages++;
10941 		}
10942 		if (m->vmp_precious) {
10943 			precpages++;
10944 		}
10945 
10946 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10947 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10948 		if (m == (vm_page_t)0) {
10949 			break;
10950 		}
10951 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10952 	vm_page_unlock_queues();
10953 
10954 	vm_page_lock_queues();
10955 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10956 	do {
10957 		if (m == (vm_page_t)0) {
10958 			break;
10959 		}
10960 
10961 		dpages++;
10962 		assert(m->vmp_dirty);
10963 		assert(!m->vmp_free_when_done);
10964 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10965 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10966 		if (m == (vm_page_t)0) {
10967 			break;
10968 		}
10969 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10970 	vm_page_unlock_queues();
10971 
10972 	vm_page_lock_queues();
10973 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10974 	do {
10975 		if (m == (vm_page_t)0) {
10976 			break;
10977 		}
10978 
10979 		if (m->vmp_dirty) {
10980 			dpages++;
10981 		}
10982 		if (m->vmp_free_when_done) {
10983 			pgopages++;
10984 		}
10985 		if (m->vmp_precious) {
10986 			precpages++;
10987 		}
10988 
10989 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10990 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10991 		if (m == (vm_page_t)0) {
10992 			break;
10993 		}
10994 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10995 	vm_page_unlock_queues();
10996 
10997 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10998 
10999 	dpages = 0;
11000 	pgopages = 0;
11001 	precpages = 0;
11002 
11003 	vm_page_lock_queues();
11004 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
11005 
11006 	do {
11007 		if (m == (vm_page_t)0) {
11008 			break;
11009 		}
11010 		if (m->vmp_dirty) {
11011 			dpages++;
11012 		}
11013 		if (m->vmp_free_when_done) {
11014 			pgopages++;
11015 		}
11016 		if (m->vmp_precious) {
11017 			precpages++;
11018 		}
11019 
11020 		assert(VM_PAGE_OBJECT(m) != kernel_object);
11021 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
11022 		if (m == (vm_page_t)0) {
11023 			break;
11024 		}
11025 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
11026 	vm_page_unlock_queues();
11027 
11028 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
11029 }
11030 #endif /* MACH_BSD */
11031 
11032 
11033 #if CONFIG_IOSCHED
11034 int
upl_get_cached_tier(upl_t upl)11035 upl_get_cached_tier(upl_t  upl)
11036 {
11037 	assert(upl);
11038 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
11039 		return upl->upl_priority;
11040 	}
11041 	return -1;
11042 }
11043 #endif /* CONFIG_IOSCHED */
11044 
11045 
11046 void
upl_callout_iodone(upl_t upl)11047 upl_callout_iodone(upl_t upl)
11048 {
11049 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11050 
11051 	if (upl_ctx) {
11052 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
11053 
11054 		assert(upl_ctx->io_done);
11055 
11056 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11057 	}
11058 }
11059 
11060 void
upl_set_iodone(upl_t upl,void * upl_iodone)11061 upl_set_iodone(upl_t upl, void *upl_iodone)
11062 {
11063 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11064 }
11065 
11066 void
upl_set_iodone_error(upl_t upl,int error)11067 upl_set_iodone_error(upl_t upl, int error)
11068 {
11069 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11070 
11071 	if (upl_ctx) {
11072 		upl_ctx->io_error = error;
11073 	}
11074 }
11075 
11076 
11077 ppnum_t
upl_get_highest_page(upl_t upl)11078 upl_get_highest_page(
11079 	upl_t                      upl)
11080 {
11081 	return upl->highest_page;
11082 }
11083 
11084 upl_size_t
upl_get_size(upl_t upl)11085 upl_get_size(
11086 	upl_t                      upl)
11087 {
11088 	return upl_adjusted_size(upl, PAGE_MASK);
11089 }
11090 
11091 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11092 upl_adjusted_size(
11093 	upl_t upl,
11094 	vm_map_offset_t pgmask)
11095 {
11096 	vm_object_offset_t start_offset, end_offset;
11097 
11098 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11099 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11100 
11101 	return (upl_size_t)(end_offset - start_offset);
11102 }
11103 
11104 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11105 upl_adjusted_offset(
11106 	upl_t upl,
11107 	vm_map_offset_t pgmask)
11108 {
11109 	return trunc_page_mask_64(upl->u_offset, pgmask);
11110 }
11111 
11112 vm_object_offset_t
upl_get_data_offset(upl_t upl)11113 upl_get_data_offset(
11114 	upl_t upl)
11115 {
11116 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11117 }
11118 
11119 upl_t
upl_associated_upl(upl_t upl)11120 upl_associated_upl(upl_t upl)
11121 {
11122 	return upl->associated_upl;
11123 }
11124 
11125 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11126 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11127 {
11128 	upl->associated_upl = associated_upl;
11129 }
11130 
11131 struct vnode *
upl_lookup_vnode(upl_t upl)11132 upl_lookup_vnode(upl_t upl)
11133 {
11134 	if (!upl->map_object->internal) {
11135 		return vnode_pager_lookup_vnode(upl->map_object->pager);
11136 	} else {
11137 		return NULL;
11138 	}
11139 }
11140 
11141 #if UPL_DEBUG
11142 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11143 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11144 {
11145 	upl->ubc_alias1 = alias1;
11146 	upl->ubc_alias2 = alias2;
11147 	return KERN_SUCCESS;
11148 }
11149 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11150 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11151 {
11152 	if (al) {
11153 		*al = upl->ubc_alias1;
11154 	}
11155 	if (al2) {
11156 		*al2 = upl->ubc_alias2;
11157 	}
11158 	return KERN_SUCCESS;
11159 }
11160 #endif /* UPL_DEBUG */
11161 
11162 #if VM_PRESSURE_EVENTS
11163 /*
11164  * Upward trajectory.
11165  */
11166 extern boolean_t vm_compressor_low_on_space(void);
11167 
11168 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11169 VM_PRESSURE_NORMAL_TO_WARNING(void)
11170 {
11171 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11172 		/* Available pages below our threshold */
11173 		if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11174 			/* No frozen processes to kill */
11175 			if (memorystatus_frozen_count == 0) {
11176 				/* Not enough suspended processes available. */
11177 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11178 					return TRUE;
11179 				}
11180 			}
11181 		}
11182 		return FALSE;
11183 	} else {
11184 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11185 	}
11186 }
11187 
11188 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11189 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11190 {
11191 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11192 		/* Available pages below our threshold */
11193 		if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11194 			return TRUE;
11195 		}
11196 		return FALSE;
11197 	} else {
11198 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11199 	}
11200 }
11201 
11202 /*
11203  * Downward trajectory.
11204  */
11205 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11206 VM_PRESSURE_WARNING_TO_NORMAL(void)
11207 {
11208 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11209 		/* Available pages above our threshold */
11210 		unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11211 		if (memorystatus_available_pages > target_threshold) {
11212 			return TRUE;
11213 		}
11214 		return FALSE;
11215 	} else {
11216 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11217 	}
11218 }
11219 
11220 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11221 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11222 {
11223 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11224 		/* Available pages above our threshold */
11225 		unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11226 		if (memorystatus_available_pages > target_threshold) {
11227 			return TRUE;
11228 		}
11229 		return FALSE;
11230 	} else {
11231 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11232 	}
11233 }
11234 #endif /* VM_PRESSURE_EVENTS */
11235 
11236 #if DEVELOPMENT || DEBUG
11237 bool compressor_running_perf_test;
11238 uint64_t compressor_perf_test_pages_processed;
11239 
11240 kern_return_t
11241 run_compressor_perf_test(
11242 	user_addr_t buf,
11243 	size_t buffer_size,
11244 	uint64_t *time,
11245 	uint64_t *bytes_compressed,
11246 	uint64_t *compressor_growth);
11247 
11248 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11249 move_pages_to_queue(
11250 	vm_map_t map,
11251 	user_addr_t start_addr,
11252 	size_t buffer_size,
11253 	vm_page_queue_head_t *queue,
11254 	size_t *pages_moved)
11255 {
11256 	kern_return_t err = KERN_SUCCESS;
11257 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11258 	boolean_t addr_in_map = FALSE;
11259 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11260 	vm_object_t curr_object = VM_OBJECT_NULL;
11261 	*pages_moved = 0;
11262 
11263 
11264 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11265 		/*
11266 		 * We don't currently support benchmarking maps with a different page size
11267 		 * than the kernel.
11268 		 */
11269 		return KERN_INVALID_ARGUMENT;
11270 	}
11271 
11272 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11273 		return KERN_INVALID_ARGUMENT;
11274 	}
11275 
11276 	vm_map_lock_read(map);
11277 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11278 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11279 
11280 
11281 	while (curr_addr < end_addr) {
11282 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11283 		if (!addr_in_map) {
11284 			err = KERN_INVALID_ARGUMENT;
11285 			break;
11286 		}
11287 		curr_object = VME_OBJECT(curr_entry);
11288 		if (curr_object) {
11289 			vm_object_lock(curr_object);
11290 			/* We really only want anonymous memory that's in the top level map and object here. */
11291 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11292 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11293 				err = KERN_INVALID_ARGUMENT;
11294 				vm_object_unlock(curr_object);
11295 				break;
11296 			}
11297 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11298 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11299 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
11300 			vm_map_offset_t curr_offset = start_offset;
11301 			vm_page_t curr_page;
11302 			while (curr_offset < end_offset) {
11303 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11304 				if (curr_page != VM_PAGE_NULL) {
11305 					vm_page_lock_queues();
11306 					if (curr_page->vmp_laundry) {
11307 						vm_pageout_steal_laundry(curr_page, TRUE);
11308 					}
11309 					/*
11310 					 * we've already factored out pages in the laundry which
11311 					 * means this page can't be on the pageout queue so it's
11312 					 * safe to do the vm_page_queues_remove
11313 					 */
11314 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11315 					vm_page_queues_remove(curr_page, TRUE);
11316 					if (donate) {
11317 						/*
11318 						 * The compressor needs to see this bit to know
11319 						 * where this page needs to land. Also if stolen,
11320 						 * this bit helps put the page back in the right
11321 						 * special queue where it belongs.
11322 						 */
11323 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11324 					}
11325 					// Clear the referenced bit so we ensure this gets paged out
11326 					curr_page->vmp_reference = false;
11327 					if (curr_page->vmp_pmapped) {
11328 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11329 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11330 					}
11331 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
11332 					vm_page_unlock_queues();
11333 					*pages_moved += 1;
11334 				}
11335 				curr_offset += PAGE_SIZE_64;
11336 				curr_addr += PAGE_SIZE_64;
11337 			}
11338 		}
11339 		vm_object_unlock(curr_object);
11340 	}
11341 	vm_map_unlock_read(map);
11342 	return err;
11343 }
11344 
11345 /*
11346  * Local queue for processing benchmark pages.
11347  * Can't be allocated on the stack because the pointer has to
11348  * be packable.
11349  */
11350 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11351 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11352 run_compressor_perf_test(
11353 	user_addr_t buf,
11354 	size_t buffer_size,
11355 	uint64_t *time,
11356 	uint64_t *bytes_compressed,
11357 	uint64_t *compressor_growth)
11358 {
11359 	kern_return_t err = KERN_SUCCESS;
11360 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11361 		return KERN_NOT_SUPPORTED;
11362 	}
11363 	if (current_task() == kernel_task) {
11364 		return KERN_INVALID_ARGUMENT;
11365 	}
11366 	vm_page_lock_queues();
11367 	if (compressor_running_perf_test) {
11368 		/* Only run one instance of the benchmark at a time. */
11369 		vm_page_unlock_queues();
11370 		return KERN_RESOURCE_SHORTAGE;
11371 	}
11372 	vm_page_unlock_queues();
11373 	size_t page_count = 0;
11374 	vm_map_t map;
11375 	vm_page_t p, next;
11376 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11377 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11378 	*bytes_compressed = *compressor_growth = 0;
11379 
11380 	vm_page_queue_init(&compressor_perf_test_queue);
11381 	map = current_task()->map;
11382 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11383 	if (err != KERN_SUCCESS) {
11384 		goto out;
11385 	}
11386 
11387 	vm_page_lock_queues();
11388 	compressor_running_perf_test = true;
11389 	compressor_perf_test_pages_processed = 0;
11390 	/*
11391 	 * At this point the compressor threads should only process the benchmark queue
11392 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11393 	 * to determine how many compressed bytes we ended up using.
11394 	 */
11395 	compressed_bytes_start = c_segment_compressed_bytes;
11396 	vm_page_unlock_queues();
11397 
11398 	compressor_perf_test_start = mach_absolute_time();
11399 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11400 
11401 	vm_page_lock_queues();
11402 	/*
11403 	 * Depending on when this test is run we could overshoot or be right on the mark
11404 	 * with our page_count. So the comparison is of the _less than_ variety.
11405 	 */
11406 	while (compressor_perf_test_pages_processed < page_count) {
11407 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11408 		vm_page_unlock_queues();
11409 		thread_block(THREAD_CONTINUE_NULL);
11410 		vm_page_lock_queues();
11411 	}
11412 	compressor_perf_test_end = mach_absolute_time();
11413 	compressed_bytes_end = c_segment_compressed_bytes;
11414 	vm_page_unlock_queues();
11415 
11416 
11417 out:
11418 	/*
11419 	 * If we errored out above, then we could still have some pages
11420 	 * on the local queue. Make sure to put them back on the active queue before
11421 	 * returning so they're not orphaned.
11422 	 */
11423 	vm_page_lock_queues();
11424 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11425 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11426 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11427 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11428 
11429 		vm_page_enqueue_active(p, FALSE);
11430 		p = next;
11431 	}
11432 
11433 	compressor_running_perf_test = false;
11434 	vm_page_unlock_queues();
11435 	if (err == KERN_SUCCESS) {
11436 		*bytes_compressed = page_count * PAGE_SIZE_64;
11437 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
11438 	}
11439 
11440 	/*
11441 	 * pageout_scan will consider waking the compactor swapper
11442 	 * before it blocks. Do the same thing here before we return
11443 	 * to ensure that back to back benchmark runs can't overly fragment the
11444 	 * compressor pool.
11445 	 */
11446 	vm_consider_waking_compactor_swapper();
11447 	return err;
11448 }
11449 #endif /* DEVELOPMENT || DEBUG */
11450