xref: /xnu-8020.101.4/osfmk/vm/vm_pageout.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include <stdint.h>
67 #include <ptrauth.h>
68 
69 #include <debug.h>
70 #include <mach_pagemap.h>
71 #include <mach_cluster_stats.h>
72 
73 #include <mach/mach_types.h>
74 #include <mach/memory_object.h>
75 #include <mach/memory_object_default.h>
76 #include <mach/memory_object_control_server.h>
77 #include <mach/mach_host_server.h>
78 #include <mach/upl.h>
79 #include <mach/vm_map.h>
80 #include <mach/vm_param.h>
81 #include <mach/vm_statistics.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/kern_types.h>
85 #include <kern/counter.h>
86 #include <kern/host_statistics.h>
87 #include <kern/machine.h>
88 #include <kern/misc_protos.h>
89 #include <kern/sched.h>
90 #include <kern/thread.h>
91 #include <kern/kalloc.h>
92 #include <kern/zalloc_internal.h>
93 #include <kern/policy_internal.h>
94 #include <kern/thread_group.h>
95 
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98 
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111 
112 #include <san/kasan.h>
113 
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117 
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121 
122 extern int cs_debug;
123 
124 extern void mbuf_drain(boolean_t);
125 
126 #if VM_PRESSURE_EVENTS
127 #if CONFIG_JETSAM
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 #else /* CONFIG_JETSAM */
132 extern uint64_t memorystatus_available_pages;
133 extern uint64_t memorystatus_available_pages_pressure;
134 extern uint64_t memorystatus_available_pages_critical;
135 #endif /* CONFIG_JETSAM */
136 
137 extern unsigned int memorystatus_frozen_count;
138 extern unsigned int memorystatus_suspended_count;
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140 
141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 
144 void vm_pressure_response(void);
145 extern void consider_vm_pressure_events(void);
146 
147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
148 #endif /* VM_PRESSURE_EVENTS */
149 
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 boolean_t vps_dynamic_priority_enabled = FALSE;
153 
154 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
155 #if !XNU_TARGET_OS_OSX
156 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
157 #else /* !XNU_TARGET_OS_OSX */
158 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
159 #endif /* !XNU_TARGET_OS_OSX */
160 #endif
161 
162 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
163 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
164 #endif
165 
166 #ifndef VM_PAGE_LAUNDRY_MAX
167 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
168 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
169 
170 #ifndef VM_PAGEOUT_BURST_WAIT
171 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
172 #endif  /* VM_PAGEOUT_BURST_WAIT */
173 
174 #ifndef VM_PAGEOUT_EMPTY_WAIT
175 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
176 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
177 
178 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
179 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
180 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
181 
182 #ifndef VM_PAGEOUT_IDLE_WAIT
183 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
184 #endif  /* VM_PAGEOUT_IDLE_WAIT */
185 
186 #ifndef VM_PAGEOUT_SWAP_WAIT
187 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
188 #endif  /* VM_PAGEOUT_SWAP_WAIT */
189 
190 
191 #ifndef VM_PAGE_SPECULATIVE_TARGET
192 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
193 #endif /* VM_PAGE_SPECULATIVE_TARGET */
194 
195 
196 /*
197  *	To obtain a reasonable LRU approximation, the inactive queue
198  *	needs to be large enough to give pages on it a chance to be
199  *	referenced a second time.  This macro defines the fraction
200  *	of active+inactive pages that should be inactive.
201  *	The pageout daemon uses it to update vm_page_inactive_target.
202  *
203  *	If vm_page_free_count falls below vm_page_free_target and
204  *	vm_page_inactive_count is below vm_page_inactive_target,
205  *	then the pageout daemon starts running.
206  */
207 
208 #ifndef VM_PAGE_INACTIVE_TARGET
209 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
210 #endif  /* VM_PAGE_INACTIVE_TARGET */
211 
212 /*
213  *	Once the pageout daemon starts running, it keeps going
214  *	until vm_page_free_count meets or exceeds vm_page_free_target.
215  */
216 
217 #ifndef VM_PAGE_FREE_TARGET
218 #if !XNU_TARGET_OS_OSX
219 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
220 #else /* !XNU_TARGET_OS_OSX */
221 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
222 #endif /* !XNU_TARGET_OS_OSX */
223 #endif  /* VM_PAGE_FREE_TARGET */
224 
225 
226 /*
227  *	The pageout daemon always starts running once vm_page_free_count
228  *	falls below vm_page_free_min.
229  */
230 
231 #ifndef VM_PAGE_FREE_MIN
232 #if !XNU_TARGET_OS_OSX
233 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
234 #else /* !XNU_TARGET_OS_OSX */
235 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
236 #endif /* !XNU_TARGET_OS_OSX */
237 #endif  /* VM_PAGE_FREE_MIN */
238 
239 #if !XNU_TARGET_OS_OSX
240 #define VM_PAGE_FREE_RESERVED_LIMIT     100
241 #define VM_PAGE_FREE_MIN_LIMIT          1500
242 #define VM_PAGE_FREE_TARGET_LIMIT       2000
243 #else /* !XNU_TARGET_OS_OSX */
244 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
245 #define VM_PAGE_FREE_MIN_LIMIT          3500
246 #define VM_PAGE_FREE_TARGET_LIMIT       4000
247 #endif /* !XNU_TARGET_OS_OSX */
248 
249 /*
250  *	When vm_page_free_count falls below vm_page_free_reserved,
251  *	only vm-privileged threads can allocate pages.  vm-privilege
252  *	allows the pageout daemon and default pager (and any other
253  *	associated threads needed for default pageout) to continue
254  *	operation by dipping into the reserved pool of pages.
255  */
256 
257 #ifndef VM_PAGE_FREE_RESERVED
258 #define VM_PAGE_FREE_RESERVED(n)        \
259 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
260 #endif  /* VM_PAGE_FREE_RESERVED */
261 
262 /*
263  *	When we dequeue pages from the inactive list, they are
264  *	reactivated (ie, put back on the active queue) if referenced.
265  *	However, it is possible to starve the free list if other
266  *	processors are referencing pages faster than we can turn off
267  *	the referenced bit.  So we limit the number of reactivations
268  *	we will make per call of vm_pageout_scan().
269  */
270 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
271 
272 #ifndef VM_PAGE_REACTIVATE_LIMIT
273 #if !XNU_TARGET_OS_OSX
274 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
275 #else /* !XNU_TARGET_OS_OSX */
276 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
277 #endif /* !XNU_TARGET_OS_OSX */
278 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
279 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
280 
281 extern boolean_t hibernate_cleaning_in_progress;
282 
283 /*
284  * Forward declarations for internal routines.
285  */
286 struct cq {
287 	struct vm_pageout_queue *q;
288 	void                    *current_chead;
289 	char                    *scratch_buf;
290 	int                     id;
291 };
292 
293 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
294 
295 
296 #if VM_PRESSURE_EVENTS
297 void vm_pressure_thread(void);
298 
299 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
300 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
301 
302 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
303 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
304 #endif
305 
306 static void vm_pageout_iothread_external(void);
307 static void vm_pageout_iothread_internal(struct cq *cq);
308 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
309 
310 extern void vm_pageout_continue(void);
311 extern void vm_pageout_scan(void);
312 
313 boolean_t vm_pageout_running = FALSE;
314 
315 uint32_t vm_page_upl_tainted = 0;
316 uint32_t vm_page_iopl_tainted = 0;
317 
318 #if XNU_TARGET_OS_OSX
319 static boolean_t vm_pageout_waiter  = FALSE;
320 #endif /* XNU_TARGET_OS_OSX */
321 
322 
323 #if DEVELOPMENT || DEBUG
324 struct vm_pageout_debug vm_pageout_debug;
325 #endif
326 struct vm_pageout_vminfo vm_pageout_vminfo;
327 struct vm_pageout_state  vm_pageout_state;
328 struct vm_config         vm_config;
329 
330 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
331 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
332 
333 int         vm_upl_wait_for_pages = 0;
334 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
335 
336 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
337 
338 int     vm_debug_events = 0;
339 
340 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
341 
342 #if CONFIG_MEMORYSTATUS
343 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
344 
345 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
346 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
347 
348 #endif
349 
350 #if __AMP__
351 int vm_compressor_ebound = 1;
352 int vm_pgo_pbound = 0;
353 extern void thread_bind_cluster_type(thread_t, char, bool);
354 #endif /* __AMP__ */
355 
356 
357 /*
358  *	Routine:	vm_pageout_object_terminate
359  *	Purpose:
360  *		Destroy the pageout_object, and perform all of the
361  *		required cleanup actions.
362  *
363  *	In/Out conditions:
364  *		The object must be locked, and will be returned locked.
365  */
366 void
vm_pageout_object_terminate(vm_object_t object)367 vm_pageout_object_terminate(
368 	vm_object_t     object)
369 {
370 	vm_object_t     shadow_object;
371 
372 	/*
373 	 * Deal with the deallocation (last reference) of a pageout object
374 	 * (used for cleaning-in-place) by dropping the paging references/
375 	 * freeing pages in the original object.
376 	 */
377 
378 	assert(object->pageout);
379 	shadow_object = object->shadow;
380 	vm_object_lock(shadow_object);
381 
382 	while (!vm_page_queue_empty(&object->memq)) {
383 		vm_page_t               p, m;
384 		vm_object_offset_t      offset;
385 
386 		p = (vm_page_t) vm_page_queue_first(&object->memq);
387 
388 		assert(p->vmp_private);
389 		assert(p->vmp_free_when_done);
390 		p->vmp_free_when_done = FALSE;
391 		assert(!p->vmp_cleaning);
392 		assert(!p->vmp_laundry);
393 
394 		offset = p->vmp_offset;
395 		VM_PAGE_FREE(p);
396 		p = VM_PAGE_NULL;
397 
398 		m = vm_page_lookup(shadow_object,
399 		    offset + object->vo_shadow_offset);
400 
401 		if (m == VM_PAGE_NULL) {
402 			continue;
403 		}
404 
405 		assert((m->vmp_dirty) || (m->vmp_precious) ||
406 		    (m->vmp_busy && m->vmp_cleaning));
407 
408 		/*
409 		 * Handle the trusted pager throttle.
410 		 * Also decrement the burst throttle (if external).
411 		 */
412 		vm_page_lock_queues();
413 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
414 			vm_pageout_throttle_up(m);
415 		}
416 
417 		/*
418 		 * Handle the "target" page(s). These pages are to be freed if
419 		 * successfully cleaned. Target pages are always busy, and are
420 		 * wired exactly once. The initial target pages are not mapped,
421 		 * (so cannot be referenced or modified) but converted target
422 		 * pages may have been modified between the selection as an
423 		 * adjacent page and conversion to a target.
424 		 */
425 		if (m->vmp_free_when_done) {
426 			assert(m->vmp_busy);
427 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
428 			assert(m->vmp_wire_count == 1);
429 			m->vmp_cleaning = FALSE;
430 			m->vmp_free_when_done = FALSE;
431 			/*
432 			 * Revoke all access to the page. Since the object is
433 			 * locked, and the page is busy, this prevents the page
434 			 * from being dirtied after the pmap_disconnect() call
435 			 * returns.
436 			 *
437 			 * Since the page is left "dirty" but "not modifed", we
438 			 * can detect whether the page was redirtied during
439 			 * pageout by checking the modify state.
440 			 */
441 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
442 				SET_PAGE_DIRTY(m, FALSE);
443 			} else {
444 				m->vmp_dirty = FALSE;
445 			}
446 
447 			if (m->vmp_dirty) {
448 				vm_page_unwire(m, TRUE);        /* reactivates */
449 				counter_inc(&vm_statistics_reactivations);
450 				PAGE_WAKEUP_DONE(m);
451 			} else {
452 				vm_page_free(m);  /* clears busy, etc. */
453 			}
454 			vm_page_unlock_queues();
455 			continue;
456 		}
457 		/*
458 		 * Handle the "adjacent" pages. These pages were cleaned in
459 		 * place, and should be left alone.
460 		 * If prep_pin_count is nonzero, then someone is using the
461 		 * page, so make it active.
462 		 */
463 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
464 			if (m->vmp_reference) {
465 				vm_page_activate(m);
466 			} else {
467 				vm_page_deactivate(m);
468 			}
469 		}
470 		if (m->vmp_overwriting) {
471 			/*
472 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
473 			 */
474 			if (m->vmp_busy) {
475 				/*
476 				 * We do not re-set m->vmp_dirty !
477 				 * The page was busy so no extraneous activity
478 				 * could have occurred. COPY_INTO is a read into the
479 				 * new pages. CLEAN_IN_PLACE does actually write
480 				 * out the pages but handling outside of this code
481 				 * will take care of resetting dirty. We clear the
482 				 * modify however for the Programmed I/O case.
483 				 */
484 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
485 
486 				m->vmp_busy = FALSE;
487 				m->vmp_absent = FALSE;
488 			} else {
489 				/*
490 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
491 				 * Occurs when the original page was wired
492 				 * at the time of the list request
493 				 */
494 				assert(VM_PAGE_WIRED(m));
495 				vm_page_unwire(m, TRUE);        /* reactivates */
496 			}
497 			m->vmp_overwriting = FALSE;
498 		} else {
499 			m->vmp_dirty = FALSE;
500 		}
501 		m->vmp_cleaning = FALSE;
502 
503 		/*
504 		 * Wakeup any thread waiting for the page to be un-cleaning.
505 		 */
506 		PAGE_WAKEUP(m);
507 		vm_page_unlock_queues();
508 	}
509 	/*
510 	 * Account for the paging reference taken in vm_paging_object_allocate.
511 	 */
512 	vm_object_activity_end(shadow_object);
513 	vm_object_unlock(shadow_object);
514 
515 	assert(object->ref_count == 0);
516 	assert(object->paging_in_progress == 0);
517 	assert(object->activity_in_progress == 0);
518 	assert(object->resident_page_count == 0);
519 	return;
520 }
521 
522 /*
523  * Routine:	vm_pageclean_setup
524  *
525  * Purpose:	setup a page to be cleaned (made non-dirty), but not
526  *		necessarily flushed from the VM page cache.
527  *		This is accomplished by cleaning in place.
528  *
529  *		The page must not be busy, and new_object
530  *		must be locked.
531  *
532  */
533 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)534 vm_pageclean_setup(
535 	vm_page_t               m,
536 	vm_page_t               new_m,
537 	vm_object_t             new_object,
538 	vm_object_offset_t      new_offset)
539 {
540 	assert(!m->vmp_busy);
541 #if 0
542 	assert(!m->vmp_cleaning);
543 #endif
544 
545 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
546 
547 	/*
548 	 * Mark original page as cleaning in place.
549 	 */
550 	m->vmp_cleaning = TRUE;
551 	SET_PAGE_DIRTY(m, FALSE);
552 	m->vmp_precious = FALSE;
553 
554 	/*
555 	 * Convert the fictitious page to a private shadow of
556 	 * the real page.
557 	 */
558 	assert(new_m->vmp_fictitious);
559 	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
560 	new_m->vmp_fictitious = FALSE;
561 	new_m->vmp_private = TRUE;
562 	new_m->vmp_free_when_done = TRUE;
563 	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
564 
565 	vm_page_lockspin_queues();
566 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
567 	vm_page_unlock_queues();
568 
569 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
570 	assert(!new_m->vmp_wanted);
571 	new_m->vmp_busy = FALSE;
572 }
573 
574 /*
575  *	Routine:	vm_pageout_initialize_page
576  *	Purpose:
577  *		Causes the specified page to be initialized in
578  *		the appropriate memory object. This routine is used to push
579  *		pages into a copy-object when they are modified in the
580  *		permanent object.
581  *
582  *		The page is moved to a temporary object and paged out.
583  *
584  *	In/out conditions:
585  *		The page in question must not be on any pageout queues.
586  *		The object to which it belongs must be locked.
587  *		The page must be busy, but not hold a paging reference.
588  *
589  *	Implementation:
590  *		Move this page to a completely new object.
591  */
592 void
vm_pageout_initialize_page(vm_page_t m)593 vm_pageout_initialize_page(
594 	vm_page_t       m)
595 {
596 	vm_object_t             object;
597 	vm_object_offset_t      paging_offset;
598 	memory_object_t         pager;
599 
600 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
601 
602 	object = VM_PAGE_OBJECT(m);
603 
604 	assert(m->vmp_busy);
605 	assert(object->internal);
606 
607 	/*
608 	 *	Verify that we really want to clean this page
609 	 */
610 	assert(!m->vmp_absent);
611 	assert(!m->vmp_error);
612 	assert(m->vmp_dirty);
613 
614 	/*
615 	 *	Create a paging reference to let us play with the object.
616 	 */
617 	paging_offset = m->vmp_offset + object->paging_offset;
618 
619 	if (m->vmp_absent || m->vmp_error || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
620 		panic("reservation without pageout?"); /* alan */
621 
622 		VM_PAGE_FREE(m);
623 		vm_object_unlock(object);
624 
625 		return;
626 	}
627 
628 	/*
629 	 * If there's no pager, then we can't clean the page.  This should
630 	 * never happen since this should be a copy object and therefore not
631 	 * an external object, so the pager should always be there.
632 	 */
633 
634 	pager = object->pager;
635 
636 	if (pager == MEMORY_OBJECT_NULL) {
637 		panic("missing pager for copy object");
638 
639 		VM_PAGE_FREE(m);
640 		return;
641 	}
642 
643 	/*
644 	 * set the page for future call to vm_fault_list_request
645 	 */
646 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
647 	SET_PAGE_DIRTY(m, FALSE);
648 
649 	/*
650 	 * keep the object from collapsing or terminating
651 	 */
652 	vm_object_paging_begin(object);
653 	vm_object_unlock(object);
654 
655 	/*
656 	 *	Write the data to its pager.
657 	 *	Note that the data is passed by naming the new object,
658 	 *	not a virtual address; the pager interface has been
659 	 *	manipulated to use the "internal memory" data type.
660 	 *	[The object reference from its allocation is donated
661 	 *	to the eventual recipient.]
662 	 */
663 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
664 
665 	vm_object_lock(object);
666 	vm_object_paging_end(object);
667 }
668 
669 
670 /*
671  * vm_pageout_cluster:
672  *
673  * Given a page, queue it to the appropriate I/O thread,
674  * which will page it out and attempt to clean adjacent pages
675  * in the same operation.
676  *
677  * The object and queues must be locked. We will take a
678  * paging reference to prevent deallocation or collapse when we
679  * release the object lock back at the call site.  The I/O thread
680  * is responsible for consuming this reference
681  *
682  * The page must not be on any pageout queue.
683  */
684 #if DEVELOPMENT || DEBUG
685 vmct_stats_t vmct_stats;
686 
687 int32_t vmct_active = 0;
688 uint64_t vm_compressor_epoch_start = 0;
689 uint64_t vm_compressor_epoch_stop = 0;
690 
691 typedef enum vmct_state_t {
692 	VMCT_IDLE,
693 	VMCT_AWAKENED,
694 	VMCT_ACTIVE,
695 } vmct_state_t;
696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
697 #endif
698 
699 
700 void
vm_pageout_cluster(vm_page_t m)701 vm_pageout_cluster(vm_page_t m)
702 {
703 	vm_object_t     object = VM_PAGE_OBJECT(m);
704 	struct          vm_pageout_queue *q;
705 
706 	VM_PAGE_CHECK(m);
707 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
708 	vm_object_lock_assert_exclusive(object);
709 
710 	/*
711 	 * Only a certain kind of page is appreciated here.
712 	 */
713 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
714 	assert(!m->vmp_cleaning && !m->vmp_laundry);
715 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
716 
717 	/*
718 	 * protect the object from collapse or termination
719 	 */
720 	vm_object_activity_begin(object);
721 
722 	if (object->internal == TRUE) {
723 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
724 
725 		m->vmp_busy = TRUE;
726 
727 		q = &vm_pageout_queue_internal;
728 	} else {
729 		q = &vm_pageout_queue_external;
730 	}
731 
732 	/*
733 	 * pgo_laundry count is tied to the laundry bit
734 	 */
735 	m->vmp_laundry = TRUE;
736 	q->pgo_laundry++;
737 
738 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
739 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
740 
741 	if (q->pgo_idle == TRUE) {
742 		q->pgo_idle = FALSE;
743 		thread_wakeup((event_t) &q->pgo_pending);
744 	}
745 	VM_PAGE_CHECK(m);
746 }
747 
748 
749 /*
750  * A page is back from laundry or we are stealing it back from
751  * the laundering state.  See if there are some pages waiting to
752  * go to laundry and if we can let some of them go now.
753  *
754  * Object and page queues must be locked.
755  */
756 void
vm_pageout_throttle_up(vm_page_t m)757 vm_pageout_throttle_up(
758 	vm_page_t       m)
759 {
760 	struct vm_pageout_queue *q;
761 	vm_object_t      m_object;
762 
763 	m_object = VM_PAGE_OBJECT(m);
764 
765 	assert(m_object != VM_OBJECT_NULL);
766 	assert(m_object != kernel_object);
767 
768 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
769 	vm_object_lock_assert_exclusive(m_object);
770 
771 	if (m_object->internal == TRUE) {
772 		q = &vm_pageout_queue_internal;
773 	} else {
774 		q = &vm_pageout_queue_external;
775 	}
776 
777 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
778 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
779 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
780 
781 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
782 
783 		vm_object_activity_end(m_object);
784 
785 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
786 	}
787 	if (m->vmp_laundry == TRUE) {
788 		m->vmp_laundry = FALSE;
789 		q->pgo_laundry--;
790 
791 		if (q->pgo_throttled == TRUE) {
792 			q->pgo_throttled = FALSE;
793 			thread_wakeup((event_t) &q->pgo_laundry);
794 		}
795 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
796 			q->pgo_draining = FALSE;
797 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
798 		}
799 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
800 	}
801 }
802 
803 
804 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)805 vm_pageout_throttle_up_batch(
806 	struct vm_pageout_queue *q,
807 	int             batch_cnt)
808 {
809 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
810 
811 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
812 
813 	q->pgo_laundry -= batch_cnt;
814 
815 	if (q->pgo_throttled == TRUE) {
816 		q->pgo_throttled = FALSE;
817 		thread_wakeup((event_t) &q->pgo_laundry);
818 	}
819 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
820 		q->pgo_draining = FALSE;
821 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
822 	}
823 }
824 
825 
826 
827 /*
828  * VM memory pressure monitoring.
829  *
830  * vm_pageout_scan() keeps track of the number of pages it considers and
831  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
832  *
833  * compute_memory_pressure() is called every second from compute_averages()
834  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
835  * of recalimed pages in a new vm_pageout_stat[] bucket.
836  *
837  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
838  * The caller provides the number of seconds ("nsecs") worth of statistics
839  * it wants, up to 30 seconds.
840  * It computes the number of pages reclaimed in the past "nsecs" seconds and
841  * also returns the number of pages the system still needs to reclaim at this
842  * moment in time.
843  */
844 #if DEVELOPMENT || DEBUG
845 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
846 #else
847 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
848 #endif
849 struct vm_pageout_stat {
850 	unsigned long vm_page_active_count;
851 	unsigned long vm_page_speculative_count;
852 	unsigned long vm_page_inactive_count;
853 	unsigned long vm_page_anonymous_count;
854 
855 	unsigned long vm_page_free_count;
856 	unsigned long vm_page_wire_count;
857 	unsigned long vm_page_compressor_count;
858 
859 	unsigned long vm_page_pages_compressed;
860 	unsigned long vm_page_pageable_internal_count;
861 	unsigned long vm_page_pageable_external_count;
862 	unsigned long vm_page_xpmapped_external_count;
863 
864 	unsigned int pages_grabbed;
865 	unsigned int pages_freed;
866 
867 	unsigned int pages_compressed;
868 	unsigned int pages_grabbed_by_compressor;
869 	unsigned int failed_compressions;
870 
871 	unsigned int pages_evicted;
872 	unsigned int pages_purged;
873 
874 	unsigned int considered;
875 	unsigned int considered_bq_internal;
876 	unsigned int considered_bq_external;
877 
878 	unsigned int skipped_external;
879 	unsigned int skipped_internal;
880 	unsigned int filecache_min_reactivations;
881 
882 	unsigned int freed_speculative;
883 	unsigned int freed_cleaned;
884 	unsigned int freed_internal;
885 	unsigned int freed_external;
886 
887 	unsigned int cleaned_dirty_external;
888 	unsigned int cleaned_dirty_internal;
889 
890 	unsigned int inactive_referenced;
891 	unsigned int inactive_nolock;
892 	unsigned int reactivation_limit_exceeded;
893 	unsigned int forced_inactive_reclaim;
894 
895 	unsigned int throttled_internal_q;
896 	unsigned int throttled_external_q;
897 
898 	unsigned int phantom_ghosts_found;
899 	unsigned int phantom_ghosts_added;
900 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, };
901 
902 unsigned int vm_pageout_stat_now = 0;
903 
904 #define VM_PAGEOUT_STAT_BEFORE(i) \
905 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
906 #define VM_PAGEOUT_STAT_AFTER(i) \
907 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
908 
909 #if VM_PAGE_BUCKETS_CHECK
910 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
911 #endif /* VM_PAGE_BUCKETS_CHECK */
912 
913 
914 void
915 record_memory_pressure(void);
916 void
record_memory_pressure(void)917 record_memory_pressure(void)
918 {
919 	unsigned int vm_pageout_next;
920 
921 #if VM_PAGE_BUCKETS_CHECK
922 	/* check the consistency of VM page buckets at regular interval */
923 	static int counter = 0;
924 	if ((++counter % vm_page_buckets_check_interval) == 0) {
925 		vm_page_buckets_check();
926 	}
927 #endif /* VM_PAGE_BUCKETS_CHECK */
928 
929 	vm_pageout_state.vm_memory_pressure =
930 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
931 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
932 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
933 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
934 
935 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
936 
937 	/* move "now" forward */
938 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
939 
940 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
941 
942 	vm_pageout_stat_now = vm_pageout_next;
943 }
944 
945 
946 /*
947  * IMPORTANT
948  * mach_vm_ctl_page_free_wanted() is called indirectly, via
949  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
950  * it must be safe in the restricted stackshot context. Locks and/or
951  * blocking are not allowable.
952  */
953 unsigned int
mach_vm_ctl_page_free_wanted(void)954 mach_vm_ctl_page_free_wanted(void)
955 {
956 	unsigned int page_free_target, page_free_count, page_free_wanted;
957 
958 	page_free_target = vm_page_free_target;
959 	page_free_count = vm_page_free_count;
960 	if (page_free_target > page_free_count) {
961 		page_free_wanted = page_free_target - page_free_count;
962 	} else {
963 		page_free_wanted = 0;
964 	}
965 
966 	return page_free_wanted;
967 }
968 
969 
970 /*
971  * IMPORTANT:
972  * mach_vm_pressure_monitor() is called when taking a stackshot, with
973  * wait_for_pressure FALSE, so that code path must remain safe in the
974  * restricted stackshot context. No blocking or locks are allowable.
975  * on that code path.
976  */
977 
978 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)979 mach_vm_pressure_monitor(
980 	boolean_t       wait_for_pressure,
981 	unsigned int    nsecs_monitored,
982 	unsigned int    *pages_reclaimed_p,
983 	unsigned int    *pages_wanted_p)
984 {
985 	wait_result_t   wr;
986 	unsigned int    vm_pageout_then, vm_pageout_now;
987 	unsigned int    pages_reclaimed;
988 	unsigned int    units_of_monitor;
989 
990 	units_of_monitor = 8 * nsecs_monitored;
991 	/*
992 	 * We don't take the vm_page_queue_lock here because we don't want
993 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
994 	 * thread when it's trying to reclaim memory.  We don't need fully
995 	 * accurate monitoring anyway...
996 	 */
997 
998 	if (wait_for_pressure) {
999 		/* wait until there's memory pressure */
1000 		while (vm_page_free_count >= vm_page_free_target) {
1001 			wr = assert_wait((event_t) &vm_page_free_wanted,
1002 			    THREAD_INTERRUPTIBLE);
1003 			if (wr == THREAD_WAITING) {
1004 				wr = thread_block(THREAD_CONTINUE_NULL);
1005 			}
1006 			if (wr == THREAD_INTERRUPTED) {
1007 				return KERN_ABORTED;
1008 			}
1009 			if (wr == THREAD_AWAKENED) {
1010 				/*
1011 				 * The memory pressure might have already
1012 				 * been relieved but let's not block again
1013 				 * and let's report that there was memory
1014 				 * pressure at some point.
1015 				 */
1016 				break;
1017 			}
1018 		}
1019 	}
1020 
1021 	/* provide the number of pages the system wants to reclaim */
1022 	if (pages_wanted_p != NULL) {
1023 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1024 	}
1025 
1026 	if (pages_reclaimed_p == NULL) {
1027 		return KERN_SUCCESS;
1028 	}
1029 
1030 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1031 	vm_pageout_now = vm_pageout_stat_now;
1032 	pages_reclaimed = 0;
1033 	for (vm_pageout_then =
1034 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1035 	    vm_pageout_then != vm_pageout_now &&
1036 	    units_of_monitor-- != 0;
1037 	    vm_pageout_then =
1038 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1039 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1040 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1041 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1042 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1043 	}
1044 	*pages_reclaimed_p = pages_reclaimed;
1045 
1046 	return KERN_SUCCESS;
1047 }
1048 
1049 
1050 
1051 #if DEVELOPMENT || DEBUG
1052 
1053 static void
1054 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1055 
1056 /*
1057  * condition variable used to make sure there is
1058  * only a single sweep going on at a time
1059  */
1060 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1061 
1062 
1063 void
vm_pageout_disconnect_all_pages()1064 vm_pageout_disconnect_all_pages()
1065 {
1066 	vm_page_lock_queues();
1067 
1068 	if (vm_pageout_disconnect_all_pages_active == TRUE) {
1069 		vm_page_unlock_queues();
1070 		return;
1071 	}
1072 	vm_pageout_disconnect_all_pages_active = TRUE;
1073 	vm_page_unlock_queues();
1074 
1075 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1076 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1077 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1078 
1079 	vm_pageout_disconnect_all_pages_active = FALSE;
1080 }
1081 
1082 
1083 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1084 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1085 {
1086 	vm_page_t       m;
1087 	vm_object_t     t_object = NULL;
1088 	vm_object_t     l_object = NULL;
1089 	vm_object_t     m_object = NULL;
1090 	int             delayed_unlock = 0;
1091 	int             try_failed_count = 0;
1092 	int             disconnected_count = 0;
1093 	int             paused_count = 0;
1094 	int             object_locked_count = 0;
1095 
1096 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1097 	    q, qcount, 0, 0, 0);
1098 
1099 	vm_page_lock_queues();
1100 
1101 	while (qcount && !vm_page_queue_empty(q)) {
1102 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1103 
1104 		m = (vm_page_t) vm_page_queue_first(q);
1105 		m_object = VM_PAGE_OBJECT(m);
1106 
1107 		/*
1108 		 * check to see if we currently are working
1109 		 * with the same object... if so, we've
1110 		 * already got the lock
1111 		 */
1112 		if (m_object != l_object) {
1113 			/*
1114 			 * the object associated with candidate page is
1115 			 * different from the one we were just working
1116 			 * with... dump the lock if we still own it
1117 			 */
1118 			if (l_object != NULL) {
1119 				vm_object_unlock(l_object);
1120 				l_object = NULL;
1121 			}
1122 			if (m_object != t_object) {
1123 				try_failed_count = 0;
1124 			}
1125 
1126 			/*
1127 			 * Try to lock object; since we've alread got the
1128 			 * page queues lock, we can only 'try' for this one.
1129 			 * if the 'try' fails, we need to do a mutex_pause
1130 			 * to allow the owner of the object lock a chance to
1131 			 * run...
1132 			 */
1133 			if (!vm_object_lock_try_scan(m_object)) {
1134 				if (try_failed_count > 20) {
1135 					goto reenter_pg_on_q;
1136 				}
1137 				vm_page_unlock_queues();
1138 				mutex_pause(try_failed_count++);
1139 				vm_page_lock_queues();
1140 				delayed_unlock = 0;
1141 
1142 				paused_count++;
1143 
1144 				t_object = m_object;
1145 				continue;
1146 			}
1147 			object_locked_count++;
1148 
1149 			l_object = m_object;
1150 		}
1151 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1152 			/*
1153 			 * put it back on the head of its queue
1154 			 */
1155 			goto reenter_pg_on_q;
1156 		}
1157 		if (m->vmp_pmapped == TRUE) {
1158 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1159 
1160 			disconnected_count++;
1161 		}
1162 reenter_pg_on_q:
1163 		vm_page_queue_remove(q, m, vmp_pageq);
1164 		vm_page_queue_enter(q, m, vmp_pageq);
1165 
1166 		qcount--;
1167 		try_failed_count = 0;
1168 
1169 		if (delayed_unlock++ > 128) {
1170 			if (l_object != NULL) {
1171 				vm_object_unlock(l_object);
1172 				l_object = NULL;
1173 			}
1174 			lck_mtx_yield(&vm_page_queue_lock);
1175 			delayed_unlock = 0;
1176 		}
1177 	}
1178 	if (l_object != NULL) {
1179 		vm_object_unlock(l_object);
1180 		l_object = NULL;
1181 	}
1182 	vm_page_unlock_queues();
1183 
1184 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1185 	    q, disconnected_count, object_locked_count, paused_count, 0);
1186 }
1187 
1188 #endif
1189 
1190 
1191 static void
1192 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1193 
1194 /*
1195  * condition variable used to make sure there is
1196  * only a single sweep going on at a time
1197  */
1198 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1199 
1200 
1201 void
vm_pageout_anonymous_pages()1202 vm_pageout_anonymous_pages()
1203 {
1204 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1205 		vm_page_lock_queues();
1206 
1207 		if (vm_pageout_anonymous_pages_active == TRUE) {
1208 			vm_page_unlock_queues();
1209 			return;
1210 		}
1211 		vm_pageout_anonymous_pages_active = TRUE;
1212 		vm_page_unlock_queues();
1213 
1214 		vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1215 		vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1216 		vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1217 
1218 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1219 			vm_consider_swapping();
1220 		}
1221 
1222 		vm_page_lock_queues();
1223 		vm_pageout_anonymous_pages_active = FALSE;
1224 		vm_page_unlock_queues();
1225 	}
1226 }
1227 
1228 
1229 void
vm_pageout_page_queue(vm_page_queue_head_t * q,int qcount)1230 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1231 {
1232 	vm_page_t       m;
1233 	vm_object_t     t_object = NULL;
1234 	vm_object_t     l_object = NULL;
1235 	vm_object_t     m_object = NULL;
1236 	int             delayed_unlock = 0;
1237 	int             try_failed_count = 0;
1238 	int             refmod_state;
1239 	int             pmap_options;
1240 	struct          vm_pageout_queue *iq;
1241 	ppnum_t         phys_page;
1242 
1243 
1244 	iq = &vm_pageout_queue_internal;
1245 
1246 	vm_page_lock_queues();
1247 
1248 	while (qcount && !vm_page_queue_empty(q)) {
1249 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1250 
1251 		if (VM_PAGE_Q_THROTTLED(iq)) {
1252 			if (l_object != NULL) {
1253 				vm_object_unlock(l_object);
1254 				l_object = NULL;
1255 			}
1256 			iq->pgo_draining = TRUE;
1257 
1258 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1259 			vm_page_unlock_queues();
1260 
1261 			thread_block(THREAD_CONTINUE_NULL);
1262 
1263 			vm_page_lock_queues();
1264 			delayed_unlock = 0;
1265 			continue;
1266 		}
1267 		m = (vm_page_t) vm_page_queue_first(q);
1268 		m_object = VM_PAGE_OBJECT(m);
1269 
1270 		/*
1271 		 * check to see if we currently are working
1272 		 * with the same object... if so, we've
1273 		 * already got the lock
1274 		 */
1275 		if (m_object != l_object) {
1276 			if (!m_object->internal) {
1277 				goto reenter_pg_on_q;
1278 			}
1279 
1280 			/*
1281 			 * the object associated with candidate page is
1282 			 * different from the one we were just working
1283 			 * with... dump the lock if we still own it
1284 			 */
1285 			if (l_object != NULL) {
1286 				vm_object_unlock(l_object);
1287 				l_object = NULL;
1288 			}
1289 			if (m_object != t_object) {
1290 				try_failed_count = 0;
1291 			}
1292 
1293 			/*
1294 			 * Try to lock object; since we've alread got the
1295 			 * page queues lock, we can only 'try' for this one.
1296 			 * if the 'try' fails, we need to do a mutex_pause
1297 			 * to allow the owner of the object lock a chance to
1298 			 * run...
1299 			 */
1300 			if (!vm_object_lock_try_scan(m_object)) {
1301 				if (try_failed_count > 20) {
1302 					goto reenter_pg_on_q;
1303 				}
1304 				vm_page_unlock_queues();
1305 				mutex_pause(try_failed_count++);
1306 				vm_page_lock_queues();
1307 				delayed_unlock = 0;
1308 
1309 				t_object = m_object;
1310 				continue;
1311 			}
1312 			l_object = m_object;
1313 		}
1314 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1315 			/*
1316 			 * page is not to be cleaned
1317 			 * put it back on the head of its queue
1318 			 */
1319 			goto reenter_pg_on_q;
1320 		}
1321 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1322 
1323 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1324 			refmod_state = pmap_get_refmod(phys_page);
1325 
1326 			if (refmod_state & VM_MEM_REFERENCED) {
1327 				m->vmp_reference = TRUE;
1328 			}
1329 			if (refmod_state & VM_MEM_MODIFIED) {
1330 				SET_PAGE_DIRTY(m, FALSE);
1331 			}
1332 		}
1333 		if (m->vmp_reference == TRUE) {
1334 			m->vmp_reference = FALSE;
1335 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1336 			goto reenter_pg_on_q;
1337 		}
1338 		if (m->vmp_pmapped == TRUE) {
1339 			if (m->vmp_dirty || m->vmp_precious) {
1340 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1341 			} else {
1342 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1343 			}
1344 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1345 			if (refmod_state & VM_MEM_MODIFIED) {
1346 				SET_PAGE_DIRTY(m, FALSE);
1347 			}
1348 		}
1349 
1350 		if (!m->vmp_dirty && !m->vmp_precious) {
1351 			vm_page_unlock_queues();
1352 			VM_PAGE_FREE(m);
1353 			vm_page_lock_queues();
1354 			delayed_unlock = 0;
1355 
1356 			goto next_pg;
1357 		}
1358 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1359 			if (!m_object->pager_initialized) {
1360 				vm_page_unlock_queues();
1361 
1362 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1363 
1364 				if (!m_object->pager_initialized) {
1365 					vm_object_compressor_pager_create(m_object);
1366 				}
1367 
1368 				vm_page_lock_queues();
1369 				delayed_unlock = 0;
1370 			}
1371 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1372 				goto reenter_pg_on_q;
1373 			}
1374 			/*
1375 			 * vm_object_compressor_pager_create will drop the object lock
1376 			 * which means 'm' may no longer be valid to use
1377 			 */
1378 			continue;
1379 		}
1380 		/*
1381 		 * we've already factored out pages in the laundry which
1382 		 * means this page can't be on the pageout queue so it's
1383 		 * safe to do the vm_page_queues_remove
1384 		 */
1385 		vm_page_queues_remove(m, TRUE);
1386 
1387 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1388 
1389 		vm_pageout_cluster(m);
1390 
1391 		goto next_pg;
1392 
1393 reenter_pg_on_q:
1394 		vm_page_queue_remove(q, m, vmp_pageq);
1395 		vm_page_queue_enter(q, m, vmp_pageq);
1396 next_pg:
1397 		qcount--;
1398 		try_failed_count = 0;
1399 
1400 		if (delayed_unlock++ > 128) {
1401 			if (l_object != NULL) {
1402 				vm_object_unlock(l_object);
1403 				l_object = NULL;
1404 			}
1405 			lck_mtx_yield(&vm_page_queue_lock);
1406 			delayed_unlock = 0;
1407 		}
1408 	}
1409 	if (l_object != NULL) {
1410 		vm_object_unlock(l_object);
1411 		l_object = NULL;
1412 	}
1413 	vm_page_unlock_queues();
1414 }
1415 
1416 
1417 
1418 /*
1419  * function in BSD to apply I/O throttle to the pageout thread
1420  */
1421 extern void vm_pageout_io_throttle(void);
1422 
1423 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1424 	MACRO_BEGIN                                                     \
1425 	/* \
1426 	 * If a "reusable" page somehow made it back into \
1427 	 * the active queue, it's been re-used and is not \
1428 	 * quite re-usable. \
1429 	 * If the VM object was "all_reusable", consider it \
1430 	 * as "all re-used" instead of converting it to \
1431 	 * "partially re-used", which could be expensive. \
1432 	 */                                                             \
1433 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1434 	if ((m)->vmp_reusable ||                                        \
1435 	    (obj)->all_reusable) {                                      \
1436 	        vm_object_reuse_pages((obj),                            \
1437 	                              (m)->vmp_offset,                  \
1438 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1439 	                              FALSE);                           \
1440 	}                                                               \
1441 	MACRO_END
1442 
1443 
1444 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1445 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1446 
1447 #define FCS_IDLE                0
1448 #define FCS_DELAYED             1
1449 #define FCS_DEADLOCK_DETECTED   2
1450 
1451 struct flow_control {
1452 	int             state;
1453 	mach_timespec_t ts;
1454 };
1455 
1456 
1457 #if CONFIG_BACKGROUND_QUEUE
1458 uint64_t vm_pageout_rejected_bq_internal = 0;
1459 uint64_t vm_pageout_rejected_bq_external = 0;
1460 uint64_t vm_pageout_skipped_bq_internal = 0;
1461 #endif
1462 
1463 #define ANONS_GRABBED_LIMIT     2
1464 
1465 
1466 #if 0
1467 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1468 #endif
1469 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1470 
1471 #define VM_PAGEOUT_PB_NO_ACTION                         0
1472 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1473 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1474 
1475 
1476 #if 0
1477 static void
1478 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1479 {
1480 	if (*local_freeq) {
1481 		vm_page_unlock_queues();
1482 
1483 		VM_DEBUG_CONSTANT_EVENT(
1484 			vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1485 			vm_page_free_count, 0, 0, 1);
1486 
1487 		vm_page_free_list(*local_freeq, TRUE);
1488 
1489 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1490 		    vm_page_free_count, *local_freed, 0, 1);
1491 
1492 		*local_freeq = NULL;
1493 		*local_freed = 0;
1494 
1495 		vm_page_lock_queues();
1496 	} else {
1497 		lck_mtx_yield(&vm_page_queue_lock);
1498 	}
1499 	*delayed_unlock = 1;
1500 }
1501 #endif
1502 
1503 
1504 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1505 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1506     vm_page_t *local_freeq, int *local_freed, int action)
1507 {
1508 	vm_page_unlock_queues();
1509 
1510 	if (*object != NULL) {
1511 		vm_object_unlock(*object);
1512 		*object = NULL;
1513 	}
1514 	if (*local_freeq) {
1515 		vm_page_free_list(*local_freeq, TRUE);
1516 
1517 		*local_freeq = NULL;
1518 		*local_freed = 0;
1519 	}
1520 	*delayed_unlock = 1;
1521 
1522 	switch (action) {
1523 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1524 		vm_consider_waking_compactor_swapper();
1525 		break;
1526 	case VM_PAGEOUT_PB_THREAD_YIELD:
1527 		thread_yield_internal(1);
1528 		break;
1529 	case VM_PAGEOUT_PB_NO_ACTION:
1530 	default:
1531 		break;
1532 	}
1533 	vm_page_lock_queues();
1534 }
1535 
1536 
1537 static struct vm_pageout_vminfo last;
1538 
1539 uint64_t last_vm_page_pages_grabbed = 0;
1540 
1541 extern  uint32_t c_segment_pages_compressed;
1542 
1543 extern uint64_t shared_region_pager_reclaimed;
1544 extern struct memory_object_pager_ops shared_region_pager_ops;
1545 
1546 void
update_vm_info(void)1547 update_vm_info(void)
1548 {
1549 	unsigned long tmp;
1550 	uint64_t tmp64;
1551 
1552 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1553 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1554 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1555 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1556 
1557 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1558 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1559 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1560 
1561 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1562 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1563 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1564 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1565 
1566 
1567 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1568 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1569 	last.vm_pageout_considered_page = tmp;
1570 
1571 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1572 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1573 	last.vm_pageout_compressions = tmp64;
1574 
1575 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1576 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1577 	last.vm_compressor_failed = tmp;
1578 
1579 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1580 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1581 	last.vm_compressor_pages_grabbed = tmp64;
1582 
1583 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1584 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1585 	last.vm_phantom_cache_found_ghost = tmp;
1586 
1587 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1588 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1589 	last.vm_phantom_cache_added_ghost = tmp;
1590 
1591 	tmp64 = counter_load(&vm_page_grab_count);
1592 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1593 	last_vm_page_pages_grabbed = tmp64;
1594 
1595 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1596 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1597 	last.vm_page_pages_freed = tmp;
1598 
1599 
1600 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1601 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1602 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1603 		last.vm_pageout_pages_evicted = tmp;
1604 
1605 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1606 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1607 		last.vm_pageout_pages_purged = tmp;
1608 
1609 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1610 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1611 		last.vm_pageout_freed_speculative = tmp;
1612 
1613 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1614 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1615 		last.vm_pageout_freed_external = tmp;
1616 
1617 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1618 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1619 		last.vm_pageout_inactive_referenced = tmp;
1620 
1621 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1622 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1623 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1624 
1625 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1626 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1627 		last.vm_pageout_inactive_dirty_external = tmp;
1628 
1629 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1630 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1631 		last.vm_pageout_freed_cleaned = tmp;
1632 
1633 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1634 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1635 		last.vm_pageout_inactive_nolock = tmp;
1636 
1637 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1638 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1639 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1640 
1641 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1642 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1643 		last.vm_pageout_skipped_external = tmp;
1644 
1645 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1646 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1647 		last.vm_pageout_skipped_internal = tmp;
1648 
1649 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1650 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1651 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1652 
1653 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1654 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1655 		last.vm_pageout_inactive_force_reclaim = tmp;
1656 
1657 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1658 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1659 		last.vm_pageout_freed_internal = tmp;
1660 
1661 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1662 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1663 		last.vm_pageout_considered_bq_internal = tmp;
1664 
1665 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1666 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1667 		last.vm_pageout_considered_bq_external = tmp;
1668 
1669 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1670 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1671 		last.vm_pageout_filecache_min_reactivated = tmp;
1672 
1673 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1674 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1675 		last.vm_pageout_inactive_dirty_internal = tmp;
1676 	}
1677 
1678 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1679 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1680 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1681 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1682 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1683 	    0);
1684 
1685 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1686 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1687 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1688 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1689 	    0,
1690 	    0);
1691 
1692 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1693 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1694 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1695 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1696 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1697 	    0);
1698 
1699 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1700 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1701 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1702 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1703 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1704 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1705 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1706 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1707 		    0);
1708 
1709 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1710 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1711 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1712 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1713 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1714 		    0);
1715 
1716 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1717 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1718 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1719 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1720 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1721 		    0);
1722 
1723 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1724 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1725 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1726 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1727 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1728 		    0);
1729 
1730 		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1731 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1732 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1733 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1734 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1735 		    0);
1736 	}
1737 	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1738 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1739 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1740 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1741 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1742 	    0);
1743 
1744 	record_memory_pressure();
1745 }
1746 
1747 extern boolean_t hibernation_vmqueues_inspection;
1748 
1749 /*
1750  * Return values for functions called by vm_pageout_scan
1751  * that control its flow.
1752  *
1753  * PROCEED -- vm_pageout_scan will keep making forward progress.
1754  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1755  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1756  */
1757 
1758 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1759 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1760 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1761 
1762 /*
1763  * This function is called only from vm_pageout_scan and
1764  * it moves overflow secluded pages (one-at-a-time) to the
1765  * batched 'local' free Q or active Q.
1766  */
1767 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1768 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1769 {
1770 #if CONFIG_SECLUDED_MEMORY
1771 	/*
1772 	 * Deal with secluded_q overflow.
1773 	 */
1774 	if (vm_page_secluded_count > vm_page_secluded_target) {
1775 		vm_page_t secluded_page;
1776 
1777 		/*
1778 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1779 		 * Excess secluded pages go to the active queue and
1780 		 * will later go to the inactive queue.
1781 		 */
1782 		assert((vm_page_secluded_count_free +
1783 		    vm_page_secluded_count_inuse) ==
1784 		    vm_page_secluded_count);
1785 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1786 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1787 
1788 		vm_page_queues_remove(secluded_page, FALSE);
1789 		assert(!secluded_page->vmp_fictitious);
1790 		assert(!VM_PAGE_WIRED(secluded_page));
1791 
1792 		if (secluded_page->vmp_object == 0) {
1793 			/* transfer to free queue */
1794 			assert(secluded_page->vmp_busy);
1795 			secluded_page->vmp_snext = *local_freeq;
1796 			*local_freeq = secluded_page;
1797 			*local_freed += 1;
1798 		} else {
1799 			/* transfer to head of active queue */
1800 			vm_page_enqueue_active(secluded_page, FALSE);
1801 			secluded_page = VM_PAGE_NULL;
1802 		}
1803 	}
1804 #else /* CONFIG_SECLUDED_MEMORY */
1805 
1806 #pragma unused(local_freeq)
1807 #pragma unused(local_freed)
1808 
1809 	return;
1810 
1811 #endif /* CONFIG_SECLUDED_MEMORY */
1812 }
1813 
1814 /*
1815  * This function is called only from vm_pageout_scan and
1816  * it initializes the loop targets for vm_pageout_scan().
1817  */
1818 static void
vps_init_page_targets(void)1819 vps_init_page_targets(void)
1820 {
1821 	/*
1822 	 * LD TODO: Other page targets should be calculated here too.
1823 	 */
1824 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1825 
1826 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1827 		vm_pageout_state.vm_page_speculative_percentage = 50;
1828 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1829 		vm_pageout_state.vm_page_speculative_percentage = 1;
1830 	}
1831 
1832 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1833 	    vm_page_inactive_count);
1834 }
1835 
1836 /*
1837  * This function is called only from vm_pageout_scan and
1838  * it purges a single VM object at-a-time and will either
1839  * make vm_pageout_scan() restart the loop or keeping moving forward.
1840  */
1841 static int
vps_purge_object()1842 vps_purge_object()
1843 {
1844 	int             force_purge;
1845 
1846 	assert(available_for_purge >= 0);
1847 	force_purge = 0; /* no force-purging */
1848 
1849 #if VM_PRESSURE_EVENTS
1850 	vm_pressure_level_t pressure_level;
1851 
1852 	pressure_level = memorystatus_vm_pressure_level;
1853 
1854 	if (pressure_level > kVMPressureNormal) {
1855 		if (pressure_level >= kVMPressureCritical) {
1856 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1857 		} else if (pressure_level >= kVMPressureUrgent) {
1858 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1859 		} else if (pressure_level >= kVMPressureWarning) {
1860 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1861 		}
1862 	}
1863 #endif /* VM_PRESSURE_EVENTS */
1864 
1865 	if (available_for_purge || force_purge) {
1866 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1867 
1868 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1869 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1870 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1871 			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1872 			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1873 
1874 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1875 		}
1876 		VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1877 		memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1878 	}
1879 
1880 	return VM_PAGEOUT_SCAN_PROCEED;
1881 }
1882 
1883 /*
1884  * This function is called only from vm_pageout_scan and
1885  * it will try to age the next speculative Q if the oldest
1886  * one is empty.
1887  */
1888 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1889 vps_age_speculative_queue(boolean_t force_speculative_aging)
1890 {
1891 #define DELAY_SPECULATIVE_AGE   1000
1892 
1893 	/*
1894 	 * try to pull pages from the aging bins...
1895 	 * see vm_page.h for an explanation of how
1896 	 * this mechanism works
1897 	 */
1898 	boolean_t                       can_steal = FALSE;
1899 	int                             num_scanned_queues;
1900 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1901 	mach_timespec_t                 ts;
1902 	struct vm_speculative_age_q     *aq;
1903 	struct vm_speculative_age_q     *sq;
1904 
1905 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1906 
1907 	aq = &vm_page_queue_speculative[speculative_steal_index];
1908 
1909 	num_scanned_queues = 0;
1910 	while (vm_page_queue_empty(&aq->age_q) &&
1911 	    num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1912 		speculative_steal_index++;
1913 
1914 		if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1915 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1916 		}
1917 
1918 		aq = &vm_page_queue_speculative[speculative_steal_index];
1919 	}
1920 
1921 	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1922 		/*
1923 		 * XXX We've scanned all the speculative
1924 		 * queues but still haven't found one
1925 		 * that is not empty, even though
1926 		 * vm_page_speculative_count is not 0.
1927 		 */
1928 		if (!vm_page_queue_empty(&sq->age_q)) {
1929 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1930 		}
1931 #if DEVELOPMENT || DEBUG
1932 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
1933 #endif
1934 		/* readjust... */
1935 		vm_page_speculative_count = 0;
1936 		/* ... and continue */
1937 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1938 	}
1939 
1940 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
1941 		can_steal = TRUE;
1942 	} else {
1943 		if (!delay_speculative_age) {
1944 			mach_timespec_t ts_fully_aged;
1945 
1946 			ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
1947 			ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
1948 			    * 1000 * NSEC_PER_USEC;
1949 
1950 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1951 
1952 			clock_sec_t sec;
1953 			clock_nsec_t nsec;
1954 			clock_get_system_nanotime(&sec, &nsec);
1955 			ts.tv_sec = (unsigned int) sec;
1956 			ts.tv_nsec = nsec;
1957 
1958 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
1959 				can_steal = TRUE;
1960 			} else {
1961 				delay_speculative_age++;
1962 			}
1963 		} else {
1964 			delay_speculative_age++;
1965 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
1966 				delay_speculative_age = 0;
1967 			}
1968 		}
1969 	}
1970 	if (can_steal == TRUE) {
1971 		vm_page_speculate_ageit(aq);
1972 	}
1973 
1974 	return VM_PAGEOUT_SCAN_PROCEED;
1975 }
1976 
1977 /*
1978  * This function is called only from vm_pageout_scan and
1979  * it evicts a single VM object from the cache.
1980  */
1981 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)1982 vps_object_cache_evict(vm_object_t *object_to_unlock)
1983 {
1984 	static int                      cache_evict_throttle = 0;
1985 	struct vm_speculative_age_q     *sq;
1986 
1987 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1988 
1989 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
1990 		int     pages_evicted;
1991 
1992 		if (*object_to_unlock != NULL) {
1993 			vm_object_unlock(*object_to_unlock);
1994 			*object_to_unlock = NULL;
1995 		}
1996 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
1997 
1998 		pages_evicted = vm_object_cache_evict(100, 10);
1999 
2000 		KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2001 
2002 		if (pages_evicted) {
2003 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2004 
2005 			VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2006 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2007 			memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2008 
2009 			/*
2010 			 * we just freed up to 100 pages,
2011 			 * so go back to the top of the main loop
2012 			 * and re-evaulate the memory situation
2013 			 */
2014 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2015 		} else {
2016 			cache_evict_throttle = 1000;
2017 		}
2018 	}
2019 	if (cache_evict_throttle) {
2020 		cache_evict_throttle--;
2021 	}
2022 
2023 	return VM_PAGEOUT_SCAN_PROCEED;
2024 }
2025 
2026 
2027 /*
2028  * This function is called only from vm_pageout_scan and
2029  * it calculates the filecache min. that needs to be maintained
2030  * as we start to steal pages.
2031  */
2032 static void
vps_calculate_filecache_min(void)2033 vps_calculate_filecache_min(void)
2034 {
2035 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2036 
2037 #if CONFIG_JETSAM
2038 	/*
2039 	 * don't let the filecache_min fall below 15% of available memory
2040 	 * on systems with an active compressor that isn't nearing its
2041 	 * limits w/r to accepting new data
2042 	 *
2043 	 * on systems w/o the compressor/swapper, the filecache is always
2044 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2045 	 * since most (if not all) of the anonymous pages are in the
2046 	 * throttled queue (which isn't counted as available) which
2047 	 * effectively disables this filter
2048 	 */
2049 	if (vm_compressor_low_on_space() || divisor == 0) {
2050 		vm_pageout_state.vm_page_filecache_min = 0;
2051 	} else {
2052 		vm_pageout_state.vm_page_filecache_min =
2053 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2054 	}
2055 #else
2056 	if (vm_compressor_out_of_space() || divisor == 0) {
2057 		vm_pageout_state.vm_page_filecache_min = 0;
2058 	} else {
2059 		/*
2060 		 * don't let the filecache_min fall below the specified critical level
2061 		 */
2062 		vm_pageout_state.vm_page_filecache_min =
2063 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2064 	}
2065 #endif
2066 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2067 		vm_pageout_state.vm_page_filecache_min = 0;
2068 	}
2069 }
2070 
2071 /*
2072  * This function is called only from vm_pageout_scan and
2073  * it updates the flow control time to detect if VM pageoutscan
2074  * isn't making progress.
2075  */
2076 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2077 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2078 {
2079 	mach_timespec_t ts;
2080 	clock_sec_t sec;
2081 	clock_nsec_t nsec;
2082 
2083 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2084 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2085 	clock_get_system_nanotime(&sec, &nsec);
2086 	flow_control->ts.tv_sec = (unsigned int) sec;
2087 	flow_control->ts.tv_nsec = nsec;
2088 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2089 
2090 	flow_control->state = FCS_DELAYED;
2091 
2092 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2093 }
2094 
2095 /*
2096  * This function is called only from vm_pageout_scan and
2097  * it is the flow control logic of VM pageout scan which
2098  * controls if it should block and for how long.
2099  * Any blocking of vm_pageout_scan happens ONLY in this function.
2100  */
2101 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2102 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2103     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2104 {
2105 	boolean_t       exceeded_burst_throttle = FALSE;
2106 	unsigned int    msecs = 0;
2107 	uint32_t        inactive_external_count;
2108 	mach_timespec_t ts;
2109 	struct  vm_pageout_queue *iq;
2110 	struct  vm_pageout_queue *eq;
2111 	struct  vm_speculative_age_q *sq;
2112 
2113 	iq = &vm_pageout_queue_internal;
2114 	eq = &vm_pageout_queue_external;
2115 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2116 
2117 	/*
2118 	 * Sometimes we have to pause:
2119 	 *	1) No inactive pages - nothing to do.
2120 	 *	2) Loop control - no acceptable pages found on the inactive queue
2121 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2122 	 *	3) Flow control - default pageout queue is full
2123 	 */
2124 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2125 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2126 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2127 	    vm_page_queue_empty(&sq->age_q)) {
2128 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2129 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2130 	} else if (inactive_burst_count >=
2131 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2132 	    (vm_page_inactive_count +
2133 	    vm_page_speculative_count))) {
2134 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2135 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2136 
2137 		exceeded_burst_throttle = TRUE;
2138 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2139 	    VM_DYNAMIC_PAGING_ENABLED()) {
2140 		clock_sec_t sec;
2141 		clock_nsec_t nsec;
2142 
2143 		switch (flow_control->state) {
2144 		case FCS_IDLE:
2145 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2146 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2147 				/*
2148 				 * since the compressor is running independently of vm_pageout_scan
2149 				 * let's not wait for it just yet... as long as we have a healthy supply
2150 				 * of filecache pages to work with, let's keep stealing those.
2151 				 */
2152 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2153 
2154 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2155 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2156 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2157 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2158 					return VM_PAGEOUT_SCAN_PROCEED;
2159 				}
2160 			}
2161 
2162 			vps_flow_control_reset_deadlock_timer(flow_control);
2163 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2164 
2165 			break;
2166 
2167 		case FCS_DELAYED:
2168 			clock_get_system_nanotime(&sec, &nsec);
2169 			ts.tv_sec = (unsigned int) sec;
2170 			ts.tv_nsec = nsec;
2171 
2172 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2173 				/*
2174 				 * the pageout thread for the default pager is potentially
2175 				 * deadlocked since the
2176 				 * default pager queue has been throttled for more than the
2177 				 * allowable time... we need to move some clean pages or dirty
2178 				 * pages belonging to the external pagers if they aren't throttled
2179 				 * vm_page_free_wanted represents the number of threads currently
2180 				 * blocked waiting for pages... we'll move one page for each of
2181 				 * these plus a fixed amount to break the logjam... once we're done
2182 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2183 				 * with a new timeout target since we have no way of knowing
2184 				 * whether we've broken the deadlock except through observation
2185 				 * of the queue associated with the default pager... we need to
2186 				 * stop moving pages and allow the system to run to see what
2187 				 * state it settles into.
2188 				 */
2189 
2190 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2191 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2192 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2193 				flow_control->state = FCS_DEADLOCK_DETECTED;
2194 				thread_wakeup(VM_PAGEOUT_GC_EVENT);
2195 				return VM_PAGEOUT_SCAN_PROCEED;
2196 			}
2197 			/*
2198 			 * just resniff instead of trying
2199 			 * to compute a new delay time... we're going to be
2200 			 * awakened immediately upon a laundry completion,
2201 			 * so we won't wait any longer than necessary
2202 			 */
2203 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2204 			break;
2205 
2206 		case FCS_DEADLOCK_DETECTED:
2207 			if (*vm_pageout_deadlock_target) {
2208 				return VM_PAGEOUT_SCAN_PROCEED;
2209 			}
2210 
2211 			vps_flow_control_reset_deadlock_timer(flow_control);
2212 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2213 
2214 			break;
2215 		}
2216 	} else {
2217 		/*
2218 		 * No need to pause...
2219 		 */
2220 		return VM_PAGEOUT_SCAN_PROCEED;
2221 	}
2222 
2223 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2224 
2225 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2226 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2227 
2228 	if (vm_page_free_count >= vm_page_free_target) {
2229 		/*
2230 		 * we're here because
2231 		 *  1) someone else freed up some pages while we had
2232 		 *     the queues unlocked above
2233 		 * and we've hit one of the 3 conditions that
2234 		 * cause us to pause the pageout scan thread
2235 		 *
2236 		 * since we already have enough free pages,
2237 		 * let's avoid stalling and return normally
2238 		 *
2239 		 * before we return, make sure the pageout I/O threads
2240 		 * are running throttled in case there are still requests
2241 		 * in the laundry... since we have enough free pages
2242 		 * we don't need the laundry to be cleaned in a timely
2243 		 * fashion... so let's avoid interfering with foreground
2244 		 * activity
2245 		 *
2246 		 * we don't want to hold vm_page_queue_free_lock when
2247 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2248 		 * may cause other locks to be taken), we do the intitial
2249 		 * check outside of the lock.  Once we take the lock,
2250 		 * we recheck the condition since it may have changed.
2251 		 * if it has, no problem, we will make the threads
2252 		 * non-throttled before actually blocking
2253 		 */
2254 		vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2255 	}
2256 	lck_mtx_lock(&vm_page_queue_free_lock);
2257 
2258 	if (vm_page_free_count >= vm_page_free_target &&
2259 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2260 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2261 	}
2262 	lck_mtx_unlock(&vm_page_queue_free_lock);
2263 
2264 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2265 		/*
2266 		 * we're most likely about to block due to one of
2267 		 * the 3 conditions that cause vm_pageout_scan to
2268 		 * not be able to make forward progress w/r
2269 		 * to providing new pages to the free queue,
2270 		 * so unthrottle the I/O threads in case we
2271 		 * have laundry to be cleaned... it needs
2272 		 * to be completed ASAP.
2273 		 *
2274 		 * even if we don't block, we want the io threads
2275 		 * running unthrottled since the sum of free +
2276 		 * clean pages is still under our free target
2277 		 */
2278 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2279 	}
2280 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2281 		/*
2282 		 * if we get here we're below our free target and
2283 		 * we're stalling due to a full laundry queue or
2284 		 * we don't have any inactive pages other then
2285 		 * those in the clean queue...
2286 		 * however, we have pages on the clean queue that
2287 		 * can be moved to the free queue, so let's not
2288 		 * stall the pageout scan
2289 		 */
2290 		flow_control->state = FCS_IDLE;
2291 		return VM_PAGEOUT_SCAN_PROCEED;
2292 	}
2293 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2294 		flow_control->state = FCS_IDLE;
2295 		return VM_PAGEOUT_SCAN_PROCEED;
2296 	}
2297 
2298 	VM_CHECK_MEMORYSTATUS;
2299 
2300 	if (flow_control->state != FCS_IDLE) {
2301 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2302 	}
2303 
2304 	iq->pgo_throttled = TRUE;
2305 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2306 
2307 	vm_page_unlock_queues();
2308 
2309 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2310 
2311 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2312 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2313 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2314 
2315 	thread_block(THREAD_CONTINUE_NULL);
2316 
2317 	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2318 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2319 	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2320 
2321 	vm_page_lock_queues();
2322 
2323 	iq->pgo_throttled = FALSE;
2324 
2325 	vps_init_page_targets();
2326 
2327 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2328 }
2329 
2330 /*
2331  * This function is called only from vm_pageout_scan and
2332  * it will find and return the most appropriate page to be
2333  * reclaimed.
2334  */
2335 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2336 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2337     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2338 {
2339 	vm_page_t                       m = NULL;
2340 	vm_object_t                     m_object = VM_OBJECT_NULL;
2341 	uint32_t                        inactive_external_count;
2342 	struct vm_speculative_age_q     *sq;
2343 	struct vm_pageout_queue         *iq;
2344 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2345 
2346 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2347 	iq = &vm_pageout_queue_internal;
2348 
2349 	*is_page_from_bg_q = FALSE;
2350 
2351 	m = NULL;
2352 	m_object = VM_OBJECT_NULL;
2353 
2354 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2355 		assert(vm_page_throttled_count == 0);
2356 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2357 	}
2358 
2359 	/*
2360 	 * Try for a clean-queue inactive page.
2361 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2362 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2363 	 */
2364 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2365 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2366 
2367 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2368 
2369 		goto found_page;
2370 	}
2371 
2372 	/*
2373 	 * The next most eligible pages are ones we paged in speculatively,
2374 	 * but which have not yet been touched and have been aged out.
2375 	 */
2376 	if (!vm_page_queue_empty(&sq->age_q)) {
2377 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2378 
2379 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2380 
2381 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2382 			goto found_page;
2383 		} else {
2384 			m = NULL;
2385 		}
2386 	}
2387 
2388 #if CONFIG_BACKGROUND_QUEUE
2389 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2390 		vm_object_t     bg_m_object = NULL;
2391 
2392 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2393 
2394 		bg_m_object = VM_PAGE_OBJECT(m);
2395 
2396 		if (!VM_PAGE_PAGEABLE(m)) {
2397 			/*
2398 			 * This page is on the background queue
2399 			 * but not on a pageable queue.  This is
2400 			 * likely a transient state and whoever
2401 			 * took it out of its pageable queue
2402 			 * will likely put it back on a pageable
2403 			 * queue soon but we can't deal with it
2404 			 * at this point, so let's ignore this
2405 			 * page.
2406 			 */
2407 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2408 			if (bg_m_object->internal &&
2409 			    (VM_PAGE_Q_THROTTLED(iq) ||
2410 			    vm_compressor_out_of_space() == TRUE ||
2411 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2412 				vm_pageout_skipped_bq_internal++;
2413 			} else {
2414 				*is_page_from_bg_q = TRUE;
2415 
2416 				if (bg_m_object->internal) {
2417 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2418 				} else {
2419 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2420 				}
2421 				goto found_page;
2422 			}
2423 		}
2424 	}
2425 #endif /* CONFIG_BACKGROUND_QUEUE */
2426 
2427 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2428 
2429 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2430 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2431 		*grab_anonymous = TRUE;
2432 		*anons_grabbed = 0;
2433 
2434 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2435 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2436 		} else {
2437 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2438 				/*
2439 				 * No swap and we are in dangerously low levels of free memory.
2440 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2441 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2442 				 *
2443 				 * So, pick a file backed page...
2444 				 */
2445 				*grab_anonymous = FALSE;
2446 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2447 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2448 			}
2449 		}
2450 		goto want_anonymous;
2451 	}
2452 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2453 
2454 #if CONFIG_JETSAM
2455 	/* If the file-backed pool has accumulated
2456 	 * significantly more pages than the jetsam
2457 	 * threshold, prefer to reclaim those
2458 	 * inline to minimise compute overhead of reclaiming
2459 	 * anonymous pages.
2460 	 * This calculation does not account for the CPU local
2461 	 * external page queues, as those are expected to be
2462 	 * much smaller relative to the global pools.
2463 	 */
2464 
2465 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2466 
2467 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2468 		if (vm_page_pageable_external_count >
2469 		    vm_pageout_state.vm_page_filecache_min) {
2470 			if ((vm_page_pageable_external_count *
2471 			    vm_pageout_memorystatus_fb_factor_dr) >
2472 			    (memorystatus_available_pages_critical *
2473 			    vm_pageout_memorystatus_fb_factor_nr)) {
2474 				*grab_anonymous = FALSE;
2475 
2476 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2477 			}
2478 		}
2479 		if (*grab_anonymous) {
2480 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2481 		}
2482 	}
2483 #endif /* CONFIG_JETSAM */
2484 
2485 want_anonymous:
2486 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2487 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2488 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2489 
2490 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2491 			*anons_grabbed = 0;
2492 
2493 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2494 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2495 					if ((++(*reactivated_this_call) % 100)) {
2496 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2497 
2498 						vm_page_activate(m);
2499 						counter_inc(&vm_statistics_reactivations);
2500 #if CONFIG_BACKGROUND_QUEUE
2501 #if DEVELOPMENT || DEBUG
2502 						if (*is_page_from_bg_q == TRUE) {
2503 							if (m_object->internal) {
2504 								vm_pageout_rejected_bq_internal++;
2505 							} else {
2506 								vm_pageout_rejected_bq_external++;
2507 							}
2508 						}
2509 #endif /* DEVELOPMENT || DEBUG */
2510 #endif /* CONFIG_BACKGROUND_QUEUE */
2511 						vm_pageout_state.vm_pageout_inactive_used++;
2512 
2513 						m = NULL;
2514 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2515 
2516 						goto found_page;
2517 					}
2518 
2519 					/*
2520 					 * steal 1 of the file backed pages even if
2521 					 * we are under the limit that has been set
2522 					 * for a healthy filecache
2523 					 */
2524 				}
2525 			}
2526 			goto found_page;
2527 		}
2528 	}
2529 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2530 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2531 
2532 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2533 		*anons_grabbed += 1;
2534 
2535 		goto found_page;
2536 	}
2537 
2538 	m = NULL;
2539 
2540 found_page:
2541 	*victim_page = m;
2542 
2543 	return retval;
2544 }
2545 
2546 /*
2547  * This function is called only from vm_pageout_scan and
2548  * it will put a page back on the active/inactive queue
2549  * if we can't reclaim it for some reason.
2550  */
2551 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2552 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2553 {
2554 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2555 		vm_page_enqueue_inactive(m, FALSE);
2556 	} else {
2557 		vm_page_activate(m);
2558 	}
2559 
2560 #if CONFIG_BACKGROUND_QUEUE
2561 #if DEVELOPMENT || DEBUG
2562 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2563 
2564 	if (page_from_bg_q == TRUE) {
2565 		if (m_object->internal) {
2566 			vm_pageout_rejected_bq_internal++;
2567 		} else {
2568 			vm_pageout_rejected_bq_external++;
2569 		}
2570 	}
2571 #endif /* DEVELOPMENT || DEBUG */
2572 #endif /* CONFIG_BACKGROUND_QUEUE */
2573 }
2574 
2575 /*
2576  * This function is called only from vm_pageout_scan and
2577  * it will try to grab the victim page's VM object (m_object)
2578  * which differs from the previous victim page's object (object).
2579  */
2580 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2581 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2582 {
2583 	struct vm_speculative_age_q *sq;
2584 
2585 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2586 
2587 	/*
2588 	 * the object associated with candidate page is
2589 	 * different from the one we were just working
2590 	 * with... dump the lock if we still own it
2591 	 */
2592 	if (*object != NULL) {
2593 		vm_object_unlock(*object);
2594 		*object = NULL;
2595 	}
2596 	/*
2597 	 * Try to lock object; since we've alread got the
2598 	 * page queues lock, we can only 'try' for this one.
2599 	 * if the 'try' fails, we need to do a mutex_pause
2600 	 * to allow the owner of the object lock a chance to
2601 	 * run... otherwise, we're likely to trip over this
2602 	 * object in the same state as we work our way through
2603 	 * the queue... clumps of pages associated with the same
2604 	 * object are fairly typical on the inactive and active queues
2605 	 */
2606 	if (!vm_object_lock_try_scan(m_object)) {
2607 		vm_page_t m_want = NULL;
2608 
2609 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2610 
2611 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2612 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2613 		}
2614 
2615 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2616 
2617 		m->vmp_reference = FALSE;
2618 
2619 		if (!m_object->object_is_shared_cache) {
2620 			/*
2621 			 * don't apply this optimization if this is the shared cache
2622 			 * object, it's too easy to get rid of very hot and important
2623 			 * pages...
2624 			 * m->vmp_object must be stable since we hold the page queues lock...
2625 			 * we can update the scan_collisions field sans the object lock
2626 			 * since it is a separate field and this is the only spot that does
2627 			 * a read-modify-write operation and it is never executed concurrently...
2628 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2629 			 * is possible for the value to be a bit non-determistic, but that's ok
2630 			 * since it's only used as a hint
2631 			 */
2632 			m_object->scan_collisions = 1;
2633 		}
2634 		if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2635 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2636 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2637 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2638 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2639 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2640 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2641 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2642 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2643 		}
2644 
2645 		/*
2646 		 * this is the next object we're going to be interested in
2647 		 * try to make sure its available after the mutex_pause
2648 		 * returns control
2649 		 */
2650 		if (m_want) {
2651 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2652 		}
2653 
2654 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2655 
2656 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2657 	} else {
2658 		*object = m_object;
2659 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2660 	}
2661 
2662 	return VM_PAGEOUT_SCAN_PROCEED;
2663 }
2664 
2665 /*
2666  * This function is called only from vm_pageout_scan and
2667  * it notices that pageout scan may be rendered ineffective
2668  * due to a FS deadlock and will jetsam a process if possible.
2669  * If jetsam isn't supported, it'll move the page to the active
2670  * queue to try and get some different pages pushed onwards so
2671  * we can try to get out of this scenario.
2672  */
2673 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2674 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2675     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2676 {
2677 	struct  vm_pageout_queue *eq;
2678 	vm_object_t cur_object = VM_OBJECT_NULL;
2679 
2680 	cur_object = *object;
2681 
2682 	eq = &vm_pageout_queue_external;
2683 
2684 	if (cur_object->internal == FALSE) {
2685 		/*
2686 		 * we need to break up the following potential deadlock case...
2687 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2688 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2689 		 *  c) Most of the pages in the inactive queue belong to this file.
2690 		 *
2691 		 * we are potentially in this deadlock because...
2692 		 *  a) the external pageout queue is throttled
2693 		 *  b) we're done with the active queue and moved on to the inactive queue
2694 		 *  c) we've got a dirty external page
2695 		 *
2696 		 * since we don't know the reason for the external pageout queue being throttled we
2697 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2698 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2699 		 *
2700 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2701 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2702 		 * pool the next time we select a victim page... if we can make enough new free pages,
2703 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2704 		 * be throttled
2705 		 *
2706 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2707 		 * that we can try to find clean pages in the active/inactive queues before
2708 		 * deciding to jetsam a process
2709 		 */
2710 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2711 
2712 		vm_page_check_pageable_safe(m);
2713 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2714 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2715 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2716 		vm_page_active_count++;
2717 		vm_page_pageable_external_count++;
2718 
2719 		vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2720 
2721 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2722 
2723 #pragma unused(force_anonymous)
2724 
2725 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2726 
2727 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2728 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2729 			/*
2730 			 * Possible deadlock scenario so request jetsam action
2731 			 */
2732 
2733 			assert(cur_object);
2734 			vm_object_unlock(cur_object);
2735 
2736 			cur_object = VM_OBJECT_NULL;
2737 
2738 			/*
2739 			 * VM pageout scan needs to know we have dropped this lock and so set the
2740 			 * object variable we got passed in to NULL.
2741 			 */
2742 			*object = VM_OBJECT_NULL;
2743 
2744 			vm_page_unlock_queues();
2745 
2746 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2747 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2748 
2749 			/* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2750 			if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
2751 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2752 			}
2753 
2754 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2755 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2756 
2757 			vm_page_lock_queues();
2758 			*delayed_unlock = 1;
2759 		}
2760 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2761 
2762 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2763 #pragma unused(delayed_unlock)
2764 
2765 		*force_anonymous = TRUE;
2766 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2767 	} else {
2768 		vm_page_activate(m);
2769 		counter_inc(&vm_statistics_reactivations);
2770 
2771 #if CONFIG_BACKGROUND_QUEUE
2772 #if DEVELOPMENT || DEBUG
2773 		if (is_page_from_bg_q == TRUE) {
2774 			if (cur_object->internal) {
2775 				vm_pageout_rejected_bq_internal++;
2776 			} else {
2777 				vm_pageout_rejected_bq_external++;
2778 			}
2779 		}
2780 #endif /* DEVELOPMENT || DEBUG */
2781 #endif /* CONFIG_BACKGROUND_QUEUE */
2782 
2783 		vm_pageout_state.vm_pageout_inactive_used++;
2784 	}
2785 }
2786 
2787 
2788 void
vm_page_balance_inactive(int max_to_move)2789 vm_page_balance_inactive(int max_to_move)
2790 {
2791 	vm_page_t m;
2792 
2793 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2794 
2795 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2796 		/*
2797 		 * It is likely that the hibernation code path is
2798 		 * dealing with these very queues as we are about
2799 		 * to move pages around in/from them and completely
2800 		 * change the linkage of the pages.
2801 		 *
2802 		 * And so we skip the rebalancing of these queues.
2803 		 */
2804 		return;
2805 	}
2806 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2807 	    vm_page_inactive_count +
2808 	    vm_page_speculative_count);
2809 
2810 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2811 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2812 
2813 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2814 
2815 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2816 		assert(!m->vmp_laundry);
2817 		assert(VM_PAGE_OBJECT(m) != kernel_object);
2818 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2819 
2820 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2821 
2822 		/*
2823 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2824 		 *
2825 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2826 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2827 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2828 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2829 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2830 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2831 		 * have happened before we moved the page
2832 		 */
2833 		if (m->vmp_pmapped == TRUE) {
2834 			/*
2835 			 * We might be holding the page queue lock as a
2836 			 * spin lock and clearing the "referenced" bit could
2837 			 * take a while if there are lots of mappings of
2838 			 * that page, so make sure we acquire the lock as
2839 			 * as mutex to avoid a spinlock timeout.
2840 			 */
2841 			vm_page_lockconvert_queues();
2842 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2843 		}
2844 
2845 		/*
2846 		 * The page might be absent or busy,
2847 		 * but vm_page_deactivate can handle that.
2848 		 * FALSE indicates that we don't want a H/W clear reference
2849 		 */
2850 		vm_page_deactivate_internal(m, FALSE);
2851 	}
2852 }
2853 
2854 
2855 /*
2856  *	vm_pageout_scan does the dirty work for the pageout daemon.
2857  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2858  *	held and vm_page_free_wanted == 0.
2859  */
2860 void
vm_pageout_scan(void)2861 vm_pageout_scan(void)
2862 {
2863 	unsigned int loop_count = 0;
2864 	unsigned int inactive_burst_count = 0;
2865 	unsigned int reactivated_this_call;
2866 	unsigned int reactivate_limit;
2867 	vm_page_t   local_freeq = NULL;
2868 	int         local_freed = 0;
2869 	int         delayed_unlock;
2870 	int         delayed_unlock_limit = 0;
2871 	int         refmod_state = 0;
2872 	int     vm_pageout_deadlock_target = 0;
2873 	struct  vm_pageout_queue *iq;
2874 	struct  vm_pageout_queue *eq;
2875 	struct  vm_speculative_age_q *sq;
2876 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2877 	boolean_t inactive_throttled = FALSE;
2878 	vm_object_t     object = NULL;
2879 	uint32_t        inactive_reclaim_run;
2880 	boolean_t       grab_anonymous = FALSE;
2881 	boolean_t       force_anonymous = FALSE;
2882 	boolean_t       force_speculative_aging = FALSE;
2883 	int             anons_grabbed = 0;
2884 	int             page_prev_q_state = 0;
2885 	boolean_t       page_from_bg_q = FALSE;
2886 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
2887 	vm_object_t     m_object = VM_OBJECT_NULL;
2888 	int             retval = 0;
2889 	boolean_t       lock_yield_check = FALSE;
2890 
2891 
2892 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2893 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
2894 	    vm_pageout_state.vm_pageout_inactive_clean,
2895 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2896 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2897 
2898 	flow_control.state = FCS_IDLE;
2899 	iq = &vm_pageout_queue_internal;
2900 	eq = &vm_pageout_queue_external;
2901 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2902 
2903 	/* Ask the pmap layer to return any pages it no longer needs. */
2904 	uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
2905 
2906 	vm_page_lock_queues();
2907 
2908 	vm_page_wire_count -= pmap_wired_pages_freed;
2909 
2910 	delayed_unlock = 1;
2911 
2912 	/*
2913 	 *	Calculate the max number of referenced pages on the inactive
2914 	 *	queue that we will reactivate.
2915 	 */
2916 	reactivated_this_call = 0;
2917 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
2918 	    vm_page_inactive_count);
2919 	inactive_reclaim_run = 0;
2920 
2921 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2922 
2923 	/*
2924 	 *	We must limit the rate at which we send pages to the pagers
2925 	 *	so that we don't tie up too many pages in the I/O queues.
2926 	 *	We implement a throttling mechanism using the laundry count
2927 	 *      to limit the number of pages outstanding to the default
2928 	 *	and external pagers.  We can bypass the throttles and look
2929 	 *	for clean pages if the pageout queues don't drain in a timely
2930 	 *	fashion since this may indicate that the pageout paths are
2931 	 *	stalled waiting for memory, which only we can provide.
2932 	 */
2933 
2934 	vps_init_page_targets();
2935 	assert(object == NULL);
2936 	assert(delayed_unlock != 0);
2937 
2938 	for (;;) {
2939 		vm_page_t m;
2940 
2941 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
2942 
2943 		if (lock_yield_check) {
2944 			lock_yield_check = FALSE;
2945 
2946 			if (delayed_unlock++ > delayed_unlock_limit) {
2947 				int freed = local_freed;
2948 
2949 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2950 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2951 				if (freed == 0) {
2952 					lck_mtx_yield(&vm_page_queue_lock);
2953 				}
2954 			} else if (vm_pageout_scan_wants_object) {
2955 				vm_page_unlock_queues();
2956 				mutex_pause(0);
2957 				vm_page_lock_queues();
2958 			}
2959 		}
2960 
2961 		if (vm_upl_wait_for_pages < 0) {
2962 			vm_upl_wait_for_pages = 0;
2963 		}
2964 
2965 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2966 
2967 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
2968 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
2969 		}
2970 
2971 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
2972 
2973 		assert(delayed_unlock);
2974 
2975 		/*
2976 		 * maintain our balance
2977 		 */
2978 		vm_page_balance_inactive(1);
2979 
2980 
2981 		/**********************************************************************
2982 		* above this point we're playing with the active and secluded queues
2983 		* below this point we're playing with the throttling mechanisms
2984 		* and the inactive queue
2985 		**********************************************************************/
2986 
2987 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
2988 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2989 
2990 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2991 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2992 			/*
2993 			 * make sure the pageout I/O threads are running
2994 			 * throttled in case there are still requests
2995 			 * in the laundry... since we have met our targets
2996 			 * we don't need the laundry to be cleaned in a timely
2997 			 * fashion... so let's avoid interfering with foreground
2998 			 * activity
2999 			 */
3000 			vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3001 
3002 			lck_mtx_lock(&vm_page_queue_free_lock);
3003 
3004 			if ((vm_page_free_count >= vm_page_free_target) &&
3005 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3006 				/*
3007 				 * done - we have met our target *and*
3008 				 * there is no one waiting for a page.
3009 				 */
3010 return_from_scan:
3011 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3012 
3013 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3014 				    vm_pageout_state.vm_pageout_inactive,
3015 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3016 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3017 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3018 				    vm_pageout_state.vm_pageout_inactive_clean,
3019 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3020 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3021 
3022 				return;
3023 			}
3024 			lck_mtx_unlock(&vm_page_queue_free_lock);
3025 		}
3026 
3027 		/*
3028 		 * Before anything, we check if we have any ripe volatile
3029 		 * objects around. If so, try to purge the first object.
3030 		 * If the purge fails, fall through to reclaim a page instead.
3031 		 * If the purge succeeds, go back to the top and reevalute
3032 		 * the new memory situation.
3033 		 */
3034 		retval = vps_purge_object();
3035 
3036 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3037 			/*
3038 			 * Success
3039 			 */
3040 			if (object != NULL) {
3041 				vm_object_unlock(object);
3042 				object = NULL;
3043 			}
3044 
3045 			lock_yield_check = FALSE;
3046 			continue;
3047 		}
3048 
3049 		/*
3050 		 * If our 'aged' queue is empty and we have some speculative pages
3051 		 * in the other queues, let's go through and see if we need to age
3052 		 * them.
3053 		 *
3054 		 * If we succeeded in aging a speculative Q or just that everything
3055 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3056 		 *
3057 		 * If, for some reason, we seem to have a mismatch between the spec.
3058 		 * page count and the page queues, we reset those variables and
3059 		 * restart the loop (LD TODO: Track this better?).
3060 		 */
3061 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3062 			retval = vps_age_speculative_queue(force_speculative_aging);
3063 
3064 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3065 				lock_yield_check = FALSE;
3066 				continue;
3067 			}
3068 		}
3069 		force_speculative_aging = FALSE;
3070 
3071 		/*
3072 		 * Check to see if we need to evict objects from the cache.
3073 		 *
3074 		 * Note: 'object' here doesn't have anything to do with
3075 		 * the eviction part. We just need to make sure we have dropped
3076 		 * any object lock we might be holding if we need to go down
3077 		 * into the eviction logic.
3078 		 */
3079 		retval = vps_object_cache_evict(&object);
3080 
3081 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3082 			lock_yield_check = FALSE;
3083 			continue;
3084 		}
3085 
3086 
3087 		/*
3088 		 * Calculate our filecache_min that will affect the loop
3089 		 * going forward.
3090 		 */
3091 		vps_calculate_filecache_min();
3092 
3093 		/*
3094 		 * LD TODO: Use a structure to hold all state variables for a single
3095 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3096 		 */
3097 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3098 		    &delayed_unlock, &local_freeq, &local_freed,
3099 		    &vm_pageout_deadlock_target, inactive_burst_count);
3100 
3101 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3102 			if (loop_count >= vm_page_inactive_count) {
3103 				loop_count = 0;
3104 			}
3105 
3106 			inactive_burst_count = 0;
3107 
3108 			assert(object == NULL);
3109 			assert(delayed_unlock != 0);
3110 
3111 			lock_yield_check = FALSE;
3112 			continue;
3113 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3114 			goto return_from_scan;
3115 		}
3116 
3117 		flow_control.state = FCS_IDLE;
3118 
3119 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3120 		    vm_pageout_inactive_external_forced_reactivate_limit);
3121 		loop_count++;
3122 		inactive_burst_count++;
3123 		vm_pageout_state.vm_pageout_inactive++;
3124 
3125 		/*
3126 		 * Choose a victim.
3127 		 */
3128 
3129 		m = NULL;
3130 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3131 
3132 		if (m == NULL) {
3133 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3134 				inactive_burst_count = 0;
3135 
3136 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3137 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3138 				}
3139 
3140 				lock_yield_check = TRUE;
3141 				continue;
3142 			}
3143 
3144 			/*
3145 			 * if we've gotten here, we have no victim page.
3146 			 * check to see if we've not finished balancing the queues
3147 			 * or we have a page on the aged speculative queue that we
3148 			 * skipped due to force_anonymous == TRUE.. or we have
3149 			 * speculative  pages that we can prematurely age... if
3150 			 * one of these cases we'll keep going, else panic
3151 			 */
3152 			force_anonymous = FALSE;
3153 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3154 
3155 			if (!vm_page_queue_empty(&sq->age_q)) {
3156 				lock_yield_check = TRUE;
3157 				continue;
3158 			}
3159 
3160 			if (vm_page_speculative_count) {
3161 				force_speculative_aging = TRUE;
3162 				lock_yield_check = TRUE;
3163 				continue;
3164 			}
3165 			panic("vm_pageout: no victim");
3166 
3167 			/* NOTREACHED */
3168 		}
3169 
3170 		assert(VM_PAGE_PAGEABLE(m));
3171 		m_object = VM_PAGE_OBJECT(m);
3172 		force_anonymous = FALSE;
3173 
3174 		page_prev_q_state = m->vmp_q_state;
3175 		/*
3176 		 * we just found this page on one of our queues...
3177 		 * it can't also be on the pageout queue, so safe
3178 		 * to call vm_page_queues_remove
3179 		 */
3180 		vm_page_queues_remove(m, TRUE);
3181 
3182 		assert(!m->vmp_laundry);
3183 		assert(!m->vmp_private);
3184 		assert(!m->vmp_fictitious);
3185 		assert(m_object != kernel_object);
3186 		assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3187 
3188 		vm_pageout_vminfo.vm_pageout_considered_page++;
3189 
3190 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3191 
3192 		/*
3193 		 * check to see if we currently are working
3194 		 * with the same object... if so, we've
3195 		 * already got the lock
3196 		 */
3197 		if (m_object != object) {
3198 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3199 
3200 			/*
3201 			 * vps_switch_object() will always drop the 'object' lock first
3202 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3203 			 * either 'm_object' or NULL.
3204 			 */
3205 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3206 
3207 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3208 				lock_yield_check = TRUE;
3209 				continue;
3210 			}
3211 		}
3212 		assert(m_object == object);
3213 		assert(VM_PAGE_OBJECT(m) == m_object);
3214 
3215 		if (m->vmp_busy) {
3216 			/*
3217 			 *	Somebody is already playing with this page.
3218 			 *	Put it back on the appropriate queue
3219 			 *
3220 			 */
3221 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3222 
3223 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3224 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3225 			}
3226 
3227 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3228 
3229 			lock_yield_check = TRUE;
3230 			continue;
3231 		}
3232 
3233 		/*
3234 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3235 		 *	If already cleaning this page in place
3236 		 *	just leave if off the paging queues.
3237 		 *	We can leave the page mapped, and upl_commit_range
3238 		 *	will put it on the clean queue.
3239 		 *
3240 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3241 		 *	an msync INVALIDATE is in progress...
3242 		 *	this page has been marked for destruction
3243 		 *      after it has been cleaned,
3244 		 *      but not yet gathered into a UPL
3245 		 *	where 'cleaning' will be set...
3246 		 *	just leave it off the paging queues
3247 		 *
3248 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3249 		 *	an msync INVALIDATE is in progress
3250 		 *	and the UPL has already gathered this page...
3251 		 *	just leave it off the paging queues
3252 		 */
3253 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3254 			lock_yield_check = TRUE;
3255 			continue;
3256 		}
3257 
3258 
3259 		/*
3260 		 *	If it's absent, in error or the object is no longer alive,
3261 		 *	we can reclaim the page... in the no longer alive case,
3262 		 *	there are 2 states the page can be in that preclude us
3263 		 *	from reclaiming it - busy or cleaning - that we've already
3264 		 *	dealt with
3265 		 */
3266 		if (m->vmp_absent || m->vmp_error || !object->alive) {
3267 			if (m->vmp_absent) {
3268 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3269 			} else if (!object->alive) {
3270 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3271 			} else {
3272 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3273 			}
3274 reclaim_page:
3275 			if (vm_pageout_deadlock_target) {
3276 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3277 				vm_pageout_deadlock_target--;
3278 			}
3279 
3280 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3281 
3282 			if (object->internal) {
3283 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3284 			} else {
3285 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3286 			}
3287 			assert(!m->vmp_cleaning);
3288 			assert(!m->vmp_laundry);
3289 
3290 			if (!object->internal &&
3291 			    object->pager != NULL &&
3292 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3293 				shared_region_pager_reclaimed++;
3294 			}
3295 
3296 			m->vmp_busy = TRUE;
3297 
3298 			/*
3299 			 * remove page from object here since we're already
3300 			 * behind the object lock... defer the rest of the work
3301 			 * we'd normally do in vm_page_free_prepare_object
3302 			 * until 'vm_page_free_list' is called
3303 			 */
3304 			if (m->vmp_tabled) {
3305 				vm_page_remove(m, TRUE);
3306 			}
3307 
3308 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3309 			m->vmp_snext = local_freeq;
3310 			local_freeq = m;
3311 			local_freed++;
3312 
3313 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3314 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3315 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3316 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3317 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3318 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3319 			} else {
3320 				vm_pageout_vminfo.vm_pageout_freed_external++;
3321 			}
3322 
3323 			inactive_burst_count = 0;
3324 
3325 			lock_yield_check = TRUE;
3326 			continue;
3327 		}
3328 		if (object->copy == VM_OBJECT_NULL) {
3329 			/*
3330 			 * No one else can have any interest in this page.
3331 			 * If this is an empty purgable object, the page can be
3332 			 * reclaimed even if dirty.
3333 			 * If the page belongs to a volatile purgable object, we
3334 			 * reactivate it if the compressor isn't active.
3335 			 */
3336 			if (object->purgable == VM_PURGABLE_EMPTY) {
3337 				if (m->vmp_pmapped == TRUE) {
3338 					/* unmap the page */
3339 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3340 					if (refmod_state & VM_MEM_MODIFIED) {
3341 						SET_PAGE_DIRTY(m, FALSE);
3342 					}
3343 				}
3344 				if (m->vmp_dirty || m->vmp_precious) {
3345 					/* we saved the cost of cleaning this page ! */
3346 					vm_page_purged_count++;
3347 				}
3348 				goto reclaim_page;
3349 			}
3350 
3351 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3352 				/*
3353 				 * With the VM compressor, the cost of
3354 				 * reclaiming a page is much lower (no I/O),
3355 				 * so if we find a "volatile" page, it's better
3356 				 * to let it get compressed rather than letting
3357 				 * it occupy a full page until it gets purged.
3358 				 * So no need to check for "volatile" here.
3359 				 */
3360 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3361 				/*
3362 				 * Avoid cleaning a "volatile" page which might
3363 				 * be purged soon.
3364 				 */
3365 
3366 				/* if it's wired, we can't put it on our queue */
3367 				assert(!VM_PAGE_WIRED(m));
3368 
3369 				/* just stick it back on! */
3370 				reactivated_this_call++;
3371 
3372 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3373 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3374 				}
3375 
3376 				goto reactivate_page;
3377 			}
3378 		}
3379 		/*
3380 		 *	If it's being used, reactivate.
3381 		 *	(Fictitious pages are either busy or absent.)
3382 		 *	First, update the reference and dirty bits
3383 		 *	to make sure the page is unreferenced.
3384 		 */
3385 		refmod_state = -1;
3386 
3387 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3388 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3389 
3390 			if (refmod_state & VM_MEM_REFERENCED) {
3391 				m->vmp_reference = TRUE;
3392 			}
3393 			if (refmod_state & VM_MEM_MODIFIED) {
3394 				SET_PAGE_DIRTY(m, FALSE);
3395 			}
3396 		}
3397 
3398 		if (m->vmp_reference || m->vmp_dirty) {
3399 			/* deal with a rogue "reusable" page */
3400 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3401 		}
3402 
3403 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3404 			vm_pageout_state.vm_page_xpmapped_min = 0;
3405 		} else {
3406 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3407 		}
3408 
3409 		if (!m->vmp_no_cache &&
3410 		    page_from_bg_q == FALSE &&
3411 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3412 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3413 			/*
3414 			 * The page we pulled off the inactive list has
3415 			 * been referenced.  It is possible for other
3416 			 * processors to be touching pages faster than we
3417 			 * can clear the referenced bit and traverse the
3418 			 * inactive queue, so we limit the number of
3419 			 * reactivations.
3420 			 */
3421 			if (++reactivated_this_call >= reactivate_limit) {
3422 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3423 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3424 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3425 			} else {
3426 				uint32_t isinuse;
3427 
3428 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3429 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3430 				}
3431 
3432 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3433 reactivate_page:
3434 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3435 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3436 					/*
3437 					 * no explict mappings of this object exist
3438 					 * and it's not open via the filesystem
3439 					 */
3440 					vm_page_deactivate(m);
3441 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3442 				} else {
3443 					/*
3444 					 * The page was/is being used, so put back on active list.
3445 					 */
3446 					vm_page_activate(m);
3447 					counter_inc(&vm_statistics_reactivations);
3448 					inactive_burst_count = 0;
3449 				}
3450 #if CONFIG_BACKGROUND_QUEUE
3451 #if DEVELOPMENT || DEBUG
3452 				if (page_from_bg_q == TRUE) {
3453 					if (m_object->internal) {
3454 						vm_pageout_rejected_bq_internal++;
3455 					} else {
3456 						vm_pageout_rejected_bq_external++;
3457 					}
3458 				}
3459 #endif /* DEVELOPMENT || DEBUG */
3460 #endif /* CONFIG_BACKGROUND_QUEUE */
3461 
3462 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3463 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3464 				}
3465 				vm_pageout_state.vm_pageout_inactive_used++;
3466 
3467 				lock_yield_check = TRUE;
3468 				continue;
3469 			}
3470 			/*
3471 			 * Make sure we call pmap_get_refmod() if it
3472 			 * wasn't already called just above, to update
3473 			 * the dirty bit.
3474 			 */
3475 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3476 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3477 				if (refmod_state & VM_MEM_MODIFIED) {
3478 					SET_PAGE_DIRTY(m, FALSE);
3479 				}
3480 			}
3481 		}
3482 
3483 		/*
3484 		 * we've got a candidate page to steal...
3485 		 *
3486 		 * m->vmp_dirty is up to date courtesy of the
3487 		 * preceding check for m->vmp_reference... if
3488 		 * we get here, then m->vmp_reference had to be
3489 		 * FALSE (or possibly "reactivate_limit" was
3490 		 * exceeded), but in either case we called
3491 		 * pmap_get_refmod() and updated both
3492 		 * m->vmp_reference and m->vmp_dirty
3493 		 *
3494 		 * if it's dirty or precious we need to
3495 		 * see if the target queue is throtttled
3496 		 * it if is, we need to skip over it by moving it back
3497 		 * to the end of the inactive queue
3498 		 */
3499 
3500 		inactive_throttled = FALSE;
3501 
3502 		if (m->vmp_dirty || m->vmp_precious) {
3503 			if (object->internal) {
3504 				if (VM_PAGE_Q_THROTTLED(iq)) {
3505 					inactive_throttled = TRUE;
3506 				}
3507 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3508 				inactive_throttled = TRUE;
3509 			}
3510 		}
3511 throttle_inactive:
3512 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3513 		    object->internal && m->vmp_dirty &&
3514 		    (object->purgable == VM_PURGABLE_DENY ||
3515 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3516 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3517 			vm_page_check_pageable_safe(m);
3518 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3519 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3520 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3521 			vm_page_throttled_count++;
3522 
3523 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3524 
3525 			inactive_burst_count = 0;
3526 
3527 			lock_yield_check = TRUE;
3528 			continue;
3529 		}
3530 		if (inactive_throttled == TRUE) {
3531 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3532 			    &delayed_unlock, &force_anonymous, page_from_bg_q);
3533 
3534 			inactive_burst_count = 0;
3535 
3536 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3537 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3538 			}
3539 
3540 			lock_yield_check = TRUE;
3541 			continue;
3542 		}
3543 
3544 		/*
3545 		 * we've got a page that we can steal...
3546 		 * eliminate all mappings and make sure
3547 		 * we have the up-to-date modified state
3548 		 *
3549 		 * if we need to do a pmap_disconnect then we
3550 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3551 		 * provides the true state atomically... the
3552 		 * page was still mapped up to the pmap_disconnect
3553 		 * and may have been dirtied at the last microsecond
3554 		 *
3555 		 * Note that if 'pmapped' is FALSE then the page is not
3556 		 * and has not been in any map, so there is no point calling
3557 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3558 		 * of likely usage of the page.
3559 		 */
3560 		if (m->vmp_pmapped == TRUE) {
3561 			int pmap_options;
3562 
3563 			/*
3564 			 * Don't count this page as going into the compressor
3565 			 * if any of these are true:
3566 			 * 1) compressed pager isn't enabled
3567 			 * 2) Freezer enabled device with compressed pager
3568 			 *    backend (exclusive use) i.e. most of the VM system
3569 			 *    (including vm_pageout_scan) has no knowledge of
3570 			 *    the compressor
3571 			 * 3) This page belongs to a file and hence will not be
3572 			 *    sent into the compressor
3573 			 */
3574 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3575 			    object->internal == FALSE) {
3576 				pmap_options = 0;
3577 			} else if (m->vmp_dirty || m->vmp_precious) {
3578 				/*
3579 				 * VM knows that this page is dirty (or
3580 				 * precious) and needs to be compressed
3581 				 * rather than freed.
3582 				 * Tell the pmap layer to count this page
3583 				 * as "compressed".
3584 				 */
3585 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3586 			} else {
3587 				/*
3588 				 * VM does not know if the page needs to
3589 				 * be preserved but the pmap layer might tell
3590 				 * us if any mapping has "modified" it.
3591 				 * Let's the pmap layer to count this page
3592 				 * as compressed if and only if it has been
3593 				 * modified.
3594 				 */
3595 				pmap_options =
3596 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3597 			}
3598 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3599 			    pmap_options,
3600 			    NULL);
3601 			if (refmod_state & VM_MEM_MODIFIED) {
3602 				SET_PAGE_DIRTY(m, FALSE);
3603 			}
3604 		}
3605 
3606 		/*
3607 		 * reset our count of pages that have been reclaimed
3608 		 * since the last page was 'stolen'
3609 		 */
3610 		inactive_reclaim_run = 0;
3611 
3612 		/*
3613 		 *	If it's clean and not precious, we can free the page.
3614 		 */
3615 		if (!m->vmp_dirty && !m->vmp_precious) {
3616 			vm_pageout_state.vm_pageout_inactive_clean++;
3617 
3618 			/*
3619 			 * OK, at this point we have found a page we are going to free.
3620 			 */
3621 #if CONFIG_PHANTOM_CACHE
3622 			if (!object->internal) {
3623 				vm_phantom_cache_add_ghost(m);
3624 			}
3625 #endif
3626 			goto reclaim_page;
3627 		}
3628 
3629 		/*
3630 		 * The page may have been dirtied since the last check
3631 		 * for a throttled target queue (which may have been skipped
3632 		 * if the page was clean then).  With the dirty page
3633 		 * disconnected here, we can make one final check.
3634 		 */
3635 		if (object->internal) {
3636 			if (VM_PAGE_Q_THROTTLED(iq)) {
3637 				inactive_throttled = TRUE;
3638 			}
3639 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3640 			inactive_throttled = TRUE;
3641 		}
3642 
3643 		if (inactive_throttled == TRUE) {
3644 			goto throttle_inactive;
3645 		}
3646 
3647 #if VM_PRESSURE_EVENTS
3648 #if CONFIG_JETSAM
3649 
3650 		/*
3651 		 * If Jetsam is enabled, then the sending
3652 		 * of memory pressure notifications is handled
3653 		 * from the same thread that takes care of high-water
3654 		 * and other jetsams i.e. the memorystatus_thread.
3655 		 */
3656 
3657 #else /* CONFIG_JETSAM */
3658 
3659 		vm_pressure_response();
3660 
3661 #endif /* CONFIG_JETSAM */
3662 #endif /* VM_PRESSURE_EVENTS */
3663 
3664 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3665 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3666 		}
3667 
3668 		if (object->internal) {
3669 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3670 		} else {
3671 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3672 		}
3673 
3674 		/*
3675 		 * internal pages will go to the compressor...
3676 		 * external pages will go to the appropriate pager to be cleaned
3677 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3678 		 * is a preferred queue to steal from
3679 		 */
3680 		vm_pageout_cluster(m);
3681 		inactive_burst_count = 0;
3682 
3683 		/*
3684 		 * back to top of pageout scan loop
3685 		 */
3686 	}
3687 }
3688 
3689 
3690 void
vm_page_free_reserve(int pages)3691 vm_page_free_reserve(
3692 	int pages)
3693 {
3694 	int             free_after_reserve;
3695 
3696 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3697 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3698 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3699 		} else {
3700 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3701 		}
3702 	} else {
3703 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3704 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3705 		} else {
3706 			vm_page_free_reserved += pages;
3707 		}
3708 	}
3709 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3710 
3711 	vm_page_free_min = vm_page_free_reserved +
3712 	    VM_PAGE_FREE_MIN(free_after_reserve);
3713 
3714 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3715 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3716 	}
3717 
3718 	vm_page_free_target = vm_page_free_reserved +
3719 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3720 
3721 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3722 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3723 	}
3724 
3725 	if (vm_page_free_target < vm_page_free_min + 5) {
3726 		vm_page_free_target = vm_page_free_min + 5;
3727 	}
3728 
3729 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3730 }
3731 
3732 /*
3733  *	vm_pageout is the high level pageout daemon.
3734  */
3735 
3736 void
vm_pageout_continue(void)3737 vm_pageout_continue(void)
3738 {
3739 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3740 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3741 
3742 	lck_mtx_lock(&vm_page_queue_free_lock);
3743 	vm_pageout_running = TRUE;
3744 	lck_mtx_unlock(&vm_page_queue_free_lock);
3745 
3746 	vm_pageout_scan();
3747 	/*
3748 	 * we hold both the vm_page_queue_free_lock
3749 	 * and the vm_page_queues_lock at this point
3750 	 */
3751 	assert(vm_page_free_wanted == 0);
3752 	assert(vm_page_free_wanted_privileged == 0);
3753 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3754 
3755 	vm_pageout_running = FALSE;
3756 #if XNU_TARGET_OS_OSX
3757 	if (vm_pageout_waiter) {
3758 		vm_pageout_waiter = FALSE;
3759 		thread_wakeup((event_t)&vm_pageout_waiter);
3760 	}
3761 #endif /* XNU_TARGET_OS_OSX */
3762 
3763 	lck_mtx_unlock(&vm_page_queue_free_lock);
3764 	vm_page_unlock_queues();
3765 
3766 	thread_block((thread_continue_t)vm_pageout_continue);
3767 	/*NOTREACHED*/
3768 }
3769 
3770 #if XNU_TARGET_OS_OSX
3771 kern_return_t
vm_pageout_wait(uint64_t deadline)3772 vm_pageout_wait(uint64_t deadline)
3773 {
3774 	kern_return_t kr;
3775 
3776 	lck_mtx_lock(&vm_page_queue_free_lock);
3777 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3778 		vm_pageout_waiter = TRUE;
3779 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3780 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3781 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3782 			kr = KERN_OPERATION_TIMED_OUT;
3783 		}
3784 	}
3785 	lck_mtx_unlock(&vm_page_queue_free_lock);
3786 
3787 	return kr;
3788 }
3789 #endif /* XNU_TARGET_OS_OSX */
3790 
3791 
3792 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q)3793 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3794 {
3795 	vm_page_t       m = NULL;
3796 	vm_object_t     object;
3797 	vm_object_offset_t offset;
3798 	memory_object_t pager;
3799 
3800 	/* On systems with a compressor, the external IO thread clears its
3801 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3802 	 * creation)
3803 	 */
3804 	if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3805 		current_thread()->options &= ~TH_OPT_VMPRIV;
3806 	}
3807 
3808 	vm_page_lockspin_queues();
3809 
3810 	while (!vm_page_queue_empty(&q->pgo_pending)) {
3811 		q->pgo_busy = TRUE;
3812 		vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3813 
3814 		assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3815 		VM_PAGE_CHECK(m);
3816 		/*
3817 		 * grab a snapshot of the object and offset this
3818 		 * page is tabled in so that we can relookup this
3819 		 * page after we've taken the object lock - these
3820 		 * fields are stable while we hold the page queues lock
3821 		 * but as soon as we drop it, there is nothing to keep
3822 		 * this page in this object... we hold an activity_in_progress
3823 		 * on this object which will keep it from terminating
3824 		 */
3825 		object = VM_PAGE_OBJECT(m);
3826 		offset = m->vmp_offset;
3827 
3828 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3829 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3830 
3831 		vm_page_unlock_queues();
3832 
3833 		vm_object_lock(object);
3834 
3835 		m = vm_page_lookup(object, offset);
3836 
3837 		if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3838 		    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3839 			/*
3840 			 * it's either the same page that someone else has
3841 			 * started cleaning (or it's finished cleaning or
3842 			 * been put back on the pageout queue), or
3843 			 * the page has been freed or we have found a
3844 			 * new page at this offset... in all of these cases
3845 			 * we merely need to release the activity_in_progress
3846 			 * we took when we put the page on the pageout queue
3847 			 */
3848 			vm_object_activity_end(object);
3849 			vm_object_unlock(object);
3850 
3851 			vm_page_lockspin_queues();
3852 			continue;
3853 		}
3854 		pager = object->pager;
3855 
3856 		if (pager == MEMORY_OBJECT_NULL) {
3857 			/*
3858 			 * This pager has been destroyed by either
3859 			 * memory_object_destroy or vm_object_destroy, and
3860 			 * so there is nowhere for the page to go.
3861 			 */
3862 			if (m->vmp_free_when_done) {
3863 				/*
3864 				 * Just free the page... VM_PAGE_FREE takes
3865 				 * care of cleaning up all the state...
3866 				 * including doing the vm_pageout_throttle_up
3867 				 */
3868 				VM_PAGE_FREE(m);
3869 			} else {
3870 				vm_page_lockspin_queues();
3871 
3872 				vm_pageout_throttle_up(m);
3873 				vm_page_activate(m);
3874 
3875 				vm_page_unlock_queues();
3876 
3877 				/*
3878 				 *	And we are done with it.
3879 				 */
3880 			}
3881 			vm_object_activity_end(object);
3882 			vm_object_unlock(object);
3883 
3884 			vm_page_lockspin_queues();
3885 			continue;
3886 		}
3887 #if 0
3888 		/*
3889 		 * we don't hold the page queue lock
3890 		 * so this check isn't safe to make
3891 		 */
3892 		VM_PAGE_CHECK(m);
3893 #endif
3894 		/*
3895 		 * give back the activity_in_progress reference we
3896 		 * took when we queued up this page and replace it
3897 		 * it with a paging_in_progress reference that will
3898 		 * also hold the paging offset from changing and
3899 		 * prevent the object from terminating
3900 		 */
3901 		vm_object_activity_end(object);
3902 		vm_object_paging_begin(object);
3903 		vm_object_unlock(object);
3904 
3905 		/*
3906 		 * Send the data to the pager.
3907 		 * any pageout clustering happens there
3908 		 */
3909 		memory_object_data_return(pager,
3910 		    m->vmp_offset + object->paging_offset,
3911 		    PAGE_SIZE,
3912 		    NULL,
3913 		    NULL,
3914 		    FALSE,
3915 		    FALSE,
3916 		    0);
3917 
3918 		vm_object_lock(object);
3919 		vm_object_paging_end(object);
3920 		vm_object_unlock(object);
3921 
3922 		vm_pageout_io_throttle();
3923 
3924 		vm_page_lockspin_queues();
3925 	}
3926 	q->pgo_busy = FALSE;
3927 	q->pgo_idle = TRUE;
3928 
3929 	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3930 	vm_page_unlock_queues();
3931 
3932 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3933 	/*NOTREACHED*/
3934 }
3935 
3936 
3937 #define         MAX_FREE_BATCH          32
3938 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3939                                      * this thread.
3940                                      */
3941 
3942 
3943 void
3944 vm_pageout_iothread_internal_continue(struct cq *);
3945 void
vm_pageout_iothread_internal_continue(struct cq * cq)3946 vm_pageout_iothread_internal_continue(struct cq *cq)
3947 {
3948 	struct vm_pageout_queue *q;
3949 	vm_page_t       m = NULL;
3950 	boolean_t       pgo_draining;
3951 	vm_page_t   local_q;
3952 	int         local_cnt;
3953 	vm_page_t   local_freeq = NULL;
3954 	int         local_freed = 0;
3955 	int         local_batch_size;
3956 #if DEVELOPMENT || DEBUG
3957 	int       ncomps = 0;
3958 	boolean_t marked_active = FALSE;
3959 #endif
3960 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3961 
3962 	q = cq->q;
3963 #if __AMP__
3964 	if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
3965 		local_batch_size = (q->pgo_maxlaundry >> 3);
3966 		local_batch_size = MAX(local_batch_size, 16);
3967 	} else {
3968 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3969 	}
3970 #else
3971 	local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3972 #endif
3973 
3974 #if RECORD_THE_COMPRESSED_DATA
3975 	if (q->pgo_laundry) {
3976 		c_compressed_record_init();
3977 	}
3978 #endif
3979 	while (TRUE) {
3980 		int     pages_left_on_q = 0;
3981 
3982 		local_cnt = 0;
3983 		local_q = NULL;
3984 
3985 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3986 
3987 		vm_page_lock_queues();
3988 #if DEVELOPMENT || DEBUG
3989 		if (marked_active == FALSE) {
3990 			vmct_active++;
3991 			vmct_state[cq->id] = VMCT_ACTIVE;
3992 			marked_active = TRUE;
3993 			if (vmct_active == 1) {
3994 				vm_compressor_epoch_start = mach_absolute_time();
3995 			}
3996 		}
3997 #endif
3998 		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3999 
4000 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4001 
4002 		while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4003 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4004 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4005 			VM_PAGE_CHECK(m);
4006 
4007 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4008 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4009 			m->vmp_laundry = FALSE;
4010 
4011 			m->vmp_snext = local_q;
4012 			local_q = m;
4013 			local_cnt++;
4014 		}
4015 		if (local_q == NULL) {
4016 			break;
4017 		}
4018 
4019 		q->pgo_busy = TRUE;
4020 
4021 		if ((pgo_draining = q->pgo_draining) == FALSE) {
4022 			vm_pageout_throttle_up_batch(q, local_cnt);
4023 			pages_left_on_q = q->pgo_laundry;
4024 		} else {
4025 			pages_left_on_q = q->pgo_laundry - local_cnt;
4026 		}
4027 
4028 		vm_page_unlock_queues();
4029 
4030 #if !RECORD_THE_COMPRESSED_DATA
4031 		if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4032 			thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
4033 		}
4034 #endif
4035 		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4036 
4037 		while (local_q) {
4038 			KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4039 
4040 			m = local_q;
4041 			local_q = m->vmp_snext;
4042 			m->vmp_snext = NULL;
4043 
4044 			if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4045 #if DEVELOPMENT || DEBUG
4046 				ncomps++;
4047 #endif
4048 				KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4049 
4050 				m->vmp_snext = local_freeq;
4051 				local_freeq = m;
4052 				local_freed++;
4053 
4054 				if (local_freed >= MAX_FREE_BATCH) {
4055 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4056 
4057 					vm_page_free_list(local_freeq, TRUE);
4058 
4059 					local_freeq = NULL;
4060 					local_freed = 0;
4061 				}
4062 			}
4063 #if !CONFIG_JETSAM
4064 			while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4065 				kern_return_t   wait_result;
4066 				int             need_wakeup = 0;
4067 
4068 				if (local_freeq) {
4069 					OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4070 
4071 					vm_page_free_list(local_freeq, TRUE);
4072 					local_freeq = NULL;
4073 					local_freed = 0;
4074 
4075 					continue;
4076 				}
4077 				lck_mtx_lock_spin(&vm_page_queue_free_lock);
4078 
4079 				if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4080 					if (vm_page_free_wanted_privileged++ == 0) {
4081 						need_wakeup = 1;
4082 					}
4083 					wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4084 
4085 					lck_mtx_unlock(&vm_page_queue_free_lock);
4086 
4087 					if (need_wakeup) {
4088 						thread_wakeup((event_t)&vm_page_free_wanted);
4089 					}
4090 
4091 					if (wait_result == THREAD_WAITING) {
4092 						thread_block(THREAD_CONTINUE_NULL);
4093 					}
4094 				} else {
4095 					lck_mtx_unlock(&vm_page_queue_free_lock);
4096 				}
4097 			}
4098 #endif
4099 		}
4100 		if (local_freeq) {
4101 			OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4102 
4103 			vm_page_free_list(local_freeq, TRUE);
4104 			local_freeq = NULL;
4105 			local_freed = 0;
4106 		}
4107 		if (pgo_draining == TRUE) {
4108 			vm_page_lockspin_queues();
4109 			vm_pageout_throttle_up_batch(q, local_cnt);
4110 			vm_page_unlock_queues();
4111 		}
4112 	}
4113 	KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4114 
4115 	/*
4116 	 * queue lock is held and our q is empty
4117 	 */
4118 	q->pgo_busy = FALSE;
4119 	q->pgo_idle = TRUE;
4120 
4121 	assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
4122 #if DEVELOPMENT || DEBUG
4123 	if (marked_active == TRUE) {
4124 		vmct_active--;
4125 		vmct_state[cq->id] = VMCT_IDLE;
4126 
4127 		if (vmct_active == 0) {
4128 			vm_compressor_epoch_stop = mach_absolute_time();
4129 			assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4130 			    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4131 			    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4132 			/* This interval includes intervals where one or more
4133 			 * compressor threads were pre-empted
4134 			 */
4135 			vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4136 		}
4137 	}
4138 #endif
4139 	vm_page_unlock_queues();
4140 #if DEVELOPMENT || DEBUG
4141 	if (__improbable(vm_compressor_time_thread)) {
4142 		vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4143 		vmct_stats.vmct_pages[cq->id] += ncomps;
4144 		vmct_stats.vmct_iterations[cq->id]++;
4145 		if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4146 			vmct_stats.vmct_maxpages[cq->id] = ncomps;
4147 		}
4148 		if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4149 			vmct_stats.vmct_minpages[cq->id] = ncomps;
4150 		}
4151 	}
4152 #endif
4153 
4154 	KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4155 
4156 	thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4157 	/*NOTREACHED*/
4158 }
4159 
4160 
4161 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4162 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4163 {
4164 	vm_object_t     object;
4165 	memory_object_t pager;
4166 	int             compressed_count_delta;
4167 	kern_return_t   retval;
4168 
4169 	object = VM_PAGE_OBJECT(m);
4170 
4171 	assert(!m->vmp_free_when_done);
4172 	assert(!m->vmp_laundry);
4173 
4174 	pager = object->pager;
4175 
4176 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4177 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4178 
4179 		vm_object_lock(object);
4180 
4181 		/*
4182 		 * If there is no memory object for the page, create
4183 		 * one and hand it to the compression pager.
4184 		 */
4185 
4186 		if (!object->pager_initialized) {
4187 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4188 		}
4189 		if (!object->pager_initialized) {
4190 			vm_object_compressor_pager_create(object);
4191 		}
4192 
4193 		pager = object->pager;
4194 
4195 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4196 			/*
4197 			 * Still no pager for the object,
4198 			 * or the pager has been destroyed.
4199 			 * Reactivate the page.
4200 			 *
4201 			 * Should only happen if there is no
4202 			 * compression pager
4203 			 */
4204 			PAGE_WAKEUP_DONE(m);
4205 
4206 			vm_page_lockspin_queues();
4207 			vm_page_activate(m);
4208 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4209 			vm_page_unlock_queues();
4210 
4211 			/*
4212 			 *	And we are done with it.
4213 			 */
4214 			vm_object_activity_end(object);
4215 			vm_object_unlock(object);
4216 
4217 			return KERN_FAILURE;
4218 		}
4219 		vm_object_unlock(object);
4220 
4221 		KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4222 	}
4223 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4224 	assert(object->activity_in_progress > 0);
4225 
4226 	retval = vm_compressor_pager_put(
4227 		pager,
4228 		m->vmp_offset + object->paging_offset,
4229 		VM_PAGE_GET_PHYS_PAGE(m),
4230 		current_chead,
4231 		scratch_buf,
4232 		&compressed_count_delta);
4233 
4234 	vm_object_lock(object);
4235 
4236 	assert(object->activity_in_progress > 0);
4237 	assert(VM_PAGE_OBJECT(m) == object);
4238 	assert( !VM_PAGE_WIRED(m));
4239 
4240 	vm_compressor_pager_count(pager,
4241 	    compressed_count_delta,
4242 	    FALSE,                       /* shared_lock */
4243 	    object);
4244 
4245 	if (retval == KERN_SUCCESS) {
4246 		/*
4247 		 * If the object is purgeable, its owner's
4248 		 * purgeable ledgers will be updated in
4249 		 * vm_page_remove() but the page still
4250 		 * contributes to the owner's memory footprint,
4251 		 * so account for it as such.
4252 		 */
4253 		if ((object->purgable != VM_PURGABLE_DENY ||
4254 		    object->vo_ledger_tag) &&
4255 		    object->vo_owner != NULL) {
4256 			/* one more compressed purgeable/tagged page */
4257 			vm_object_owner_compressed_update(object,
4258 			    +1);
4259 		}
4260 		counter_inc(&vm_statistics_compressions);
4261 
4262 		if (m->vmp_tabled) {
4263 			vm_page_remove(m, TRUE);
4264 		}
4265 	} else {
4266 		PAGE_WAKEUP_DONE(m);
4267 
4268 		vm_page_lockspin_queues();
4269 
4270 		vm_page_activate(m);
4271 		vm_pageout_vminfo.vm_compressor_failed++;
4272 
4273 		vm_page_unlock_queues();
4274 	}
4275 	vm_object_activity_end(object);
4276 	vm_object_unlock(object);
4277 
4278 	return retval;
4279 }
4280 
4281 
4282 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4283 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4284 {
4285 	uint32_t        policy;
4286 
4287 	if (hibernate_cleaning_in_progress == TRUE) {
4288 		req_lowpriority = FALSE;
4289 	}
4290 
4291 	if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4292 		vm_page_unlock_queues();
4293 
4294 		if (req_lowpriority == TRUE) {
4295 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4296 			DTRACE_VM(laundrythrottle);
4297 		} else {
4298 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4299 			DTRACE_VM(laundryunthrottle);
4300 		}
4301 		proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4302 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4303 
4304 		vm_page_lock_queues();
4305 		eq->pgo_lowpriority = req_lowpriority;
4306 	}
4307 }
4308 
4309 
4310 static void
vm_pageout_iothread_external(void)4311 vm_pageout_iothread_external(void)
4312 {
4313 	thread_t        self = current_thread();
4314 
4315 	self->options |= TH_OPT_VMPRIV;
4316 
4317 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4318 
4319 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4320 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4321 
4322 	vm_page_lock_queues();
4323 
4324 	vm_pageout_queue_external.pgo_tid = self->thread_id;
4325 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4326 	vm_pageout_queue_external.pgo_inited = TRUE;
4327 
4328 	vm_page_unlock_queues();
4329 
4330 #if CONFIG_THREAD_GROUPS
4331 	thread_group_vm_add();
4332 #endif /* CONFIG_THREAD_GROUPS */
4333 
4334 	vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4335 
4336 	/*NOTREACHED*/
4337 }
4338 
4339 
4340 static void
vm_pageout_iothread_internal(struct cq * cq)4341 vm_pageout_iothread_internal(struct cq *cq)
4342 {
4343 	thread_t        self = current_thread();
4344 
4345 	self->options |= TH_OPT_VMPRIV;
4346 
4347 	vm_page_lock_queues();
4348 
4349 	vm_pageout_queue_internal.pgo_tid = self->thread_id;
4350 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4351 	vm_pageout_queue_internal.pgo_inited = TRUE;
4352 
4353 	vm_page_unlock_queues();
4354 
4355 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4356 		thread_vm_bind_group_add();
4357 	}
4358 
4359 #if CONFIG_THREAD_GROUPS
4360 	thread_group_vm_add();
4361 #endif /* CONFIG_THREAD_GROUPS */
4362 
4363 #if __AMP__
4364 	if (vm_compressor_ebound) {
4365 		/*
4366 		 * Use the soft bound option for vm_compressor to allow it to run on
4367 		 * P-cores if E-cluster is unavailable.
4368 		 */
4369 		thread_bind_cluster_type(self, 'E', true);
4370 	}
4371 #endif /* __AMP__ */
4372 
4373 	thread_set_thread_name(current_thread(), "VM_compressor");
4374 #if DEVELOPMENT || DEBUG
4375 	vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4376 #endif
4377 	vm_pageout_iothread_internal_continue(cq);
4378 
4379 	/*NOTREACHED*/
4380 }
4381 
4382 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4383 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4384 {
4385 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4386 		return KERN_SUCCESS;
4387 	} else {
4388 		return KERN_FAILURE; /* Already set */
4389 	}
4390 }
4391 
4392 extern boolean_t        memorystatus_manual_testing_on;
4393 extern unsigned int     memorystatus_level;
4394 
4395 
4396 #if VM_PRESSURE_EVENTS
4397 
4398 boolean_t vm_pressure_events_enabled = FALSE;
4399 
4400 extern uint64_t next_warning_notification_sent_at_ts;
4401 extern uint64_t next_critical_notification_sent_at_ts;
4402 
4403 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4404 
4405 /*
4406  * The last time there was change in pressure level OR we forced a check
4407  * because the system is stuck in a non-normal pressure level.
4408  */
4409 uint64_t  vm_pressure_last_level_transition_abs = 0;
4410 
4411 /*
4412  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4413  * level before resending out notifications for that level again.
4414  */
4415 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4416 
4417 void
vm_pressure_response(void)4418 vm_pressure_response(void)
4419 {
4420 	vm_pressure_level_t     old_level = kVMPressureNormal;
4421 	int                     new_level = -1;
4422 	unsigned int            total_pages;
4423 	uint64_t                available_memory = 0;
4424 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4425 	bool                    force_check = false;
4426 	int                     time_in_mins;
4427 
4428 
4429 	if (vm_pressure_events_enabled == FALSE) {
4430 		return;
4431 	}
4432 
4433 #if !XNU_TARGET_OS_OSX
4434 
4435 	available_memory = (uint64_t) memorystatus_available_pages;
4436 
4437 #else /* !XNU_TARGET_OS_OSX */
4438 
4439 	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4440 	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4441 
4442 #endif /* !XNU_TARGET_OS_OSX */
4443 
4444 	total_pages = (unsigned int) atop_64(max_mem);
4445 #if CONFIG_SECLUDED_MEMORY
4446 	total_pages -= vm_page_secluded_count;
4447 #endif /* CONFIG_SECLUDED_MEMORY */
4448 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4449 
4450 	if (memorystatus_manual_testing_on) {
4451 		return;
4452 	}
4453 
4454 	curr_ts = mach_absolute_time();
4455 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4456 
4457 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4458 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4459 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4460 
4461 	old_level = memorystatus_vm_pressure_level;
4462 
4463 	switch (memorystatus_vm_pressure_level) {
4464 	case kVMPressureNormal:
4465 	{
4466 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4467 			new_level = kVMPressureCritical;
4468 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4469 			new_level = kVMPressureWarning;
4470 		}
4471 		break;
4472 	}
4473 
4474 	case kVMPressureWarning:
4475 	case kVMPressureUrgent:
4476 	{
4477 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4478 			new_level = kVMPressureNormal;
4479 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4480 			new_level = kVMPressureCritical;
4481 		} else if (force_check) {
4482 			new_level = kVMPressureWarning;
4483 			next_warning_notification_sent_at_ts = curr_ts;
4484 		}
4485 		break;
4486 	}
4487 
4488 	case kVMPressureCritical:
4489 	{
4490 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4491 			new_level = kVMPressureNormal;
4492 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4493 			new_level = kVMPressureWarning;
4494 		} else if (force_check) {
4495 			new_level = kVMPressureCritical;
4496 			next_critical_notification_sent_at_ts = curr_ts;
4497 		}
4498 		break;
4499 	}
4500 
4501 	default:
4502 		return;
4503 	}
4504 
4505 	if (new_level != -1 || force_check) {
4506 		if (new_level != -1) {
4507 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4508 
4509 			if (new_level != (int) old_level) {
4510 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4511 				    new_level, old_level, 0, 0);
4512 			}
4513 		} else {
4514 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4515 			    new_level, old_level, force_check, 0);
4516 		}
4517 
4518 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4519 			/*
4520 			 * We don't want to schedule a wakeup while hibernation is in progress
4521 			 * because that could collide with checks for non-monotonicity in the scheduler.
4522 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4523 			 * we _might_ want to use that for decisions regarding which pages or how
4524 			 * many pages we want to dump in hibernation.
4525 			 */
4526 			return;
4527 		}
4528 
4529 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4530 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4531 				thread_wakeup(&vm_pressure_thread);
4532 			}
4533 
4534 			if (old_level != memorystatus_vm_pressure_level) {
4535 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4536 			}
4537 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4538 		}
4539 	}
4540 }
4541 #endif /* VM_PRESSURE_EVENTS */
4542 
4543 /*
4544  * Function called by a kernel thread to either get the current pressure level or
4545  * wait until memory pressure changes from a given level.
4546  */
4547 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4548 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4549 {
4550 #if !VM_PRESSURE_EVENTS
4551 
4552 	return KERN_FAILURE;
4553 
4554 #else /* VM_PRESSURE_EVENTS */
4555 
4556 	wait_result_t       wr = 0;
4557 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4558 
4559 	if (pressure_level == NULL) {
4560 		return KERN_INVALID_ARGUMENT;
4561 	}
4562 
4563 	if (*pressure_level == kVMPressureJetsam) {
4564 		if (!wait_for_pressure) {
4565 			return KERN_INVALID_ARGUMENT;
4566 		}
4567 
4568 		lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4569 		wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4570 		    THREAD_INTERRUPTIBLE);
4571 		if (wr == THREAD_WAITING) {
4572 			++memorystatus_jetsam_fg_band_waiters;
4573 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4574 			wr = thread_block(THREAD_CONTINUE_NULL);
4575 		} else {
4576 			lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4577 		}
4578 		if (wr != THREAD_AWAKENED) {
4579 			return KERN_ABORTED;
4580 		}
4581 		*pressure_level = kVMPressureJetsam;
4582 		return KERN_SUCCESS;
4583 	}
4584 
4585 	if (wait_for_pressure == TRUE) {
4586 		while (old_level == *pressure_level) {
4587 			wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4588 			    THREAD_INTERRUPTIBLE);
4589 			if (wr == THREAD_WAITING) {
4590 				wr = thread_block(THREAD_CONTINUE_NULL);
4591 			}
4592 			if (wr == THREAD_INTERRUPTED) {
4593 				return KERN_ABORTED;
4594 			}
4595 
4596 			if (wr == THREAD_AWAKENED) {
4597 				old_level = memorystatus_vm_pressure_level;
4598 			}
4599 		}
4600 	}
4601 
4602 	*pressure_level = old_level;
4603 	return KERN_SUCCESS;
4604 #endif /* VM_PRESSURE_EVENTS */
4605 }
4606 
4607 #if VM_PRESSURE_EVENTS
4608 void
vm_pressure_thread(void)4609 vm_pressure_thread(void)
4610 {
4611 	static boolean_t thread_initialized = FALSE;
4612 
4613 	if (thread_initialized == TRUE) {
4614 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4615 		consider_vm_pressure_events();
4616 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4617 	}
4618 
4619 #if CONFIG_THREAD_GROUPS
4620 	thread_group_vm_add();
4621 #endif /* CONFIG_THREAD_GROUPS */
4622 
4623 	thread_set_thread_name(current_thread(), "VM_pressure");
4624 	thread_initialized = TRUE;
4625 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4626 	thread_block((thread_continue_t)vm_pressure_thread);
4627 }
4628 #endif /* VM_PRESSURE_EVENTS */
4629 
4630 
4631 /*
4632  * called once per-second via "compute_averages"
4633  */
4634 void
compute_pageout_gc_throttle(__unused void * arg)4635 compute_pageout_gc_throttle(__unused void *arg)
4636 {
4637 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4638 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4639 
4640 		thread_wakeup(VM_PAGEOUT_GC_EVENT);
4641 	}
4642 }
4643 
4644 /*
4645  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4646  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4647  * jetsams. We need to check if the zone map size is above its jetsam limit to
4648  * decide if this was indeed the case.
4649  *
4650  * We need to do this on a different thread because of the following reasons:
4651  *
4652  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4653  * itself causing the system to hang. We perform synchronous jetsams if we're
4654  * leaking in the VM map entries zone, so the leaking process could be doing a
4655  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4656  * jetsam itself. We also need the vm_map lock on the process termination path,
4657  * which would now lead the dying process to deadlock against itself.
4658  *
4659  * 2. The jetsam path might need to allocate zone memory itself. We could try
4660  * using the non-blocking variant of zalloc for this path, but we can still
4661  * end up trying to do a kernel_memory_allocate when the zone maps are almost
4662  * full.
4663  */
4664 __dead2
4665 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4666 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4667 {
4668 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4669 
4670 	if (step == VM_PAGEOUT_GC_INIT) {
4671 		/* first time being called is not about GC */
4672 #if CONFIG_THREAD_GROUPS
4673 		thread_group_vm_add();
4674 #endif /* CONFIG_THREAD_GROUPS */
4675 	} else if (zone_map_nearing_exhaustion()) {
4676 		/*
4677 		 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4678 		 *
4679 		 * Bail out after calling zone_gc (which triggers the
4680 		 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4681 		 * operations that clear out a bunch of caches might allocate zone
4682 		 * memory themselves (for eg. vm_map operations would need VM map
4683 		 * entries). Since the zone map is almost full at this point, we
4684 		 * could end up with a panic. We just need to quickly jetsam a
4685 		 * process and exit here.
4686 		 *
4687 		 * It could so happen that we were woken up to relieve memory
4688 		 * pressure and the zone map also happened to be near its limit at
4689 		 * the time, in which case we'll skip out early. But that should be
4690 		 * ok; if memory pressure persists, the thread will simply be woken
4691 		 * up again.
4692 		 */
4693 		zone_gc(ZONE_GC_JETSAM);
4694 	} else {
4695 		/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4696 		boolean_t buf_large_zfree = FALSE;
4697 		boolean_t first_try = TRUE;
4698 
4699 		stack_collect();
4700 
4701 		consider_machine_collect();
4702 		mbuf_drain(FALSE);
4703 
4704 		do {
4705 			if (consider_buffer_cache_collect != NULL) {
4706 				buf_large_zfree = (*consider_buffer_cache_collect)(0);
4707 			}
4708 			if (first_try == TRUE || buf_large_zfree == TRUE) {
4709 				/*
4710 				 * zone_gc should be last, because the other operations
4711 				 * might return memory to zones.
4712 				 */
4713 				zone_gc(ZONE_GC_TRIM);
4714 			}
4715 			first_try = FALSE;
4716 		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4717 
4718 		consider_machine_adjust();
4719 	}
4720 
4721 	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4722 
4723 	thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4724 	__builtin_unreachable();
4725 }
4726 
4727 
4728 #if VM_PAGE_BUCKETS_CHECK
4729 #if VM_PAGE_FAKE_BUCKETS
4730 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4731 #endif /* VM_PAGE_FAKE_BUCKETS */
4732 #endif /* VM_PAGE_BUCKETS_CHECK */
4733 
4734 
4735 
4736 void
vm_set_restrictions(unsigned int num_cpus)4737 vm_set_restrictions(unsigned int num_cpus)
4738 {
4739 	int vm_restricted_to_single_processor = 0;
4740 
4741 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4742 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4743 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4744 	} else {
4745 		assert(num_cpus > 0);
4746 
4747 		if (num_cpus <= 3) {
4748 			/*
4749 			 * on systems with a limited number of CPUS, bind the
4750 			 * 4 major threads that can free memory and that tend to use
4751 			 * a fair bit of CPU under pressured conditions to a single processor.
4752 			 * This insures that these threads don't hog all of the available CPUs
4753 			 * (important for camera launch), while allowing them to run independently
4754 			 * w/r to locks... the 4 threads are
4755 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4756 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
4757 			 * memorystatus_thread (jetsams).
4758 			 *
4759 			 * the first time the thread is run, it is responsible for checking the
4760 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
4761 			 * thread_bind_master...  someday this should be replaced with a group
4762 			 * scheduling mechanism and KPI.
4763 			 */
4764 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4765 		} else {
4766 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4767 		}
4768 	}
4769 }
4770 
4771 /*
4772  * Set up vm_config based on the vm_compressor_mode.
4773  * Must run BEFORE the pageout thread starts up.
4774  */
4775 __startup_func
4776 void
vm_config_init(void)4777 vm_config_init(void)
4778 {
4779 	bzero(&vm_config, sizeof(vm_config));
4780 
4781 	switch (vm_compressor_mode) {
4782 	case VM_PAGER_DEFAULT:
4783 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4784 		OS_FALLTHROUGH;
4785 
4786 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
4787 		vm_config.compressor_is_present = TRUE;
4788 		vm_config.swap_is_present = TRUE;
4789 		vm_config.compressor_is_active = TRUE;
4790 		vm_config.swap_is_active = TRUE;
4791 		break;
4792 
4793 	case VM_PAGER_COMPRESSOR_NO_SWAP:
4794 		vm_config.compressor_is_present = TRUE;
4795 		vm_config.swap_is_present = TRUE;
4796 		vm_config.compressor_is_active = TRUE;
4797 		break;
4798 
4799 	case VM_PAGER_FREEZER_DEFAULT:
4800 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4801 		OS_FALLTHROUGH;
4802 
4803 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4804 		vm_config.compressor_is_present = TRUE;
4805 		vm_config.swap_is_present = TRUE;
4806 		break;
4807 
4808 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4809 		vm_config.compressor_is_present = TRUE;
4810 		vm_config.swap_is_present = TRUE;
4811 		vm_config.compressor_is_active = TRUE;
4812 		vm_config.freezer_swap_is_active = TRUE;
4813 		break;
4814 
4815 	case VM_PAGER_NOT_CONFIGURED:
4816 		break;
4817 
4818 	default:
4819 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
4820 		break;
4821 	}
4822 }
4823 
4824 __startup_func
4825 static void
vm_pageout_create_gc_thread(void)4826 vm_pageout_create_gc_thread(void)
4827 {
4828 	thread_t thread;
4829 
4830 	if (kernel_thread_create(vm_pageout_garbage_collect,
4831 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
4832 		panic("vm_pageout_garbage_collect: create failed");
4833 	}
4834 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
4835 	if (thread->reserved_stack == 0) {
4836 		assert(thread->kernel_stack);
4837 		thread->reserved_stack = thread->kernel_stack;
4838 	}
4839 
4840 	/* thread is started in vm_pageout() */
4841 	vm_pageout_gc_thread = thread;
4842 }
4843 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
4844 
4845 void
vm_pageout(void)4846 vm_pageout(void)
4847 {
4848 	thread_t        self = current_thread();
4849 	thread_t        thread;
4850 	kern_return_t   result;
4851 	spl_t           s;
4852 
4853 	/*
4854 	 * Set thread privileges.
4855 	 */
4856 	s = splsched();
4857 
4858 #if CONFIG_VPS_DYNAMIC_PRIO
4859 
4860 	int             vps_dynprio_bootarg = 0;
4861 
4862 	if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
4863 		vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
4864 		kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
4865 	} else {
4866 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4867 			vps_dynamic_priority_enabled = TRUE;
4868 		} else {
4869 			vps_dynamic_priority_enabled = FALSE;
4870 		}
4871 	}
4872 
4873 	if (vps_dynamic_priority_enabled) {
4874 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
4875 		thread_set_eager_preempt(self);
4876 	} else {
4877 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
4878 	}
4879 
4880 #else /* CONFIG_VPS_DYNAMIC_PRIO */
4881 
4882 	vps_dynamic_priority_enabled = FALSE;
4883 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
4884 
4885 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
4886 
4887 	thread_lock(self);
4888 	self->options |= TH_OPT_VMPRIV;
4889 	thread_unlock(self);
4890 
4891 	if (!self->reserved_stack) {
4892 		self->reserved_stack = self->kernel_stack;
4893 	}
4894 
4895 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
4896 	    vps_dynamic_priority_enabled == FALSE) {
4897 		thread_vm_bind_group_add();
4898 	}
4899 
4900 
4901 #if CONFIG_THREAD_GROUPS
4902 	thread_group_vm_add();
4903 #endif /* CONFIG_THREAD_GROUPS */
4904 
4905 #if __AMP__
4906 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
4907 	if (vm_pgo_pbound) {
4908 		/*
4909 		 * Use the soft bound option for vm pageout to allow it to run on
4910 		 * E-cores if P-cluster is unavailable.
4911 		 */
4912 		thread_bind_cluster_type(self, 'P', true);
4913 	}
4914 #endif /* __AMP__ */
4915 
4916 	splx(s);
4917 
4918 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
4919 
4920 	/*
4921 	 *	Initialize some paging parameters.
4922 	 */
4923 
4924 	vm_pageout_state.vm_pressure_thread_running = FALSE;
4925 	vm_pageout_state.vm_pressure_changed = FALSE;
4926 	vm_pageout_state.memorystatus_purge_on_warning = 2;
4927 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
4928 	vm_pageout_state.memorystatus_purge_on_critical = 8;
4929 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
4930 	vm_pageout_state.vm_page_speculative_percentage = 5;
4931 	vm_pageout_state.vm_page_speculative_target = 0;
4932 
4933 	vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
4934 	vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
4935 
4936 	vm_pageout_state.vm_pageout_swap_wait = 0;
4937 	vm_pageout_state.vm_pageout_idle_wait = 0;
4938 	vm_pageout_state.vm_pageout_empty_wait = 0;
4939 	vm_pageout_state.vm_pageout_burst_wait = 0;
4940 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
4941 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
4942 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
4943 
4944 	vm_pageout_state.vm_pageout_inactive = 0;
4945 	vm_pageout_state.vm_pageout_inactive_used = 0;
4946 	vm_pageout_state.vm_pageout_inactive_clean = 0;
4947 
4948 	vm_pageout_state.vm_memory_pressure = 0;
4949 	vm_pageout_state.vm_page_filecache_min = 0;
4950 #if CONFIG_JETSAM
4951 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
4952 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
4953 #else
4954 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
4955 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
4956 #endif
4957 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
4958 
4959 	vm_pageout_state.vm_pageout_considered_page_last = 0;
4960 
4961 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
4962 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4963 	}
4964 
4965 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
4966 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4967 	}
4968 
4969 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
4970 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4971 	}
4972 
4973 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
4974 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4975 	}
4976 
4977 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
4978 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4979 	}
4980 
4981 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
4982 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4983 	}
4984 
4985 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
4986 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4987 	}
4988 	/*
4989 	 * even if we've already called vm_page_free_reserve
4990 	 * call it again here to insure that the targets are
4991 	 * accurately calculated (it uses vm_page_free_count_init)
4992 	 * calling it with an arg of 0 will not change the reserve
4993 	 * but will re-calculate free_min and free_target
4994 	 */
4995 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4996 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4997 	} else {
4998 		vm_page_free_reserve(0);
4999 	}
5000 
5001 
5002 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5003 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5004 	vm_pageout_queue_external.pgo_laundry = 0;
5005 	vm_pageout_queue_external.pgo_idle = FALSE;
5006 	vm_pageout_queue_external.pgo_busy = FALSE;
5007 	vm_pageout_queue_external.pgo_throttled = FALSE;
5008 	vm_pageout_queue_external.pgo_draining = FALSE;
5009 	vm_pageout_queue_external.pgo_lowpriority = FALSE;
5010 	vm_pageout_queue_external.pgo_tid = -1;
5011 	vm_pageout_queue_external.pgo_inited = FALSE;
5012 
5013 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5014 	vm_pageout_queue_internal.pgo_maxlaundry = 0;
5015 	vm_pageout_queue_internal.pgo_laundry = 0;
5016 	vm_pageout_queue_internal.pgo_idle = FALSE;
5017 	vm_pageout_queue_internal.pgo_busy = FALSE;
5018 	vm_pageout_queue_internal.pgo_throttled = FALSE;
5019 	vm_pageout_queue_internal.pgo_draining = FALSE;
5020 	vm_pageout_queue_internal.pgo_lowpriority = FALSE;
5021 	vm_pageout_queue_internal.pgo_tid = -1;
5022 	vm_pageout_queue_internal.pgo_inited = FALSE;
5023 
5024 	/* internal pageout thread started when default pager registered first time */
5025 	/* external pageout and garbage collection threads started here */
5026 
5027 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5028 	    BASEPRI_VM,
5029 	    &vm_pageout_state.vm_pageout_external_iothread);
5030 	if (result != KERN_SUCCESS) {
5031 		panic("vm_pageout_iothread_external: create failed");
5032 	}
5033 	thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5034 	thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5035 
5036 	thread_mtx_lock(vm_pageout_gc_thread );
5037 	thread_start(vm_pageout_gc_thread );
5038 	thread_mtx_unlock(vm_pageout_gc_thread);
5039 
5040 #if VM_PRESSURE_EVENTS
5041 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5042 	    BASEPRI_DEFAULT,
5043 	    &thread);
5044 
5045 	if (result != KERN_SUCCESS) {
5046 		panic("vm_pressure_thread: create failed");
5047 	}
5048 
5049 	thread_deallocate(thread);
5050 #endif
5051 
5052 	vm_object_reaper_init();
5053 
5054 
5055 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5056 		vm_compressor_init();
5057 	}
5058 
5059 #if VM_PRESSURE_EVENTS
5060 	vm_pressure_events_enabled = TRUE;
5061 #endif /* VM_PRESSURE_EVENTS */
5062 
5063 #if CONFIG_PHANTOM_CACHE
5064 	vm_phantom_cache_init();
5065 #endif
5066 #if VM_PAGE_BUCKETS_CHECK
5067 #if VM_PAGE_FAKE_BUCKETS
5068 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5069 	    (uint64_t) vm_page_fake_buckets_start,
5070 	    (uint64_t) vm_page_fake_buckets_end);
5071 	pmap_protect(kernel_pmap,
5072 	    vm_page_fake_buckets_start,
5073 	    vm_page_fake_buckets_end,
5074 	    VM_PROT_READ);
5075 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5076 #endif /* VM_PAGE_FAKE_BUCKETS */
5077 #endif /* VM_PAGE_BUCKETS_CHECK */
5078 
5079 #if VM_OBJECT_TRACKING
5080 	vm_object_tracking_init();
5081 #endif /* VM_OBJECT_TRACKING */
5082 
5083 #if __arm64__
5084 //	vm_tests();
5085 #endif /* __arm64__ */
5086 
5087 	vm_pageout_continue();
5088 
5089 	/*
5090 	 * Unreached code!
5091 	 *
5092 	 * The vm_pageout_continue() call above never returns, so the code below is never
5093 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5094 	 * points that our kernel doesn't have an analog for.  These are probe points that
5095 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5096 	 * scripts that use them.  Declaring the probe points here means their scripts will
5097 	 * compile and execute which we want for portability of the scripts, but since this
5098 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5099 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5100 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5101 	 */
5102 
5103 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5104 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5105 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5106 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5107 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5108 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5109 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5110 	/*NOTREACHED*/
5111 }
5112 
5113 
5114 
5115 kern_return_t
vm_pageout_internal_start(void)5116 vm_pageout_internal_start(void)
5117 {
5118 	kern_return_t   result = KERN_SUCCESS;
5119 	host_basic_info_data_t hinfo;
5120 	vm_offset_t     buf, bufsize;
5121 
5122 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5123 
5124 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5125 #define BSD_HOST 1
5126 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5127 
5128 	assert(hinfo.max_cpus > 0);
5129 
5130 #if !XNU_TARGET_OS_OSX
5131 	vm_pageout_state.vm_compressor_thread_count = 1;
5132 #else /* !XNU_TARGET_OS_OSX */
5133 	if (hinfo.max_cpus > 4) {
5134 		vm_pageout_state.vm_compressor_thread_count = 2;
5135 	} else {
5136 		vm_pageout_state.vm_compressor_thread_count = 1;
5137 	}
5138 #endif /* !XNU_TARGET_OS_OSX */
5139 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5140 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5141 
5142 #if     __AMP__
5143 	PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5144 	if (vm_compressor_ebound) {
5145 		vm_pageout_state.vm_compressor_thread_count = 2;
5146 	}
5147 #endif
5148 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5149 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5150 	}
5151 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5152 		vm_pageout_state.vm_compressor_thread_count = 1;
5153 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5154 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5155 	}
5156 
5157 	vm_pageout_queue_internal.pgo_maxlaundry =
5158 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5159 
5160 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5161 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5162 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5163 
5164 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5165 	if (kernel_memory_allocate(kernel_map, &buf,
5166 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5167 	    0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR)) {
5168 		panic("vm_pageout_internal_start: Unable to allocate %zd bytes",
5169 		    (size_t)(bufsize * vm_pageout_state.vm_compressor_thread_count));
5170 	}
5171 
5172 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5173 		ciq[i].id = i;
5174 		ciq[i].q = &vm_pageout_queue_internal;
5175 		ciq[i].current_chead = NULL;
5176 		ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5177 
5178 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5179 		    (void *)&ciq[i], BASEPRI_VM,
5180 		    &vm_pageout_state.vm_pageout_internal_iothread);
5181 
5182 		if (result == KERN_SUCCESS) {
5183 			thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5184 		} else {
5185 			break;
5186 		}
5187 	}
5188 	return result;
5189 }
5190 
5191 #if CONFIG_IOSCHED
5192 /*
5193  * To support I/O Expedite for compressed files we mark the upls with special flags.
5194  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5195  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5196  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5197  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5198  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5199  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5200  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5201  * unless the real I/O upl is being destroyed).
5202  */
5203 
5204 
5205 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5206 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5207 {
5208 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5209 
5210 	upl_lock(src_upl);
5211 	if (src_upl->decmp_io_upl) {
5212 		/*
5213 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5214 		 * This case should rarely happen and even if it does, it just means
5215 		 * that we might issue a spurious expedite which the driver is expected
5216 		 * to handle.
5217 		 */
5218 		upl_unlock(src_upl);
5219 		return;
5220 	}
5221 	src_upl->decmp_io_upl = (void *)upl;
5222 	src_upl->ref_count++;
5223 
5224 	upl->flags |= UPL_DECMP_REAL_IO;
5225 	upl->decmp_io_upl = (void *)src_upl;
5226 	upl_unlock(src_upl);
5227 }
5228 #endif /* CONFIG_IOSCHED */
5229 
5230 #if UPL_DEBUG
5231 int     upl_debug_enabled = 1;
5232 #else
5233 int     upl_debug_enabled = 0;
5234 #endif
5235 
5236 static upl_t
upl_create(int type,int flags,upl_size_t size)5237 upl_create(int type, int flags, upl_size_t size)
5238 {
5239 	upl_t   upl;
5240 	vm_size_t       page_field_size = 0;
5241 	int     upl_flags = 0;
5242 	vm_size_t       upl_size  = sizeof(struct upl);
5243 
5244 	assert(page_aligned(size));
5245 
5246 	size = round_page_32(size);
5247 
5248 	if (type & UPL_CREATE_LITE) {
5249 		page_field_size = (atop(size) + 7) >> 3;
5250 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5251 
5252 		upl_flags |= UPL_LITE;
5253 	}
5254 	if (type & UPL_CREATE_INTERNAL) {
5255 		upl_size += sizeof(struct upl_page_info) * atop(size);
5256 
5257 		upl_flags |= UPL_INTERNAL;
5258 	}
5259 	upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5260 
5261 	upl->flags = upl_flags | flags;
5262 	upl->ref_count = 1;
5263 	upl_lock_init(upl);
5264 #if CONFIG_IOSCHED
5265 	if (type & UPL_CREATE_IO_TRACKING) {
5266 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5267 	}
5268 
5269 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5270 		/* Only support expedite on internal UPLs */
5271 		thread_t        curthread = current_thread();
5272 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5273 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5274 		if (curthread->decmp_upl != NULL) {
5275 			upl_set_decmp_info(upl, curthread->decmp_upl);
5276 		}
5277 	}
5278 #endif
5279 #if CONFIG_IOSCHED || UPL_DEBUG
5280 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5281 		upl->upl_creator = current_thread();
5282 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5283 	}
5284 #endif
5285 
5286 #if UPL_DEBUG
5287 	(void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5288 #endif /* UPL_DEBUG */
5289 
5290 	return upl;
5291 }
5292 
5293 static void
upl_destroy(upl_t upl)5294 upl_destroy(upl_t upl)
5295 {
5296 	int     page_field_size;  /* bit field in word size buf */
5297 	int     size;
5298 
5299 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5300 
5301 	if (upl->ext_ref_count) {
5302 		panic("upl(%p) ext_ref_count", upl);
5303 	}
5304 
5305 #if CONFIG_IOSCHED
5306 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5307 		upl_t src_upl;
5308 		src_upl = upl->decmp_io_upl;
5309 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5310 		upl_lock(src_upl);
5311 		src_upl->decmp_io_upl = NULL;
5312 		upl_unlock(src_upl);
5313 		upl_deallocate(src_upl);
5314 	}
5315 #endif /* CONFIG_IOSCHED */
5316 
5317 #if CONFIG_IOSCHED || UPL_DEBUG
5318 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5319 	    !(upl->flags & UPL_VECTOR)) {
5320 		vm_object_t     object;
5321 
5322 		if (upl->flags & UPL_SHADOWED) {
5323 			object = upl->map_object->shadow;
5324 		} else {
5325 			object = upl->map_object;
5326 		}
5327 
5328 		vm_object_lock(object);
5329 		queue_remove(&object->uplq, upl, upl_t, uplq);
5330 		vm_object_activity_end(object);
5331 		vm_object_collapse(object, 0, TRUE);
5332 		vm_object_unlock(object);
5333 	}
5334 #endif
5335 	/*
5336 	 * drop a reference on the map_object whether or
5337 	 * not a pageout object is inserted
5338 	 */
5339 	if (upl->flags & UPL_SHADOWED) {
5340 		vm_object_deallocate(upl->map_object);
5341 	}
5342 
5343 	if (upl->flags & UPL_DEVICE_MEMORY) {
5344 		size = PAGE_SIZE;
5345 	} else {
5346 		size = upl_adjusted_size(upl, PAGE_MASK);
5347 	}
5348 	page_field_size = 0;
5349 
5350 	if (upl->flags & UPL_LITE) {
5351 		page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5352 		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5353 	}
5354 	upl_lock_destroy(upl);
5355 	upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5356 
5357 #if CONFIG_IOSCHED
5358 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5359 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5360 	}
5361 #endif
5362 
5363 	if (upl->flags & UPL_INTERNAL) {
5364 		kheap_free(KHEAP_DEFAULT, upl,
5365 		    sizeof(struct upl) +
5366 		    (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5367 		    + page_field_size);
5368 	} else {
5369 		kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5370 	}
5371 }
5372 
5373 void
upl_deallocate(upl_t upl)5374 upl_deallocate(upl_t upl)
5375 {
5376 	upl_lock(upl);
5377 
5378 	if (--upl->ref_count == 0) {
5379 		if (vector_upl_is_valid(upl)) {
5380 			vector_upl_deallocate(upl);
5381 		}
5382 		upl_unlock(upl);
5383 
5384 		if (upl->upl_iodone) {
5385 			upl_callout_iodone(upl);
5386 		}
5387 
5388 		upl_destroy(upl);
5389 	} else {
5390 		upl_unlock(upl);
5391 	}
5392 }
5393 
5394 #if CONFIG_IOSCHED
5395 void
upl_mark_decmp(upl_t upl)5396 upl_mark_decmp(upl_t upl)
5397 {
5398 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5399 		upl->flags |= UPL_DECMP_REQ;
5400 		upl->upl_creator->decmp_upl = (void *)upl;
5401 	}
5402 }
5403 
5404 void
upl_unmark_decmp(upl_t upl)5405 upl_unmark_decmp(upl_t upl)
5406 {
5407 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5408 		upl->upl_creator->decmp_upl = NULL;
5409 	}
5410 }
5411 
5412 #endif /* CONFIG_IOSCHED */
5413 
5414 #define VM_PAGE_Q_BACKING_UP(q)         \
5415 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5416 
5417 boolean_t must_throttle_writes(void);
5418 
5419 boolean_t
must_throttle_writes()5420 must_throttle_writes()
5421 {
5422 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5423 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5424 		return TRUE;
5425 	}
5426 
5427 	return FALSE;
5428 }
5429 
5430 #define MIN_DELAYED_WORK_CTX_ALLOCATED  (16)
5431 #define MAX_DELAYED_WORK_CTX_ALLOCATED  (512)
5432 
5433 int vm_page_delayed_work_ctx_needed = 0;
5434 SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone;
5435 
5436 void
vm_page_delayed_work_init_ctx(void)5437 vm_page_delayed_work_init_ctx(void)
5438 {
5439 	size_t elem_size = sizeof(struct vm_page_delayed_work_ctx);
5440 
5441 	dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size,
5442 	    ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) {
5443 		zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED);
5444 	});
5445 
5446 	zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED);
5447 }
5448 
5449 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5450 vm_page_delayed_work_get_ctx(void)
5451 {
5452 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5453 
5454 	dw_ctx = (struct vm_page_delayed_work_ctx*) zalloc_noblock(dw_ctx_zone);
5455 
5456 	if (dw_ctx) {
5457 		dw_ctx->delayed_owner = current_thread();
5458 	} else {
5459 		vm_page_delayed_work_ctx_needed++;
5460 	}
5461 	return dw_ctx ? dw_ctx->dwp : NULL;
5462 }
5463 
5464 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5465 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5466 {
5467 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5468 
5469 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5470 	ldw_ctx->delayed_owner = NULL;
5471 
5472 	zfree(dw_ctx_zone, ldw_ctx);
5473 }
5474 
5475 /*
5476  *	Routine:	vm_object_upl_request
5477  *	Purpose:
5478  *		Cause the population of a portion of a vm_object.
5479  *		Depending on the nature of the request, the pages
5480  *		returned may be contain valid data or be uninitialized.
5481  *		A page list structure, listing the physical pages
5482  *		will be returned upon request.
5483  *		This function is called by the file system or any other
5484  *		supplier of backing store to a pager.
5485  *		IMPORTANT NOTE: The caller must still respect the relationship
5486  *		between the vm_object and its backing memory object.  The
5487  *		caller MUST NOT substitute changes in the backing file
5488  *		without first doing a memory_object_lock_request on the
5489  *		target range unless it is know that the pages are not
5490  *		shared with another entity at the pager level.
5491  *		Copy_in_to:
5492  *			if a page list structure is present
5493  *			return the mapped physical pages, where a
5494  *			page is not present, return a non-initialized
5495  *			one.  If the no_sync bit is turned on, don't
5496  *			call the pager unlock to synchronize with other
5497  *			possible copies of the page. Leave pages busy
5498  *			in the original object, if a page list structure
5499  *			was specified.  When a commit of the page list
5500  *			pages is done, the dirty bit will be set for each one.
5501  *		Copy_out_from:
5502  *			If a page list structure is present, return
5503  *			all mapped pages.  Where a page does not exist
5504  *			map a zero filled one. Leave pages busy in
5505  *			the original object.  If a page list structure
5506  *			is not specified, this call is a no-op.
5507  *
5508  *		Note:  access of default pager objects has a rather interesting
5509  *		twist.  The caller of this routine, presumably the file system
5510  *		page cache handling code, will never actually make a request
5511  *		against a default pager backed object.  Only the default
5512  *		pager will make requests on backing store related vm_objects
5513  *		In this way the default pager can maintain the relationship
5514  *		between backing store files (abstract memory objects) and
5515  *		the vm_objects (cache objects), they support.
5516  *
5517  */
5518 
5519 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5520 vm_object_upl_request(
5521 	vm_object_t             object,
5522 	vm_object_offset_t      offset,
5523 	upl_size_t              size,
5524 	upl_t                   *upl_ptr,
5525 	upl_page_info_array_t   user_page_list,
5526 	unsigned int            *page_list_count,
5527 	upl_control_flags_t     cntrl_flags,
5528 	vm_tag_t                tag)
5529 {
5530 	vm_page_t               dst_page = VM_PAGE_NULL;
5531 	vm_object_offset_t      dst_offset;
5532 	upl_size_t              xfer_size;
5533 	unsigned int            size_in_pages;
5534 	boolean_t               dirty;
5535 	boolean_t               hw_dirty;
5536 	upl_t                   upl = NULL;
5537 	unsigned int            entry;
5538 	vm_page_t               alias_page = NULL;
5539 	int                     refmod_state = 0;
5540 	wpl_array_t             lite_list = NULL;
5541 	vm_object_t             last_copy_object;
5542 	struct  vm_page_delayed_work    dw_array;
5543 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5544 	bool                    dwp_finish_ctx = TRUE;
5545 	int                     dw_count;
5546 	int                     dw_limit;
5547 	int                     io_tracking_flag = 0;
5548 	int                     grab_options;
5549 	int                     page_grab_count = 0;
5550 	ppnum_t                 phys_page;
5551 	pmap_flush_context      pmap_flush_context_storage;
5552 	boolean_t               pmap_flushes_delayed = FALSE;
5553 #if DEVELOPMENT || DEBUG
5554 	task_t                  task = current_task();
5555 #endif /* DEVELOPMENT || DEBUG */
5556 
5557 	dwp_start = dwp = NULL;
5558 
5559 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5560 		/*
5561 		 * For forward compatibility's sake,
5562 		 * reject any unknown flag.
5563 		 */
5564 		return KERN_INVALID_VALUE;
5565 	}
5566 	if ((!object->internal) && (object->paging_offset != 0)) {
5567 		panic("vm_object_upl_request: external object with non-zero paging offset");
5568 	}
5569 	if (object->phys_contiguous) {
5570 		panic("vm_object_upl_request: contiguous object specified");
5571 	}
5572 
5573 	assertf(page_aligned(offset) && page_aligned(size),
5574 	    "offset 0x%llx size 0x%x",
5575 	    offset, size);
5576 
5577 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5578 
5579 	dw_count = 0;
5580 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5581 	dwp_start = vm_page_delayed_work_get_ctx();
5582 	if (dwp_start == NULL) {
5583 		dwp_start = &dw_array;
5584 		dw_limit = 1;
5585 		dwp_finish_ctx = FALSE;
5586 	}
5587 
5588 	dwp = dwp_start;
5589 
5590 	if (size > MAX_UPL_SIZE_BYTES) {
5591 		size = MAX_UPL_SIZE_BYTES;
5592 	}
5593 
5594 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5595 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5596 	}
5597 
5598 #if CONFIG_IOSCHED || UPL_DEBUG
5599 	if (object->io_tracking || upl_debug_enabled) {
5600 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5601 	}
5602 #endif
5603 #if CONFIG_IOSCHED
5604 	if (object->io_tracking) {
5605 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5606 	}
5607 #endif
5608 
5609 	if (cntrl_flags & UPL_SET_INTERNAL) {
5610 		if (cntrl_flags & UPL_SET_LITE) {
5611 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5612 
5613 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5614 			lite_list = (wpl_array_t)
5615 			    (((uintptr_t)user_page_list) +
5616 			    ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5617 			if (size == 0) {
5618 				user_page_list = NULL;
5619 				lite_list = NULL;
5620 			}
5621 		} else {
5622 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5623 
5624 			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5625 			if (size == 0) {
5626 				user_page_list = NULL;
5627 			}
5628 		}
5629 	} else {
5630 		if (cntrl_flags & UPL_SET_LITE) {
5631 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5632 
5633 			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5634 			if (size == 0) {
5635 				lite_list = NULL;
5636 			}
5637 		} else {
5638 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5639 		}
5640 	}
5641 	*upl_ptr = upl;
5642 
5643 	if (user_page_list) {
5644 		user_page_list[0].device = FALSE;
5645 	}
5646 
5647 	if (cntrl_flags & UPL_SET_LITE) {
5648 		upl->map_object = object;
5649 	} else {
5650 		upl->map_object = vm_object_allocate(size);
5651 		/*
5652 		 * No neeed to lock the new object: nobody else knows
5653 		 * about it yet, so it's all ours so far.
5654 		 */
5655 		upl->map_object->shadow = object;
5656 		upl->map_object->pageout = TRUE;
5657 		upl->map_object->can_persist = FALSE;
5658 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5659 		upl->map_object->vo_shadow_offset = offset;
5660 		upl->map_object->wimg_bits = object->wimg_bits;
5661 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5662 		    "object %p shadow_offset 0x%llx",
5663 		    upl->map_object, upl->map_object->vo_shadow_offset);
5664 
5665 		alias_page = vm_page_grab_fictitious(TRUE);
5666 
5667 		upl->flags |= UPL_SHADOWED;
5668 	}
5669 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5670 		upl->flags |= UPL_PAGEOUT;
5671 	}
5672 
5673 	vm_object_lock(object);
5674 	vm_object_activity_begin(object);
5675 
5676 	grab_options = 0;
5677 #if CONFIG_SECLUDED_MEMORY
5678 	if (object->can_grab_secluded) {
5679 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5680 	}
5681 #endif /* CONFIG_SECLUDED_MEMORY */
5682 
5683 	/*
5684 	 * we can lock in the paging_offset once paging_in_progress is set
5685 	 */
5686 	upl->u_size = size;
5687 	upl->u_offset = offset + object->paging_offset;
5688 
5689 #if CONFIG_IOSCHED || UPL_DEBUG
5690 	if (object->io_tracking || upl_debug_enabled) {
5691 		vm_object_activity_begin(object);
5692 		queue_enter(&object->uplq, upl, upl_t, uplq);
5693 	}
5694 #endif
5695 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5696 		/*
5697 		 * Honor copy-on-write obligations
5698 		 *
5699 		 * The caller is gathering these pages and
5700 		 * might modify their contents.  We need to
5701 		 * make sure that the copy object has its own
5702 		 * private copies of these pages before we let
5703 		 * the caller modify them.
5704 		 */
5705 		vm_object_update(object,
5706 		    offset,
5707 		    size,
5708 		    NULL,
5709 		    NULL,
5710 		    FALSE,              /* should_return */
5711 		    MEMORY_OBJECT_COPY_SYNC,
5712 		    VM_PROT_NO_CHANGE);
5713 
5714 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5715 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5716 	}
5717 	/*
5718 	 * remember which copy object we synchronized with
5719 	 */
5720 	last_copy_object = object->copy;
5721 	entry = 0;
5722 
5723 	xfer_size = size;
5724 	dst_offset = offset;
5725 	size_in_pages = size / PAGE_SIZE;
5726 
5727 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5728 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5729 		object->scan_collisions = 0;
5730 	}
5731 
5732 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5733 		boolean_t       isSSD = FALSE;
5734 
5735 #if !XNU_TARGET_OS_OSX
5736 		isSSD = TRUE;
5737 #else /* !XNU_TARGET_OS_OSX */
5738 		vnode_pager_get_isSSD(object->pager, &isSSD);
5739 #endif /* !XNU_TARGET_OS_OSX */
5740 		vm_object_unlock(object);
5741 
5742 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5743 
5744 		if (isSSD == TRUE) {
5745 			delay(1000 * size_in_pages);
5746 		} else {
5747 			delay(5000 * size_in_pages);
5748 		}
5749 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5750 
5751 		vm_object_lock(object);
5752 	}
5753 
5754 	while (xfer_size) {
5755 		dwp->dw_mask = 0;
5756 
5757 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5758 			vm_object_unlock(object);
5759 			alias_page = vm_page_grab_fictitious(TRUE);
5760 			vm_object_lock(object);
5761 		}
5762 		if (cntrl_flags & UPL_COPYOUT_FROM) {
5763 			upl->flags |= UPL_PAGE_SYNC_DONE;
5764 
5765 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5766 			    dst_page->vmp_fictitious ||
5767 			    dst_page->vmp_absent ||
5768 			    dst_page->vmp_error ||
5769 			    dst_page->vmp_cleaning ||
5770 			    (VM_PAGE_WIRED(dst_page))) {
5771 				if (user_page_list) {
5772 					user_page_list[entry].phys_addr = 0;
5773 				}
5774 
5775 				goto try_next_page;
5776 			}
5777 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5778 
5779 			/*
5780 			 * grab this up front...
5781 			 * a high percentange of the time we're going to
5782 			 * need the hardware modification state a bit later
5783 			 * anyway... so we can eliminate an extra call into
5784 			 * the pmap layer by grabbing it here and recording it
5785 			 */
5786 			if (dst_page->vmp_pmapped) {
5787 				refmod_state = pmap_get_refmod(phys_page);
5788 			} else {
5789 				refmod_state = 0;
5790 			}
5791 
5792 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5793 				/*
5794 				 * page is on inactive list and referenced...
5795 				 * reactivate it now... this gets it out of the
5796 				 * way of vm_pageout_scan which would have to
5797 				 * reactivate it upon tripping over it
5798 				 */
5799 				dwp->dw_mask |= DW_vm_page_activate;
5800 			}
5801 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5802 				/*
5803 				 * we're only asking for DIRTY pages to be returned
5804 				 */
5805 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5806 					/*
5807 					 * if we were the page stolen by vm_pageout_scan to be
5808 					 * cleaned (as opposed to a buddy being clustered in
5809 					 * or this request is not being driven by a PAGEOUT cluster
5810 					 * then we only need to check for the page being dirty or
5811 					 * precious to decide whether to return it
5812 					 */
5813 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
5814 						goto check_busy;
5815 					}
5816 					goto dont_return;
5817 				}
5818 				/*
5819 				 * this is a request for a PAGEOUT cluster and this page
5820 				 * is merely along for the ride as a 'buddy'... not only
5821 				 * does it have to be dirty to be returned, but it also
5822 				 * can't have been referenced recently...
5823 				 */
5824 				if ((hibernate_cleaning_in_progress == TRUE ||
5825 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
5826 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5827 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
5828 					goto check_busy;
5829 				}
5830 dont_return:
5831 				/*
5832 				 * if we reach here, we're not to return
5833 				 * the page... go on to the next one
5834 				 */
5835 				if (dst_page->vmp_laundry == TRUE) {
5836 					/*
5837 					 * if we get here, the page is not 'cleaning' (filtered out above).
5838 					 * since it has been referenced, remove it from the laundry
5839 					 * so we don't pay the cost of an I/O to clean a page
5840 					 * we're just going to take back
5841 					 */
5842 					vm_page_lockspin_queues();
5843 
5844 					vm_pageout_steal_laundry(dst_page, TRUE);
5845 					vm_page_activate(dst_page);
5846 
5847 					vm_page_unlock_queues();
5848 				}
5849 				if (user_page_list) {
5850 					user_page_list[entry].phys_addr = 0;
5851 				}
5852 
5853 				goto try_next_page;
5854 			}
5855 check_busy:
5856 			if (dst_page->vmp_busy) {
5857 				if (cntrl_flags & UPL_NOBLOCK) {
5858 					if (user_page_list) {
5859 						user_page_list[entry].phys_addr = 0;
5860 					}
5861 					dwp->dw_mask = 0;
5862 
5863 					goto try_next_page;
5864 				}
5865 				/*
5866 				 * someone else is playing with the
5867 				 * page.  We will have to wait.
5868 				 */
5869 				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5870 
5871 				continue;
5872 			}
5873 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5874 				vm_page_lockspin_queues();
5875 
5876 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5877 					/*
5878 					 * we've buddied up a page for a clustered pageout
5879 					 * that has already been moved to the pageout
5880 					 * queue by pageout_scan... we need to remove
5881 					 * it from the queue and drop the laundry count
5882 					 * on that queue
5883 					 */
5884 					vm_pageout_throttle_up(dst_page);
5885 				}
5886 				vm_page_unlock_queues();
5887 			}
5888 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
5889 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
5890 
5891 			if (phys_page > upl->highest_page) {
5892 				upl->highest_page = phys_page;
5893 			}
5894 
5895 			assert(!pmap_is_noencrypt(phys_page));
5896 
5897 			if (cntrl_flags & UPL_SET_LITE) {
5898 				unsigned int    pg_num;
5899 
5900 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
5901 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
5902 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
5903 
5904 				if (hw_dirty) {
5905 					if (pmap_flushes_delayed == FALSE) {
5906 						pmap_flush_context_init(&pmap_flush_context_storage);
5907 						pmap_flushes_delayed = TRUE;
5908 					}
5909 					pmap_clear_refmod_options(phys_page,
5910 					    VM_MEM_MODIFIED,
5911 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
5912 					    &pmap_flush_context_storage);
5913 				}
5914 
5915 				/*
5916 				 * Mark original page as cleaning
5917 				 * in place.
5918 				 */
5919 				dst_page->vmp_cleaning = TRUE;
5920 				dst_page->vmp_precious = FALSE;
5921 			} else {
5922 				/*
5923 				 * use pageclean setup, it is more
5924 				 * convenient even for the pageout
5925 				 * cases here
5926 				 */
5927 				vm_object_lock(upl->map_object);
5928 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5929 				vm_object_unlock(upl->map_object);
5930 
5931 				alias_page->vmp_absent = FALSE;
5932 				alias_page = NULL;
5933 			}
5934 			if (dirty) {
5935 				SET_PAGE_DIRTY(dst_page, FALSE);
5936 			} else {
5937 				dst_page->vmp_dirty = FALSE;
5938 			}
5939 
5940 			if (!dirty) {
5941 				dst_page->vmp_precious = TRUE;
5942 			}
5943 
5944 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
5945 				if (!VM_PAGE_WIRED(dst_page)) {
5946 					dst_page->vmp_free_when_done = TRUE;
5947 				}
5948 			}
5949 		} else {
5950 			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5951 				/*
5952 				 * Honor copy-on-write obligations
5953 				 *
5954 				 * The copy object has changed since we
5955 				 * last synchronized for copy-on-write.
5956 				 * Another copy object might have been
5957 				 * inserted while we released the object's
5958 				 * lock.  Since someone could have seen the
5959 				 * original contents of the remaining pages
5960 				 * through that new object, we have to
5961 				 * synchronize with it again for the remaining
5962 				 * pages only.  The previous pages are "busy"
5963 				 * so they can not be seen through the new
5964 				 * mapping.  The new mapping will see our
5965 				 * upcoming changes for those previous pages,
5966 				 * but that's OK since they couldn't see what
5967 				 * was there before.  It's just a race anyway
5968 				 * and there's no guarantee of consistency or
5969 				 * atomicity.  We just don't want new mappings
5970 				 * to see both the *before* and *after* pages.
5971 				 */
5972 				if (object->copy != VM_OBJECT_NULL) {
5973 					vm_object_update(
5974 						object,
5975 						dst_offset,/* current offset */
5976 						xfer_size, /* remaining size */
5977 						NULL,
5978 						NULL,
5979 						FALSE,     /* should_return */
5980 						MEMORY_OBJECT_COPY_SYNC,
5981 						VM_PROT_NO_CHANGE);
5982 
5983 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
5984 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
5985 				}
5986 				/*
5987 				 * remember the copy object we synced with
5988 				 */
5989 				last_copy_object = object->copy;
5990 			}
5991 			dst_page = vm_page_lookup(object, dst_offset);
5992 
5993 			if (dst_page != VM_PAGE_NULL) {
5994 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5995 					/*
5996 					 * skip over pages already present in the cache
5997 					 */
5998 					if (user_page_list) {
5999 						user_page_list[entry].phys_addr = 0;
6000 					}
6001 
6002 					goto try_next_page;
6003 				}
6004 				if (dst_page->vmp_fictitious) {
6005 					panic("need corner case for fictitious page");
6006 				}
6007 
6008 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6009 					/*
6010 					 * someone else is playing with the
6011 					 * page.  We will have to wait.
6012 					 */
6013 					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6014 
6015 					continue;
6016 				}
6017 				if (dst_page->vmp_laundry) {
6018 					vm_pageout_steal_laundry(dst_page, FALSE);
6019 				}
6020 			} else {
6021 				if (object->private) {
6022 					/*
6023 					 * This is a nasty wrinkle for users
6024 					 * of upl who encounter device or
6025 					 * private memory however, it is
6026 					 * unavoidable, only a fault can
6027 					 * resolve the actual backing
6028 					 * physical page by asking the
6029 					 * backing device.
6030 					 */
6031 					if (user_page_list) {
6032 						user_page_list[entry].phys_addr = 0;
6033 					}
6034 
6035 					goto try_next_page;
6036 				}
6037 				if (object->scan_collisions) {
6038 					/*
6039 					 * the pageout_scan thread is trying to steal
6040 					 * pages from this object, but has run into our
6041 					 * lock... grab 2 pages from the head of the object...
6042 					 * the first is freed on behalf of pageout_scan, the
6043 					 * 2nd is for our own use... we use vm_object_page_grab
6044 					 * in both cases to avoid taking pages from the free
6045 					 * list since we are under memory pressure and our
6046 					 * lock on this object is getting in the way of
6047 					 * relieving it
6048 					 */
6049 					dst_page = vm_object_page_grab(object);
6050 
6051 					if (dst_page != VM_PAGE_NULL) {
6052 						vm_page_release(dst_page,
6053 						    FALSE);
6054 					}
6055 
6056 					dst_page = vm_object_page_grab(object);
6057 				}
6058 				if (dst_page == VM_PAGE_NULL) {
6059 					/*
6060 					 * need to allocate a page
6061 					 */
6062 					dst_page = vm_page_grab_options(grab_options);
6063 					if (dst_page != VM_PAGE_NULL) {
6064 						page_grab_count++;
6065 					}
6066 				}
6067 				if (dst_page == VM_PAGE_NULL) {
6068 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6069 						/*
6070 						 * we don't want to stall waiting for pages to come onto the free list
6071 						 * while we're already holding absent pages in this UPL
6072 						 * the caller will deal with the empty slots
6073 						 */
6074 						if (user_page_list) {
6075 							user_page_list[entry].phys_addr = 0;
6076 						}
6077 
6078 						goto try_next_page;
6079 					}
6080 					/*
6081 					 * no pages available... wait
6082 					 * then try again for the same
6083 					 * offset...
6084 					 */
6085 					vm_object_unlock(object);
6086 
6087 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6088 
6089 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6090 
6091 					VM_PAGE_WAIT();
6092 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6093 
6094 					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6095 
6096 					vm_object_lock(object);
6097 
6098 					continue;
6099 				}
6100 				vm_page_insert(dst_page, object, dst_offset);
6101 
6102 				dst_page->vmp_absent = TRUE;
6103 				dst_page->vmp_busy = FALSE;
6104 
6105 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6106 					/*
6107 					 * if UPL_RET_ONLY_ABSENT was specified,
6108 					 * than we're definitely setting up a
6109 					 * upl for a clustered read/pagein
6110 					 * operation... mark the pages as clustered
6111 					 * so upl_commit_range can put them on the
6112 					 * speculative list
6113 					 */
6114 					dst_page->vmp_clustered = TRUE;
6115 
6116 					if (!(cntrl_flags & UPL_FILE_IO)) {
6117 						counter_inc(&vm_statistics_pageins);
6118 					}
6119 				}
6120 			}
6121 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6122 
6123 			dst_page->vmp_overwriting = TRUE;
6124 
6125 			if (dst_page->vmp_pmapped) {
6126 				if (!(cntrl_flags & UPL_FILE_IO)) {
6127 					/*
6128 					 * eliminate all mappings from the
6129 					 * original object and its prodigy
6130 					 */
6131 					refmod_state = pmap_disconnect(phys_page);
6132 				} else {
6133 					refmod_state = pmap_get_refmod(phys_page);
6134 				}
6135 			} else {
6136 				refmod_state = 0;
6137 			}
6138 
6139 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6140 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6141 
6142 			if (cntrl_flags & UPL_SET_LITE) {
6143 				unsigned int    pg_num;
6144 
6145 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6146 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6147 				lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6148 
6149 				if (hw_dirty) {
6150 					pmap_clear_modify(phys_page);
6151 				}
6152 
6153 				/*
6154 				 * Mark original page as cleaning
6155 				 * in place.
6156 				 */
6157 				dst_page->vmp_cleaning = TRUE;
6158 				dst_page->vmp_precious = FALSE;
6159 			} else {
6160 				/*
6161 				 * use pageclean setup, it is more
6162 				 * convenient even for the pageout
6163 				 * cases here
6164 				 */
6165 				vm_object_lock(upl->map_object);
6166 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6167 				vm_object_unlock(upl->map_object);
6168 
6169 				alias_page->vmp_absent = FALSE;
6170 				alias_page = NULL;
6171 			}
6172 
6173 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6174 				upl->flags &= ~UPL_CLEAR_DIRTY;
6175 				upl->flags |= UPL_SET_DIRTY;
6176 				dirty = TRUE;
6177 				/*
6178 				 * Page belonging to a code-signed object is about to
6179 				 * be written. Mark it tainted and disconnect it from
6180 				 * all pmaps so processes have to fault it back in and
6181 				 * deal with the tainted bit.
6182 				 */
6183 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6184 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6185 					vm_page_upl_tainted++;
6186 					if (dst_page->vmp_pmapped) {
6187 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6188 						if (refmod_state & VM_MEM_REFERENCED) {
6189 							dst_page->vmp_reference = TRUE;
6190 						}
6191 					}
6192 				}
6193 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6194 				/*
6195 				 * clean in place for read implies
6196 				 * that a write will be done on all
6197 				 * the pages that are dirty before
6198 				 * a upl commit is done.  The caller
6199 				 * is obligated to preserve the
6200 				 * contents of all pages marked dirty
6201 				 */
6202 				upl->flags |= UPL_CLEAR_DIRTY;
6203 			}
6204 			dst_page->vmp_dirty = dirty;
6205 
6206 			if (!dirty) {
6207 				dst_page->vmp_precious = TRUE;
6208 			}
6209 
6210 			if (!VM_PAGE_WIRED(dst_page)) {
6211 				/*
6212 				 * deny access to the target page while
6213 				 * it is being worked on
6214 				 */
6215 				dst_page->vmp_busy = TRUE;
6216 			} else {
6217 				dwp->dw_mask |= DW_vm_page_wire;
6218 			}
6219 
6220 			/*
6221 			 * We might be about to satisfy a fault which has been
6222 			 * requested. So no need for the "restart" bit.
6223 			 */
6224 			dst_page->vmp_restart = FALSE;
6225 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6226 				/*
6227 				 * expect the page to be used
6228 				 */
6229 				dwp->dw_mask |= DW_set_reference;
6230 			}
6231 			if (cntrl_flags & UPL_PRECIOUS) {
6232 				if (object->internal) {
6233 					SET_PAGE_DIRTY(dst_page, FALSE);
6234 					dst_page->vmp_precious = FALSE;
6235 				} else {
6236 					dst_page->vmp_precious = TRUE;
6237 				}
6238 			} else {
6239 				dst_page->vmp_precious = FALSE;
6240 			}
6241 		}
6242 		if (dst_page->vmp_busy) {
6243 			upl->flags |= UPL_HAS_BUSY;
6244 		}
6245 
6246 		if (phys_page > upl->highest_page) {
6247 			upl->highest_page = phys_page;
6248 		}
6249 		assert(!pmap_is_noencrypt(phys_page));
6250 		if (user_page_list) {
6251 			user_page_list[entry].phys_addr = phys_page;
6252 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6253 			user_page_list[entry].absent    = dst_page->vmp_absent;
6254 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6255 			user_page_list[entry].precious  = dst_page->vmp_precious;
6256 			user_page_list[entry].device    = FALSE;
6257 			user_page_list[entry].needed    = FALSE;
6258 			if (dst_page->vmp_clustered == TRUE) {
6259 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6260 			} else {
6261 				user_page_list[entry].speculative = FALSE;
6262 			}
6263 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6264 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6265 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6266 			user_page_list[entry].mark      = FALSE;
6267 		}
6268 		/*
6269 		 * if UPL_RET_ONLY_ABSENT is set, then
6270 		 * we are working with a fresh page and we've
6271 		 * just set the clustered flag on it to
6272 		 * indicate that it was drug in as part of a
6273 		 * speculative cluster... so leave it alone
6274 		 */
6275 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6276 			/*
6277 			 * someone is explicitly grabbing this page...
6278 			 * update clustered and speculative state
6279 			 *
6280 			 */
6281 			if (dst_page->vmp_clustered) {
6282 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6283 			}
6284 		}
6285 try_next_page:
6286 		if (dwp->dw_mask) {
6287 			if (dwp->dw_mask & DW_vm_page_activate) {
6288 				counter_inc(&vm_statistics_reactivations);
6289 			}
6290 
6291 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6292 
6293 			if (dw_count >= dw_limit) {
6294 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6295 
6296 				dwp = dwp_start;
6297 				dw_count = 0;
6298 			}
6299 		}
6300 		entry++;
6301 		dst_offset += PAGE_SIZE_64;
6302 		xfer_size -= PAGE_SIZE;
6303 	}
6304 	if (dw_count) {
6305 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6306 		dwp = dwp_start;
6307 		dw_count = 0;
6308 	}
6309 
6310 	if (alias_page != NULL) {
6311 		VM_PAGE_FREE(alias_page);
6312 	}
6313 	if (pmap_flushes_delayed == TRUE) {
6314 		pmap_flush(&pmap_flush_context_storage);
6315 	}
6316 
6317 	if (page_list_count != NULL) {
6318 		if (upl->flags & UPL_INTERNAL) {
6319 			*page_list_count = 0;
6320 		} else if (*page_list_count > entry) {
6321 			*page_list_count = entry;
6322 		}
6323 	}
6324 #if UPL_DEBUG
6325 	upl->upl_state = 1;
6326 #endif
6327 	vm_object_unlock(object);
6328 
6329 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6330 #if DEVELOPMENT || DEBUG
6331 	if (task != NULL) {
6332 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6333 	}
6334 #endif /* DEVELOPMENT || DEBUG */
6335 
6336 	if (dwp_start && dwp_finish_ctx) {
6337 		vm_page_delayed_work_finish_ctx(dwp_start);
6338 		dwp_start = dwp = NULL;
6339 	}
6340 
6341 	return KERN_SUCCESS;
6342 }
6343 
6344 /*
6345  *	Routine:	vm_object_super_upl_request
6346  *	Purpose:
6347  *		Cause the population of a portion of a vm_object
6348  *		in much the same way as memory_object_upl_request.
6349  *		Depending on the nature of the request, the pages
6350  *		returned may be contain valid data or be uninitialized.
6351  *		However, the region may be expanded up to the super
6352  *		cluster size provided.
6353  */
6354 
6355 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6356 vm_object_super_upl_request(
6357 	vm_object_t object,
6358 	vm_object_offset_t      offset,
6359 	upl_size_t              size,
6360 	upl_size_t              super_cluster,
6361 	upl_t                   *upl,
6362 	upl_page_info_t         *user_page_list,
6363 	unsigned int            *page_list_count,
6364 	upl_control_flags_t     cntrl_flags,
6365 	vm_tag_t                tag)
6366 {
6367 	if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6368 		return KERN_FAILURE;
6369 	}
6370 
6371 	assert(object->paging_in_progress);
6372 	offset = offset - object->paging_offset;
6373 
6374 	if (super_cluster > size) {
6375 		vm_object_offset_t      base_offset;
6376 		upl_size_t              super_size;
6377 		vm_object_size_t        super_size_64;
6378 
6379 		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6380 		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6381 		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6382 		super_size = (upl_size_t) super_size_64;
6383 		assert(super_size == super_size_64);
6384 
6385 		if (offset > (base_offset + super_size)) {
6386 			panic("vm_object_super_upl_request: Missed target pageout"
6387 			    " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6388 			    offset, base_offset, super_size, super_cluster,
6389 			    size, object->paging_offset);
6390 		}
6391 		/*
6392 		 * apparently there is a case where the vm requests a
6393 		 * page to be written out who's offset is beyond the
6394 		 * object size
6395 		 */
6396 		if ((offset + size) > (base_offset + super_size)) {
6397 			super_size_64 = (offset + size) - base_offset;
6398 			super_size = (upl_size_t) super_size_64;
6399 			assert(super_size == super_size_64);
6400 		}
6401 
6402 		offset = base_offset;
6403 		size = super_size;
6404 	}
6405 	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6406 }
6407 
6408 int cs_executable_create_upl = 0;
6409 extern int proc_selfpid(void);
6410 extern char *proc_name_address(void *p);
6411 
6412 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6413 vm_map_create_upl(
6414 	vm_map_t                map,
6415 	vm_map_address_t        offset,
6416 	upl_size_t              *upl_size,
6417 	upl_t                   *upl,
6418 	upl_page_info_array_t   page_list,
6419 	unsigned int            *count,
6420 	upl_control_flags_t     *flags,
6421 	vm_tag_t                tag)
6422 {
6423 	vm_map_entry_t          entry;
6424 	upl_control_flags_t     caller_flags;
6425 	int                     force_data_sync;
6426 	int                     sync_cow_data;
6427 	vm_object_t             local_object;
6428 	vm_map_offset_t         local_offset;
6429 	vm_map_offset_t         local_start;
6430 	kern_return_t           ret;
6431 	vm_map_address_t        original_offset;
6432 	vm_map_size_t           original_size, adjusted_size;
6433 	vm_map_offset_t         local_entry_start;
6434 	vm_object_offset_t      local_entry_offset;
6435 	vm_object_offset_t      offset_in_mapped_page;
6436 	boolean_t               release_map = FALSE;
6437 
6438 start_with_map:
6439 
6440 	original_offset = offset;
6441 	original_size = *upl_size;
6442 	adjusted_size = original_size;
6443 
6444 	caller_flags = *flags;
6445 
6446 	if (caller_flags & ~UPL_VALID_FLAGS) {
6447 		/*
6448 		 * For forward compatibility's sake,
6449 		 * reject any unknown flag.
6450 		 */
6451 		ret = KERN_INVALID_VALUE;
6452 		goto done;
6453 	}
6454 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6455 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6456 
6457 	if (upl == NULL) {
6458 		ret = KERN_INVALID_ARGUMENT;
6459 		goto done;
6460 	}
6461 
6462 REDISCOVER_ENTRY:
6463 	vm_map_lock_read(map);
6464 
6465 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6466 		vm_map_unlock_read(map);
6467 		ret = KERN_FAILURE;
6468 		goto done;
6469 	}
6470 
6471 	local_entry_start = entry->vme_start;
6472 	local_entry_offset = VME_OFFSET(entry);
6473 
6474 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6475 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6476 	}
6477 
6478 	if (entry->vme_end - original_offset < adjusted_size) {
6479 		adjusted_size = entry->vme_end - original_offset;
6480 		assert(adjusted_size > 0);
6481 		*upl_size = (upl_size_t) adjusted_size;
6482 		assert(*upl_size == adjusted_size);
6483 	}
6484 
6485 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6486 		*flags = 0;
6487 
6488 		if (!entry->is_sub_map &&
6489 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6490 			if (VME_OBJECT(entry)->private) {
6491 				*flags = UPL_DEV_MEMORY;
6492 			}
6493 
6494 			if (VME_OBJECT(entry)->phys_contiguous) {
6495 				*flags |= UPL_PHYS_CONTIG;
6496 			}
6497 		}
6498 		vm_map_unlock_read(map);
6499 		ret = KERN_SUCCESS;
6500 		goto done;
6501 	}
6502 
6503 	offset_in_mapped_page = 0;
6504 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6505 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6506 		*upl_size = (upl_size_t)
6507 		    (vm_map_round_page(original_offset + adjusted_size,
6508 		    VM_MAP_PAGE_MASK(map))
6509 		    - offset);
6510 
6511 		offset_in_mapped_page = original_offset - offset;
6512 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6513 
6514 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6515 	}
6516 
6517 	if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6518 	    !VME_OBJECT(entry)->phys_contiguous) {
6519 		if (*upl_size > MAX_UPL_SIZE_BYTES) {
6520 			*upl_size = MAX_UPL_SIZE_BYTES;
6521 		}
6522 	}
6523 
6524 	/*
6525 	 *      Create an object if necessary.
6526 	 */
6527 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6528 		if (vm_map_lock_read_to_write(map)) {
6529 			goto REDISCOVER_ENTRY;
6530 		}
6531 
6532 		VME_OBJECT_SET(entry,
6533 		    vm_object_allocate((vm_size_t)
6534 		    vm_object_round_page((entry->vme_end - entry->vme_start))));
6535 		VME_OFFSET_SET(entry, 0);
6536 		assert(entry->use_pmap);
6537 
6538 		vm_map_lock_write_to_read(map);
6539 	}
6540 
6541 	if (!(caller_flags & UPL_COPYOUT_FROM) &&
6542 	    !entry->is_sub_map &&
6543 	    !(entry->protection & VM_PROT_WRITE)) {
6544 		vm_map_unlock_read(map);
6545 		ret = KERN_PROTECTION_FAILURE;
6546 		goto done;
6547 	}
6548 
6549 #if !XNU_TARGET_OS_OSX
6550 	if (map->pmap != kernel_pmap &&
6551 	    (caller_flags & UPL_COPYOUT_FROM) &&
6552 	    (entry->protection & VM_PROT_EXECUTE) &&
6553 	    !(entry->protection & VM_PROT_WRITE)) {
6554 		vm_offset_t     kaddr;
6555 		vm_size_t       ksize;
6556 
6557 		/*
6558 		 * We're about to create a read-only UPL backed by
6559 		 * memory from an executable mapping.
6560 		 * Wiring the pages would result in the pages being copied
6561 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6562 		 * code-signed, so no longer eligible for execution.
6563 		 * Instead, let's copy the data into a kernel buffer and
6564 		 * create the UPL from this kernel buffer.
6565 		 * The kernel buffer is then freed, leaving the UPL holding
6566 		 * the last reference on the VM object, so the memory will
6567 		 * be released when the UPL is committed.
6568 		 */
6569 
6570 		vm_map_unlock_read(map);
6571 		entry = VM_MAP_ENTRY_NULL;
6572 		/* allocate kernel buffer */
6573 		ksize = round_page(*upl_size);
6574 		kaddr = 0;
6575 		ret = kmem_alloc_pageable(kernel_map,
6576 		    &kaddr,
6577 		    ksize,
6578 		    tag);
6579 		if (ret == KERN_SUCCESS) {
6580 			/* copyin the user data */
6581 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6582 		}
6583 		if (ret == KERN_SUCCESS) {
6584 			if (ksize > *upl_size) {
6585 				/* zero out the extra space in kernel buffer */
6586 				memset((void *)(kaddr + *upl_size),
6587 				    0,
6588 				    ksize - *upl_size);
6589 			}
6590 			/* create the UPL from the kernel buffer */
6591 			vm_object_offset_t      offset_in_object;
6592 			vm_object_offset_t      offset_in_object_page;
6593 
6594 			offset_in_object = offset - local_entry_start + local_entry_offset;
6595 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6596 			assert(offset_in_object_page < PAGE_SIZE);
6597 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6598 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6599 			ret = vm_map_create_upl(kernel_map,
6600 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6601 			    upl_size, upl, page_list, count, flags, tag);
6602 		}
6603 		if (kaddr != 0) {
6604 			/* free the kernel buffer */
6605 			kmem_free(kernel_map, kaddr, ksize);
6606 			kaddr = 0;
6607 			ksize = 0;
6608 		}
6609 #if DEVELOPMENT || DEBUG
6610 		DTRACE_VM4(create_upl_from_executable,
6611 		    vm_map_t, map,
6612 		    vm_map_address_t, offset,
6613 		    upl_size_t, *upl_size,
6614 		    kern_return_t, ret);
6615 #endif /* DEVELOPMENT || DEBUG */
6616 		goto done;
6617 	}
6618 #endif /* !XNU_TARGET_OS_OSX */
6619 
6620 	local_object = VME_OBJECT(entry);
6621 	assert(local_object != VM_OBJECT_NULL);
6622 
6623 	if (!entry->is_sub_map &&
6624 	    !entry->needs_copy &&
6625 	    *upl_size != 0 &&
6626 	    local_object->vo_size > *upl_size && /* partial UPL */
6627 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6628 	    (map->pmap != kernel_pmap) && /* alias checks */
6629 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6630 	    ||
6631 	    ( /* case 2 */
6632 		    local_object->internal &&
6633 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6634 		    local_object->ref_count > 1))) {
6635 		vm_prot_t       prot;
6636 
6637 		/*
6638 		 * Case 1:
6639 		 * Set up the targeted range for copy-on-write to avoid
6640 		 * applying true_share/copy_delay to the entire object.
6641 		 *
6642 		 * Case 2:
6643 		 * This map entry covers only part of an internal
6644 		 * object.  There could be other map entries covering
6645 		 * other areas of this object and some of these map
6646 		 * entries could be marked as "needs_copy", which
6647 		 * assumes that the object is COPY_SYMMETRIC.
6648 		 * To avoid marking this object as COPY_DELAY and
6649 		 * "true_share", let's shadow it and mark the new
6650 		 * (smaller) object as "true_share" and COPY_DELAY.
6651 		 */
6652 
6653 		if (vm_map_lock_read_to_write(map)) {
6654 			goto REDISCOVER_ENTRY;
6655 		}
6656 		vm_map_lock_assert_exclusive(map);
6657 		assert(VME_OBJECT(entry) == local_object);
6658 
6659 		vm_map_clip_start(map,
6660 		    entry,
6661 		    vm_map_trunc_page(offset,
6662 		    VM_MAP_PAGE_MASK(map)));
6663 		vm_map_clip_end(map,
6664 		    entry,
6665 		    vm_map_round_page(offset + *upl_size,
6666 		    VM_MAP_PAGE_MASK(map)));
6667 		if ((entry->vme_end - offset) < *upl_size) {
6668 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6669 			assert(*upl_size == entry->vme_end - offset);
6670 		}
6671 
6672 		prot = entry->protection & ~VM_PROT_WRITE;
6673 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6674 			prot |= VM_PROT_EXECUTE;
6675 		}
6676 		vm_object_pmap_protect(local_object,
6677 		    VME_OFFSET(entry),
6678 		    entry->vme_end - entry->vme_start,
6679 		    ((entry->is_shared ||
6680 		    map->mapped_in_other_pmaps)
6681 		    ? PMAP_NULL
6682 		    : map->pmap),
6683 		    VM_MAP_PAGE_SIZE(map),
6684 		    entry->vme_start,
6685 		    prot);
6686 
6687 		assert(entry->wired_count == 0);
6688 
6689 		/*
6690 		 * Lock the VM object and re-check its status: if it's mapped
6691 		 * in another address space, we could still be racing with
6692 		 * another thread holding that other VM map exclusively.
6693 		 */
6694 		vm_object_lock(local_object);
6695 		if (local_object->true_share) {
6696 			/* object is already in proper state: no COW needed */
6697 			assert(local_object->copy_strategy !=
6698 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6699 		} else {
6700 			/* not true_share: ask for copy-on-write below */
6701 			assert(local_object->copy_strategy ==
6702 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6703 			entry->needs_copy = TRUE;
6704 		}
6705 		vm_object_unlock(local_object);
6706 
6707 		vm_map_lock_write_to_read(map);
6708 	}
6709 
6710 	if (entry->needs_copy) {
6711 		/*
6712 		 * Honor copy-on-write for COPY_SYMMETRIC
6713 		 * strategy.
6714 		 */
6715 		vm_map_t                local_map;
6716 		vm_object_t             object;
6717 		vm_object_offset_t      new_offset;
6718 		vm_prot_t               prot;
6719 		boolean_t               wired;
6720 		vm_map_version_t        version;
6721 		vm_map_t                real_map;
6722 		vm_prot_t               fault_type;
6723 
6724 		local_map = map;
6725 
6726 		if (caller_flags & UPL_COPYOUT_FROM) {
6727 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6728 			vm_counters.create_upl_extra_cow++;
6729 			vm_counters.create_upl_extra_cow_pages +=
6730 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6731 		} else {
6732 			fault_type = VM_PROT_WRITE;
6733 		}
6734 		if (vm_map_lookup_locked(&local_map,
6735 		    offset, fault_type,
6736 		    OBJECT_LOCK_EXCLUSIVE,
6737 		    &version, &object,
6738 		    &new_offset, &prot, &wired,
6739 		    NULL,
6740 		    &real_map, NULL) != KERN_SUCCESS) {
6741 			if (fault_type == VM_PROT_WRITE) {
6742 				vm_counters.create_upl_lookup_failure_write++;
6743 			} else {
6744 				vm_counters.create_upl_lookup_failure_copy++;
6745 			}
6746 			vm_map_unlock_read(local_map);
6747 			ret = KERN_FAILURE;
6748 			goto done;
6749 		}
6750 		if (real_map != local_map) {
6751 			vm_map_unlock(real_map);
6752 		}
6753 		vm_map_unlock_read(local_map);
6754 
6755 		vm_object_unlock(object);
6756 
6757 		goto REDISCOVER_ENTRY;
6758 	}
6759 
6760 	if (entry->is_sub_map) {
6761 		vm_map_t        submap;
6762 
6763 		submap = VME_SUBMAP(entry);
6764 		local_start = entry->vme_start;
6765 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6766 
6767 		vm_map_reference(submap);
6768 		vm_map_unlock_read(map);
6769 
6770 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6771 		offset += offset_in_mapped_page;
6772 		*upl_size -= offset_in_mapped_page;
6773 
6774 		if (release_map) {
6775 			vm_map_deallocate(map);
6776 		}
6777 		map = submap;
6778 		release_map = TRUE;
6779 		offset = local_offset + (offset - local_start);
6780 		goto start_with_map;
6781 	}
6782 
6783 	if (sync_cow_data &&
6784 	    (VME_OBJECT(entry)->shadow ||
6785 	    VME_OBJECT(entry)->copy)) {
6786 		local_object = VME_OBJECT(entry);
6787 		local_start = entry->vme_start;
6788 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6789 
6790 		vm_object_reference(local_object);
6791 		vm_map_unlock_read(map);
6792 
6793 		if (local_object->shadow && local_object->copy) {
6794 			vm_object_lock_request(local_object->shadow,
6795 			    ((vm_object_offset_t)
6796 			    ((offset - local_start) +
6797 			    local_offset) +
6798 			    local_object->vo_shadow_offset),
6799 			    *upl_size, FALSE,
6800 			    MEMORY_OBJECT_DATA_SYNC,
6801 			    VM_PROT_NO_CHANGE);
6802 		}
6803 		sync_cow_data = FALSE;
6804 		vm_object_deallocate(local_object);
6805 
6806 		goto REDISCOVER_ENTRY;
6807 	}
6808 	if (force_data_sync) {
6809 		local_object = VME_OBJECT(entry);
6810 		local_start = entry->vme_start;
6811 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6812 
6813 		vm_object_reference(local_object);
6814 		vm_map_unlock_read(map);
6815 
6816 		vm_object_lock_request(local_object,
6817 		    ((vm_object_offset_t)
6818 		    ((offset - local_start) +
6819 		    local_offset)),
6820 		    (vm_object_size_t)*upl_size,
6821 		    FALSE,
6822 		    MEMORY_OBJECT_DATA_SYNC,
6823 		    VM_PROT_NO_CHANGE);
6824 
6825 		force_data_sync = FALSE;
6826 		vm_object_deallocate(local_object);
6827 
6828 		goto REDISCOVER_ENTRY;
6829 	}
6830 	if (VME_OBJECT(entry)->private) {
6831 		*flags = UPL_DEV_MEMORY;
6832 	} else {
6833 		*flags = 0;
6834 	}
6835 
6836 	if (VME_OBJECT(entry)->phys_contiguous) {
6837 		*flags |= UPL_PHYS_CONTIG;
6838 	}
6839 
6840 	local_object = VME_OBJECT(entry);
6841 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6842 	local_start = entry->vme_start;
6843 
6844 	/*
6845 	 * Wiring will copy the pages to the shadow object.
6846 	 * The shadow object will not be code-signed so
6847 	 * attempting to execute code from these copied pages
6848 	 * would trigger a code-signing violation.
6849 	 */
6850 	if (entry->protection & VM_PROT_EXECUTE) {
6851 #if MACH_ASSERT
6852 		printf("pid %d[%s] create_upl out of executable range from "
6853 		    "0x%llx to 0x%llx: side effects may include "
6854 		    "code-signing violations later on\n",
6855 		    proc_selfpid(),
6856 		    (current_task()->bsd_info
6857 		    ? proc_name_address(current_task()->bsd_info)
6858 		    : "?"),
6859 		    (uint64_t) entry->vme_start,
6860 		    (uint64_t) entry->vme_end);
6861 #endif /* MACH_ASSERT */
6862 		DTRACE_VM2(cs_executable_create_upl,
6863 		    uint64_t, (uint64_t)entry->vme_start,
6864 		    uint64_t, (uint64_t)entry->vme_end);
6865 		cs_executable_create_upl++;
6866 	}
6867 
6868 	vm_object_lock(local_object);
6869 
6870 	/*
6871 	 * Ensure that this object is "true_share" and "copy_delay" now,
6872 	 * while we're still holding the VM map lock.  After we unlock the map,
6873 	 * anything could happen to that mapping, including some copy-on-write
6874 	 * activity.  We need to make sure that the IOPL will point at the
6875 	 * same memory as the mapping.
6876 	 */
6877 	if (local_object->true_share) {
6878 		assert(local_object->copy_strategy !=
6879 		    MEMORY_OBJECT_COPY_SYMMETRIC);
6880 	} else if (local_object != kernel_object &&
6881 	    local_object != compressor_object &&
6882 	    !local_object->phys_contiguous) {
6883 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6884 		if (!local_object->true_share &&
6885 		    vm_object_tracking_btlog) {
6886 			btlog_record(vm_object_tracking_btlog, local_object,
6887 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
6888 			    btref_get(__builtin_frame_address(0), 0));
6889 		}
6890 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6891 		local_object->true_share = TRUE;
6892 		if (local_object->copy_strategy ==
6893 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
6894 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6895 		}
6896 	}
6897 
6898 	vm_object_reference_locked(local_object);
6899 	vm_object_unlock(local_object);
6900 
6901 	vm_map_unlock_read(map);
6902 
6903 	offset += offset_in_mapped_page;
6904 	assert(*upl_size > offset_in_mapped_page);
6905 	*upl_size -= offset_in_mapped_page;
6906 
6907 	ret = vm_object_iopl_request(local_object,
6908 	    ((vm_object_offset_t)
6909 	    ((offset - local_start) + local_offset)),
6910 	    *upl_size,
6911 	    upl,
6912 	    page_list,
6913 	    count,
6914 	    caller_flags,
6915 	    tag);
6916 	vm_object_deallocate(local_object);
6917 
6918 done:
6919 	if (release_map) {
6920 		vm_map_deallocate(map);
6921 	}
6922 
6923 	return ret;
6924 }
6925 
6926 /*
6927  * Internal routine to enter a UPL into a VM map.
6928  *
6929  * JMM - This should just be doable through the standard
6930  * vm_map_enter() API.
6931  */
6932 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)6933 vm_map_enter_upl_range(
6934 	vm_map_t                map,
6935 	upl_t                   upl,
6936 	vm_object_offset_t      offset_to_map,
6937 	upl_size_t              size_to_map,
6938 	vm_prot_t               prot_to_map,
6939 	vm_map_offset_t         *dst_addr)
6940 {
6941 	vm_map_size_t           size;
6942 	vm_object_offset_t      offset;
6943 	vm_map_offset_t         addr;
6944 	vm_page_t               m;
6945 	kern_return_t           kr;
6946 	int                     isVectorUPL = 0, curr_upl = 0;
6947 	upl_t                   vector_upl = NULL;
6948 	vm_offset_t             vector_upl_dst_addr = 0;
6949 	vm_map_t                vector_upl_submap = NULL;
6950 	upl_offset_t            subupl_offset = 0;
6951 	upl_size_t              subupl_size = 0;
6952 
6953 	if (upl == UPL_NULL) {
6954 		return KERN_INVALID_ARGUMENT;
6955 	}
6956 
6957 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
6958 	assert(map == kernel_map);
6959 
6960 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
6961 		int mapped = 0, valid_upls = 0;
6962 		vector_upl = upl;
6963 
6964 		upl_lock(vector_upl);
6965 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6966 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6967 			if (upl == NULL) {
6968 				continue;
6969 			}
6970 			valid_upls++;
6971 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6972 				mapped++;
6973 			}
6974 		}
6975 
6976 		if (mapped) {
6977 			if (mapped != valid_upls) {
6978 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
6979 			} else {
6980 				upl_unlock(vector_upl);
6981 				return KERN_FAILURE;
6982 			}
6983 		}
6984 
6985 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
6986 			panic("TODO4K: vector UPL not implemented");
6987 		}
6988 
6989 		kr = kmem_suballoc(map, &vector_upl_dst_addr,
6990 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
6991 		    VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE,
6992 		    &vector_upl_submap);
6993 		if (kr != KERN_SUCCESS) {
6994 			panic("Vector UPL submap allocation failed");
6995 		}
6996 		map = vector_upl_submap;
6997 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6998 		curr_upl = 0;
6999 	} else {
7000 		upl_lock(upl);
7001 	}
7002 
7003 process_upl_to_enter:
7004 	if (isVectorUPL) {
7005 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7006 			*dst_addr = vector_upl_dst_addr;
7007 			upl_unlock(vector_upl);
7008 			return KERN_SUCCESS;
7009 		}
7010 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7011 		if (upl == NULL) {
7012 			goto process_upl_to_enter;
7013 		}
7014 
7015 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7016 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7017 	} else {
7018 		/*
7019 		 * check to see if already mapped
7020 		 */
7021 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7022 			upl_unlock(upl);
7023 			return KERN_FAILURE;
7024 		}
7025 	}
7026 
7027 	if ((!(upl->flags & UPL_SHADOWED)) &&
7028 	    ((upl->flags & UPL_HAS_BUSY) ||
7029 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7030 		vm_object_t             object;
7031 		vm_page_t               alias_page;
7032 		vm_object_offset_t      new_offset;
7033 		unsigned int            pg_num;
7034 		wpl_array_t             lite_list;
7035 
7036 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7037 		if (upl->flags & UPL_INTERNAL) {
7038 			lite_list = (wpl_array_t)
7039 			    ((((uintptr_t)upl) + sizeof(struct upl))
7040 			    + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7041 		} else {
7042 			lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7043 		}
7044 		object = upl->map_object;
7045 		upl->map_object = vm_object_allocate(vm_object_round_page(size));
7046 
7047 		vm_object_lock(upl->map_object);
7048 
7049 		upl->map_object->shadow = object;
7050 		upl->map_object->pageout = TRUE;
7051 		upl->map_object->can_persist = FALSE;
7052 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7053 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7054 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7055 		    "object %p shadow_offset 0x%llx",
7056 		    upl->map_object,
7057 		    (uint64_t)upl->map_object->vo_shadow_offset);
7058 		upl->map_object->wimg_bits = object->wimg_bits;
7059 		offset = upl->map_object->vo_shadow_offset;
7060 		new_offset = 0;
7061 
7062 		upl->flags |= UPL_SHADOWED;
7063 
7064 		while (size) {
7065 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7066 			assert(pg_num == new_offset / PAGE_SIZE);
7067 
7068 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7069 				alias_page = vm_page_grab_fictitious(TRUE);
7070 
7071 				vm_object_lock(object);
7072 
7073 				m = vm_page_lookup(object, offset);
7074 				if (m == VM_PAGE_NULL) {
7075 					panic("vm_upl_map: page missing");
7076 				}
7077 
7078 				/*
7079 				 * Convert the fictitious page to a private
7080 				 * shadow of the real page.
7081 				 */
7082 				assert(alias_page->vmp_fictitious);
7083 				alias_page->vmp_fictitious = FALSE;
7084 				alias_page->vmp_private = TRUE;
7085 				alias_page->vmp_free_when_done = TRUE;
7086 				/*
7087 				 * since m is a page in the upl it must
7088 				 * already be wired or BUSY, so it's
7089 				 * safe to assign the underlying physical
7090 				 * page to the alias
7091 				 */
7092 				VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7093 
7094 				vm_object_unlock(object);
7095 
7096 				vm_page_lockspin_queues();
7097 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7098 				vm_page_unlock_queues();
7099 
7100 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7101 
7102 				assert(!alias_page->vmp_wanted);
7103 				alias_page->vmp_busy = FALSE;
7104 				alias_page->vmp_absent = FALSE;
7105 			}
7106 			size -= PAGE_SIZE;
7107 			offset += PAGE_SIZE_64;
7108 			new_offset += PAGE_SIZE_64;
7109 		}
7110 		vm_object_unlock(upl->map_object);
7111 	}
7112 	if (upl->flags & UPL_SHADOWED) {
7113 		if (isVectorUPL) {
7114 			offset = 0;
7115 		} else {
7116 			offset = offset_to_map;
7117 		}
7118 	} else {
7119 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7120 		if (!isVectorUPL) {
7121 			offset += offset_to_map;
7122 		}
7123 	}
7124 
7125 	if (isVectorUPL) {
7126 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7127 	} else {
7128 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7129 	}
7130 
7131 	vm_object_reference(upl->map_object);
7132 
7133 	if (!isVectorUPL) {
7134 		*dst_addr = 0;
7135 		/*
7136 		 * NEED A UPL_MAP ALIAS
7137 		 */
7138 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7139 		    VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7140 		    upl->map_object, offset, FALSE,
7141 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7142 
7143 		if (kr != KERN_SUCCESS) {
7144 			vm_object_deallocate(upl->map_object);
7145 			upl_unlock(upl);
7146 			return kr;
7147 		}
7148 	} else {
7149 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7150 		    VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7151 		    upl->map_object, offset, FALSE,
7152 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7153 		if (kr) {
7154 			panic("vm_map_enter failed for a Vector UPL");
7155 		}
7156 	}
7157 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7158 	                                        /* this will have to be an increment rather than */
7159 	                                        /* an assignment. */
7160 	vm_object_lock(upl->map_object);
7161 
7162 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7163 		m = vm_page_lookup(upl->map_object, offset);
7164 
7165 		if (m) {
7166 			m->vmp_pmapped = TRUE;
7167 
7168 			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7169 			 * but only in kernel space. If this was on a user map,
7170 			 * we'd have to set the wpmapped bit. */
7171 			/* m->vmp_wpmapped = TRUE; */
7172 			assert(map->pmap == kernel_pmap);
7173 
7174 			PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7175 
7176 			assert(kr == KERN_SUCCESS);
7177 #if KASAN
7178 			kasan_notify_address(addr, PAGE_SIZE_64);
7179 #endif
7180 		}
7181 		offset += PAGE_SIZE_64;
7182 	}
7183 	vm_object_unlock(upl->map_object);
7184 
7185 	/*
7186 	 * hold a reference for the mapping
7187 	 */
7188 	upl->ref_count++;
7189 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7190 	upl->kaddr = (vm_offset_t) *dst_addr;
7191 	assert(upl->kaddr == *dst_addr);
7192 
7193 	if (isVectorUPL) {
7194 		goto process_upl_to_enter;
7195 	}
7196 
7197 	if (!isVectorUPL) {
7198 		vm_map_offset_t addr_adjustment;
7199 
7200 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7201 		if (addr_adjustment) {
7202 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7203 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7204 			*dst_addr += addr_adjustment;
7205 		}
7206 	}
7207 
7208 	upl_unlock(upl);
7209 
7210 	return KERN_SUCCESS;
7211 }
7212 
7213 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7214 vm_map_enter_upl(
7215 	vm_map_t                map,
7216 	upl_t                   upl,
7217 	vm_map_offset_t         *dst_addr)
7218 {
7219 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7220 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7221 }
7222 
7223 /*
7224  * Internal routine to remove a UPL mapping from a VM map.
7225  *
7226  * XXX - This should just be doable through a standard
7227  * vm_map_remove() operation.  Otherwise, implicit clean-up
7228  * of the target map won't be able to correctly remove
7229  * these (and release the reference on the UPL).  Having
7230  * to do this means we can't map these into user-space
7231  * maps yet.
7232  */
7233 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7234 vm_map_remove_upl_range(
7235 	vm_map_t        map,
7236 	upl_t           upl,
7237 	__unused vm_object_offset_t    offset_to_unmap,
7238 	__unused upl_size_t      size_to_unmap)
7239 {
7240 	vm_address_t    addr;
7241 	upl_size_t      size;
7242 	int             isVectorUPL = 0, curr_upl = 0;
7243 	upl_t           vector_upl = NULL;
7244 
7245 	if (upl == UPL_NULL) {
7246 		return KERN_INVALID_ARGUMENT;
7247 	}
7248 
7249 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7250 		int     unmapped = 0, valid_upls = 0;
7251 		vector_upl = upl;
7252 		upl_lock(vector_upl);
7253 		for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7254 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7255 			if (upl == NULL) {
7256 				continue;
7257 			}
7258 			valid_upls++;
7259 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7260 				unmapped++;
7261 			}
7262 		}
7263 
7264 		if (unmapped) {
7265 			if (unmapped != valid_upls) {
7266 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7267 			} else {
7268 				upl_unlock(vector_upl);
7269 				return KERN_FAILURE;
7270 			}
7271 		}
7272 		curr_upl = 0;
7273 	} else {
7274 		upl_lock(upl);
7275 	}
7276 
7277 process_upl_to_remove:
7278 	if (isVectorUPL) {
7279 		if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7280 			vm_map_t v_upl_submap;
7281 			vm_offset_t v_upl_submap_dst_addr;
7282 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7283 
7284 			vm_map_remove(map, v_upl_submap_dst_addr,
7285 			    v_upl_submap_dst_addr + vector_upl->u_size,
7286 			    VM_MAP_REMOVE_NO_FLAGS);
7287 			vm_map_deallocate(v_upl_submap);
7288 			upl_unlock(vector_upl);
7289 			return KERN_SUCCESS;
7290 		}
7291 
7292 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7293 		if (upl == NULL) {
7294 			goto process_upl_to_remove;
7295 		}
7296 	}
7297 
7298 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7299 		addr = upl->kaddr;
7300 		size = upl->u_mapped_size;
7301 
7302 		assert(upl->ref_count > 1);
7303 		upl->ref_count--;               /* removing mapping ref */
7304 
7305 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7306 		upl->kaddr = (vm_offset_t) 0;
7307 		upl->u_mapped_size = 0;
7308 
7309 		if (!isVectorUPL) {
7310 			upl_unlock(upl);
7311 
7312 			vm_map_remove(
7313 				map,
7314 				vm_map_trunc_page(addr,
7315 				VM_MAP_PAGE_MASK(map)),
7316 				vm_map_round_page(addr + size,
7317 				VM_MAP_PAGE_MASK(map)),
7318 				VM_MAP_REMOVE_NO_FLAGS);
7319 			return KERN_SUCCESS;
7320 		} else {
7321 			/*
7322 			 * If it's a Vectored UPL, we'll be removing the entire
7323 			 * submap anyways, so no need to remove individual UPL
7324 			 * element mappings from within the submap
7325 			 */
7326 			goto process_upl_to_remove;
7327 		}
7328 	}
7329 	upl_unlock(upl);
7330 
7331 	return KERN_FAILURE;
7332 }
7333 
7334 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7335 vm_map_remove_upl(
7336 	vm_map_t        map,
7337 	upl_t           upl)
7338 {
7339 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7340 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7341 }
7342 
7343 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7344 upl_commit_range(
7345 	upl_t                   upl,
7346 	upl_offset_t            offset,
7347 	upl_size_t              size,
7348 	int                     flags,
7349 	upl_page_info_t         *page_list,
7350 	mach_msg_type_number_t  count,
7351 	boolean_t               *empty)
7352 {
7353 	upl_size_t              xfer_size, subupl_size;
7354 	vm_object_t             shadow_object;
7355 	vm_object_t             object;
7356 	vm_object_t             m_object;
7357 	vm_object_offset_t      target_offset;
7358 	upl_offset_t            subupl_offset = offset;
7359 	int                     entry;
7360 	wpl_array_t             lite_list;
7361 	int                     occupied;
7362 	int                     clear_refmod = 0;
7363 	int                     pgpgout_count = 0;
7364 	struct  vm_page_delayed_work    dw_array;
7365 	struct  vm_page_delayed_work    *dwp, *dwp_start;
7366 	bool                    dwp_finish_ctx = TRUE;
7367 	int                     dw_count;
7368 	int                     dw_limit;
7369 	int                     isVectorUPL = 0;
7370 	upl_t                   vector_upl = NULL;
7371 	boolean_t               should_be_throttled = FALSE;
7372 
7373 	vm_page_t               nxt_page = VM_PAGE_NULL;
7374 	int                     fast_path_possible = 0;
7375 	int                     fast_path_full_commit = 0;
7376 	int                     throttle_page = 0;
7377 	int                     unwired_count = 0;
7378 	int                     local_queue_count = 0;
7379 	vm_page_t               first_local, last_local;
7380 	vm_object_offset_t      obj_start, obj_end, obj_offset;
7381 	kern_return_t           kr = KERN_SUCCESS;
7382 
7383 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7384 
7385 	dwp_start = dwp = NULL;
7386 
7387 	subupl_size = size;
7388 	*empty = FALSE;
7389 
7390 	if (upl == UPL_NULL) {
7391 		return KERN_INVALID_ARGUMENT;
7392 	}
7393 
7394 	dw_count = 0;
7395 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7396 	dwp_start = vm_page_delayed_work_get_ctx();
7397 	if (dwp_start == NULL) {
7398 		dwp_start = &dw_array;
7399 		dw_limit = 1;
7400 		dwp_finish_ctx = FALSE;
7401 	}
7402 
7403 	dwp = dwp_start;
7404 
7405 	if (count == 0) {
7406 		page_list = NULL;
7407 	}
7408 
7409 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7410 		vector_upl = upl;
7411 		upl_lock(vector_upl);
7412 	} else {
7413 		upl_lock(upl);
7414 	}
7415 
7416 process_upl_to_commit:
7417 
7418 	if (isVectorUPL) {
7419 		size = subupl_size;
7420 		offset = subupl_offset;
7421 		if (size == 0) {
7422 			upl_unlock(vector_upl);
7423 			kr = KERN_SUCCESS;
7424 			goto done;
7425 		}
7426 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7427 		if (upl == NULL) {
7428 			upl_unlock(vector_upl);
7429 			kr = KERN_FAILURE;
7430 			goto done;
7431 		}
7432 		page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7433 		subupl_size -= size;
7434 		subupl_offset += size;
7435 	}
7436 
7437 #if UPL_DEBUG
7438 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7439 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7440 
7441 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7442 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7443 
7444 		upl->upl_commit_index++;
7445 	}
7446 #endif
7447 	if (upl->flags & UPL_DEVICE_MEMORY) {
7448 		xfer_size = 0;
7449 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7450 		xfer_size = size;
7451 	} else {
7452 		if (!isVectorUPL) {
7453 			upl_unlock(upl);
7454 		} else {
7455 			upl_unlock(vector_upl);
7456 		}
7457 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7458 		kr = KERN_FAILURE;
7459 		goto done;
7460 	}
7461 	if (upl->flags & UPL_SET_DIRTY) {
7462 		flags |= UPL_COMMIT_SET_DIRTY;
7463 	}
7464 	if (upl->flags & UPL_CLEAR_DIRTY) {
7465 		flags |= UPL_COMMIT_CLEAR_DIRTY;
7466 	}
7467 
7468 	if (upl->flags & UPL_INTERNAL) {
7469 		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7470 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7471 	} else {
7472 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7473 	}
7474 
7475 	object = upl->map_object;
7476 
7477 	if (upl->flags & UPL_SHADOWED) {
7478 		vm_object_lock(object);
7479 		shadow_object = object->shadow;
7480 	} else {
7481 		shadow_object = object;
7482 	}
7483 	entry = offset / PAGE_SIZE;
7484 	target_offset = (vm_object_offset_t)offset;
7485 
7486 	if (upl->flags & UPL_KERNEL_OBJECT) {
7487 		vm_object_lock_shared(shadow_object);
7488 	} else {
7489 		vm_object_lock(shadow_object);
7490 	}
7491 
7492 	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7493 
7494 	if (upl->flags & UPL_ACCESS_BLOCKED) {
7495 		assert(shadow_object->blocked_access);
7496 		shadow_object->blocked_access = FALSE;
7497 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7498 	}
7499 
7500 	if (shadow_object->code_signed) {
7501 		/*
7502 		 * CODE SIGNING:
7503 		 * If the object is code-signed, do not let this UPL tell
7504 		 * us if the pages are valid or not.  Let the pages be
7505 		 * validated by VM the normal way (when they get mapped or
7506 		 * copied).
7507 		 */
7508 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7509 	}
7510 	if (!page_list) {
7511 		/*
7512 		 * No page list to get the code-signing info from !?
7513 		 */
7514 		flags &= ~UPL_COMMIT_CS_VALIDATED;
7515 	}
7516 	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7517 		should_be_throttled = TRUE;
7518 	}
7519 
7520 	if ((upl->flags & UPL_IO_WIRE) &&
7521 	    !(flags & UPL_COMMIT_FREE_ABSENT) &&
7522 	    !isVectorUPL &&
7523 	    shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7524 	    shadow_object->purgable != VM_PURGABLE_EMPTY) {
7525 		if (!vm_page_queue_empty(&shadow_object->memq)) {
7526 			if (size == shadow_object->vo_size) {
7527 				nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7528 				fast_path_full_commit = 1;
7529 			}
7530 			fast_path_possible = 1;
7531 
7532 			if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7533 			    (shadow_object->purgable == VM_PURGABLE_DENY ||
7534 			    shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7535 			    shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7536 				throttle_page = 1;
7537 			}
7538 		}
7539 	}
7540 	first_local = VM_PAGE_NULL;
7541 	last_local = VM_PAGE_NULL;
7542 
7543 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7544 	obj_end = obj_start + xfer_size;
7545 	obj_start = vm_object_trunc_page(obj_start);
7546 	obj_end = vm_object_round_page(obj_end);
7547 	for (obj_offset = obj_start;
7548 	    obj_offset < obj_end;
7549 	    obj_offset += PAGE_SIZE) {
7550 		vm_page_t       t, m;
7551 
7552 		dwp->dw_mask = 0;
7553 		clear_refmod = 0;
7554 
7555 		m = VM_PAGE_NULL;
7556 
7557 		if (upl->flags & UPL_LITE) {
7558 			unsigned int    pg_num;
7559 
7560 			if (nxt_page != VM_PAGE_NULL) {
7561 				m = nxt_page;
7562 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7563 				target_offset = m->vmp_offset;
7564 			}
7565 			pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7566 			assert(pg_num == target_offset / PAGE_SIZE);
7567 
7568 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7569 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7570 
7571 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7572 					m = vm_page_lookup(shadow_object, obj_offset);
7573 				}
7574 			} else {
7575 				m = NULL;
7576 			}
7577 		}
7578 		if (upl->flags & UPL_SHADOWED) {
7579 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7580 				t->vmp_free_when_done = FALSE;
7581 
7582 				VM_PAGE_FREE(t);
7583 
7584 				if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7585 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7586 				}
7587 			}
7588 		}
7589 		if (m == VM_PAGE_NULL) {
7590 			goto commit_next_page;
7591 		}
7592 
7593 		m_object = VM_PAGE_OBJECT(m);
7594 
7595 		if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7596 			assert(m->vmp_busy);
7597 
7598 			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7599 			goto commit_next_page;
7600 		}
7601 
7602 		if (flags & UPL_COMMIT_CS_VALIDATED) {
7603 			/*
7604 			 * CODE SIGNING:
7605 			 * Set the code signing bits according to
7606 			 * what the UPL says they should be.
7607 			 */
7608 			m->vmp_cs_validated |= page_list[entry].cs_validated;
7609 			m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7610 			m->vmp_cs_nx |= page_list[entry].cs_nx;
7611 		}
7612 		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7613 			m->vmp_written_by_kernel = TRUE;
7614 		}
7615 
7616 		if (upl->flags & UPL_IO_WIRE) {
7617 			if (page_list) {
7618 				page_list[entry].phys_addr = 0;
7619 			}
7620 
7621 			if (flags & UPL_COMMIT_SET_DIRTY) {
7622 				SET_PAGE_DIRTY(m, FALSE);
7623 			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7624 				m->vmp_dirty = FALSE;
7625 
7626 				if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7627 				    m->vmp_cs_validated &&
7628 				    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7629 					/*
7630 					 * CODE SIGNING:
7631 					 * This page is no longer dirty
7632 					 * but could have been modified,
7633 					 * so it will need to be
7634 					 * re-validated.
7635 					 */
7636 					m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7637 
7638 					VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7639 
7640 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7641 				}
7642 				clear_refmod |= VM_MEM_MODIFIED;
7643 			}
7644 			if (upl->flags & UPL_ACCESS_BLOCKED) {
7645 				/*
7646 				 * We blocked access to the pages in this UPL.
7647 				 * Clear the "busy" bit and wake up any waiter
7648 				 * for this page.
7649 				 */
7650 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7651 			}
7652 			if (fast_path_possible) {
7653 				assert(m_object->purgable != VM_PURGABLE_EMPTY);
7654 				assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7655 				if (m->vmp_absent) {
7656 					assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7657 					assert(m->vmp_wire_count == 0);
7658 					assert(m->vmp_busy);
7659 
7660 					m->vmp_absent = FALSE;
7661 					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7662 				} else {
7663 					if (m->vmp_wire_count == 0) {
7664 						panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7665 					}
7666 					assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7667 
7668 					/*
7669 					 * XXX FBDP need to update some other
7670 					 * counters here (purgeable_wired_count)
7671 					 * (ledgers), ...
7672 					 */
7673 					assert(m->vmp_wire_count > 0);
7674 					m->vmp_wire_count--;
7675 
7676 					if (m->vmp_wire_count == 0) {
7677 						m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7678 						unwired_count++;
7679 					}
7680 				}
7681 				if (m->vmp_wire_count == 0) {
7682 					assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7683 
7684 					if (last_local == VM_PAGE_NULL) {
7685 						assert(first_local == VM_PAGE_NULL);
7686 
7687 						last_local = m;
7688 						first_local = m;
7689 					} else {
7690 						assert(first_local != VM_PAGE_NULL);
7691 
7692 						m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7693 						first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7694 						first_local = m;
7695 					}
7696 					local_queue_count++;
7697 
7698 					if (throttle_page) {
7699 						m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7700 					} else {
7701 						if (flags & UPL_COMMIT_INACTIVATE) {
7702 							if (shadow_object->internal) {
7703 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7704 							} else {
7705 								m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7706 							}
7707 						} else {
7708 							m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7709 						}
7710 					}
7711 				}
7712 			} else {
7713 				if (flags & UPL_COMMIT_INACTIVATE) {
7714 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7715 					clear_refmod |= VM_MEM_REFERENCED;
7716 				}
7717 				if (m->vmp_absent) {
7718 					if (flags & UPL_COMMIT_FREE_ABSENT) {
7719 						dwp->dw_mask |= DW_vm_page_free;
7720 					} else {
7721 						m->vmp_absent = FALSE;
7722 						dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7723 
7724 						if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7725 							dwp->dw_mask |= DW_vm_page_activate;
7726 						}
7727 					}
7728 				} else {
7729 					dwp->dw_mask |= DW_vm_page_unwire;
7730 				}
7731 			}
7732 			goto commit_next_page;
7733 		}
7734 		assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7735 
7736 		if (page_list) {
7737 			page_list[entry].phys_addr = 0;
7738 		}
7739 
7740 		/*
7741 		 * make sure to clear the hardware
7742 		 * modify or reference bits before
7743 		 * releasing the BUSY bit on this page
7744 		 * otherwise we risk losing a legitimate
7745 		 * change of state
7746 		 */
7747 		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7748 			m->vmp_dirty = FALSE;
7749 
7750 			clear_refmod |= VM_MEM_MODIFIED;
7751 		}
7752 		if (m->vmp_laundry) {
7753 			dwp->dw_mask |= DW_vm_pageout_throttle_up;
7754 		}
7755 
7756 		if (VM_PAGE_WIRED(m)) {
7757 			m->vmp_free_when_done = FALSE;
7758 		}
7759 
7760 		if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7761 		    m->vmp_cs_validated &&
7762 		    m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7763 			/*
7764 			 * CODE SIGNING:
7765 			 * This page is no longer dirty
7766 			 * but could have been modified,
7767 			 * so it will need to be
7768 			 * re-validated.
7769 			 */
7770 			m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7771 
7772 			VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7773 
7774 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7775 		}
7776 		if (m->vmp_overwriting) {
7777 			/*
7778 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
7779 			 */
7780 			if (m->vmp_busy) {
7781 #if CONFIG_PHANTOM_CACHE
7782 				if (m->vmp_absent && !m_object->internal) {
7783 					dwp->dw_mask |= DW_vm_phantom_cache_update;
7784 				}
7785 #endif
7786 				m->vmp_absent = FALSE;
7787 
7788 				dwp->dw_mask |= DW_clear_busy;
7789 			} else {
7790 				/*
7791 				 * alternate (COPY_OUT_FROM == FALSE) page_list case
7792 				 * Occurs when the original page was wired
7793 				 * at the time of the list request
7794 				 */
7795 				assert(VM_PAGE_WIRED(m));
7796 
7797 				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7798 			}
7799 			m->vmp_overwriting = FALSE;
7800 		}
7801 		m->vmp_cleaning = FALSE;
7802 
7803 		if (m->vmp_free_when_done) {
7804 			/*
7805 			 * With the clean queue enabled, UPL_PAGEOUT should
7806 			 * no longer set the pageout bit. Its pages now go
7807 			 * to the clean queue.
7808 			 *
7809 			 * We don't use the cleaned Q anymore and so this
7810 			 * assert isn't correct. The code for the clean Q
7811 			 * still exists and might be used in the future. If we
7812 			 * go back to the cleaned Q, we will re-enable this
7813 			 * assert.
7814 			 *
7815 			 * assert(!(upl->flags & UPL_PAGEOUT));
7816 			 */
7817 			assert(!m_object->internal);
7818 
7819 			m->vmp_free_when_done = FALSE;
7820 
7821 			if ((flags & UPL_COMMIT_SET_DIRTY) ||
7822 			    (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7823 				/*
7824 				 * page was re-dirtied after we started
7825 				 * the pageout... reactivate it since
7826 				 * we don't know whether the on-disk
7827 				 * copy matches what is now in memory
7828 				 */
7829 				SET_PAGE_DIRTY(m, FALSE);
7830 
7831 				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7832 
7833 				if (upl->flags & UPL_PAGEOUT) {
7834 					counter_inc(&vm_statistics_reactivations);
7835 					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7836 				}
7837 			} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
7838 				/*
7839 				 * Someone else might still be handling this
7840 				 * page (vm_fault() for example), so let's not
7841 				 * free it or "un-busy" it!
7842 				 * Put that page in the "speculative" queue
7843 				 * for now (since we would otherwise have freed
7844 				 * it) and let whoever is keeping the page
7845 				 * "busy" move it if needed when they're done
7846 				 * with it.
7847 				 */
7848 				dwp->dw_mask |= DW_vm_page_speculate;
7849 			} else {
7850 				/*
7851 				 * page has been successfully cleaned
7852 				 * go ahead and free it for other use
7853 				 */
7854 				if (m_object->internal) {
7855 					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7856 				} else {
7857 					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7858 				}
7859 				m->vmp_dirty = FALSE;
7860 				if (!(upl->flags & UPL_HAS_BUSY)) {
7861 					assert(!m->vmp_busy);
7862 				}
7863 				m->vmp_busy = TRUE;
7864 
7865 				dwp->dw_mask |= DW_vm_page_free;
7866 			}
7867 			goto commit_next_page;
7868 		}
7869 		/*
7870 		 * It is a part of the semantic of COPYOUT_FROM
7871 		 * UPLs that a commit implies cache sync
7872 		 * between the vm page and the backing store
7873 		 * this can be used to strip the precious bit
7874 		 * as well as clean
7875 		 */
7876 		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
7877 			m->vmp_precious = FALSE;
7878 		}
7879 
7880 		if (flags & UPL_COMMIT_SET_DIRTY) {
7881 			SET_PAGE_DIRTY(m, FALSE);
7882 		} else {
7883 			m->vmp_dirty = FALSE;
7884 		}
7885 
7886 		/* with the clean queue on, move *all* cleaned pages to the clean queue */
7887 		if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
7888 			pgpgout_count++;
7889 
7890 			counter_inc(&vm_statistics_pageouts);
7891 			DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7892 
7893 			dwp->dw_mask |= DW_enqueue_cleaned;
7894 		} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
7895 			/*
7896 			 * page coming back in from being 'frozen'...
7897 			 * it was dirty before it was frozen, so keep it so
7898 			 * the vm_page_activate will notice that it really belongs
7899 			 * on the throttle queue and put it there
7900 			 */
7901 			SET_PAGE_DIRTY(m, FALSE);
7902 			dwp->dw_mask |= DW_vm_page_activate;
7903 		} else {
7904 			if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7905 				dwp->dw_mask |= DW_vm_page_deactivate_internal;
7906 				clear_refmod |= VM_MEM_REFERENCED;
7907 			} else if (!VM_PAGE_PAGEABLE(m)) {
7908 				if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
7909 					dwp->dw_mask |= DW_vm_page_speculate;
7910 				} else if (m->vmp_reference) {
7911 					dwp->dw_mask |= DW_vm_page_activate;
7912 				} else {
7913 					dwp->dw_mask |= DW_vm_page_deactivate_internal;
7914 					clear_refmod |= VM_MEM_REFERENCED;
7915 				}
7916 			}
7917 		}
7918 		if (upl->flags & UPL_ACCESS_BLOCKED) {
7919 			/*
7920 			 * We blocked access to the pages in this URL.
7921 			 * Clear the "busy" bit on this page before we
7922 			 * wake up any waiter.
7923 			 */
7924 			dwp->dw_mask |= DW_clear_busy;
7925 		}
7926 		/*
7927 		 * Wakeup any thread waiting for the page to be un-cleaning.
7928 		 */
7929 		dwp->dw_mask |= DW_PAGE_WAKEUP;
7930 
7931 commit_next_page:
7932 		if (clear_refmod) {
7933 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7934 		}
7935 
7936 		target_offset += PAGE_SIZE_64;
7937 		xfer_size -= PAGE_SIZE;
7938 		entry++;
7939 
7940 		if (dwp->dw_mask) {
7941 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7942 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7943 
7944 				if (dw_count >= dw_limit) {
7945 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7946 
7947 					dwp = dwp_start;
7948 					dw_count = 0;
7949 				}
7950 			} else {
7951 				if (dwp->dw_mask & DW_clear_busy) {
7952 					m->vmp_busy = FALSE;
7953 				}
7954 
7955 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
7956 					PAGE_WAKEUP(m);
7957 				}
7958 			}
7959 		}
7960 	}
7961 	if (dw_count) {
7962 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7963 		dwp = dwp_start;
7964 		dw_count = 0;
7965 	}
7966 
7967 	if (fast_path_possible) {
7968 		assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7969 		assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7970 
7971 		if (local_queue_count || unwired_count) {
7972 			if (local_queue_count) {
7973 				vm_page_t       first_target;
7974 				vm_page_queue_head_t    *target_queue;
7975 
7976 				if (throttle_page) {
7977 					target_queue = &vm_page_queue_throttled;
7978 				} else {
7979 					if (flags & UPL_COMMIT_INACTIVATE) {
7980 						if (shadow_object->internal) {
7981 							target_queue = &vm_page_queue_anonymous;
7982 						} else {
7983 							target_queue = &vm_page_queue_inactive;
7984 						}
7985 					} else {
7986 						target_queue = &vm_page_queue_active;
7987 					}
7988 				}
7989 				/*
7990 				 * Transfer the entire local queue to a regular LRU page queues.
7991 				 */
7992 				vm_page_lockspin_queues();
7993 
7994 				first_target = (vm_page_t) vm_page_queue_first(target_queue);
7995 
7996 				if (vm_page_queue_empty(target_queue)) {
7997 					target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7998 				} else {
7999 					first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8000 				}
8001 
8002 				target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8003 				first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8004 				last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8005 
8006 				/*
8007 				 * Adjust the global page counts.
8008 				 */
8009 				if (throttle_page) {
8010 					vm_page_throttled_count += local_queue_count;
8011 				} else {
8012 					if (flags & UPL_COMMIT_INACTIVATE) {
8013 						if (shadow_object->internal) {
8014 							vm_page_anonymous_count += local_queue_count;
8015 						}
8016 						vm_page_inactive_count += local_queue_count;
8017 
8018 						token_new_pagecount += local_queue_count;
8019 					} else {
8020 						vm_page_active_count += local_queue_count;
8021 					}
8022 
8023 					if (shadow_object->internal) {
8024 						vm_page_pageable_internal_count += local_queue_count;
8025 					} else {
8026 						vm_page_pageable_external_count += local_queue_count;
8027 					}
8028 				}
8029 			} else {
8030 				vm_page_lockspin_queues();
8031 			}
8032 			if (unwired_count) {
8033 				vm_page_wire_count -= unwired_count;
8034 				VM_CHECK_MEMORYSTATUS;
8035 			}
8036 			vm_page_unlock_queues();
8037 
8038 			VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8039 		}
8040 	}
8041 	occupied = 1;
8042 
8043 	if (upl->flags & UPL_DEVICE_MEMORY) {
8044 		occupied = 0;
8045 	} else if (upl->flags & UPL_LITE) {
8046 		int     pg_num;
8047 		int     i;
8048 
8049 		occupied = 0;
8050 
8051 		if (!fast_path_full_commit) {
8052 			pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8053 			pg_num = (pg_num + 31) >> 5;
8054 
8055 			for (i = 0; i < pg_num; i++) {
8056 				if (lite_list[i] != 0) {
8057 					occupied = 1;
8058 					break;
8059 				}
8060 			}
8061 		}
8062 	} else {
8063 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8064 			occupied = 0;
8065 		}
8066 	}
8067 	if (occupied == 0) {
8068 		/*
8069 		 * If this UPL element belongs to a Vector UPL and is
8070 		 * empty, then this is the right function to deallocate
8071 		 * it. So go ahead set the *empty variable. The flag
8072 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8073 		 * should be considered relevant for the Vector UPL and not
8074 		 * the internal UPLs.
8075 		 */
8076 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8077 			*empty = TRUE;
8078 		}
8079 
8080 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8081 			/*
8082 			 * this is not a paging object
8083 			 * so we need to drop the paging reference
8084 			 * that was taken when we created the UPL
8085 			 * against this object
8086 			 */
8087 			vm_object_activity_end(shadow_object);
8088 			vm_object_collapse(shadow_object, 0, TRUE);
8089 		} else {
8090 			/*
8091 			 * we dontated the paging reference to
8092 			 * the map object... vm_pageout_object_terminate
8093 			 * will drop this reference
8094 			 */
8095 		}
8096 	}
8097 	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8098 	vm_object_unlock(shadow_object);
8099 	if (object != shadow_object) {
8100 		vm_object_unlock(object);
8101 	}
8102 
8103 	if (!isVectorUPL) {
8104 		upl_unlock(upl);
8105 	} else {
8106 		/*
8107 		 * If we completed our operations on an UPL that is
8108 		 * part of a Vectored UPL and if empty is TRUE, then
8109 		 * we should go ahead and deallocate this UPL element.
8110 		 * Then we check if this was the last of the UPL elements
8111 		 * within that Vectored UPL. If so, set empty to TRUE
8112 		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8113 		 * can go ahead and deallocate the Vector UPL too.
8114 		 */
8115 		if (*empty == TRUE) {
8116 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8117 			upl_deallocate(upl);
8118 		}
8119 		goto process_upl_to_commit;
8120 	}
8121 	if (pgpgout_count) {
8122 		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8123 	}
8124 
8125 	kr = KERN_SUCCESS;
8126 done:
8127 	if (dwp_start && dwp_finish_ctx) {
8128 		vm_page_delayed_work_finish_ctx(dwp_start);
8129 		dwp_start = dwp = NULL;
8130 	}
8131 
8132 	return kr;
8133 }
8134 
8135 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8136 upl_abort_range(
8137 	upl_t                   upl,
8138 	upl_offset_t            offset,
8139 	upl_size_t              size,
8140 	int                     error,
8141 	boolean_t               *empty)
8142 {
8143 	upl_page_info_t         *user_page_list = NULL;
8144 	upl_size_t              xfer_size, subupl_size;
8145 	vm_object_t             shadow_object;
8146 	vm_object_t             object;
8147 	vm_object_offset_t      target_offset;
8148 	upl_offset_t            subupl_offset = offset;
8149 	int                     entry;
8150 	wpl_array_t             lite_list;
8151 	int                     occupied;
8152 	struct  vm_page_delayed_work    dw_array;
8153 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8154 	bool                    dwp_finish_ctx = TRUE;
8155 	int                     dw_count;
8156 	int                     dw_limit;
8157 	int                     isVectorUPL = 0;
8158 	upl_t                   vector_upl = NULL;
8159 	vm_object_offset_t      obj_start, obj_end, obj_offset;
8160 	kern_return_t           kr = KERN_SUCCESS;
8161 
8162 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8163 
8164 	dwp_start = dwp = NULL;
8165 
8166 	subupl_size = size;
8167 	*empty = FALSE;
8168 
8169 	if (upl == UPL_NULL) {
8170 		return KERN_INVALID_ARGUMENT;
8171 	}
8172 
8173 	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8174 		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8175 	}
8176 
8177 	dw_count = 0;
8178 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8179 	dwp_start = vm_page_delayed_work_get_ctx();
8180 	if (dwp_start == NULL) {
8181 		dwp_start = &dw_array;
8182 		dw_limit = 1;
8183 		dwp_finish_ctx = FALSE;
8184 	}
8185 
8186 	dwp = dwp_start;
8187 
8188 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8189 		vector_upl = upl;
8190 		upl_lock(vector_upl);
8191 	} else {
8192 		upl_lock(upl);
8193 	}
8194 
8195 process_upl_to_abort:
8196 	if (isVectorUPL) {
8197 		size = subupl_size;
8198 		offset = subupl_offset;
8199 		if (size == 0) {
8200 			upl_unlock(vector_upl);
8201 			kr = KERN_SUCCESS;
8202 			goto done;
8203 		}
8204 		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8205 		if (upl == NULL) {
8206 			upl_unlock(vector_upl);
8207 			kr = KERN_FAILURE;
8208 			goto done;
8209 		}
8210 		subupl_size -= size;
8211 		subupl_offset += size;
8212 	}
8213 
8214 	*empty = FALSE;
8215 
8216 #if UPL_DEBUG
8217 	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8218 		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8219 
8220 		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8221 		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8222 		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8223 
8224 		upl->upl_commit_index++;
8225 	}
8226 #endif
8227 	if (upl->flags & UPL_DEVICE_MEMORY) {
8228 		xfer_size = 0;
8229 	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8230 		xfer_size = size;
8231 	} else {
8232 		if (!isVectorUPL) {
8233 			upl_unlock(upl);
8234 		} else {
8235 			upl_unlock(vector_upl);
8236 		}
8237 		DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8238 		kr = KERN_FAILURE;
8239 		goto done;
8240 	}
8241 	if (upl->flags & UPL_INTERNAL) {
8242 		lite_list = (wpl_array_t)
8243 		    ((((uintptr_t)upl) + sizeof(struct upl))
8244 		    + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8245 
8246 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8247 	} else {
8248 		lite_list = (wpl_array_t)
8249 		    (((uintptr_t)upl) + sizeof(struct upl));
8250 	}
8251 	object = upl->map_object;
8252 
8253 	if (upl->flags & UPL_SHADOWED) {
8254 		vm_object_lock(object);
8255 		shadow_object = object->shadow;
8256 	} else {
8257 		shadow_object = object;
8258 	}
8259 
8260 	entry = offset / PAGE_SIZE;
8261 	target_offset = (vm_object_offset_t)offset;
8262 
8263 	if (upl->flags & UPL_KERNEL_OBJECT) {
8264 		vm_object_lock_shared(shadow_object);
8265 	} else {
8266 		vm_object_lock(shadow_object);
8267 	}
8268 
8269 	if (upl->flags & UPL_ACCESS_BLOCKED) {
8270 		assert(shadow_object->blocked_access);
8271 		shadow_object->blocked_access = FALSE;
8272 		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8273 	}
8274 
8275 	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8276 		panic("upl_abort_range: kernel_object being DUMPED");
8277 	}
8278 
8279 	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8280 	obj_end = obj_start + xfer_size;
8281 	obj_start = vm_object_trunc_page(obj_start);
8282 	obj_end = vm_object_round_page(obj_end);
8283 	for (obj_offset = obj_start;
8284 	    obj_offset < obj_end;
8285 	    obj_offset += PAGE_SIZE) {
8286 		vm_page_t       t, m;
8287 		unsigned int    pg_num;
8288 		boolean_t       needed;
8289 
8290 		pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8291 		assert(pg_num == target_offset / PAGE_SIZE);
8292 
8293 		needed = FALSE;
8294 
8295 		if (user_page_list) {
8296 			needed = user_page_list[pg_num].needed;
8297 		}
8298 
8299 		dwp->dw_mask = 0;
8300 		m = VM_PAGE_NULL;
8301 
8302 		if (upl->flags & UPL_LITE) {
8303 			if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8304 				lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8305 
8306 				if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8307 					m = vm_page_lookup(shadow_object, obj_offset);
8308 				}
8309 			}
8310 		}
8311 		if (upl->flags & UPL_SHADOWED) {
8312 			if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8313 				t->vmp_free_when_done = FALSE;
8314 
8315 				VM_PAGE_FREE(t);
8316 
8317 				if (m == VM_PAGE_NULL) {
8318 					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8319 				}
8320 			}
8321 		}
8322 		if ((upl->flags & UPL_KERNEL_OBJECT)) {
8323 			goto abort_next_page;
8324 		}
8325 
8326 		if (m != VM_PAGE_NULL) {
8327 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8328 
8329 			if (m->vmp_absent) {
8330 				boolean_t must_free = TRUE;
8331 
8332 				/*
8333 				 * COPYOUT = FALSE case
8334 				 * check for error conditions which must
8335 				 * be passed back to the pages customer
8336 				 */
8337 				if (error & UPL_ABORT_RESTART) {
8338 					m->vmp_restart = TRUE;
8339 					m->vmp_absent = FALSE;
8340 					m->vmp_unusual = TRUE;
8341 					must_free = FALSE;
8342 				} else if (error & UPL_ABORT_UNAVAILABLE) {
8343 					m->vmp_restart = FALSE;
8344 					m->vmp_unusual = TRUE;
8345 					must_free = FALSE;
8346 				} else if (error & UPL_ABORT_ERROR) {
8347 					m->vmp_restart = FALSE;
8348 					m->vmp_absent = FALSE;
8349 					m->vmp_error = TRUE;
8350 					m->vmp_unusual = TRUE;
8351 					must_free = FALSE;
8352 				}
8353 				if (m->vmp_clustered && needed == FALSE) {
8354 					/*
8355 					 * This page was a part of a speculative
8356 					 * read-ahead initiated by the kernel
8357 					 * itself.  No one is expecting this
8358 					 * page and no one will clean up its
8359 					 * error state if it ever becomes valid
8360 					 * in the future.
8361 					 * We have to free it here.
8362 					 */
8363 					must_free = TRUE;
8364 				}
8365 				m->vmp_cleaning = FALSE;
8366 
8367 				if (m->vmp_overwriting && !m->vmp_busy) {
8368 					/*
8369 					 * this shouldn't happen since
8370 					 * this is an 'absent' page, but
8371 					 * it doesn't hurt to check for
8372 					 * the 'alternate' method of
8373 					 * stabilizing the page...
8374 					 * we will mark 'busy' to be cleared
8375 					 * in the following code which will
8376 					 * take care of the primary stabilzation
8377 					 * method (i.e. setting 'busy' to TRUE)
8378 					 */
8379 					dwp->dw_mask |= DW_vm_page_unwire;
8380 				}
8381 				m->vmp_overwriting = FALSE;
8382 
8383 				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8384 
8385 				if (must_free == TRUE) {
8386 					dwp->dw_mask |= DW_vm_page_free;
8387 				} else {
8388 					dwp->dw_mask |= DW_vm_page_activate;
8389 				}
8390 			} else {
8391 				/*
8392 				 * Handle the trusted pager throttle.
8393 				 */
8394 				if (m->vmp_laundry) {
8395 					dwp->dw_mask |= DW_vm_pageout_throttle_up;
8396 				}
8397 
8398 				if (upl->flags & UPL_ACCESS_BLOCKED) {
8399 					/*
8400 					 * We blocked access to the pages in this UPL.
8401 					 * Clear the "busy" bit and wake up any waiter
8402 					 * for this page.
8403 					 */
8404 					dwp->dw_mask |= DW_clear_busy;
8405 				}
8406 				if (m->vmp_overwriting) {
8407 					if (m->vmp_busy) {
8408 						dwp->dw_mask |= DW_clear_busy;
8409 					} else {
8410 						/*
8411 						 * deal with the 'alternate' method
8412 						 * of stabilizing the page...
8413 						 * we will either free the page
8414 						 * or mark 'busy' to be cleared
8415 						 * in the following code which will
8416 						 * take care of the primary stabilzation
8417 						 * method (i.e. setting 'busy' to TRUE)
8418 						 */
8419 						dwp->dw_mask |= DW_vm_page_unwire;
8420 					}
8421 					m->vmp_overwriting = FALSE;
8422 				}
8423 				m->vmp_free_when_done = FALSE;
8424 				m->vmp_cleaning = FALSE;
8425 
8426 				if (error & UPL_ABORT_DUMP_PAGES) {
8427 					pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8428 
8429 					dwp->dw_mask |= DW_vm_page_free;
8430 				} else {
8431 					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8432 						if (error & UPL_ABORT_REFERENCE) {
8433 							/*
8434 							 * we've been told to explictly
8435 							 * reference this page... for
8436 							 * file I/O, this is done by
8437 							 * implementing an LRU on the inactive q
8438 							 */
8439 							dwp->dw_mask |= DW_vm_page_lru;
8440 						} else if (!VM_PAGE_PAGEABLE(m)) {
8441 							dwp->dw_mask |= DW_vm_page_deactivate_internal;
8442 						}
8443 					}
8444 					dwp->dw_mask |= DW_PAGE_WAKEUP;
8445 				}
8446 			}
8447 		}
8448 abort_next_page:
8449 		target_offset += PAGE_SIZE_64;
8450 		xfer_size -= PAGE_SIZE;
8451 		entry++;
8452 
8453 		if (dwp->dw_mask) {
8454 			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8455 				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8456 
8457 				if (dw_count >= dw_limit) {
8458 					vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8459 
8460 					dwp = dwp_start;
8461 					dw_count = 0;
8462 				}
8463 			} else {
8464 				if (dwp->dw_mask & DW_clear_busy) {
8465 					m->vmp_busy = FALSE;
8466 				}
8467 
8468 				if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8469 					PAGE_WAKEUP(m);
8470 				}
8471 			}
8472 		}
8473 	}
8474 	if (dw_count) {
8475 		vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8476 		dwp = dwp_start;
8477 		dw_count = 0;
8478 	}
8479 
8480 	occupied = 1;
8481 
8482 	if (upl->flags & UPL_DEVICE_MEMORY) {
8483 		occupied = 0;
8484 	} else if (upl->flags & UPL_LITE) {
8485 		int     pg_num;
8486 		int     i;
8487 
8488 		pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8489 		pg_num = (pg_num + 31) >> 5;
8490 		occupied = 0;
8491 
8492 		for (i = 0; i < pg_num; i++) {
8493 			if (lite_list[i] != 0) {
8494 				occupied = 1;
8495 				break;
8496 			}
8497 		}
8498 	} else {
8499 		if (vm_page_queue_empty(&upl->map_object->memq)) {
8500 			occupied = 0;
8501 		}
8502 	}
8503 	if (occupied == 0) {
8504 		/*
8505 		 * If this UPL element belongs to a Vector UPL and is
8506 		 * empty, then this is the right function to deallocate
8507 		 * it. So go ahead set the *empty variable. The flag
8508 		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8509 		 * should be considered relevant for the Vector UPL and
8510 		 * not the internal UPLs.
8511 		 */
8512 		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8513 			*empty = TRUE;
8514 		}
8515 
8516 		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8517 			/*
8518 			 * this is not a paging object
8519 			 * so we need to drop the paging reference
8520 			 * that was taken when we created the UPL
8521 			 * against this object
8522 			 */
8523 			vm_object_activity_end(shadow_object);
8524 			vm_object_collapse(shadow_object, 0, TRUE);
8525 		} else {
8526 			/*
8527 			 * we dontated the paging reference to
8528 			 * the map object... vm_pageout_object_terminate
8529 			 * will drop this reference
8530 			 */
8531 		}
8532 	}
8533 	vm_object_unlock(shadow_object);
8534 	if (object != shadow_object) {
8535 		vm_object_unlock(object);
8536 	}
8537 
8538 	if (!isVectorUPL) {
8539 		upl_unlock(upl);
8540 	} else {
8541 		/*
8542 		 * If we completed our operations on an UPL that is
8543 		 * part of a Vectored UPL and if empty is TRUE, then
8544 		 * we should go ahead and deallocate this UPL element.
8545 		 * Then we check if this was the last of the UPL elements
8546 		 * within that Vectored UPL. If so, set empty to TRUE
8547 		 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8548 		 * can go ahead and deallocate the Vector UPL too.
8549 		 */
8550 		if (*empty == TRUE) {
8551 			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
8552 			upl_deallocate(upl);
8553 		}
8554 		goto process_upl_to_abort;
8555 	}
8556 
8557 	kr = KERN_SUCCESS;
8558 
8559 done:
8560 	if (dwp_start && dwp_finish_ctx) {
8561 		vm_page_delayed_work_finish_ctx(dwp_start);
8562 		dwp_start = dwp = NULL;
8563 	}
8564 
8565 	return kr;
8566 }
8567 
8568 
8569 kern_return_t
upl_abort(upl_t upl,int error)8570 upl_abort(
8571 	upl_t   upl,
8572 	int     error)
8573 {
8574 	boolean_t       empty;
8575 
8576 	if (upl == UPL_NULL) {
8577 		return KERN_INVALID_ARGUMENT;
8578 	}
8579 
8580 	return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8581 }
8582 
8583 
8584 /* an option on commit should be wire */
8585 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8586 upl_commit(
8587 	upl_t                   upl,
8588 	upl_page_info_t         *page_list,
8589 	mach_msg_type_number_t  count)
8590 {
8591 	boolean_t       empty;
8592 
8593 	if (upl == UPL_NULL) {
8594 		return KERN_INVALID_ARGUMENT;
8595 	}
8596 
8597 	return upl_commit_range(upl, 0, upl->u_size, 0,
8598 	           page_list, count, &empty);
8599 }
8600 
8601 
8602 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8603 iopl_valid_data(
8604 	upl_t    upl,
8605 	vm_tag_t tag)
8606 {
8607 	vm_object_t     object;
8608 	vm_offset_t     offset;
8609 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
8610 	upl_size_t      size;
8611 	int             wired_count = 0;
8612 
8613 	if (upl == NULL) {
8614 		panic("iopl_valid_data: NULL upl");
8615 	}
8616 	if (vector_upl_is_valid(upl)) {
8617 		panic("iopl_valid_data: vector upl");
8618 	}
8619 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8620 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8621 	}
8622 
8623 	object = upl->map_object;
8624 
8625 	if (object == kernel_object || object == compressor_object) {
8626 		panic("iopl_valid_data: object == kernel or compressor");
8627 	}
8628 
8629 	if (object->purgable == VM_PURGABLE_VOLATILE ||
8630 	    object->purgable == VM_PURGABLE_EMPTY) {
8631 		panic("iopl_valid_data: object %p purgable %d",
8632 		    object, object->purgable);
8633 	}
8634 
8635 	size = upl_adjusted_size(upl, PAGE_MASK);
8636 
8637 	vm_object_lock(object);
8638 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8639 
8640 	bool whole_object;
8641 
8642 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8643 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8644 		whole_object = true;
8645 	} else {
8646 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8647 		whole_object = false;
8648 	}
8649 
8650 	while (size) {
8651 		if (whole_object) {
8652 			if (nxt_page != VM_PAGE_NULL) {
8653 				m = nxt_page;
8654 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8655 			}
8656 		} else {
8657 			m = vm_page_lookup(object, offset);
8658 			offset += PAGE_SIZE;
8659 
8660 			if (m == VM_PAGE_NULL) {
8661 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8662 			}
8663 		}
8664 		if (m->vmp_busy) {
8665 			if (!m->vmp_absent) {
8666 				panic("iopl_valid_data: busy page w/o absent");
8667 			}
8668 
8669 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8670 				panic("iopl_valid_data: busy+absent page on page queue");
8671 			}
8672 			if (m->vmp_reusable) {
8673 				panic("iopl_valid_data: %p is reusable", m);
8674 			}
8675 
8676 			m->vmp_absent = FALSE;
8677 			m->vmp_dirty = TRUE;
8678 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8679 			assert(m->vmp_wire_count == 0);
8680 			m->vmp_wire_count++;
8681 			assert(m->vmp_wire_count);
8682 			if (m->vmp_wire_count == 1) {
8683 				m->vmp_q_state = VM_PAGE_IS_WIRED;
8684 				wired_count++;
8685 			} else {
8686 				panic("iopl_valid_data: %p already wired", m);
8687 			}
8688 
8689 			PAGE_WAKEUP_DONE(m);
8690 		}
8691 		size -= PAGE_SIZE;
8692 	}
8693 	if (wired_count) {
8694 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8695 		assert(object->resident_page_count >= object->wired_page_count);
8696 
8697 		/* no need to adjust purgeable accounting for this object: */
8698 		assert(object->purgable != VM_PURGABLE_VOLATILE);
8699 		assert(object->purgable != VM_PURGABLE_EMPTY);
8700 
8701 		vm_page_lockspin_queues();
8702 		vm_page_wire_count += wired_count;
8703 		vm_page_unlock_queues();
8704 	}
8705 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8706 	vm_object_unlock(object);
8707 }
8708 
8709 
8710 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8711 vm_object_set_pmap_cache_attr(
8712 	vm_object_t             object,
8713 	upl_page_info_array_t   user_page_list,
8714 	unsigned int            num_pages,
8715 	boolean_t               batch_pmap_op)
8716 {
8717 	unsigned int    cache_attr = 0;
8718 
8719 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8720 	assert(user_page_list);
8721 	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8722 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8723 	}
8724 }
8725 
8726 
8727 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8728 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8729 
8730 
8731 
8732 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8733 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8734     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8735 {
8736 	vm_page_t       dst_page;
8737 	unsigned int    entry;
8738 	int             page_count;
8739 	int             delayed_unlock = 0;
8740 	boolean_t       retval = TRUE;
8741 	ppnum_t         phys_page;
8742 
8743 	vm_object_lock_assert_exclusive(object);
8744 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8745 	assert(object->purgable != VM_PURGABLE_EMPTY);
8746 	assert(object->pager == NULL);
8747 	assert(object->copy == NULL);
8748 	assert(object->shadow == NULL);
8749 
8750 	page_count = object->resident_page_count;
8751 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8752 
8753 	vm_page_lock_queues();
8754 
8755 	while (page_count--) {
8756 		if (dst_page->vmp_busy ||
8757 		    dst_page->vmp_fictitious ||
8758 		    dst_page->vmp_absent ||
8759 		    dst_page->vmp_error ||
8760 		    dst_page->vmp_cleaning ||
8761 		    dst_page->vmp_restart ||
8762 		    dst_page->vmp_laundry) {
8763 			retval = FALSE;
8764 			goto done;
8765 		}
8766 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8767 			retval = FALSE;
8768 			goto done;
8769 		}
8770 		dst_page->vmp_reference = TRUE;
8771 
8772 		vm_page_wire(dst_page, tag, FALSE);
8773 
8774 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8775 			SET_PAGE_DIRTY(dst_page, FALSE);
8776 		}
8777 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8778 		assert(entry >= 0 && entry < object->resident_page_count);
8779 		lite_list[entry >> 5] |= 1U << (entry & 31);
8780 
8781 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8782 
8783 		if (phys_page > upl->highest_page) {
8784 			upl->highest_page = phys_page;
8785 		}
8786 
8787 		if (user_page_list) {
8788 			user_page_list[entry].phys_addr = phys_page;
8789 			user_page_list[entry].absent    = dst_page->vmp_absent;
8790 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8791 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8792 			user_page_list[entry].precious  = dst_page->vmp_precious;
8793 			user_page_list[entry].device    = FALSE;
8794 			user_page_list[entry].speculative = FALSE;
8795 			user_page_list[entry].cs_validated = FALSE;
8796 			user_page_list[entry].cs_tainted = FALSE;
8797 			user_page_list[entry].cs_nx     = FALSE;
8798 			user_page_list[entry].needed    = FALSE;
8799 			user_page_list[entry].mark      = FALSE;
8800 		}
8801 		if (delayed_unlock++ > 256) {
8802 			delayed_unlock = 0;
8803 			lck_mtx_yield(&vm_page_queue_lock);
8804 
8805 			VM_CHECK_MEMORYSTATUS;
8806 		}
8807 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8808 	}
8809 done:
8810 	vm_page_unlock_queues();
8811 
8812 	VM_CHECK_MEMORYSTATUS;
8813 
8814 	return retval;
8815 }
8816 
8817 
8818 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8819 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8820     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
8821     int page_count, int* page_grab_count)
8822 {
8823 	vm_page_t       dst_page;
8824 	boolean_t       no_zero_fill = FALSE;
8825 	int             interruptible;
8826 	int             pages_wired = 0;
8827 	int             pages_inserted = 0;
8828 	int             entry = 0;
8829 	uint64_t        delayed_ledger_update = 0;
8830 	kern_return_t   ret = KERN_SUCCESS;
8831 	int             grab_options;
8832 	ppnum_t         phys_page;
8833 
8834 	vm_object_lock_assert_exclusive(object);
8835 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8836 	assert(object->purgable != VM_PURGABLE_EMPTY);
8837 	assert(object->pager == NULL);
8838 	assert(object->copy == NULL);
8839 	assert(object->shadow == NULL);
8840 
8841 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8842 		interruptible = THREAD_ABORTSAFE;
8843 	} else {
8844 		interruptible = THREAD_UNINT;
8845 	}
8846 
8847 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8848 		no_zero_fill = TRUE;
8849 	}
8850 
8851 	grab_options = 0;
8852 #if CONFIG_SECLUDED_MEMORY
8853 	if (object->can_grab_secluded) {
8854 		grab_options |= VM_PAGE_GRAB_SECLUDED;
8855 	}
8856 #endif /* CONFIG_SECLUDED_MEMORY */
8857 
8858 	while (page_count--) {
8859 		while ((dst_page = vm_page_grab_options(grab_options))
8860 		    == VM_PAGE_NULL) {
8861 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8862 
8863 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8864 
8865 			if (vm_page_wait(interruptible) == FALSE) {
8866 				/*
8867 				 * interrupted case
8868 				 */
8869 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8870 
8871 				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8872 
8873 				ret = MACH_SEND_INTERRUPTED;
8874 				goto done;
8875 			}
8876 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8877 
8878 			VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8879 		}
8880 		if (no_zero_fill == FALSE) {
8881 			vm_page_zero_fill(dst_page);
8882 		} else {
8883 			dst_page->vmp_absent = TRUE;
8884 		}
8885 
8886 		dst_page->vmp_reference = TRUE;
8887 
8888 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8889 			SET_PAGE_DIRTY(dst_page, FALSE);
8890 		}
8891 		if (dst_page->vmp_absent == FALSE) {
8892 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
8893 			assert(dst_page->vmp_wire_count == 0);
8894 			dst_page->vmp_wire_count++;
8895 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
8896 			assert(dst_page->vmp_wire_count);
8897 			pages_wired++;
8898 			PAGE_WAKEUP_DONE(dst_page);
8899 		}
8900 		pages_inserted++;
8901 
8902 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8903 
8904 		lite_list[entry >> 5] |= 1U << (entry & 31);
8905 
8906 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8907 
8908 		if (phys_page > upl->highest_page) {
8909 			upl->highest_page = phys_page;
8910 		}
8911 
8912 		if (user_page_list) {
8913 			user_page_list[entry].phys_addr = phys_page;
8914 			user_page_list[entry].absent    = dst_page->vmp_absent;
8915 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8916 			user_page_list[entry].free_when_done    = FALSE;
8917 			user_page_list[entry].precious  = FALSE;
8918 			user_page_list[entry].device    = FALSE;
8919 			user_page_list[entry].speculative = FALSE;
8920 			user_page_list[entry].cs_validated = FALSE;
8921 			user_page_list[entry].cs_tainted = FALSE;
8922 			user_page_list[entry].cs_nx     = FALSE;
8923 			user_page_list[entry].needed    = FALSE;
8924 			user_page_list[entry].mark      = FALSE;
8925 		}
8926 		entry++;
8927 		*dst_offset += PAGE_SIZE_64;
8928 	}
8929 done:
8930 	if (pages_wired) {
8931 		vm_page_lockspin_queues();
8932 		vm_page_wire_count += pages_wired;
8933 		vm_page_unlock_queues();
8934 	}
8935 	if (pages_inserted) {
8936 		if (object->internal) {
8937 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
8938 		} else {
8939 			OSAddAtomic(pages_inserted, &vm_page_external_count);
8940 		}
8941 	}
8942 	if (delayed_ledger_update) {
8943 		task_t          owner;
8944 		int             ledger_idx_volatile;
8945 		int             ledger_idx_nonvolatile;
8946 		int             ledger_idx_volatile_compressed;
8947 		int             ledger_idx_nonvolatile_compressed;
8948 		boolean_t       do_footprint;
8949 
8950 		owner = VM_OBJECT_OWNER(object);
8951 		assert(owner);
8952 
8953 		vm_object_ledger_tag_ledgers(object,
8954 		    &ledger_idx_volatile,
8955 		    &ledger_idx_nonvolatile,
8956 		    &ledger_idx_volatile_compressed,
8957 		    &ledger_idx_nonvolatile_compressed,
8958 		    &do_footprint);
8959 
8960 		/* more non-volatile bytes */
8961 		ledger_credit(owner->ledger,
8962 		    ledger_idx_nonvolatile,
8963 		    delayed_ledger_update);
8964 		if (do_footprint) {
8965 			/* more footprint */
8966 			ledger_credit(owner->ledger,
8967 			    task_ledgers.phys_footprint,
8968 			    delayed_ledger_update);
8969 		}
8970 	}
8971 
8972 	assert(page_grab_count);
8973 	*page_grab_count = pages_inserted;
8974 
8975 	return ret;
8976 }
8977 
8978 
8979 
8980 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)8981 vm_object_iopl_request(
8982 	vm_object_t             object,
8983 	vm_object_offset_t      offset,
8984 	upl_size_t              size,
8985 	upl_t                   *upl_ptr,
8986 	upl_page_info_array_t   user_page_list,
8987 	unsigned int            *page_list_count,
8988 	upl_control_flags_t     cntrl_flags,
8989 	vm_tag_t                tag)
8990 {
8991 	vm_page_t               dst_page;
8992 	vm_object_offset_t      dst_offset;
8993 	upl_size_t              xfer_size;
8994 	upl_t                   upl = NULL;
8995 	unsigned int            entry;
8996 	wpl_array_t             lite_list = NULL;
8997 	int                     no_zero_fill = FALSE;
8998 	unsigned int            size_in_pages;
8999 	int                     page_grab_count = 0;
9000 	u_int32_t               psize;
9001 	kern_return_t           ret;
9002 	vm_prot_t               prot;
9003 	struct vm_object_fault_info fault_info = {};
9004 	struct  vm_page_delayed_work    dw_array;
9005 	struct  vm_page_delayed_work    *dwp, *dwp_start;
9006 	bool                    dwp_finish_ctx = TRUE;
9007 	int                     dw_count;
9008 	int                     dw_limit;
9009 	int                     dw_index;
9010 	boolean_t               caller_lookup;
9011 	int                     io_tracking_flag = 0;
9012 	int                     interruptible;
9013 	ppnum_t                 phys_page;
9014 
9015 	boolean_t               set_cache_attr_needed = FALSE;
9016 	boolean_t               free_wired_pages = FALSE;
9017 	boolean_t               fast_path_empty_req = FALSE;
9018 	boolean_t               fast_path_full_req = FALSE;
9019 
9020 #if DEVELOPMENT || DEBUG
9021 	task_t                  task = current_task();
9022 #endif /* DEVELOPMENT || DEBUG */
9023 
9024 	dwp_start = dwp = NULL;
9025 
9026 	vm_object_offset_t original_offset = offset;
9027 	upl_size_t original_size = size;
9028 
9029 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9030 
9031 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9032 	offset = vm_object_trunc_page(offset);
9033 	if (size != original_size || offset != original_offset) {
9034 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9035 	}
9036 
9037 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9038 		/*
9039 		 * For forward compatibility's sake,
9040 		 * reject any unknown flag.
9041 		 */
9042 		return KERN_INVALID_VALUE;
9043 	}
9044 	if (vm_lopage_needed == FALSE) {
9045 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9046 	}
9047 
9048 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9049 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9050 			return KERN_INVALID_VALUE;
9051 		}
9052 
9053 		if (object->phys_contiguous) {
9054 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9055 				return KERN_INVALID_ADDRESS;
9056 			}
9057 
9058 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9059 				return KERN_INVALID_ADDRESS;
9060 			}
9061 		}
9062 	}
9063 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9064 		no_zero_fill = TRUE;
9065 	}
9066 
9067 	if (cntrl_flags & UPL_COPYOUT_FROM) {
9068 		prot = VM_PROT_READ;
9069 	} else {
9070 		prot = VM_PROT_READ | VM_PROT_WRITE;
9071 	}
9072 
9073 	if ((!object->internal) && (object->paging_offset != 0)) {
9074 		panic("vm_object_iopl_request: external object with non-zero paging offset");
9075 	}
9076 
9077 
9078 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9079 
9080 #if CONFIG_IOSCHED || UPL_DEBUG
9081 	if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9082 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9083 	}
9084 #endif
9085 
9086 #if CONFIG_IOSCHED
9087 	if (object->io_tracking) {
9088 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9089 		if (object != kernel_object) {
9090 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9091 		}
9092 	}
9093 #endif
9094 
9095 	if (object->phys_contiguous) {
9096 		psize = PAGE_SIZE;
9097 	} else {
9098 		psize = size;
9099 
9100 		dw_count = 0;
9101 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9102 		dwp_start = vm_page_delayed_work_get_ctx();
9103 		if (dwp_start == NULL) {
9104 			dwp_start = &dw_array;
9105 			dw_limit = 1;
9106 			dwp_finish_ctx = FALSE;
9107 		}
9108 
9109 		dwp = dwp_start;
9110 	}
9111 
9112 	if (cntrl_flags & UPL_SET_INTERNAL) {
9113 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9114 
9115 		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9116 		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9117 		    ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9118 		if (size == 0) {
9119 			user_page_list = NULL;
9120 			lite_list = NULL;
9121 		}
9122 	} else {
9123 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9124 
9125 		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9126 		if (size == 0) {
9127 			lite_list = NULL;
9128 		}
9129 	}
9130 	if (user_page_list) {
9131 		user_page_list[0].device = FALSE;
9132 	}
9133 	*upl_ptr = upl;
9134 
9135 	if (cntrl_flags & UPL_NOZEROFILLIO) {
9136 		DTRACE_VM4(upl_nozerofillio,
9137 		    vm_object_t, object,
9138 		    vm_object_offset_t, offset,
9139 		    upl_size_t, size,
9140 		    upl_t, upl);
9141 	}
9142 
9143 	upl->map_object = object;
9144 	upl->u_offset = original_offset;
9145 	upl->u_size = original_size;
9146 
9147 	size_in_pages = size / PAGE_SIZE;
9148 
9149 	if (object == kernel_object &&
9150 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9151 		upl->flags |= UPL_KERNEL_OBJECT;
9152 #if UPL_DEBUG
9153 		vm_object_lock(object);
9154 #else
9155 		vm_object_lock_shared(object);
9156 #endif
9157 	} else {
9158 		vm_object_lock(object);
9159 		vm_object_activity_begin(object);
9160 	}
9161 	/*
9162 	 * paging in progress also protects the paging_offset
9163 	 */
9164 	upl->u_offset = original_offset + object->paging_offset;
9165 
9166 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9167 		/*
9168 		 * The user requested that access to the pages in this UPL
9169 		 * be blocked until the UPL is commited or aborted.
9170 		 */
9171 		upl->flags |= UPL_ACCESS_BLOCKED;
9172 	}
9173 
9174 #if CONFIG_IOSCHED || UPL_DEBUG
9175 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9176 		vm_object_activity_begin(object);
9177 		queue_enter(&object->uplq, upl, upl_t, uplq);
9178 	}
9179 #endif
9180 
9181 	if (object->phys_contiguous) {
9182 		if (upl->flags & UPL_ACCESS_BLOCKED) {
9183 			assert(!object->blocked_access);
9184 			object->blocked_access = TRUE;
9185 		}
9186 
9187 		vm_object_unlock(object);
9188 
9189 		/*
9190 		 * don't need any shadow mappings for this one
9191 		 * since it is already I/O memory
9192 		 */
9193 		upl->flags |= UPL_DEVICE_MEMORY;
9194 
9195 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9196 
9197 		if (user_page_list) {
9198 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9199 			user_page_list[0].device = TRUE;
9200 		}
9201 		if (page_list_count != NULL) {
9202 			if (upl->flags & UPL_INTERNAL) {
9203 				*page_list_count = 0;
9204 			} else {
9205 				*page_list_count = 1;
9206 			}
9207 		}
9208 
9209 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9210 #if DEVELOPMENT || DEBUG
9211 		if (task != NULL) {
9212 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9213 		}
9214 #endif /* DEVELOPMENT || DEBUG */
9215 		return KERN_SUCCESS;
9216 	}
9217 	if (object != kernel_object && object != compressor_object) {
9218 		/*
9219 		 * Protect user space from future COW operations
9220 		 */
9221 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9222 		if (!object->true_share &&
9223 		    vm_object_tracking_btlog) {
9224 			btlog_record(vm_object_tracking_btlog, object,
9225 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
9226 			    btref_get(__builtin_frame_address(0), 0));
9227 		}
9228 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9229 
9230 		vm_object_lock_assert_exclusive(object);
9231 		object->true_share = TRUE;
9232 
9233 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9234 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9235 		}
9236 	}
9237 
9238 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9239 	    object->copy != VM_OBJECT_NULL) {
9240 		/*
9241 		 * Honor copy-on-write obligations
9242 		 *
9243 		 * The caller is gathering these pages and
9244 		 * might modify their contents.  We need to
9245 		 * make sure that the copy object has its own
9246 		 * private copies of these pages before we let
9247 		 * the caller modify them.
9248 		 *
9249 		 * NOTE: someone else could map the original object
9250 		 * after we've done this copy-on-write here, and they
9251 		 * could then see an inconsistent picture of the memory
9252 		 * while it's being modified via the UPL.  To prevent this,
9253 		 * we would have to block access to these pages until the
9254 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
9255 		 * code path for that...
9256 		 */
9257 		vm_object_update(object,
9258 		    offset,
9259 		    size,
9260 		    NULL,
9261 		    NULL,
9262 		    FALSE,              /* should_return */
9263 		    MEMORY_OBJECT_COPY_SYNC,
9264 		    VM_PROT_NO_CHANGE);
9265 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
9266 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9267 	}
9268 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9269 	    object->purgable != VM_PURGABLE_VOLATILE &&
9270 	    object->purgable != VM_PURGABLE_EMPTY &&
9271 	    object->copy == NULL &&
9272 	    size == object->vo_size &&
9273 	    offset == 0 &&
9274 	    object->shadow == NULL &&
9275 	    object->pager == NULL) {
9276 		if (object->resident_page_count == size_in_pages) {
9277 			assert(object != compressor_object);
9278 			assert(object != kernel_object);
9279 			fast_path_full_req = TRUE;
9280 		} else if (object->resident_page_count == 0) {
9281 			assert(object != compressor_object);
9282 			assert(object != kernel_object);
9283 			fast_path_empty_req = TRUE;
9284 			set_cache_attr_needed = TRUE;
9285 		}
9286 	}
9287 
9288 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9289 		interruptible = THREAD_ABORTSAFE;
9290 	} else {
9291 		interruptible = THREAD_UNINT;
9292 	}
9293 
9294 	entry = 0;
9295 
9296 	xfer_size = size;
9297 	dst_offset = offset;
9298 
9299 	if (fast_path_full_req) {
9300 		if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9301 			goto finish;
9302 		}
9303 		/*
9304 		 * we couldn't complete the processing of this request on the fast path
9305 		 * so fall through to the slow path and finish up
9306 		 */
9307 	} else if (fast_path_empty_req) {
9308 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9309 			ret = KERN_MEMORY_ERROR;
9310 			goto return_err;
9311 		}
9312 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9313 
9314 		if (ret) {
9315 			free_wired_pages = TRUE;
9316 			goto return_err;
9317 		}
9318 		goto finish;
9319 	}
9320 
9321 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9322 	fault_info.lo_offset = offset;
9323 	fault_info.hi_offset = offset + xfer_size;
9324 	fault_info.mark_zf_absent = TRUE;
9325 	fault_info.interruptible = interruptible;
9326 	fault_info.batch_pmap_op = TRUE;
9327 
9328 	while (xfer_size) {
9329 		vm_fault_return_t       result;
9330 
9331 		dwp->dw_mask = 0;
9332 
9333 		if (fast_path_full_req) {
9334 			/*
9335 			 * if we get here, it means that we ran into a page
9336 			 * state we couldn't handle in the fast path and
9337 			 * bailed out to the slow path... since the order
9338 			 * we look at pages is different between the 2 paths,
9339 			 * the following check is needed to determine whether
9340 			 * this page was already processed in the fast path
9341 			 */
9342 			if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9343 				goto skip_page;
9344 			}
9345 		}
9346 		dst_page = vm_page_lookup(object, dst_offset);
9347 
9348 		if (dst_page == VM_PAGE_NULL ||
9349 		    dst_page->vmp_busy ||
9350 		    dst_page->vmp_error ||
9351 		    dst_page->vmp_restart ||
9352 		    dst_page->vmp_absent ||
9353 		    dst_page->vmp_fictitious) {
9354 			if (object == kernel_object) {
9355 				panic("vm_object_iopl_request: missing/bad page in kernel object");
9356 			}
9357 			if (object == compressor_object) {
9358 				panic("vm_object_iopl_request: missing/bad page in compressor object");
9359 			}
9360 
9361 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9362 				ret = KERN_MEMORY_ERROR;
9363 				goto return_err;
9364 			}
9365 			set_cache_attr_needed = TRUE;
9366 
9367 			/*
9368 			 * We just looked up the page and the result remains valid
9369 			 * until the object lock is release, so send it to
9370 			 * vm_fault_page() (as "dst_page"), to avoid having to
9371 			 * look it up again there.
9372 			 */
9373 			caller_lookup = TRUE;
9374 
9375 			do {
9376 				vm_page_t       top_page;
9377 				kern_return_t   error_code;
9378 
9379 				fault_info.cluster_size = xfer_size;
9380 
9381 				vm_object_paging_begin(object);
9382 
9383 				result = vm_fault_page(object, dst_offset,
9384 				    prot | VM_PROT_WRITE, FALSE,
9385 				    caller_lookup,
9386 				    &prot, &dst_page, &top_page,
9387 				    (int *)0,
9388 				    &error_code, no_zero_fill,
9389 				    FALSE, &fault_info);
9390 
9391 				/* our lookup is no longer valid at this point */
9392 				caller_lookup = FALSE;
9393 
9394 				switch (result) {
9395 				case VM_FAULT_SUCCESS:
9396 					page_grab_count++;
9397 
9398 					if (!dst_page->vmp_absent) {
9399 						PAGE_WAKEUP_DONE(dst_page);
9400 					} else {
9401 						/*
9402 						 * we only get back an absent page if we
9403 						 * requested that it not be zero-filled
9404 						 * because we are about to fill it via I/O
9405 						 *
9406 						 * absent pages should be left BUSY
9407 						 * to prevent them from being faulted
9408 						 * into an address space before we've
9409 						 * had a chance to complete the I/O on
9410 						 * them since they may contain info that
9411 						 * shouldn't be seen by the faulting task
9412 						 */
9413 					}
9414 					/*
9415 					 *	Release paging references and
9416 					 *	top-level placeholder page, if any.
9417 					 */
9418 					if (top_page != VM_PAGE_NULL) {
9419 						vm_object_t local_object;
9420 
9421 						local_object = VM_PAGE_OBJECT(top_page);
9422 
9423 						/*
9424 						 * comparing 2 packed pointers
9425 						 */
9426 						if (top_page->vmp_object != dst_page->vmp_object) {
9427 							vm_object_lock(local_object);
9428 							VM_PAGE_FREE(top_page);
9429 							vm_object_paging_end(local_object);
9430 							vm_object_unlock(local_object);
9431 						} else {
9432 							VM_PAGE_FREE(top_page);
9433 							vm_object_paging_end(local_object);
9434 						}
9435 					}
9436 					vm_object_paging_end(object);
9437 					break;
9438 
9439 				case VM_FAULT_RETRY:
9440 					vm_object_lock(object);
9441 					break;
9442 
9443 				case VM_FAULT_MEMORY_SHORTAGE:
9444 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9445 
9446 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9447 
9448 					if (vm_page_wait(interruptible)) {
9449 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9450 
9451 						VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9452 						vm_object_lock(object);
9453 
9454 						break;
9455 					}
9456 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9457 
9458 					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9459 
9460 					OS_FALLTHROUGH;
9461 
9462 				case VM_FAULT_INTERRUPTED:
9463 					error_code = MACH_SEND_INTERRUPTED;
9464 					OS_FALLTHROUGH;
9465 				case VM_FAULT_MEMORY_ERROR:
9466 memory_error:
9467 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9468 
9469 					vm_object_lock(object);
9470 					goto return_err;
9471 
9472 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
9473 					/* success but no page: fail */
9474 					vm_object_paging_end(object);
9475 					vm_object_unlock(object);
9476 					goto memory_error;
9477 
9478 				default:
9479 					panic("vm_object_iopl_request: unexpected error"
9480 					    " 0x%x from vm_fault_page()\n", result);
9481 				}
9482 			} while (result != VM_FAULT_SUCCESS);
9483 		}
9484 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9485 
9486 		if (upl->flags & UPL_KERNEL_OBJECT) {
9487 			goto record_phys_addr;
9488 		}
9489 
9490 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9491 			dst_page->vmp_busy = TRUE;
9492 			goto record_phys_addr;
9493 		}
9494 
9495 		if (dst_page->vmp_cleaning) {
9496 			/*
9497 			 * Someone else is cleaning this page in place.
9498 			 * In theory, we should be able to  proceed and use this
9499 			 * page but they'll probably end up clearing the "busy"
9500 			 * bit on it in upl_commit_range() but they didn't set
9501 			 * it, so they would clear our "busy" bit and open
9502 			 * us to race conditions.
9503 			 * We'd better wait for the cleaning to complete and
9504 			 * then try again.
9505 			 */
9506 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9507 			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9508 			continue;
9509 		}
9510 		if (dst_page->vmp_laundry) {
9511 			vm_pageout_steal_laundry(dst_page, FALSE);
9512 		}
9513 
9514 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9515 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9516 			vm_page_t       low_page;
9517 			int             refmod;
9518 
9519 			/*
9520 			 * support devices that can't DMA above 32 bits
9521 			 * by substituting pages from a pool of low address
9522 			 * memory for any pages we find above the 4G mark
9523 			 * can't substitute if the page is already wired because
9524 			 * we don't know whether that physical address has been
9525 			 * handed out to some other 64 bit capable DMA device to use
9526 			 */
9527 			if (VM_PAGE_WIRED(dst_page)) {
9528 				ret = KERN_PROTECTION_FAILURE;
9529 				goto return_err;
9530 			}
9531 			low_page = vm_page_grablo();
9532 
9533 			if (low_page == VM_PAGE_NULL) {
9534 				ret = KERN_RESOURCE_SHORTAGE;
9535 				goto return_err;
9536 			}
9537 			/*
9538 			 * from here until the vm_page_replace completes
9539 			 * we musn't drop the object lock... we don't
9540 			 * want anyone refaulting this page in and using
9541 			 * it after we disconnect it... we want the fault
9542 			 * to find the new page being substituted.
9543 			 */
9544 			if (dst_page->vmp_pmapped) {
9545 				refmod = pmap_disconnect(phys_page);
9546 			} else {
9547 				refmod = 0;
9548 			}
9549 
9550 			if (!dst_page->vmp_absent) {
9551 				vm_page_copy(dst_page, low_page);
9552 			}
9553 
9554 			low_page->vmp_reference = dst_page->vmp_reference;
9555 			low_page->vmp_dirty     = dst_page->vmp_dirty;
9556 			low_page->vmp_absent    = dst_page->vmp_absent;
9557 
9558 			if (refmod & VM_MEM_REFERENCED) {
9559 				low_page->vmp_reference = TRUE;
9560 			}
9561 			if (refmod & VM_MEM_MODIFIED) {
9562 				SET_PAGE_DIRTY(low_page, FALSE);
9563 			}
9564 
9565 			vm_page_replace(low_page, object, dst_offset);
9566 
9567 			dst_page = low_page;
9568 			/*
9569 			 * vm_page_grablo returned the page marked
9570 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
9571 			 * here, because we've never dropped the object lock
9572 			 */
9573 			if (!dst_page->vmp_absent) {
9574 				dst_page->vmp_busy = FALSE;
9575 			}
9576 
9577 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9578 		}
9579 		if (!dst_page->vmp_busy) {
9580 			dwp->dw_mask |= DW_vm_page_wire;
9581 		}
9582 
9583 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
9584 			/*
9585 			 * Mark the page "busy" to block any future page fault
9586 			 * on this page in addition to wiring it.
9587 			 * We'll also remove the mapping
9588 			 * of all these pages before leaving this routine.
9589 			 */
9590 			assert(!dst_page->vmp_fictitious);
9591 			dst_page->vmp_busy = TRUE;
9592 		}
9593 		/*
9594 		 * expect the page to be used
9595 		 * page queues lock must be held to set 'reference'
9596 		 */
9597 		dwp->dw_mask |= DW_set_reference;
9598 
9599 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9600 			SET_PAGE_DIRTY(dst_page, TRUE);
9601 			/*
9602 			 * Page belonging to a code-signed object is about to
9603 			 * be written. Mark it tainted and disconnect it from
9604 			 * all pmaps so processes have to fault it back in and
9605 			 * deal with the tainted bit.
9606 			 */
9607 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9608 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9609 				vm_page_iopl_tainted++;
9610 				if (dst_page->vmp_pmapped) {
9611 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9612 					if (refmod & VM_MEM_REFERENCED) {
9613 						dst_page->vmp_reference = TRUE;
9614 					}
9615 				}
9616 			}
9617 		}
9618 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9619 			pmap_sync_page_attributes_phys(phys_page);
9620 			dst_page->vmp_written_by_kernel = FALSE;
9621 		}
9622 
9623 record_phys_addr:
9624 		if (dst_page->vmp_busy) {
9625 			upl->flags |= UPL_HAS_BUSY;
9626 		}
9627 
9628 		lite_list[entry >> 5] |= 1U << (entry & 31);
9629 
9630 		if (phys_page > upl->highest_page) {
9631 			upl->highest_page = phys_page;
9632 		}
9633 
9634 		if (user_page_list) {
9635 			user_page_list[entry].phys_addr = phys_page;
9636 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9637 			user_page_list[entry].absent    = dst_page->vmp_absent;
9638 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
9639 			user_page_list[entry].precious  = dst_page->vmp_precious;
9640 			user_page_list[entry].device    = FALSE;
9641 			user_page_list[entry].needed    = FALSE;
9642 			if (dst_page->vmp_clustered == TRUE) {
9643 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9644 			} else {
9645 				user_page_list[entry].speculative = FALSE;
9646 			}
9647 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9648 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9649 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9650 			user_page_list[entry].mark      = FALSE;
9651 		}
9652 		if (object != kernel_object && object != compressor_object) {
9653 			/*
9654 			 * someone is explicitly grabbing this page...
9655 			 * update clustered and speculative state
9656 			 *
9657 			 */
9658 			if (dst_page->vmp_clustered) {
9659 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
9660 			}
9661 		}
9662 skip_page:
9663 		entry++;
9664 		dst_offset += PAGE_SIZE_64;
9665 		xfer_size -= PAGE_SIZE;
9666 
9667 		if (dwp->dw_mask) {
9668 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9669 
9670 			if (dw_count >= dw_limit) {
9671 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9672 
9673 				dwp = dwp_start;
9674 				dw_count = 0;
9675 			}
9676 		}
9677 	}
9678 	assert(entry == size_in_pages);
9679 
9680 	if (dw_count) {
9681 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9682 		dwp = dwp_start;
9683 		dw_count = 0;
9684 	}
9685 finish:
9686 	if (user_page_list && set_cache_attr_needed == TRUE) {
9687 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9688 	}
9689 
9690 	if (page_list_count != NULL) {
9691 		if (upl->flags & UPL_INTERNAL) {
9692 			*page_list_count = 0;
9693 		} else if (*page_list_count > size_in_pages) {
9694 			*page_list_count = size_in_pages;
9695 		}
9696 	}
9697 	vm_object_unlock(object);
9698 
9699 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9700 		/*
9701 		 * We've marked all the pages "busy" so that future
9702 		 * page faults will block.
9703 		 * Now remove the mapping for these pages, so that they
9704 		 * can't be accessed without causing a page fault.
9705 		 */
9706 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9707 		    PMAP_NULL,
9708 		    PAGE_SIZE,
9709 		    0, VM_PROT_NONE);
9710 		assert(!object->blocked_access);
9711 		object->blocked_access = TRUE;
9712 	}
9713 
9714 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9715 #if DEVELOPMENT || DEBUG
9716 	if (task != NULL) {
9717 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9718 	}
9719 #endif /* DEVELOPMENT || DEBUG */
9720 
9721 	if (dwp_start && dwp_finish_ctx) {
9722 		vm_page_delayed_work_finish_ctx(dwp_start);
9723 		dwp_start = dwp = NULL;
9724 	}
9725 
9726 	return KERN_SUCCESS;
9727 
9728 return_err:
9729 	dw_index = 0;
9730 
9731 	for (; offset < dst_offset; offset += PAGE_SIZE) {
9732 		boolean_t need_unwire;
9733 
9734 		dst_page = vm_page_lookup(object, offset);
9735 
9736 		if (dst_page == VM_PAGE_NULL) {
9737 			panic("vm_object_iopl_request: Wired page missing.");
9738 		}
9739 
9740 		/*
9741 		 * if we've already processed this page in an earlier
9742 		 * dw_do_work, we need to undo the wiring... we will
9743 		 * leave the dirty and reference bits on if they
9744 		 * were set, since we don't have a good way of knowing
9745 		 * what the previous state was and we won't get here
9746 		 * under any normal circumstances...  we will always
9747 		 * clear BUSY and wakeup any waiters via vm_page_free
9748 		 * or PAGE_WAKEUP_DONE
9749 		 */
9750 		need_unwire = TRUE;
9751 
9752 		if (dw_count) {
9753 			if ((dwp_start)[dw_index].dw_m == dst_page) {
9754 				/*
9755 				 * still in the deferred work list
9756 				 * which means we haven't yet called
9757 				 * vm_page_wire on this page
9758 				 */
9759 				need_unwire = FALSE;
9760 
9761 				dw_index++;
9762 				dw_count--;
9763 			}
9764 		}
9765 		vm_page_lock_queues();
9766 
9767 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9768 			vm_page_free(dst_page);
9769 
9770 			need_unwire = FALSE;
9771 		} else {
9772 			if (need_unwire == TRUE) {
9773 				vm_page_unwire(dst_page, TRUE);
9774 			}
9775 
9776 			PAGE_WAKEUP_DONE(dst_page);
9777 		}
9778 		vm_page_unlock_queues();
9779 
9780 		if (need_unwire == TRUE) {
9781 			counter_inc(&vm_statistics_reactivations);
9782 		}
9783 	}
9784 #if UPL_DEBUG
9785 	upl->upl_state = 2;
9786 #endif
9787 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9788 		vm_object_activity_end(object);
9789 		vm_object_collapse(object, 0, TRUE);
9790 	}
9791 	vm_object_unlock(object);
9792 	upl_destroy(upl);
9793 
9794 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9795 #if DEVELOPMENT || DEBUG
9796 	if (task != NULL) {
9797 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9798 	}
9799 #endif /* DEVELOPMENT || DEBUG */
9800 
9801 	if (dwp_start && dwp_finish_ctx) {
9802 		vm_page_delayed_work_finish_ctx(dwp_start);
9803 		dwp_start = dwp = NULL;
9804 	}
9805 	return ret;
9806 }
9807 
9808 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9809 upl_transpose(
9810 	upl_t           upl1,
9811 	upl_t           upl2)
9812 {
9813 	kern_return_t           retval;
9814 	boolean_t               upls_locked;
9815 	vm_object_t             object1, object2;
9816 
9817 	/* LD: Should mapped UPLs be eligible for a transpose? */
9818 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9819 		return KERN_INVALID_ARGUMENT;
9820 	}
9821 
9822 	upls_locked = FALSE;
9823 
9824 	/*
9825 	 * Since we need to lock both UPLs at the same time,
9826 	 * avoid deadlocks by always taking locks in the same order.
9827 	 */
9828 	if (upl1 < upl2) {
9829 		upl_lock(upl1);
9830 		upl_lock(upl2);
9831 	} else {
9832 		upl_lock(upl2);
9833 		upl_lock(upl1);
9834 	}
9835 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9836 
9837 	object1 = upl1->map_object;
9838 	object2 = upl2->map_object;
9839 
9840 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9841 	    upl1->u_size != upl2->u_size) {
9842 		/*
9843 		 * We deal only with full objects, not subsets.
9844 		 * That's because we exchange the entire backing store info
9845 		 * for the objects: pager, resident pages, etc...  We can't do
9846 		 * only part of it.
9847 		 */
9848 		retval = KERN_INVALID_VALUE;
9849 		goto done;
9850 	}
9851 
9852 	/*
9853 	 * Tranpose the VM objects' backing store.
9854 	 */
9855 	retval = vm_object_transpose(object1, object2,
9856 	    upl_adjusted_size(upl1, PAGE_MASK));
9857 
9858 	if (retval == KERN_SUCCESS) {
9859 		/*
9860 		 * Make each UPL point to the correct VM object, i.e. the
9861 		 * object holding the pages that the UPL refers to...
9862 		 */
9863 #if CONFIG_IOSCHED || UPL_DEBUG
9864 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9865 			vm_object_lock(object1);
9866 			vm_object_lock(object2);
9867 		}
9868 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9869 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
9870 		}
9871 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9872 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
9873 		}
9874 #endif
9875 		upl1->map_object = object2;
9876 		upl2->map_object = object1;
9877 
9878 #if CONFIG_IOSCHED || UPL_DEBUG
9879 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9880 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
9881 		}
9882 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9883 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
9884 		}
9885 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9886 			vm_object_unlock(object2);
9887 			vm_object_unlock(object1);
9888 		}
9889 #endif
9890 	}
9891 
9892 done:
9893 	/*
9894 	 * Cleanup.
9895 	 */
9896 	if (upls_locked) {
9897 		upl_unlock(upl1);
9898 		upl_unlock(upl2);
9899 		upls_locked = FALSE;
9900 	}
9901 
9902 	return retval;
9903 }
9904 
9905 void
upl_range_needed(upl_t upl,int index,int count)9906 upl_range_needed(
9907 	upl_t           upl,
9908 	int             index,
9909 	int             count)
9910 {
9911 	upl_page_info_t *user_page_list;
9912 	int             size_in_pages;
9913 
9914 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
9915 		return;
9916 	}
9917 
9918 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
9919 
9920 	user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9921 
9922 	while (count-- && index < size_in_pages) {
9923 		user_page_list[index++].needed = TRUE;
9924 	}
9925 }
9926 
9927 
9928 /*
9929  * Reserve of virtual addresses in the kernel address space.
9930  * We need to map the physical pages in the kernel, so that we
9931  * can call the code-signing or slide routines with a kernel
9932  * virtual address.  We keep this pool of pre-allocated kernel
9933  * virtual addresses so that we don't have to scan the kernel's
9934  * virtaul address space each time we need to work with
9935  * a physical page.
9936  */
9937 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
9938 #define VM_PAGING_NUM_PAGES     64
9939 vm_map_offset_t vm_paging_base_address = 0;
9940 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9941 int             vm_paging_max_index = 0;
9942 int             vm_paging_page_waiter = 0;
9943 int             vm_paging_page_waiter_total = 0;
9944 
9945 unsigned long   vm_paging_no_kernel_page = 0;
9946 unsigned long   vm_paging_objects_mapped = 0;
9947 unsigned long   vm_paging_pages_mapped = 0;
9948 unsigned long   vm_paging_objects_mapped_slow = 0;
9949 unsigned long   vm_paging_pages_mapped_slow = 0;
9950 
9951 __startup_func
9952 void
vm_paging_map_init(void)9953 vm_paging_map_init(void)
9954 {
9955 	kern_return_t   kr;
9956 	vm_map_offset_t page_map_offset;
9957 	vm_map_entry_t  map_entry;
9958 
9959 	assert(vm_paging_base_address == 0);
9960 
9961 	/*
9962 	 * Initialize our pool of pre-allocated kernel
9963 	 * virtual addresses.
9964 	 */
9965 	page_map_offset = 0;
9966 	kr = vm_map_find_space(kernel_map,
9967 	    &page_map_offset,
9968 	    VM_PAGING_NUM_PAGES * PAGE_SIZE,
9969 	    0,
9970 	    VM_MAP_KERNEL_FLAGS_NONE,
9971 	    VM_KERN_MEMORY_NONE,
9972 	    &map_entry);
9973 	if (kr != KERN_SUCCESS) {
9974 		panic("vm_paging_map_init: kernel_map full");
9975 	}
9976 	VME_OBJECT_SET(map_entry, kernel_object);
9977 	VME_OFFSET_SET(map_entry, page_map_offset);
9978 	map_entry->protection = VM_PROT_NONE;
9979 	map_entry->max_protection = VM_PROT_NONE;
9980 	map_entry->permanent = TRUE;
9981 	vm_object_reference(kernel_object);
9982 	vm_map_unlock(kernel_map);
9983 
9984 	assert(vm_paging_base_address == 0);
9985 	vm_paging_base_address = page_map_offset;
9986 }
9987 
9988 /*
9989  * vm_paging_map_object:
9990  *	Maps part of a VM object's pages in the kernel
9991  *      virtual address space, using the pre-allocated
9992  *	kernel virtual addresses, if possible.
9993  * Context:
9994  *      The VM object is locked.  This lock will get
9995  *      dropped and re-acquired though, so the caller
9996  *      must make sure the VM object is kept alive
9997  *	(by holding a VM map that has a reference
9998  *      on it, for example, or taking an extra reference).
9999  *      The page should also be kept busy to prevent
10000  *	it from being reclaimed.
10001  */
10002 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10003 vm_paging_map_object(
10004 	vm_page_t               page,
10005 	vm_object_t             object,
10006 	vm_object_offset_t      offset,
10007 	vm_prot_t               protection,
10008 	boolean_t               can_unlock_object,
10009 	vm_map_size_t           *size,          /* IN/OUT */
10010 	vm_map_offset_t         *address,       /* OUT */
10011 	boolean_t               *need_unmap)    /* OUT */
10012 {
10013 	kern_return_t           kr;
10014 	vm_map_offset_t         page_map_offset;
10015 	vm_map_size_t           map_size;
10016 	vm_object_offset_t      object_offset;
10017 	int                     i;
10018 
10019 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10020 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
10021 		*address = (vm_map_offset_t)
10022 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10023 		*need_unmap = FALSE;
10024 		return KERN_SUCCESS;
10025 
10026 		assert(page->vmp_busy);
10027 		/*
10028 		 * Use one of the pre-allocated kernel virtual addresses
10029 		 * and just enter the VM page in the kernel address space
10030 		 * at that virtual address.
10031 		 */
10032 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10033 
10034 		/*
10035 		 * Try and find an available kernel virtual address
10036 		 * from our pre-allocated pool.
10037 		 */
10038 		page_map_offset = 0;
10039 		for (;;) {
10040 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10041 				if (vm_paging_page_inuse[i] == FALSE) {
10042 					page_map_offset =
10043 					    vm_paging_base_address +
10044 					    (i * PAGE_SIZE);
10045 					break;
10046 				}
10047 			}
10048 			if (page_map_offset != 0) {
10049 				/* found a space to map our page ! */
10050 				break;
10051 			}
10052 
10053 			if (can_unlock_object) {
10054 				/*
10055 				 * If we can afford to unlock the VM object,
10056 				 * let's take the slow path now...
10057 				 */
10058 				break;
10059 			}
10060 			/*
10061 			 * We can't afford to unlock the VM object, so
10062 			 * let's wait for a space to become available...
10063 			 */
10064 			vm_paging_page_waiter_total++;
10065 			vm_paging_page_waiter++;
10066 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10067 			if (kr == THREAD_WAITING) {
10068 				simple_unlock(&vm_paging_lock);
10069 				kr = thread_block(THREAD_CONTINUE_NULL);
10070 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10071 			}
10072 			vm_paging_page_waiter--;
10073 			/* ... and try again */
10074 		}
10075 
10076 		if (page_map_offset != 0) {
10077 			/*
10078 			 * We found a kernel virtual address;
10079 			 * map the physical page to that virtual address.
10080 			 */
10081 			if (i > vm_paging_max_index) {
10082 				vm_paging_max_index = i;
10083 			}
10084 			vm_paging_page_inuse[i] = TRUE;
10085 			simple_unlock(&vm_paging_lock);
10086 
10087 			page->vmp_pmapped = TRUE;
10088 
10089 			/*
10090 			 * Keep the VM object locked over the PMAP_ENTER
10091 			 * and the actual use of the page by the kernel,
10092 			 * or this pmap mapping might get undone by a
10093 			 * vm_object_pmap_protect() call...
10094 			 */
10095 			PMAP_ENTER(kernel_pmap,
10096 			    page_map_offset,
10097 			    page,
10098 			    protection,
10099 			    VM_PROT_NONE,
10100 			    0,
10101 			    TRUE,
10102 			    kr);
10103 			assert(kr == KERN_SUCCESS);
10104 			vm_paging_objects_mapped++;
10105 			vm_paging_pages_mapped++;
10106 			*address = page_map_offset;
10107 			*need_unmap = TRUE;
10108 
10109 #if KASAN
10110 			kasan_notify_address(page_map_offset, PAGE_SIZE);
10111 #endif
10112 
10113 			/* all done and mapped, ready to use ! */
10114 			return KERN_SUCCESS;
10115 		}
10116 
10117 		/*
10118 		 * We ran out of pre-allocated kernel virtual
10119 		 * addresses.  Just map the page in the kernel
10120 		 * the slow and regular way.
10121 		 */
10122 		vm_paging_no_kernel_page++;
10123 		simple_unlock(&vm_paging_lock);
10124 	}
10125 
10126 	if (!can_unlock_object) {
10127 		*address = 0;
10128 		*size = 0;
10129 		*need_unmap = FALSE;
10130 		return KERN_NOT_SUPPORTED;
10131 	}
10132 
10133 	object_offset = vm_object_trunc_page(offset);
10134 	map_size = vm_map_round_page(*size,
10135 	    VM_MAP_PAGE_MASK(kernel_map));
10136 
10137 	/*
10138 	 * Try and map the required range of the object
10139 	 * in the kernel_map
10140 	 */
10141 
10142 	vm_object_reference_locked(object);     /* for the map entry */
10143 	vm_object_unlock(object);
10144 
10145 	kr = vm_map_enter(kernel_map,
10146 	    address,
10147 	    map_size,
10148 	    0,
10149 	    VM_FLAGS_ANYWHERE,
10150 	    VM_MAP_KERNEL_FLAGS_NONE,
10151 	    VM_KERN_MEMORY_NONE,
10152 	    object,
10153 	    object_offset,
10154 	    FALSE,
10155 	    protection,
10156 	    VM_PROT_ALL,
10157 	    VM_INHERIT_NONE);
10158 	if (kr != KERN_SUCCESS) {
10159 		*address = 0;
10160 		*size = 0;
10161 		*need_unmap = FALSE;
10162 		vm_object_deallocate(object);   /* for the map entry */
10163 		vm_object_lock(object);
10164 		return kr;
10165 	}
10166 
10167 	*size = map_size;
10168 
10169 	/*
10170 	 * Enter the mapped pages in the page table now.
10171 	 */
10172 	vm_object_lock(object);
10173 	/*
10174 	 * VM object must be kept locked from before PMAP_ENTER()
10175 	 * until after the kernel is done accessing the page(s).
10176 	 * Otherwise, the pmap mappings in the kernel could be
10177 	 * undone by a call to vm_object_pmap_protect().
10178 	 */
10179 
10180 	for (page_map_offset = 0;
10181 	    map_size != 0;
10182 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10183 		page = vm_page_lookup(object, offset + page_map_offset);
10184 		if (page == VM_PAGE_NULL) {
10185 			printf("vm_paging_map_object: no page !?");
10186 			vm_object_unlock(object);
10187 			kr = vm_map_remove(kernel_map, *address, *size,
10188 			    VM_MAP_REMOVE_NO_FLAGS);
10189 			assert(kr == KERN_SUCCESS);
10190 			*address = 0;
10191 			*size = 0;
10192 			*need_unmap = FALSE;
10193 			vm_object_lock(object);
10194 			return KERN_MEMORY_ERROR;
10195 		}
10196 		page->vmp_pmapped = TRUE;
10197 
10198 		PMAP_ENTER(kernel_pmap,
10199 		    *address + page_map_offset,
10200 		    page,
10201 		    protection,
10202 		    VM_PROT_NONE,
10203 		    0,
10204 		    TRUE,
10205 		    kr);
10206 		assert(kr == KERN_SUCCESS);
10207 #if KASAN
10208 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10209 #endif
10210 	}
10211 
10212 	vm_paging_objects_mapped_slow++;
10213 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10214 
10215 	*need_unmap = TRUE;
10216 
10217 	return KERN_SUCCESS;
10218 }
10219 
10220 /*
10221  * vm_paging_unmap_object:
10222  *	Unmaps part of a VM object's pages from the kernel
10223  *      virtual address space.
10224  * Context:
10225  *      The VM object is locked.  This lock will get
10226  *      dropped and re-acquired though.
10227  */
10228 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10229 vm_paging_unmap_object(
10230 	vm_object_t     object,
10231 	vm_map_offset_t start,
10232 	vm_map_offset_t end)
10233 {
10234 	kern_return_t   kr;
10235 	int             i;
10236 
10237 	if ((vm_paging_base_address == 0) ||
10238 	    (start < vm_paging_base_address) ||
10239 	    (end > (vm_paging_base_address
10240 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10241 		/*
10242 		 * We didn't use our pre-allocated pool of
10243 		 * kernel virtual address.  Deallocate the
10244 		 * virtual memory.
10245 		 */
10246 		if (object != VM_OBJECT_NULL) {
10247 			vm_object_unlock(object);
10248 		}
10249 		kr = vm_map_remove(kernel_map, start, end,
10250 		    VM_MAP_REMOVE_NO_FLAGS);
10251 		if (object != VM_OBJECT_NULL) {
10252 			vm_object_lock(object);
10253 		}
10254 		assert(kr == KERN_SUCCESS);
10255 	} else {
10256 		/*
10257 		 * We used a kernel virtual address from our
10258 		 * pre-allocated pool.  Put it back in the pool
10259 		 * for next time.
10260 		 */
10261 		assert(end - start == PAGE_SIZE);
10262 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10263 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10264 
10265 		/* undo the pmap mapping */
10266 		pmap_remove(kernel_pmap, start, end);
10267 
10268 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10269 		vm_paging_page_inuse[i] = FALSE;
10270 		if (vm_paging_page_waiter) {
10271 			thread_wakeup(&vm_paging_page_waiter);
10272 		}
10273 		simple_unlock(&vm_paging_lock);
10274 	}
10275 }
10276 
10277 
10278 /*
10279  * page->vmp_object must be locked
10280  */
10281 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10282 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10283 {
10284 	if (!queues_locked) {
10285 		vm_page_lockspin_queues();
10286 	}
10287 
10288 	page->vmp_free_when_done = FALSE;
10289 	/*
10290 	 * need to drop the laundry count...
10291 	 * we may also need to remove it
10292 	 * from the I/O paging queue...
10293 	 * vm_pageout_throttle_up handles both cases
10294 	 *
10295 	 * the laundry and pageout_queue flags are cleared...
10296 	 */
10297 	vm_pageout_throttle_up(page);
10298 
10299 	if (!queues_locked) {
10300 		vm_page_unlock_queues();
10301 	}
10302 }
10303 
10304 upl_t
vector_upl_create(vm_offset_t upl_offset)10305 vector_upl_create(vm_offset_t upl_offset)
10306 {
10307 	int i = 0;
10308 	upl_t   upl;
10309 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10310 
10311 	upl = upl_create(0, UPL_VECTOR, 0);
10312 	upl->vector_upl = vector_upl;
10313 	upl->u_offset = upl_offset;
10314 	vector_upl->size = 0;
10315 	vector_upl->offset = upl_offset;
10316 	vector_upl->invalid_upls = 0;
10317 	vector_upl->num_upls = 0;
10318 	vector_upl->pagelist = NULL;
10319 
10320 	for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10321 		vector_upl->upl_iostates[i].size = 0;
10322 		vector_upl->upl_iostates[i].offset = 0;
10323 	}
10324 	return upl;
10325 }
10326 
10327 void
vector_upl_deallocate(upl_t upl)10328 vector_upl_deallocate(upl_t upl)
10329 {
10330 	if (upl) {
10331 		vector_upl_t vector_upl = upl->vector_upl;
10332 		if (vector_upl) {
10333 			if (vector_upl->invalid_upls != vector_upl->num_upls) {
10334 				panic("Deallocating non-empty Vectored UPL");
10335 			}
10336 			kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10337 			vector_upl->invalid_upls = 0;
10338 			vector_upl->num_upls = 0;
10339 			vector_upl->pagelist = NULL;
10340 			vector_upl->size = 0;
10341 			vector_upl->offset = 0;
10342 			kfree_type(struct _vector_upl, vector_upl);
10343 			vector_upl = (vector_upl_t)0xfeedfeed;
10344 		} else {
10345 			panic("vector_upl_deallocate was passed a non-vectored upl");
10346 		}
10347 	} else {
10348 		panic("vector_upl_deallocate was passed a NULL upl");
10349 	}
10350 }
10351 
10352 boolean_t
vector_upl_is_valid(upl_t upl)10353 vector_upl_is_valid(upl_t upl)
10354 {
10355 	if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10356 		vector_upl_t vector_upl = upl->vector_upl;
10357 		if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10358 			return FALSE;
10359 		} else {
10360 			return TRUE;
10361 		}
10362 	}
10363 	return FALSE;
10364 }
10365 
10366 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10367 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10368 {
10369 	if (vector_upl_is_valid(upl)) {
10370 		vector_upl_t vector_upl = upl->vector_upl;
10371 
10372 		if (vector_upl) {
10373 			if (subupl) {
10374 				if (io_size) {
10375 					if (io_size < PAGE_SIZE) {
10376 						io_size = PAGE_SIZE;
10377 					}
10378 					subupl->vector_upl = (void*)vector_upl;
10379 					vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10380 					vector_upl->size += io_size;
10381 					upl->u_size += io_size;
10382 				} else {
10383 					uint32_t i = 0, invalid_upls = 0;
10384 					for (i = 0; i < vector_upl->num_upls; i++) {
10385 						if (vector_upl->upl_elems[i] == subupl) {
10386 							break;
10387 						}
10388 					}
10389 					if (i == vector_upl->num_upls) {
10390 						panic("Trying to remove sub-upl when none exists");
10391 					}
10392 
10393 					vector_upl->upl_elems[i] = NULL;
10394 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10395 					    relaxed);
10396 					if (invalid_upls == vector_upl->num_upls) {
10397 						return TRUE;
10398 					} else {
10399 						return FALSE;
10400 					}
10401 				}
10402 			} else {
10403 				panic("vector_upl_set_subupl was passed a NULL upl element");
10404 			}
10405 		} else {
10406 			panic("vector_upl_set_subupl was passed a non-vectored upl");
10407 		}
10408 	} else {
10409 		panic("vector_upl_set_subupl was passed a NULL upl");
10410 	}
10411 
10412 	return FALSE;
10413 }
10414 
10415 void
vector_upl_set_pagelist(upl_t upl)10416 vector_upl_set_pagelist(upl_t upl)
10417 {
10418 	if (vector_upl_is_valid(upl)) {
10419 		uint32_t i = 0;
10420 		vector_upl_t vector_upl = upl->vector_upl;
10421 
10422 		if (vector_upl) {
10423 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10424 
10425 			vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10426 
10427 			for (i = 0; i < vector_upl->num_upls; i++) {
10428 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10429 				bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10430 				pagelist_size += cur_upl_pagelist_size;
10431 				if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10432 					upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10433 				}
10434 			}
10435 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10436 		} else {
10437 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
10438 		}
10439 	} else {
10440 		panic("vector_upl_set_pagelist was passed a NULL upl");
10441 	}
10442 }
10443 
10444 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10445 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10446 {
10447 	if (vector_upl_is_valid(upl)) {
10448 		vector_upl_t vector_upl = upl->vector_upl;
10449 		if (vector_upl) {
10450 			if (index < vector_upl->num_upls) {
10451 				return vector_upl->upl_elems[index];
10452 			}
10453 		} else {
10454 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10455 		}
10456 	}
10457 	return NULL;
10458 }
10459 
10460 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10461 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10462 {
10463 	if (vector_upl_is_valid(upl)) {
10464 		uint32_t i = 0;
10465 		vector_upl_t vector_upl = upl->vector_upl;
10466 
10467 		if (vector_upl) {
10468 			upl_t subupl = NULL;
10469 			vector_upl_iostates_t subupl_state;
10470 
10471 			for (i = 0; i < vector_upl->num_upls; i++) {
10472 				subupl = vector_upl->upl_elems[i];
10473 				subupl_state = vector_upl->upl_iostates[i];
10474 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10475 					/* We could have been passed an offset/size pair that belongs
10476 					 * to an UPL element that has already been committed/aborted.
10477 					 * If so, return NULL.
10478 					 */
10479 					if (subupl == NULL) {
10480 						return NULL;
10481 					}
10482 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10483 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10484 						if (*upl_size > subupl_state.size) {
10485 							*upl_size = subupl_state.size;
10486 						}
10487 					}
10488 					if (*upl_offset >= subupl_state.offset) {
10489 						*upl_offset -= subupl_state.offset;
10490 					} else if (i) {
10491 						panic("Vector UPL offset miscalculation");
10492 					}
10493 					return subupl;
10494 				}
10495 			}
10496 		} else {
10497 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10498 		}
10499 	}
10500 	return NULL;
10501 }
10502 
10503 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10504 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10505 {
10506 	*v_upl_submap = NULL;
10507 
10508 	if (vector_upl_is_valid(upl)) {
10509 		vector_upl_t vector_upl = upl->vector_upl;
10510 		if (vector_upl) {
10511 			*v_upl_submap = vector_upl->submap;
10512 			*submap_dst_addr = vector_upl->submap_dst_addr;
10513 		} else {
10514 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10515 		}
10516 	} else {
10517 		panic("vector_upl_get_submap was passed a null UPL");
10518 	}
10519 }
10520 
10521 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10522 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10523 {
10524 	if (vector_upl_is_valid(upl)) {
10525 		vector_upl_t vector_upl = upl->vector_upl;
10526 		if (vector_upl) {
10527 			vector_upl->submap = submap;
10528 			vector_upl->submap_dst_addr = submap_dst_addr;
10529 		} else {
10530 			panic("vector_upl_get_submap was passed a non-vectored UPL");
10531 		}
10532 	} else {
10533 		panic("vector_upl_get_submap was passed a NULL UPL");
10534 	}
10535 }
10536 
10537 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10538 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10539 {
10540 	if (vector_upl_is_valid(upl)) {
10541 		uint32_t i = 0;
10542 		vector_upl_t vector_upl = upl->vector_upl;
10543 
10544 		if (vector_upl) {
10545 			for (i = 0; i < vector_upl->num_upls; i++) {
10546 				if (vector_upl->upl_elems[i] == subupl) {
10547 					break;
10548 				}
10549 			}
10550 
10551 			if (i == vector_upl->num_upls) {
10552 				panic("setting sub-upl iostate when none exists");
10553 			}
10554 
10555 			vector_upl->upl_iostates[i].offset = offset;
10556 			if (size < PAGE_SIZE) {
10557 				size = PAGE_SIZE;
10558 			}
10559 			vector_upl->upl_iostates[i].size = size;
10560 		} else {
10561 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
10562 		}
10563 	} else {
10564 		panic("vector_upl_set_iostate was passed a NULL UPL");
10565 	}
10566 }
10567 
10568 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10569 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10570 {
10571 	if (vector_upl_is_valid(upl)) {
10572 		uint32_t i = 0;
10573 		vector_upl_t vector_upl = upl->vector_upl;
10574 
10575 		if (vector_upl) {
10576 			for (i = 0; i < vector_upl->num_upls; i++) {
10577 				if (vector_upl->upl_elems[i] == subupl) {
10578 					break;
10579 				}
10580 			}
10581 
10582 			if (i == vector_upl->num_upls) {
10583 				panic("getting sub-upl iostate when none exists");
10584 			}
10585 
10586 			*offset = vector_upl->upl_iostates[i].offset;
10587 			*size = vector_upl->upl_iostates[i].size;
10588 		} else {
10589 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
10590 		}
10591 	} else {
10592 		panic("vector_upl_get_iostate was passed a NULL UPL");
10593 	}
10594 }
10595 
10596 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10597 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10598 {
10599 	if (vector_upl_is_valid(upl)) {
10600 		vector_upl_t vector_upl = upl->vector_upl;
10601 		if (vector_upl) {
10602 			if (index < vector_upl->num_upls) {
10603 				*offset = vector_upl->upl_iostates[index].offset;
10604 				*size = vector_upl->upl_iostates[index].size;
10605 			} else {
10606 				*offset = *size = 0;
10607 			}
10608 		} else {
10609 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10610 		}
10611 	} else {
10612 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10613 	}
10614 }
10615 
10616 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10617 upl_get_internal_vectorupl_pagelist(upl_t upl)
10618 {
10619 	return ((vector_upl_t)(upl->vector_upl))->pagelist;
10620 }
10621 
10622 void *
upl_get_internal_vectorupl(upl_t upl)10623 upl_get_internal_vectorupl(upl_t upl)
10624 {
10625 	return upl->vector_upl;
10626 }
10627 
10628 vm_size_t
upl_get_internal_pagelist_offset(void)10629 upl_get_internal_pagelist_offset(void)
10630 {
10631 	return sizeof(struct upl);
10632 }
10633 
10634 void
upl_clear_dirty(upl_t upl,boolean_t value)10635 upl_clear_dirty(
10636 	upl_t           upl,
10637 	boolean_t       value)
10638 {
10639 	if (value) {
10640 		upl->flags |= UPL_CLEAR_DIRTY;
10641 	} else {
10642 		upl->flags &= ~UPL_CLEAR_DIRTY;
10643 	}
10644 }
10645 
10646 void
upl_set_referenced(upl_t upl,boolean_t value)10647 upl_set_referenced(
10648 	upl_t           upl,
10649 	boolean_t       value)
10650 {
10651 	upl_lock(upl);
10652 	if (value) {
10653 		upl->ext_ref_count++;
10654 	} else {
10655 		if (!upl->ext_ref_count) {
10656 			panic("upl_set_referenced not %p", upl);
10657 		}
10658 		upl->ext_ref_count--;
10659 	}
10660 	upl_unlock(upl);
10661 }
10662 
10663 #if CONFIG_IOSCHED
10664 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10665 upl_set_blkno(
10666 	upl_t           upl,
10667 	vm_offset_t     upl_offset,
10668 	int             io_size,
10669 	int64_t         blkno)
10670 {
10671 	int i, j;
10672 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10673 		return;
10674 	}
10675 
10676 	assert(upl->upl_reprio_info != 0);
10677 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10678 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10679 	}
10680 }
10681 #endif
10682 
10683 void inline
memoryshot(unsigned int event,unsigned int control)10684 memoryshot(unsigned int event, unsigned int control)
10685 {
10686 	if (vm_debug_events) {
10687 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10688 		    vm_page_active_count, vm_page_inactive_count,
10689 		    vm_page_free_count, vm_page_speculative_count,
10690 		    vm_page_throttled_count);
10691 	} else {
10692 		(void) event;
10693 		(void) control;
10694 	}
10695 }
10696 
10697 #ifdef MACH_BSD
10698 
10699 boolean_t
upl_device_page(upl_page_info_t * upl)10700 upl_device_page(upl_page_info_t *upl)
10701 {
10702 	return UPL_DEVICE_PAGE(upl);
10703 }
10704 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10705 upl_page_present(upl_page_info_t *upl, int index)
10706 {
10707 	return UPL_PAGE_PRESENT(upl, index);
10708 }
10709 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10710 upl_speculative_page(upl_page_info_t *upl, int index)
10711 {
10712 	return UPL_SPECULATIVE_PAGE(upl, index);
10713 }
10714 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10715 upl_dirty_page(upl_page_info_t *upl, int index)
10716 {
10717 	return UPL_DIRTY_PAGE(upl, index);
10718 }
10719 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10720 upl_valid_page(upl_page_info_t *upl, int index)
10721 {
10722 	return UPL_VALID_PAGE(upl, index);
10723 }
10724 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10725 upl_phys_page(upl_page_info_t *upl, int index)
10726 {
10727 	return UPL_PHYS_PAGE(upl, index);
10728 }
10729 
10730 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10731 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10732 {
10733 	upl[index].mark = v;
10734 }
10735 
10736 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10737 upl_page_get_mark(upl_page_info_t *upl, int index)
10738 {
10739 	return upl[index].mark;
10740 }
10741 
10742 void
vm_countdirtypages(void)10743 vm_countdirtypages(void)
10744 {
10745 	vm_page_t m;
10746 	int dpages;
10747 	int pgopages;
10748 	int precpages;
10749 
10750 
10751 	dpages = 0;
10752 	pgopages = 0;
10753 	precpages = 0;
10754 
10755 	vm_page_lock_queues();
10756 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10757 	do {
10758 		if (m == (vm_page_t)0) {
10759 			break;
10760 		}
10761 
10762 		if (m->vmp_dirty) {
10763 			dpages++;
10764 		}
10765 		if (m->vmp_free_when_done) {
10766 			pgopages++;
10767 		}
10768 		if (m->vmp_precious) {
10769 			precpages++;
10770 		}
10771 
10772 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10773 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10774 		if (m == (vm_page_t)0) {
10775 			break;
10776 		}
10777 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10778 	vm_page_unlock_queues();
10779 
10780 	vm_page_lock_queues();
10781 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10782 	do {
10783 		if (m == (vm_page_t)0) {
10784 			break;
10785 		}
10786 
10787 		dpages++;
10788 		assert(m->vmp_dirty);
10789 		assert(!m->vmp_free_when_done);
10790 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10791 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10792 		if (m == (vm_page_t)0) {
10793 			break;
10794 		}
10795 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10796 	vm_page_unlock_queues();
10797 
10798 	vm_page_lock_queues();
10799 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10800 	do {
10801 		if (m == (vm_page_t)0) {
10802 			break;
10803 		}
10804 
10805 		if (m->vmp_dirty) {
10806 			dpages++;
10807 		}
10808 		if (m->vmp_free_when_done) {
10809 			pgopages++;
10810 		}
10811 		if (m->vmp_precious) {
10812 			precpages++;
10813 		}
10814 
10815 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10816 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10817 		if (m == (vm_page_t)0) {
10818 			break;
10819 		}
10820 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10821 	vm_page_unlock_queues();
10822 
10823 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10824 
10825 	dpages = 0;
10826 	pgopages = 0;
10827 	precpages = 0;
10828 
10829 	vm_page_lock_queues();
10830 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10831 
10832 	do {
10833 		if (m == (vm_page_t)0) {
10834 			break;
10835 		}
10836 		if (m->vmp_dirty) {
10837 			dpages++;
10838 		}
10839 		if (m->vmp_free_when_done) {
10840 			pgopages++;
10841 		}
10842 		if (m->vmp_precious) {
10843 			precpages++;
10844 		}
10845 
10846 		assert(VM_PAGE_OBJECT(m) != kernel_object);
10847 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10848 		if (m == (vm_page_t)0) {
10849 			break;
10850 		}
10851 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10852 	vm_page_unlock_queues();
10853 
10854 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10855 }
10856 #endif /* MACH_BSD */
10857 
10858 
10859 #if CONFIG_IOSCHED
10860 int
upl_get_cached_tier(upl_t upl)10861 upl_get_cached_tier(upl_t  upl)
10862 {
10863 	assert(upl);
10864 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10865 		return upl->upl_priority;
10866 	}
10867 	return -1;
10868 }
10869 #endif /* CONFIG_IOSCHED */
10870 
10871 
10872 void
upl_callout_iodone(upl_t upl)10873 upl_callout_iodone(upl_t upl)
10874 {
10875 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
10876 
10877 	if (upl_ctx) {
10878 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
10879 
10880 		assert(upl_ctx->io_done);
10881 
10882 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
10883 	}
10884 }
10885 
10886 void
upl_set_iodone(upl_t upl,void * upl_iodone)10887 upl_set_iodone(upl_t upl, void *upl_iodone)
10888 {
10889 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
10890 }
10891 
10892 void
upl_set_iodone_error(upl_t upl,int error)10893 upl_set_iodone_error(upl_t upl, int error)
10894 {
10895 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
10896 
10897 	if (upl_ctx) {
10898 		upl_ctx->io_error = error;
10899 	}
10900 }
10901 
10902 
10903 ppnum_t
upl_get_highest_page(upl_t upl)10904 upl_get_highest_page(
10905 	upl_t                      upl)
10906 {
10907 	return upl->highest_page;
10908 }
10909 
10910 upl_size_t
upl_get_size(upl_t upl)10911 upl_get_size(
10912 	upl_t                      upl)
10913 {
10914 	return upl_adjusted_size(upl, PAGE_MASK);
10915 }
10916 
10917 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)10918 upl_adjusted_size(
10919 	upl_t upl,
10920 	vm_map_offset_t pgmask)
10921 {
10922 	vm_object_offset_t start_offset, end_offset;
10923 
10924 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
10925 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
10926 
10927 	return (upl_size_t)(end_offset - start_offset);
10928 }
10929 
10930 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)10931 upl_adjusted_offset(
10932 	upl_t upl,
10933 	vm_map_offset_t pgmask)
10934 {
10935 	return trunc_page_mask_64(upl->u_offset, pgmask);
10936 }
10937 
10938 vm_object_offset_t
upl_get_data_offset(upl_t upl)10939 upl_get_data_offset(
10940 	upl_t upl)
10941 {
10942 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
10943 }
10944 
10945 upl_t
upl_associated_upl(upl_t upl)10946 upl_associated_upl(upl_t upl)
10947 {
10948 	return upl->associated_upl;
10949 }
10950 
10951 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)10952 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10953 {
10954 	upl->associated_upl = associated_upl;
10955 }
10956 
10957 struct vnode *
upl_lookup_vnode(upl_t upl)10958 upl_lookup_vnode(upl_t upl)
10959 {
10960 	if (!upl->map_object->internal) {
10961 		return vnode_pager_lookup_vnode(upl->map_object->pager);
10962 	} else {
10963 		return NULL;
10964 	}
10965 }
10966 
10967 #if UPL_DEBUG
10968 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)10969 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10970 {
10971 	upl->ubc_alias1 = alias1;
10972 	upl->ubc_alias2 = alias2;
10973 	return KERN_SUCCESS;
10974 }
10975 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)10976 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10977 {
10978 	if (al) {
10979 		*al = upl->ubc_alias1;
10980 	}
10981 	if (al2) {
10982 		*al2 = upl->ubc_alias2;
10983 	}
10984 	return KERN_SUCCESS;
10985 }
10986 #endif /* UPL_DEBUG */
10987 
10988 #if VM_PRESSURE_EVENTS
10989 /*
10990  * Upward trajectory.
10991  */
10992 extern boolean_t vm_compressor_low_on_space(void);
10993 
10994 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)10995 VM_PRESSURE_NORMAL_TO_WARNING(void)
10996 {
10997 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10998 		/* Available pages below our threshold */
10999 		if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11000 			/* No frozen processes to kill */
11001 			if (memorystatus_frozen_count == 0) {
11002 				/* Not enough suspended processes available. */
11003 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11004 					return TRUE;
11005 				}
11006 			}
11007 		}
11008 		return FALSE;
11009 	} else {
11010 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11011 	}
11012 }
11013 
11014 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11015 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11016 {
11017 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11018 		/* Available pages below our threshold */
11019 		if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11020 			return TRUE;
11021 		}
11022 		return FALSE;
11023 	} else {
11024 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11025 	}
11026 }
11027 
11028 /*
11029  * Downward trajectory.
11030  */
11031 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11032 VM_PRESSURE_WARNING_TO_NORMAL(void)
11033 {
11034 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11035 		/* Available pages above our threshold */
11036 		unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11037 		if (memorystatus_available_pages > target_threshold) {
11038 			return TRUE;
11039 		}
11040 		return FALSE;
11041 	} else {
11042 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11043 	}
11044 }
11045 
11046 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11047 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11048 {
11049 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11050 		/* Available pages above our threshold */
11051 		unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11052 		if (memorystatus_available_pages > target_threshold) {
11053 			return TRUE;
11054 		}
11055 		return FALSE;
11056 	} else {
11057 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11058 	}
11059 }
11060 #endif /* VM_PRESSURE_EVENTS */
11061