1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69
70 #include <debug.h>
71
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92
93 #include <os/log.h>
94
95 #include <sys/kdebug_triage.h>
96
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116
117 #include <san/kasan.h>
118 #include <sys/kern_memorystatus_xnu.h>
119
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123
124
125 #if UPL_DEBUG
126 #include <libkern/OSDebug.h>
127 #endif
128
129 extern int cs_debug;
130
131 #if CONFIG_MBUF_MCACHE
132 extern void mbuf_drain(boolean_t);
133 #endif /* CONFIG_MBUF_MCACHE */
134
135 #if CONFIG_FREEZE
136 extern unsigned int memorystatus_frozen_count;
137 extern unsigned int memorystatus_suspended_count;
138 #endif /* CONFIG_FREEZE */
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140
141 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 extern uint32_t memorystatus_jetsam_bg_band_waiters;
144
145 void vm_pressure_response(void);
146 extern void consider_vm_pressure_events(void);
147
148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
149
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 sched_cond_atomic_t vm_pageout_gc_cond;
153 #if CONFIG_VPS_DYNAMIC_PRIO
154 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
155 #else
156 const bool vps_dynamic_priority_enabled = false;
157 #endif
158 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
159
160 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
161 #if !XNU_TARGET_OS_OSX
162 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
163 #else /* !XNU_TARGET_OS_OSX */
164 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
165 #endif /* !XNU_TARGET_OS_OSX */
166 #endif
167
168 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
169 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
170 #endif
171
172 #ifndef VM_PAGE_LAUNDRY_MAX
173 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
174 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
175
176 #ifndef VM_PAGEOUT_BURST_WAIT
177 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
178 #endif /* VM_PAGEOUT_BURST_WAIT */
179
180 #ifndef VM_PAGEOUT_EMPTY_WAIT
181 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
182 #endif /* VM_PAGEOUT_EMPTY_WAIT */
183
184 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
185 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
186 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
187
188 #ifndef VM_PAGEOUT_IDLE_WAIT
189 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
190 #endif /* VM_PAGEOUT_IDLE_WAIT */
191
192 #ifndef VM_PAGEOUT_SWAP_WAIT
193 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
194 #endif /* VM_PAGEOUT_SWAP_WAIT */
195
196 /*
197 * vm_page_max_speculative_age_q should be less than or equal to
198 * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
199 * vm_page_queue_speculative entries.
200 */
201
202 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
203 #ifndef VM_PAGE_SPECULATIVE_TARGET
204 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
205 #endif /* VM_PAGE_SPECULATIVE_TARGET */
206
207
208 /*
209 * To obtain a reasonable LRU approximation, the inactive queue
210 * needs to be large enough to give pages on it a chance to be
211 * referenced a second time. This macro defines the fraction
212 * of active+inactive pages that should be inactive.
213 * The pageout daemon uses it to update vm_page_inactive_target.
214 *
215 * If vm_page_free_count falls below vm_page_free_target and
216 * vm_page_inactive_count is below vm_page_inactive_target,
217 * then the pageout daemon starts running.
218 */
219
220 #ifndef VM_PAGE_INACTIVE_TARGET
221 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
222 #endif /* VM_PAGE_INACTIVE_TARGET */
223
224 /*
225 * Once the pageout daemon starts running, it keeps going
226 * until vm_page_free_count meets or exceeds vm_page_free_target.
227 */
228
229 #ifndef VM_PAGE_FREE_TARGET
230 #if !XNU_TARGET_OS_OSX
231 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
232 #else /* !XNU_TARGET_OS_OSX */
233 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
234 #endif /* !XNU_TARGET_OS_OSX */
235 #endif /* VM_PAGE_FREE_TARGET */
236
237
238 /*
239 * The pageout daemon always starts running once vm_page_free_count
240 * falls below vm_page_free_min.
241 */
242
243 #ifndef VM_PAGE_FREE_MIN
244 #if !XNU_TARGET_OS_OSX
245 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
246 #else /* !XNU_TARGET_OS_OSX */
247 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
248 #endif /* !XNU_TARGET_OS_OSX */
249 #endif /* VM_PAGE_FREE_MIN */
250
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_RESERVED_LIMIT 100
253 #define VM_PAGE_FREE_MIN_LIMIT 1500
254 #define VM_PAGE_FREE_TARGET_LIMIT 2000
255 #else /* !XNU_TARGET_OS_OSX */
256 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
257 #define VM_PAGE_FREE_MIN_LIMIT 3500
258 #define VM_PAGE_FREE_TARGET_LIMIT 4000
259 #endif /* !XNU_TARGET_OS_OSX */
260
261 /*
262 * When vm_page_free_count falls below vm_page_free_reserved,
263 * only vm-privileged threads can allocate pages. vm-privilege
264 * allows the pageout daemon and default pager (and any other
265 * associated threads needed for default pageout) to continue
266 * operation by dipping into the reserved pool of pages.
267 */
268
269 #ifndef VM_PAGE_FREE_RESERVED
270 #define VM_PAGE_FREE_RESERVED(n) \
271 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
272 #endif /* VM_PAGE_FREE_RESERVED */
273
274 /*
275 * When we dequeue pages from the inactive list, they are
276 * reactivated (ie, put back on the active queue) if referenced.
277 * However, it is possible to starve the free list if other
278 * processors are referencing pages faster than we can turn off
279 * the referenced bit. So we limit the number of reactivations
280 * we will make per call of vm_pageout_scan().
281 */
282 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
283
284 #ifndef VM_PAGE_REACTIVATE_LIMIT
285 #if !XNU_TARGET_OS_OSX
286 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
287 #else /* !XNU_TARGET_OS_OSX */
288 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
289 #endif /* !XNU_TARGET_OS_OSX */
290 #endif /* VM_PAGE_REACTIVATE_LIMIT */
291 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
292
293 int vm_pageout_protect_realtime = true;
294
295 extern boolean_t hibernate_cleaning_in_progress;
296
297 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
298 struct pgo_iothread_state pgo_iothread_external_state;
299
300 #if VM_PRESSURE_EVENTS
301 void vm_pressure_thread(void);
302
303 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
304 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
305
306 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
307 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
308 #endif
309
310 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
311 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
312 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
313
314 extern void vm_pageout_continue(void);
315 extern void vm_pageout_scan(void);
316
317 boolean_t vm_pageout_running = FALSE;
318
319 uint32_t vm_page_upl_tainted = 0;
320 uint32_t vm_page_iopl_tainted = 0;
321
322 #if XNU_TARGET_OS_OSX
323 static boolean_t vm_pageout_waiter = FALSE;
324 #endif /* XNU_TARGET_OS_OSX */
325
326
327 #if DEVELOPMENT || DEBUG
328 struct vm_pageout_debug vm_pageout_debug;
329 #endif
330 struct vm_pageout_vminfo vm_pageout_vminfo;
331 struct vm_pageout_state vm_pageout_state;
332 struct vm_config vm_config;
333
334 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
335 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
336 #if DEVELOPMENT || DEBUG
337 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
338 #endif /* DEVELOPMENT || DEBUG */
339
340 int vm_upl_wait_for_pages = 0;
341 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
342
343 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
344
345 int vm_debug_events = 0;
346
347 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
348
349 #if CONFIG_MEMORYSTATUS
350 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
351 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
352 #endif
353
354 #if __AMP__
355
356
357 /*
358 * Bind compressor threads to e-cores unless there are multiple non-e clusters
359 */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_soft_bind_cluster_type(thread_t, char);
371
372 #endif /* __AMP__ */
373
374
375 /*
376 * Routine: vm_pageout_object_terminate
377 * Purpose:
378 * Destroy the pageout_object, and perform all of the
379 * required cleanup actions.
380 *
381 * In/Out conditions:
382 * The object must be locked, and will be returned locked.
383 */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 vm_object_t object)
387 {
388 vm_object_t shadow_object;
389
390 /*
391 * Deal with the deallocation (last reference) of a pageout object
392 * (used for cleaning-in-place) by dropping the paging references/
393 * freeing pages in the original object.
394 */
395
396 assert(object->pageout);
397 shadow_object = object->shadow;
398 vm_object_lock(shadow_object);
399
400 while (!vm_page_queue_empty(&object->memq)) {
401 vm_page_t p, m;
402 vm_object_offset_t offset;
403
404 p = (vm_page_t) vm_page_queue_first(&object->memq);
405
406 assert(vm_page_is_private(p));
407 assert(p->vmp_free_when_done);
408 p->vmp_free_when_done = FALSE;
409 assert(!p->vmp_cleaning);
410 assert(!p->vmp_laundry);
411
412 offset = p->vmp_offset;
413 VM_PAGE_FREE(p);
414 p = VM_PAGE_NULL;
415
416 m = vm_page_lookup(shadow_object,
417 offset + object->vo_shadow_offset);
418
419 if (m == VM_PAGE_NULL) {
420 continue;
421 }
422
423 assert((m->vmp_dirty) || (m->vmp_precious) ||
424 (m->vmp_busy && m->vmp_cleaning));
425
426 /*
427 * Handle the trusted pager throttle.
428 * Also decrement the burst throttle (if external).
429 */
430 vm_page_lock_queues();
431 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 vm_pageout_throttle_up(m);
433 }
434
435 /*
436 * Handle the "target" page(s). These pages are to be freed if
437 * successfully cleaned. Target pages are always busy, and are
438 * wired exactly once. The initial target pages are not mapped,
439 * (so cannot be referenced or modified) but converted target
440 * pages may have been modified between the selection as an
441 * adjacent page and conversion to a target.
442 */
443 if (m->vmp_free_when_done) {
444 assert(m->vmp_busy);
445 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 assert(m->vmp_wire_count == 1);
447 m->vmp_cleaning = FALSE;
448 m->vmp_free_when_done = FALSE;
449 /*
450 * Revoke all access to the page. Since the object is
451 * locked, and the page is busy, this prevents the page
452 * from being dirtied after the pmap_disconnect() call
453 * returns.
454 *
455 * Since the page is left "dirty" but "not modifed", we
456 * can detect whether the page was redirtied during
457 * pageout by checking the modify state.
458 */
459 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 SET_PAGE_DIRTY(m, FALSE);
461 } else {
462 m->vmp_dirty = FALSE;
463 }
464
465 if (m->vmp_dirty) {
466 vm_page_unwire(m, TRUE); /* reactivates */
467 counter_inc(&vm_statistics_reactivations);
468 vm_page_wakeup_done(object, m);
469 } else {
470 vm_page_free(m); /* clears busy, etc. */
471 }
472 vm_page_unlock_queues();
473 continue;
474 }
475 /*
476 * Handle the "adjacent" pages. These pages were cleaned in
477 * place, and should be left alone.
478 * If prep_pin_count is nonzero, then someone is using the
479 * page, so make it active.
480 */
481 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
482 if (m->vmp_reference) {
483 vm_page_activate(m);
484 } else {
485 vm_page_deactivate(m);
486 }
487 }
488 if (m->vmp_overwriting) {
489 /*
490 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 */
492 if (m->vmp_busy) {
493 /*
494 * We do not re-set m->vmp_dirty !
495 * The page was busy so no extraneous activity
496 * could have occurred. COPY_INTO is a read into the
497 * new pages. CLEAN_IN_PLACE does actually write
498 * out the pages but handling outside of this code
499 * will take care of resetting dirty. We clear the
500 * modify however for the Programmed I/O case.
501 */
502 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503
504 m->vmp_busy = FALSE;
505 m->vmp_absent = FALSE;
506 } else {
507 /*
508 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 * Occurs when the original page was wired
510 * at the time of the list request
511 */
512 assert(VM_PAGE_WIRED(m));
513 vm_page_unwire(m, TRUE); /* reactivates */
514 }
515 m->vmp_overwriting = FALSE;
516 } else {
517 m->vmp_dirty = FALSE;
518 }
519 m->vmp_cleaning = FALSE;
520
521 /*
522 * Wakeup any thread waiting for the page to be un-cleaning.
523 */
524 vm_page_wakeup(object, m);
525 vm_page_unlock_queues();
526 }
527 /*
528 * Account for the paging reference taken in vm_paging_object_allocate.
529 */
530 vm_object_activity_end(shadow_object);
531 vm_object_unlock(shadow_object);
532
533 assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 assert(object->paging_in_progress == 0);
535 assert(object->activity_in_progress == 0);
536 assert(object->resident_page_count == 0);
537 return;
538 }
539
540 /*
541 * Routine: vm_pageclean_setup
542 *
543 * Purpose: setup a page to be cleaned (made non-dirty), but not
544 * necessarily flushed from the VM page cache.
545 * This is accomplished by cleaning in place.
546 *
547 * The page must not be busy, and new_object
548 * must be locked.
549 *
550 */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 vm_page_t m,
554 vm_page_t new_m,
555 vm_object_t new_object,
556 vm_object_offset_t new_offset)
557 {
558 assert(!m->vmp_busy);
559 #if 0
560 assert(!m->vmp_cleaning);
561 #endif
562
563 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564
565 /*
566 * Mark original page as cleaning in place.
567 */
568 m->vmp_cleaning = TRUE;
569 SET_PAGE_DIRTY(m, FALSE);
570 m->vmp_precious = FALSE;
571
572 /*
573 * Convert the fictitious page to a private shadow of
574 * the real page.
575 */
576 new_m->vmp_free_when_done = TRUE;
577
578 vm_page_lockspin_queues();
579 vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
580 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 vm_page_unlock_queues();
582
583 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 assert(!new_m->vmp_wanted);
585 new_m->vmp_busy = FALSE;
586 }
587
588 /*
589 * Routine: vm_pageout_initialize_page
590 * Purpose:
591 * Causes the specified page to be initialized in
592 * the appropriate memory object. This routine is used to push
593 * pages into a copy-object when they are modified in the
594 * permanent object.
595 *
596 * The page is moved to a temporary object and paged out.
597 *
598 * In/out conditions:
599 * The page in question must not be on any pageout queues.
600 * The object to which it belongs must be locked.
601 * The page must be busy, but not hold a paging reference.
602 *
603 * Implementation:
604 * Move this page to a completely new object.
605 */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 vm_page_t m)
609 {
610 vm_object_t object;
611 vm_object_offset_t paging_offset;
612 memory_object_t pager;
613
614 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615
616 object = VM_PAGE_OBJECT(m);
617
618 assert(m->vmp_busy);
619 assert(object->internal);
620
621 /*
622 * Verify that we really want to clean this page
623 */
624 assert(!m->vmp_absent);
625 assert(m->vmp_dirty);
626
627 /*
628 * Create a paging reference to let us play with the object.
629 */
630 paging_offset = m->vmp_offset + object->paging_offset;
631
632 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 panic("reservation without pageout?"); /* alan */
634
635 VM_PAGE_FREE(m);
636 vm_object_unlock(object);
637
638 return;
639 }
640
641 /*
642 * If there's no pager, then we can't clean the page. This should
643 * never happen since this should be a copy object and therefore not
644 * an external object, so the pager should always be there.
645 */
646
647 pager = object->pager;
648
649 if (pager == MEMORY_OBJECT_NULL) {
650 panic("missing pager for copy object");
651
652 VM_PAGE_FREE(m);
653 return;
654 }
655
656 /*
657 * set the page for future call to vm_fault_list_request
658 */
659 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 SET_PAGE_DIRTY(m, FALSE);
661
662 /*
663 * keep the object from collapsing or terminating
664 */
665 vm_object_paging_begin(object);
666 vm_object_unlock(object);
667
668 /*
669 * Write the data to its pager.
670 * Note that the data is passed by naming the new object,
671 * not a virtual address; the pager interface has been
672 * manipulated to use the "internal memory" data type.
673 * [The object reference from its allocation is donated
674 * to the eventual recipient.]
675 */
676 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677
678 vm_object_lock(object);
679 vm_object_paging_end(object);
680 }
681
682
683 /*
684 * vm_pageout_cluster:
685 *
686 * Given a page, queue it to the appropriate I/O thread,
687 * which will page it out and attempt to clean adjacent pages
688 * in the same operation.
689 *
690 * The object and queues must be locked. We will take a
691 * paging reference to prevent deallocation or collapse when we
692 * release the object lock back at the call site. The I/O thread
693 * is responsible for consuming this reference
694 *
695 * The page must not be on any pageout queue.
696 */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703
704 typedef enum vmct_state_t {
705 VMCT_IDLE,
706 VMCT_AWAKENED,
707 VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711
712
713
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 vm_object_t object = VM_PAGE_OBJECT(m);
718
719 VM_PAGE_CHECK(m);
720 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 vm_object_lock_assert_exclusive(object);
722
723 /*
724 * Make sure it's OK to page this out.
725 */
726 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 assert(!m->vmp_cleaning && !m->vmp_laundry);
728 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729
730 /*
731 * protect the object from collapse or termination
732 */
733 vm_object_activity_begin(object);
734
735
736 /*
737 * pgo_laundry count is tied to the laundry bit
738 */
739 m->vmp_laundry = TRUE;
740 q->pgo_laundry++;
741
742 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744
745 if (object->internal == TRUE) {
746 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
747 m->vmp_busy = TRUE;
748 #if DEVELOPMENT || DEBUG
749 /*
750 * The benchmark queue will be woken up independently by the benchmark
751 * itself.
752 */
753 if (q != &vm_pageout_queue_benchmark) {
754 #else /* DEVELOPMENT || DEBUG */
755 if (true) {
756 #endif /* DEVELOPMENT || DEBUG */
757 /*
758 * Wake up the first compressor thread. It will wake subsequent
759 * threads if necessary.
760 */
761 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
762 pgo_iothread_internal_state[0].pgo_iothread);
763 }
764 } else {
765 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
766 }
767 VM_PAGE_CHECK(m);
768 }
769
770 void
771 vm_pageout_cluster(vm_page_t m)
772 {
773 struct vm_pageout_queue *q;
774 vm_object_t object = VM_PAGE_OBJECT(m);
775 if (object->internal) {
776 q = &vm_pageout_queue_internal;
777 } else {
778 q = &vm_pageout_queue_external;
779 }
780 vm_pageout_cluster_to_queue(m, q);
781 }
782
783
784 /*
785 * A page is back from laundry or we are stealing it back from
786 * the laundering state. See if there are some pages waiting to
787 * go to laundry and if we can let some of them go now.
788 *
789 * Object and page queues must be locked.
790 */
791 void
792 vm_pageout_throttle_up(
793 vm_page_t m)
794 {
795 struct vm_pageout_queue *q;
796 vm_object_t m_object;
797
798 m_object = VM_PAGE_OBJECT(m);
799
800 assert(m_object != VM_OBJECT_NULL);
801 assert(!is_kernel_object(m_object));
802
803 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
804 vm_object_lock_assert_exclusive(m_object);
805
806 if (m_object->internal == TRUE) {
807 q = &vm_pageout_queue_internal;
808 } else {
809 q = &vm_pageout_queue_external;
810 }
811
812 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
813 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
814 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
815
816 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
817
818 vm_object_activity_end(m_object);
819
820 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
821 }
822 if (m->vmp_laundry == TRUE) {
823 m->vmp_laundry = FALSE;
824 q->pgo_laundry--;
825
826 if (q->pgo_throttled == TRUE) {
827 q->pgo_throttled = FALSE;
828 thread_wakeup((event_t) &q->pgo_laundry);
829 }
830 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
831 q->pgo_draining = FALSE;
832 thread_wakeup((event_t) (&q->pgo_laundry + 1));
833 }
834 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
835 }
836 }
837
838
839 static void
840 vm_pageout_throttle_up_batch(
841 struct vm_pageout_queue *q,
842 int batch_cnt)
843 {
844 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
845
846 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
847
848 q->pgo_laundry -= batch_cnt;
849
850 if (q->pgo_throttled == TRUE) {
851 q->pgo_throttled = FALSE;
852 thread_wakeup((event_t) &q->pgo_laundry);
853 }
854 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
855 q->pgo_draining = FALSE;
856 thread_wakeup((event_t) (&q->pgo_laundry + 1));
857 }
858 }
859
860
861
862 /*
863 * VM memory pressure monitoring.
864 *
865 * vm_pageout_scan() keeps track of the number of pages it considers and
866 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
867 *
868 * compute_memory_pressure() is called every second from compute_averages()
869 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
870 * of recalimed pages in a new vm_pageout_stat[] bucket.
871 *
872 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
873 * The caller provides the number of seconds ("nsecs") worth of statistics
874 * it wants, up to 30 seconds.
875 * It computes the number of pages reclaimed in the past "nsecs" seconds and
876 * also returns the number of pages the system still needs to reclaim at this
877 * moment in time.
878 */
879 #if DEVELOPMENT || DEBUG
880 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
881 #else
882 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
883 #endif
884 struct vm_pageout_stat {
885 unsigned long vm_page_active_count;
886 unsigned long vm_page_speculative_count;
887 unsigned long vm_page_inactive_count;
888 unsigned long vm_page_anonymous_count;
889
890 unsigned long vm_page_free_count;
891 unsigned long vm_page_wire_count;
892 unsigned long vm_page_compressor_count;
893
894 unsigned long vm_page_pages_compressed;
895 unsigned long vm_page_pageable_internal_count;
896 unsigned long vm_page_pageable_external_count;
897 unsigned long vm_page_xpmapped_external_count;
898
899 unsigned int pages_grabbed;
900 unsigned int pages_freed;
901
902 unsigned int pages_compressed;
903 unsigned int pages_grabbed_by_compressor;
904 unsigned int failed_compressions;
905
906 unsigned int pages_evicted;
907 unsigned int pages_purged;
908
909 unsigned int considered;
910 unsigned int considered_bq_internal;
911 unsigned int considered_bq_external;
912
913 unsigned int skipped_external;
914 unsigned int skipped_internal;
915 unsigned int filecache_min_reactivations;
916
917 unsigned int freed_speculative;
918 unsigned int freed_cleaned;
919 unsigned int freed_internal;
920 unsigned int freed_external;
921
922 unsigned int cleaned_dirty_external;
923 unsigned int cleaned_dirty_internal;
924
925 unsigned int inactive_referenced;
926 unsigned int inactive_nolock;
927 unsigned int reactivation_limit_exceeded;
928 unsigned int forced_inactive_reclaim;
929
930 unsigned int throttled_internal_q;
931 unsigned int throttled_external_q;
932
933 unsigned int phantom_ghosts_found;
934 unsigned int phantom_ghosts_added;
935
936 unsigned int vm_page_realtime_count;
937 unsigned int forcereclaimed_sharedcache;
938 unsigned int forcereclaimed_realtime;
939 unsigned int protected_sharedcache;
940 unsigned int protected_realtime;
941
942 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
943
944 unsigned int vm_pageout_stat_now = 0;
945
946 #define VM_PAGEOUT_STAT_BEFORE(i) \
947 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
948 #define VM_PAGEOUT_STAT_AFTER(i) \
949 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
950
951 #if VM_PAGE_BUCKETS_CHECK
952 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
953 #endif /* VM_PAGE_BUCKETS_CHECK */
954
955
956 void
957 record_memory_pressure(void);
958 void
959 record_memory_pressure(void)
960 {
961 unsigned int vm_pageout_next;
962
963 #if VM_PAGE_BUCKETS_CHECK
964 /* check the consistency of VM page buckets at regular interval */
965 static int counter = 0;
966 if ((++counter % vm_page_buckets_check_interval) == 0) {
967 vm_page_buckets_check();
968 }
969 #endif /* VM_PAGE_BUCKETS_CHECK */
970
971 vm_pageout_state.vm_memory_pressure =
972 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
973 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
974 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
975 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
976
977 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
978
979 /* move "now" forward */
980 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
981
982 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
983
984 vm_pageout_stat_now = vm_pageout_next;
985 }
986
987
988 /*
989 * IMPORTANT
990 * mach_vm_ctl_page_free_wanted() is called indirectly, via
991 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
992 * it must be safe in the restricted stackshot context. Locks and/or
993 * blocking are not allowable.
994 */
995 unsigned int
996 mach_vm_ctl_page_free_wanted(void)
997 {
998 unsigned int page_free_target, page_free_count, page_free_wanted;
999
1000 page_free_target = vm_page_free_target;
1001 page_free_count = vm_page_free_count;
1002 if (page_free_target > page_free_count) {
1003 page_free_wanted = page_free_target - page_free_count;
1004 } else {
1005 page_free_wanted = 0;
1006 }
1007
1008 return page_free_wanted;
1009 }
1010
1011
1012 /*
1013 * IMPORTANT:
1014 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1015 * wait_for_pressure FALSE, so that code path must remain safe in the
1016 * restricted stackshot context. No blocking or locks are allowable.
1017 * on that code path.
1018 */
1019
1020 kern_return_t
1021 mach_vm_pressure_monitor(
1022 boolean_t wait_for_pressure,
1023 unsigned int nsecs_monitored,
1024 unsigned int *pages_reclaimed_p,
1025 unsigned int *pages_wanted_p)
1026 {
1027 wait_result_t wr;
1028 unsigned int vm_pageout_then, vm_pageout_now;
1029 unsigned int pages_reclaimed;
1030 unsigned int units_of_monitor;
1031
1032 units_of_monitor = 8 * nsecs_monitored;
1033 /*
1034 * We don't take the vm_page_queue_lock here because we don't want
1035 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1036 * thread when it's trying to reclaim memory. We don't need fully
1037 * accurate monitoring anyway...
1038 */
1039
1040 if (wait_for_pressure) {
1041 /* wait until there's memory pressure */
1042 while (vm_page_free_count >= vm_page_free_target) {
1043 wr = assert_wait((event_t) &vm_page_free_wanted,
1044 THREAD_INTERRUPTIBLE);
1045 if (wr == THREAD_WAITING) {
1046 wr = thread_block(THREAD_CONTINUE_NULL);
1047 }
1048 if (wr == THREAD_INTERRUPTED) {
1049 return KERN_ABORTED;
1050 }
1051 if (wr == THREAD_AWAKENED) {
1052 /*
1053 * The memory pressure might have already
1054 * been relieved but let's not block again
1055 * and let's report that there was memory
1056 * pressure at some point.
1057 */
1058 break;
1059 }
1060 }
1061 }
1062
1063 /* provide the number of pages the system wants to reclaim */
1064 if (pages_wanted_p != NULL) {
1065 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1066 }
1067
1068 if (pages_reclaimed_p == NULL) {
1069 return KERN_SUCCESS;
1070 }
1071
1072 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1073 vm_pageout_now = vm_pageout_stat_now;
1074 pages_reclaimed = 0;
1075 for (vm_pageout_then =
1076 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1077 vm_pageout_then != vm_pageout_now &&
1078 units_of_monitor-- != 0;
1079 vm_pageout_then =
1080 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1081 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1082 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1083 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1084 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1085 }
1086 *pages_reclaimed_p = pages_reclaimed;
1087
1088 return KERN_SUCCESS;
1089 }
1090
1091
1092
1093 #if DEVELOPMENT || DEBUG
1094
1095 static void
1096 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1097
1098 /*
1099 * condition variable used to make sure there is
1100 * only a single sweep going on at a time
1101 */
1102 bool vm_pageout_disconnect_all_pages_active = false;
1103
1104 void
1105 vm_pageout_disconnect_all_pages()
1106 {
1107 vm_page_lock_queues();
1108
1109 if (vm_pageout_disconnect_all_pages_active) {
1110 vm_page_unlock_queues();
1111 return;
1112 }
1113 vm_pageout_disconnect_all_pages_active = true;
1114
1115 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1116 vm_page_throttled_count);
1117 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1118 vm_page_anonymous_count);
1119 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1120 (vm_page_inactive_count - vm_page_anonymous_count));
1121 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1122 vm_page_active_count);
1123 #ifdef CONFIG_SECLUDED_MEMORY
1124 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1125 vm_page_secluded_count);
1126 #endif /* CONFIG_SECLUDED_MEMORY */
1127 vm_page_unlock_queues();
1128
1129 vm_pageout_disconnect_all_pages_active = false;
1130 }
1131
1132 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1133 void
1134 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1135 {
1136 vm_page_t m;
1137 vm_object_t t_object = NULL;
1138 vm_object_t l_object = NULL;
1139 vm_object_t m_object = NULL;
1140 int delayed_unlock = 0;
1141 int try_failed_count = 0;
1142 int disconnected_count = 0;
1143 int paused_count = 0;
1144 int object_locked_count = 0;
1145
1146 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1147 DBG_FUNC_START),
1148 q, qcount);
1149
1150 while (qcount && !vm_page_queue_empty(q)) {
1151 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1152
1153 m = (vm_page_t) vm_page_queue_first(q);
1154 m_object = VM_PAGE_OBJECT(m);
1155
1156 if (m_object == VM_OBJECT_NULL) {
1157 /*
1158 * Bumped into a free page. This should only happen on the
1159 * secluded queue
1160 */
1161 #if CONFIG_SECLUDED_MEMORY
1162 assert(q == &vm_page_queue_secluded);
1163 #endif /* CONFIG_SECLUDED_MEMORY */
1164 goto reenter_pg_on_q;
1165 }
1166
1167 /*
1168 * check to see if we currently are working
1169 * with the same object... if so, we've
1170 * already got the lock
1171 */
1172 if (m_object != l_object) {
1173 /*
1174 * the object associated with candidate page is
1175 * different from the one we were just working
1176 * with... dump the lock if we still own it
1177 */
1178 if (l_object != NULL) {
1179 vm_object_unlock(l_object);
1180 l_object = NULL;
1181 }
1182 if (m_object != t_object) {
1183 try_failed_count = 0;
1184 }
1185
1186 /*
1187 * Try to lock object; since we've alread got the
1188 * page queues lock, we can only 'try' for this one.
1189 * if the 'try' fails, we need to do a mutex_pause
1190 * to allow the owner of the object lock a chance to
1191 * run...
1192 */
1193 if (!vm_object_lock_try_scan(m_object)) {
1194 if (try_failed_count > 20) {
1195 goto reenter_pg_on_q;
1196 }
1197 vm_page_unlock_queues();
1198 mutex_pause(try_failed_count++);
1199 vm_page_lock_queues();
1200 delayed_unlock = 0;
1201
1202 paused_count++;
1203
1204 t_object = m_object;
1205 continue;
1206 }
1207 object_locked_count++;
1208
1209 l_object = m_object;
1210 }
1211 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1212 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1213 m->vmp_free_when_done) {
1214 /*
1215 * put it back on the head of its queue
1216 */
1217 goto reenter_pg_on_q;
1218 }
1219 if (m->vmp_pmapped == TRUE) {
1220 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1221
1222 disconnected_count++;
1223 }
1224 reenter_pg_on_q:
1225 vm_page_queue_remove(q, m, vmp_pageq);
1226 vm_page_queue_enter(q, m, vmp_pageq);
1227
1228 qcount--;
1229 try_failed_count = 0;
1230
1231 if (delayed_unlock++ > 128) {
1232 if (l_object != NULL) {
1233 vm_object_unlock(l_object);
1234 l_object = NULL;
1235 }
1236 lck_mtx_yield(&vm_page_queue_lock);
1237 delayed_unlock = 0;
1238 }
1239 }
1240 if (l_object != NULL) {
1241 vm_object_unlock(l_object);
1242 l_object = NULL;
1243 }
1244
1245 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1246 DBG_FUNC_END),
1247 q, disconnected_count, object_locked_count, paused_count);
1248 }
1249
1250 extern const char *proc_best_name(struct proc* proc);
1251
1252 int
1253 vm_toggle_task_selfdonate_pages(task_t task)
1254 {
1255 int state = 0;
1256 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1257 printf("VM Donation mode is OFF on the system\n");
1258 return state;
1259 }
1260 if (task != kernel_task) {
1261 task_lock(task);
1262 if (!task->donates_own_pages) {
1263 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1264 task->donates_own_pages = true;
1265 state = 1;
1266 } else if (task->donates_own_pages) {
1267 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1268 task->donates_own_pages = false;
1269 state = 0;
1270 }
1271 task_unlock(task);
1272 }
1273 return state;
1274 }
1275 #endif /* DEVELOPMENT || DEBUG */
1276
1277 void
1278 vm_task_set_selfdonate_pages(task_t task, bool donate)
1279 {
1280 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1281 assert(task != kernel_task);
1282
1283 task_lock(task);
1284 task->donates_own_pages = donate;
1285 task_unlock(task);
1286 }
1287
1288
1289
1290 static size_t
1291 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1292
1293 /*
1294 * condition variable used to make sure there is
1295 * only a single sweep going on at a time
1296 */
1297 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1298
1299
1300 kern_return_t
1301 vm_pageout_anonymous_pages()
1302 {
1303 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1304 size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1305 vm_page_lock_queues();
1306
1307 if (vm_pageout_anonymous_pages_active == TRUE) {
1308 vm_page_unlock_queues();
1309 return KERN_RESOURCE_SHORTAGE;
1310 }
1311 vm_pageout_anonymous_pages_active = TRUE;
1312 vm_page_unlock_queues();
1313
1314 throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1315 anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1316 active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1317
1318 os_log(OS_LOG_DEFAULT,
1319 "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1320 __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1321
1322 if (VM_CONFIG_SWAP_IS_PRESENT) {
1323 vm_consider_swapping();
1324 }
1325
1326 vm_page_lock_queues();
1327 vm_pageout_anonymous_pages_active = FALSE;
1328 vm_page_unlock_queues();
1329 return KERN_SUCCESS;
1330 } else {
1331 return KERN_NOT_SUPPORTED;
1332 }
1333 }
1334
1335
1336 size_t
1337 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1338 {
1339 vm_page_t m;
1340 vm_object_t t_object = NULL;
1341 vm_object_t l_object = NULL;
1342 vm_object_t m_object = NULL;
1343 int delayed_unlock = 0;
1344 int try_failed_count = 0;
1345 int refmod_state;
1346 int pmap_options;
1347 struct vm_pageout_queue *iq;
1348 ppnum_t phys_page;
1349 size_t pages_moved = 0;
1350
1351
1352 iq = &vm_pageout_queue_internal;
1353
1354 vm_page_lock_queues();
1355
1356 #if DEVELOPMENT || DEBUG
1357 if (perf_test) {
1358 iq = &vm_pageout_queue_benchmark;
1359 // ensure the benchmark queue isn't throttled
1360 iq->pgo_maxlaundry = (unsigned int) qcount;
1361 }
1362 #endif /* DEVELOPMENT ||DEBUG */
1363
1364 while (qcount && !vm_page_queue_empty(q)) {
1365 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1366
1367 if (VM_PAGE_Q_THROTTLED(iq)) {
1368 if (l_object != NULL) {
1369 vm_object_unlock(l_object);
1370 l_object = NULL;
1371 }
1372 iq->pgo_draining = TRUE;
1373
1374 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1375 vm_page_unlock_queues();
1376
1377 thread_block(THREAD_CONTINUE_NULL);
1378
1379 vm_page_lock_queues();
1380 delayed_unlock = 0;
1381 continue;
1382 }
1383 m = (vm_page_t) vm_page_queue_first(q);
1384 m_object = VM_PAGE_OBJECT(m);
1385
1386 /*
1387 * check to see if we currently are working
1388 * with the same object... if so, we've
1389 * already got the lock
1390 */
1391 if (m_object != l_object) {
1392 if (!m_object->internal) {
1393 goto reenter_pg_on_q;
1394 }
1395
1396 /*
1397 * the object associated with candidate page is
1398 * different from the one we were just working
1399 * with... dump the lock if we still own it
1400 */
1401 if (l_object != NULL) {
1402 vm_object_unlock(l_object);
1403 l_object = NULL;
1404 }
1405 if (m_object != t_object) {
1406 try_failed_count = 0;
1407 }
1408
1409 /*
1410 * Try to lock object; since we've alread got the
1411 * page queues lock, we can only 'try' for this one.
1412 * if the 'try' fails, we need to do a mutex_pause
1413 * to allow the owner of the object lock a chance to
1414 * run...
1415 */
1416 if (!vm_object_lock_try_scan(m_object)) {
1417 if (try_failed_count > 20) {
1418 goto reenter_pg_on_q;
1419 }
1420 vm_page_unlock_queues();
1421 mutex_pause(try_failed_count++);
1422 vm_page_lock_queues();
1423 delayed_unlock = 0;
1424
1425 t_object = m_object;
1426 continue;
1427 }
1428 l_object = m_object;
1429 }
1430 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1431 /*
1432 * page is not to be cleaned
1433 * put it back on the head of its queue
1434 */
1435 goto reenter_pg_on_q;
1436 }
1437 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1438
1439 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1440 refmod_state = pmap_get_refmod(phys_page);
1441
1442 if (refmod_state & VM_MEM_REFERENCED) {
1443 m->vmp_reference = TRUE;
1444 }
1445 if (refmod_state & VM_MEM_MODIFIED) {
1446 SET_PAGE_DIRTY(m, FALSE);
1447 }
1448 }
1449 if (m->vmp_reference == TRUE) {
1450 m->vmp_reference = FALSE;
1451 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1452 goto reenter_pg_on_q;
1453 }
1454 if (m->vmp_pmapped == TRUE) {
1455 if (m->vmp_dirty || m->vmp_precious) {
1456 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1457 } else {
1458 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1459 }
1460 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1461 if (refmod_state & VM_MEM_MODIFIED) {
1462 SET_PAGE_DIRTY(m, FALSE);
1463 }
1464 }
1465
1466 if (!m->vmp_dirty && !m->vmp_precious) {
1467 vm_page_unlock_queues();
1468 VM_PAGE_FREE(m);
1469 vm_page_lock_queues();
1470 delayed_unlock = 0;
1471
1472 goto next_pg;
1473 }
1474 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1475 if (!m_object->pager_initialized) {
1476 vm_page_unlock_queues();
1477
1478 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1479
1480 if (!m_object->pager_initialized) {
1481 vm_object_compressor_pager_create(m_object);
1482 }
1483
1484 vm_page_lock_queues();
1485 delayed_unlock = 0;
1486 }
1487 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1488 /*
1489 * We dropped the page queues lock above, so
1490 * "m" might no longer be on this queue...
1491 */
1492 if (m != (vm_page_t) vm_page_queue_first(q)) {
1493 continue;
1494 }
1495 goto reenter_pg_on_q;
1496 }
1497 /*
1498 * vm_object_compressor_pager_create will drop the object lock
1499 * which means 'm' may no longer be valid to use
1500 */
1501 continue;
1502 }
1503
1504 if (!perf_test) {
1505 /*
1506 * we've already factored out pages in the laundry which
1507 * means this page can't be on the pageout queue so it's
1508 * safe to do the vm_page_queues_remove
1509 */
1510 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1511 vm_page_queues_remove(m, TRUE);
1512 if (donate) {
1513 /*
1514 * The compressor needs to see this bit to know
1515 * where this page needs to land. Also if stolen,
1516 * this bit helps put the page back in the right
1517 * special queue where it belongs.
1518 */
1519 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1520 }
1521 } else {
1522 vm_page_queue_remove(q, m, vmp_pageq);
1523 }
1524
1525 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1526
1527 vm_pageout_cluster_to_queue(m, iq);
1528
1529 pages_moved++;
1530 goto next_pg;
1531
1532 reenter_pg_on_q:
1533 vm_page_queue_remove(q, m, vmp_pageq);
1534 vm_page_queue_enter(q, m, vmp_pageq);
1535 next_pg:
1536 qcount--;
1537 try_failed_count = 0;
1538
1539 if (delayed_unlock++ > 128) {
1540 if (l_object != NULL) {
1541 vm_object_unlock(l_object);
1542 l_object = NULL;
1543 }
1544 lck_mtx_yield(&vm_page_queue_lock);
1545 delayed_unlock = 0;
1546 }
1547 }
1548 if (l_object != NULL) {
1549 vm_object_unlock(l_object);
1550 l_object = NULL;
1551 }
1552 vm_page_unlock_queues();
1553 return pages_moved;
1554 }
1555
1556
1557
1558 /*
1559 * function in BSD to apply I/O throttle to the pageout thread
1560 */
1561 extern void vm_pageout_io_throttle(void);
1562
1563 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1564 MACRO_BEGIN \
1565 /* \
1566 * If a "reusable" page somehow made it back into \
1567 * the active queue, it's been re-used and is not \
1568 * quite re-usable. \
1569 * If the VM object was "all_reusable", consider it \
1570 * as "all re-used" instead of converting it to \
1571 * "partially re-used", which could be expensive. \
1572 */ \
1573 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1574 if ((m)->vmp_reusable || \
1575 (obj)->all_reusable) { \
1576 vm_object_reuse_pages((obj), \
1577 (m)->vmp_offset, \
1578 (m)->vmp_offset + PAGE_SIZE_64, \
1579 FALSE); \
1580 } \
1581 MACRO_END
1582
1583
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1585 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1586
1587 #define FCS_IDLE 0
1588 #define FCS_DELAYED 1
1589 #define FCS_DEADLOCK_DETECTED 2
1590
1591 struct flow_control {
1592 int state;
1593 mach_timespec_t ts;
1594 };
1595
1596
1597 uint64_t vm_pageout_rejected_bq_internal = 0;
1598 uint64_t vm_pageout_rejected_bq_external = 0;
1599 uint64_t vm_pageout_skipped_bq_internal = 0;
1600 uint64_t vm_pageout_skipped_bq_external = 0;
1601
1602 #define ANONS_GRABBED_LIMIT 2
1603
1604
1605 #if 0
1606 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1607 #endif
1608 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1609
1610 #define VM_PAGEOUT_PB_NO_ACTION 0
1611 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1612 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1613
1614
1615 #if 0
1616 static void
1617 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1618 {
1619 if (*local_freeq) {
1620 vm_page_unlock_queues();
1621
1622 VM_DEBUG_CONSTANT_EVENT(
1623 vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1624 vm_page_free_count, 0, 0, 1);
1625
1626 vm_page_free_list(*local_freeq, TRUE);
1627
1628 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1629 vm_page_free_count, *local_freed, 0, 1);
1630
1631 *local_freeq = NULL;
1632 *local_freed = 0;
1633
1634 vm_page_lock_queues();
1635 } else {
1636 lck_mtx_yield(&vm_page_queue_lock);
1637 }
1638 *delayed_unlock = 1;
1639 }
1640 #endif
1641
1642
1643 static void
1644 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1645 vm_page_t *local_freeq, int *local_freed, int action)
1646 {
1647 vm_page_unlock_queues();
1648
1649 if (*object != NULL) {
1650 vm_object_unlock(*object);
1651 *object = NULL;
1652 }
1653 if (*local_freeq) {
1654 vm_page_free_list(*local_freeq, TRUE);
1655
1656 *local_freeq = NULL;
1657 *local_freed = 0;
1658 }
1659 *delayed_unlock = 1;
1660
1661 switch (action) {
1662 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1663 vm_consider_waking_compactor_swapper();
1664 break;
1665 case VM_PAGEOUT_PB_THREAD_YIELD:
1666 thread_yield_internal(1);
1667 break;
1668 case VM_PAGEOUT_PB_NO_ACTION:
1669 default:
1670 break;
1671 }
1672 vm_page_lock_queues();
1673 }
1674
1675
1676 static struct vm_pageout_vminfo last;
1677
1678 uint64_t last_vm_page_pages_grabbed = 0;
1679
1680 extern uint32_t c_segment_pages_compressed;
1681
1682 extern uint64_t shared_region_pager_reclaimed;
1683 extern struct memory_object_pager_ops shared_region_pager_ops;
1684
1685 void
1686 update_vm_info(void)
1687 {
1688 unsigned long tmp;
1689 uint64_t tmp64;
1690
1691 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1692 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1693 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1694 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1695
1696 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1697 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1698 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1699
1700 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1701 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1702 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1703 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1704 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1705
1706 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1707 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1708 last.vm_pageout_considered_page = tmp;
1709
1710 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1711 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1712 last.vm_pageout_compressions = tmp64;
1713
1714 tmp = vm_pageout_vminfo.vm_compressor_failed;
1715 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1716 last.vm_compressor_failed = tmp;
1717
1718 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1719 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1720 last.vm_compressor_pages_grabbed = tmp64;
1721
1722 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1723 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1724 last.vm_phantom_cache_found_ghost = tmp;
1725
1726 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1727 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1728 last.vm_phantom_cache_added_ghost = tmp;
1729
1730 tmp64 = counter_load(&vm_page_grab_count);
1731 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1732 last_vm_page_pages_grabbed = tmp64;
1733
1734 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1735 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1736 last.vm_page_pages_freed = tmp;
1737
1738
1739 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1740 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1741 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1742 last.vm_pageout_pages_evicted = tmp;
1743
1744 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1745 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1746 last.vm_pageout_pages_purged = tmp;
1747
1748 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1749 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1750 last.vm_pageout_freed_speculative = tmp;
1751
1752 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1753 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1754 last.vm_pageout_freed_external = tmp;
1755
1756 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1757 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1758 last.vm_pageout_inactive_referenced = tmp;
1759
1760 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1761 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1762 last.vm_pageout_scan_inactive_throttled_external = tmp;
1763
1764 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1765 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1766 last.vm_pageout_inactive_dirty_external = tmp;
1767
1768 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1769 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1770 last.vm_pageout_freed_cleaned = tmp;
1771
1772 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1773 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1774 last.vm_pageout_inactive_nolock = tmp;
1775
1776 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1777 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1778 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1779
1780 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1781 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1782 last.vm_pageout_skipped_external = tmp;
1783
1784 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1785 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1786 last.vm_pageout_skipped_internal = tmp;
1787
1788 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1789 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1790 last.vm_pageout_reactivation_limit_exceeded = tmp;
1791
1792 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1793 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1794 last.vm_pageout_inactive_force_reclaim = tmp;
1795
1796 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1797 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1798 last.vm_pageout_freed_internal = tmp;
1799
1800 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1801 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1802 last.vm_pageout_considered_bq_internal = tmp;
1803
1804 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1805 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1806 last.vm_pageout_considered_bq_external = tmp;
1807
1808 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1809 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1810 last.vm_pageout_filecache_min_reactivated = tmp;
1811
1812 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1813 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1814 last.vm_pageout_inactive_dirty_internal = tmp;
1815
1816 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1817 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1818 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1819
1820 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1821 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1822 last.vm_pageout_forcereclaimed_realtime = tmp;
1823
1824 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1825 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1826 last.vm_pageout_protected_sharedcache = tmp;
1827
1828 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1829 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1830 last.vm_pageout_protected_realtime = tmp;
1831 }
1832
1833 KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1834 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1835 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1836 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1837 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1838
1839 KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1840 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1841 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1842 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1843
1844 KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1845 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1846 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1847 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1848 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1849
1850 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1851 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1852 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1853 KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1854 vm_pageout_stats[vm_pageout_stat_now].considered,
1855 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1856 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1857 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1858
1859 KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1860 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1861 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1862 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1863 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1864
1865 KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1866 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1867 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1868 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1869 vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1870
1871 KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1872 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1873 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1874 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1875 vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1876
1877 KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1878 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1879 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1880 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1881 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1882
1883 KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1884 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1885 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1886 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1887 vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1888 }
1889 KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1890 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1891 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1892 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1893 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1894
1895 record_memory_pressure();
1896 }
1897
1898 extern boolean_t hibernation_vmqueues_inspection;
1899
1900 /*
1901 * Return values for functions called by vm_pageout_scan
1902 * that control its flow.
1903 *
1904 * PROCEED -- vm_pageout_scan will keep making forward progress.
1905 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1906 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1907 */
1908
1909 #define VM_PAGEOUT_SCAN_PROCEED (0)
1910 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1911 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1912
1913 /*
1914 * This function is called only from vm_pageout_scan and
1915 * it moves overflow secluded pages (one-at-a-time) to the
1916 * batched 'local' free Q or active Q.
1917 */
1918 static void
1919 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1920 {
1921 #if CONFIG_SECLUDED_MEMORY
1922 /*
1923 * Deal with secluded_q overflow.
1924 */
1925 if (vm_page_secluded_count > vm_page_secluded_target) {
1926 vm_page_t secluded_page;
1927
1928 /*
1929 * SECLUDED_AGING_BEFORE_ACTIVE:
1930 * Excess secluded pages go to the active queue and
1931 * will later go to the inactive queue.
1932 */
1933 assert((vm_page_secluded_count_free +
1934 vm_page_secluded_count_inuse) ==
1935 vm_page_secluded_count);
1936 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1937 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1938
1939 vm_page_queues_remove(secluded_page, FALSE);
1940 assert(!vm_page_is_fictitious(secluded_page));
1941 assert(!VM_PAGE_WIRED(secluded_page));
1942
1943 if (secluded_page->vmp_object == 0) {
1944 /* transfer to free queue */
1945 assert(secluded_page->vmp_busy);
1946 secluded_page->vmp_snext = *local_freeq;
1947 *local_freeq = secluded_page;
1948 *local_freed += 1;
1949 } else {
1950 /* transfer to head of active queue */
1951 vm_page_enqueue_active(secluded_page, FALSE);
1952 secluded_page = VM_PAGE_NULL;
1953 }
1954 }
1955 #else /* CONFIG_SECLUDED_MEMORY */
1956
1957 #pragma unused(local_freeq)
1958 #pragma unused(local_freed)
1959
1960 return;
1961
1962 #endif /* CONFIG_SECLUDED_MEMORY */
1963 }
1964
1965
1966 /*
1967 * This function is called only from vm_pageout_scan and
1968 * it initializes the loop targets for vm_pageout_scan().
1969 */
1970 static void
1971 vps_init_page_targets(void)
1972 {
1973 /*
1974 * LD TODO: Other page targets should be calculated here too.
1975 */
1976 vm_page_anonymous_min = vm_page_inactive_target / 20;
1977
1978 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1979 vm_pageout_state.vm_page_speculative_percentage = 50;
1980 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1981 vm_pageout_state.vm_page_speculative_percentage = 1;
1982 }
1983
1984 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1985 vm_page_inactive_count);
1986 }
1987
1988 /*
1989 * This function is called only from vm_pageout_scan and
1990 * it purges a single VM object at-a-time and will either
1991 * make vm_pageout_scan() restart the loop or keeping moving forward.
1992 */
1993 static int
1994 vps_purge_object()
1995 {
1996 int force_purge;
1997
1998 assert(available_for_purge >= 0);
1999 force_purge = 0; /* no force-purging */
2000
2001 #if VM_PRESSURE_EVENTS
2002 vm_pressure_level_t pressure_level;
2003
2004 pressure_level = memorystatus_vm_pressure_level;
2005
2006 if (pressure_level > kVMPressureNormal) {
2007 if (pressure_level >= kVMPressureCritical) {
2008 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2009 } else if (pressure_level >= kVMPressureUrgent) {
2010 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2011 } else if (pressure_level >= kVMPressureWarning) {
2012 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2013 }
2014 }
2015 #endif /* VM_PRESSURE_EVENTS */
2016
2017 if (available_for_purge || force_purge) {
2018 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2019
2020 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2021 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2022 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2023 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2024 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2025
2026 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2027 }
2028 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2029 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2030 }
2031
2032 return VM_PAGEOUT_SCAN_PROCEED;
2033 }
2034
2035 /*
2036 * This function is called only from vm_pageout_scan and
2037 * it will try to age the next speculative Q if the oldest
2038 * one is empty.
2039 */
2040 static int
2041 vps_age_speculative_queue(boolean_t force_speculative_aging)
2042 {
2043 #define DELAY_SPECULATIVE_AGE 1000
2044
2045 /*
2046 * try to pull pages from the aging bins...
2047 * see vm_page_internal.h for an explanation of how
2048 * this mechanism works
2049 */
2050 boolean_t can_steal = FALSE;
2051 int num_scanned_queues;
2052 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2053 mach_timespec_t ts;
2054 struct vm_speculative_age_q *aq;
2055 struct vm_speculative_age_q *sq;
2056
2057 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2058
2059 aq = &vm_page_queue_speculative[speculative_steal_index];
2060
2061 num_scanned_queues = 0;
2062 while (vm_page_queue_empty(&aq->age_q) &&
2063 num_scanned_queues++ != vm_page_max_speculative_age_q) {
2064 speculative_steal_index++;
2065
2066 if (speculative_steal_index > vm_page_max_speculative_age_q) {
2067 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2068 }
2069
2070 aq = &vm_page_queue_speculative[speculative_steal_index];
2071 }
2072
2073 if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2074 /*
2075 * XXX We've scanned all the speculative
2076 * queues but still haven't found one
2077 * that is not empty, even though
2078 * vm_page_speculative_count is not 0.
2079 */
2080 if (!vm_page_queue_empty(&sq->age_q)) {
2081 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2082 }
2083 #if DEVELOPMENT || DEBUG
2084 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2085 #endif
2086 /* readjust... */
2087 vm_page_speculative_count = 0;
2088 /* ... and continue */
2089 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2090 }
2091
2092 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2093 can_steal = TRUE;
2094 } else {
2095 if (!delay_speculative_age) {
2096 mach_timespec_t ts_fully_aged;
2097
2098 ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2099 ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2100 * 1000 * NSEC_PER_USEC;
2101
2102 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2103
2104 clock_sec_t sec;
2105 clock_nsec_t nsec;
2106 clock_get_system_nanotime(&sec, &nsec);
2107 ts.tv_sec = (unsigned int) sec;
2108 ts.tv_nsec = nsec;
2109
2110 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2111 can_steal = TRUE;
2112 } else {
2113 delay_speculative_age++;
2114 }
2115 } else {
2116 delay_speculative_age++;
2117 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2118 delay_speculative_age = 0;
2119 }
2120 }
2121 }
2122 if (can_steal == TRUE) {
2123 vm_page_speculate_ageit(aq);
2124 }
2125
2126 return VM_PAGEOUT_SCAN_PROCEED;
2127 }
2128
2129 /*
2130 * This function is called only from vm_pageout_scan and
2131 * it evicts a single VM object from the cache.
2132 */
2133 static int inline
2134 vps_object_cache_evict(vm_object_t *object_to_unlock)
2135 {
2136 static int cache_evict_throttle = 0;
2137 struct vm_speculative_age_q *sq;
2138
2139 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2140
2141 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2142 int pages_evicted;
2143
2144 if (*object_to_unlock != NULL) {
2145 vm_object_unlock(*object_to_unlock);
2146 *object_to_unlock = NULL;
2147 }
2148 KDBG(0x13001ec | DBG_FUNC_START);
2149
2150 pages_evicted = vm_object_cache_evict(100, 10);
2151
2152 KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2153
2154 if (pages_evicted) {
2155 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2156
2157 VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2158 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2159 memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2160
2161 /*
2162 * we just freed up to 100 pages,
2163 * so go back to the top of the main loop
2164 * and re-evaulate the memory situation
2165 */
2166 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2167 } else {
2168 cache_evict_throttle = 1000;
2169 }
2170 }
2171 if (cache_evict_throttle) {
2172 cache_evict_throttle--;
2173 }
2174
2175 return VM_PAGEOUT_SCAN_PROCEED;
2176 }
2177
2178
2179 /*
2180 * This function is called only from vm_pageout_scan and
2181 * it calculates the filecache min. that needs to be maintained
2182 * as we start to steal pages.
2183 */
2184 static void
2185 vps_calculate_filecache_min(void)
2186 {
2187 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2188
2189 #if CONFIG_JETSAM
2190 /*
2191 * don't let the filecache_min fall below 15% of available memory
2192 * on systems with an active compressor that isn't nearing its
2193 * limits w/r to accepting new data
2194 *
2195 * on systems w/o the compressor/swapper, the filecache is always
2196 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2197 * since most (if not all) of the anonymous pages are in the
2198 * throttled queue (which isn't counted as available) which
2199 * effectively disables this filter
2200 */
2201 if (vm_compressor_low_on_space() || divisor == 0) {
2202 vm_pageout_state.vm_page_filecache_min = 0;
2203 } else {
2204 vm_pageout_state.vm_page_filecache_min =
2205 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2206 }
2207 #else
2208 if (vm_compressor_out_of_space() || divisor == 0) {
2209 vm_pageout_state.vm_page_filecache_min = 0;
2210 } else {
2211 /*
2212 * don't let the filecache_min fall below the specified critical level
2213 */
2214 vm_pageout_state.vm_page_filecache_min =
2215 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2216 }
2217 #endif
2218 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2219 vm_pageout_state.vm_page_filecache_min = 0;
2220 }
2221 }
2222
2223 /*
2224 * This function is called only from vm_pageout_scan and
2225 * it updates the flow control time to detect if VM pageoutscan
2226 * isn't making progress.
2227 */
2228 static void
2229 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2230 {
2231 mach_timespec_t ts;
2232 clock_sec_t sec;
2233 clock_nsec_t nsec;
2234
2235 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2236 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2237 clock_get_system_nanotime(&sec, &nsec);
2238 flow_control->ts.tv_sec = (unsigned int) sec;
2239 flow_control->ts.tv_nsec = nsec;
2240 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2241
2242 flow_control->state = FCS_DELAYED;
2243
2244 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2245 }
2246
2247 /*
2248 * This function is called only from vm_pageout_scan and
2249 * it is the flow control logic of VM pageout scan which
2250 * controls if it should block and for how long.
2251 * Any blocking of vm_pageout_scan happens ONLY in this function.
2252 */
2253 static int
2254 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2255 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2256 {
2257 boolean_t exceeded_burst_throttle = FALSE;
2258 unsigned int msecs = 0;
2259 uint32_t inactive_external_count;
2260 mach_timespec_t ts;
2261 struct vm_pageout_queue *iq;
2262 struct vm_pageout_queue *eq;
2263 struct vm_speculative_age_q *sq;
2264
2265 iq = &vm_pageout_queue_internal;
2266 eq = &vm_pageout_queue_external;
2267 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2268
2269 /*
2270 * Sometimes we have to pause:
2271 * 1) No inactive pages - nothing to do.
2272 * 2) Loop control - no acceptable pages found on the inactive queue
2273 * within the last vm_pageout_burst_inactive_throttle iterations
2274 * 3) Flow control - default pageout queue is full
2275 */
2276 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2277 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2278 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2279 vm_page_queue_empty(&sq->age_q)) {
2280 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2281 msecs = vm_pageout_state.vm_pageout_empty_wait;
2282 } else if (inactive_burst_count >=
2283 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2284 (vm_page_inactive_count +
2285 vm_page_speculative_count))) {
2286 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2287 msecs = vm_pageout_state.vm_pageout_burst_wait;
2288
2289 exceeded_burst_throttle = TRUE;
2290 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2291 VM_DYNAMIC_PAGING_ENABLED()) {
2292 clock_sec_t sec;
2293 clock_nsec_t nsec;
2294
2295 switch (flow_control->state) {
2296 case FCS_IDLE:
2297 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2298 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2299 /*
2300 * since the compressor is running independently of vm_pageout_scan
2301 * let's not wait for it just yet... as long as we have a healthy supply
2302 * of filecache pages to work with, let's keep stealing those.
2303 */
2304 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2305
2306 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2307 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2308 *anons_grabbed = ANONS_GRABBED_LIMIT;
2309 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2310 return VM_PAGEOUT_SCAN_PROCEED;
2311 }
2312 }
2313
2314 vps_flow_control_reset_deadlock_timer(flow_control);
2315 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2316
2317 break;
2318
2319 case FCS_DELAYED:
2320 clock_get_system_nanotime(&sec, &nsec);
2321 ts.tv_sec = (unsigned int) sec;
2322 ts.tv_nsec = nsec;
2323
2324 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2325 /*
2326 * the pageout thread for the default pager is potentially
2327 * deadlocked since the
2328 * default pager queue has been throttled for more than the
2329 * allowable time... we need to move some clean pages or dirty
2330 * pages belonging to the external pagers if they aren't throttled
2331 * vm_page_free_wanted represents the number of threads currently
2332 * blocked waiting for pages... we'll move one page for each of
2333 * these plus a fixed amount to break the logjam... once we're done
2334 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2335 * with a new timeout target since we have no way of knowing
2336 * whether we've broken the deadlock except through observation
2337 * of the queue associated with the default pager... we need to
2338 * stop moving pages and allow the system to run to see what
2339 * state it settles into.
2340 */
2341
2342 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2343 vm_page_free_wanted + vm_page_free_wanted_privileged;
2344 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2345 flow_control->state = FCS_DEADLOCK_DETECTED;
2346 sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2347 return VM_PAGEOUT_SCAN_PROCEED;
2348 }
2349 /*
2350 * just resniff instead of trying
2351 * to compute a new delay time... we're going to be
2352 * awakened immediately upon a laundry completion,
2353 * so we won't wait any longer than necessary
2354 */
2355 msecs = vm_pageout_state.vm_pageout_idle_wait;
2356 break;
2357
2358 case FCS_DEADLOCK_DETECTED:
2359 if (*vm_pageout_deadlock_target) {
2360 return VM_PAGEOUT_SCAN_PROCEED;
2361 }
2362
2363 vps_flow_control_reset_deadlock_timer(flow_control);
2364 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2365
2366 break;
2367 }
2368 } else {
2369 /*
2370 * No need to pause...
2371 */
2372 return VM_PAGEOUT_SCAN_PROCEED;
2373 }
2374
2375 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2376
2377 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2378 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2379
2380 if (vm_page_free_count >= vm_page_free_target) {
2381 /*
2382 * we're here because
2383 * 1) someone else freed up some pages while we had
2384 * the queues unlocked above
2385 * and we've hit one of the 3 conditions that
2386 * cause us to pause the pageout scan thread
2387 *
2388 * since we already have enough free pages,
2389 * let's avoid stalling and return normally
2390 *
2391 * before we return, make sure the pageout I/O threads
2392 * are running throttled in case there are still requests
2393 * in the laundry... since we have enough free pages
2394 * we don't need the laundry to be cleaned in a timely
2395 * fashion... so let's avoid interfering with foreground
2396 * activity
2397 *
2398 * we don't want to hold vm_page_queue_free_lock when
2399 * calling vm_pageout_adjust_eq_iothrottle (since it
2400 * may cause other locks to be taken), we do the intitial
2401 * check outside of the lock. Once we take the lock,
2402 * we recheck the condition since it may have changed.
2403 * if it has, no problem, we will make the threads
2404 * non-throttled before actually blocking
2405 */
2406 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2407 }
2408 vm_free_page_lock();
2409
2410 if (vm_page_free_count >= vm_page_free_target &&
2411 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2412 return VM_PAGEOUT_SCAN_DONE_RETURN;
2413 }
2414 vm_free_page_unlock();
2415
2416 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2417 /*
2418 * we're most likely about to block due to one of
2419 * the 3 conditions that cause vm_pageout_scan to
2420 * not be able to make forward progress w/r
2421 * to providing new pages to the free queue,
2422 * so unthrottle the I/O threads in case we
2423 * have laundry to be cleaned... it needs
2424 * to be completed ASAP.
2425 *
2426 * even if we don't block, we want the io threads
2427 * running unthrottled since the sum of free +
2428 * clean pages is still under our free target
2429 */
2430 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2431 }
2432 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2433 /*
2434 * if we get here we're below our free target and
2435 * we're stalling due to a full laundry queue or
2436 * we don't have any inactive pages other then
2437 * those in the clean queue...
2438 * however, we have pages on the clean queue that
2439 * can be moved to the free queue, so let's not
2440 * stall the pageout scan
2441 */
2442 flow_control->state = FCS_IDLE;
2443 return VM_PAGEOUT_SCAN_PROCEED;
2444 }
2445 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2446 flow_control->state = FCS_IDLE;
2447 return VM_PAGEOUT_SCAN_PROCEED;
2448 }
2449
2450 VM_CHECK_MEMORYSTATUS;
2451
2452 if (flow_control->state != FCS_IDLE) {
2453 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2454 }
2455
2456 iq->pgo_throttled = TRUE;
2457 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2458
2459 vm_page_unlock_queues();
2460
2461 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2462
2463 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2464 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2465 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2466
2467 thread_block(THREAD_CONTINUE_NULL);
2468
2469 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2470 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2471 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2472
2473 vm_page_lock_queues();
2474
2475 iq->pgo_throttled = FALSE;
2476
2477 vps_init_page_targets();
2478
2479 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2480 }
2481
2482 extern boolean_t vm_darkwake_mode;
2483 /*
2484 * This function is called only from vm_pageout_scan and
2485 * it will find and return the most appropriate page to be
2486 * reclaimed.
2487 */
2488 static int
2489 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2490 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2491 {
2492 vm_page_t m = NULL;
2493 vm_object_t m_object = VM_OBJECT_NULL;
2494 uint32_t inactive_external_count;
2495 struct vm_speculative_age_q *sq;
2496 struct vm_pageout_queue *iq;
2497 int retval = VM_PAGEOUT_SCAN_PROCEED;
2498
2499 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2500 iq = &vm_pageout_queue_internal;
2501
2502 *is_page_from_bg_q = FALSE;
2503
2504 m = NULL;
2505 m_object = VM_OBJECT_NULL;
2506
2507 if (VM_DYNAMIC_PAGING_ENABLED()) {
2508 assert(vm_page_throttled_count == 0);
2509 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2510 }
2511
2512 /*
2513 * Try for a clean-queue inactive page.
2514 * These are pages that vm_pageout_scan tried to steal earlier, but
2515 * were dirty and had to be cleaned. Pick them up now that they are clean.
2516 */
2517 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2518 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2519
2520 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2521
2522 goto found_page;
2523 }
2524
2525 /*
2526 * The next most eligible pages are ones we paged in speculatively,
2527 * but which have not yet been touched and have been aged out.
2528 */
2529 if (!vm_page_queue_empty(&sq->age_q)) {
2530 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2531
2532 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2533
2534 if (!m->vmp_dirty || force_anonymous == FALSE) {
2535 goto found_page;
2536 } else {
2537 m = NULL;
2538 }
2539 }
2540
2541 #if !CONFIG_JETSAM
2542 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2543 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2544 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2545 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2546 goto found_page;
2547 }
2548 }
2549 #endif /* !CONFIG_JETSAM */
2550
2551 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2552 vm_object_t bg_m_object = NULL;
2553
2554 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2555
2556 bg_m_object = VM_PAGE_OBJECT(m);
2557
2558 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2559 /*
2560 * This page is on the background queue
2561 * but not on a pageable queue OR is busy during
2562 * darkwake mode when the target is artificially lowered.
2563 * If it is busy during darkwake mode, and we don't skip it,
2564 * we will just swing back around and try again with the same
2565 * queue and might hit the same page or its neighbor in a
2566 * similar state. Both of these are transient states and will
2567 * get resolved, but, at this point let's ignore this page.
2568 */
2569 if (vm_darkwake_mode && m->vmp_busy) {
2570 if (bg_m_object->internal) {
2571 vm_pageout_skipped_bq_internal++;
2572 } else {
2573 vm_pageout_skipped_bq_external++;
2574 }
2575 }
2576 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2577 if (bg_m_object->internal &&
2578 (VM_PAGE_Q_THROTTLED(iq) ||
2579 vm_compressor_out_of_space() == TRUE ||
2580 vm_page_free_count < (vm_page_free_reserved / 4))) {
2581 vm_pageout_skipped_bq_internal++;
2582 } else {
2583 *is_page_from_bg_q = TRUE;
2584
2585 if (bg_m_object->internal) {
2586 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2587 } else {
2588 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2589 }
2590 goto found_page;
2591 }
2592 }
2593 }
2594
2595 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2596
2597 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2598 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2599 *grab_anonymous = TRUE;
2600 *anons_grabbed = 0;
2601
2602 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2603 vm_pageout_vminfo.vm_pageout_skipped_external++;
2604 } else {
2605 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2606 /*
2607 * No swap and we are in dangerously low levels of free memory.
2608 * If we keep going ahead with anonymous pages, we are going to run into a situation
2609 * where the compressor will be stuck waiting for free pages (if it isn't already).
2610 *
2611 * So, pick a file backed page...
2612 */
2613 *grab_anonymous = FALSE;
2614 *anons_grabbed = ANONS_GRABBED_LIMIT;
2615 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2616 }
2617 }
2618 goto want_anonymous;
2619 }
2620 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2621
2622 #if CONFIG_JETSAM
2623 /* If the file-backed pool has accumulated
2624 * significantly more pages than the jetsam
2625 * threshold, prefer to reclaim those
2626 * inline to minimise compute overhead of reclaiming
2627 * anonymous pages.
2628 * This calculation does not account for the CPU local
2629 * external page queues, as those are expected to be
2630 * much smaller relative to the global pools.
2631 */
2632
2633 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2634
2635 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2636 if (vm_page_pageable_external_count >
2637 vm_pageout_state.vm_page_filecache_min) {
2638 if ((vm_page_pageable_external_count *
2639 vm_pageout_memorystatus_fb_factor_dr) >
2640 (memorystatus_get_critical_page_shortage_threshold() *
2641 vm_pageout_memorystatus_fb_factor_nr)) {
2642 *grab_anonymous = FALSE;
2643
2644 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2645 }
2646 }
2647 if (*grab_anonymous) {
2648 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2649 }
2650 }
2651 #endif /* CONFIG_JETSAM */
2652
2653 want_anonymous:
2654 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2655 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2656 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2657
2658 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2659 *anons_grabbed = 0;
2660
2661 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2662 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2663 if ((++(*reactivated_this_call) % 100)) {
2664 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2665
2666 vm_page_activate(m);
2667 counter_inc(&vm_statistics_reactivations);
2668 #if DEVELOPMENT || DEBUG
2669 if (*is_page_from_bg_q == TRUE) {
2670 if (m_object->internal) {
2671 vm_pageout_rejected_bq_internal++;
2672 } else {
2673 vm_pageout_rejected_bq_external++;
2674 }
2675 }
2676 #endif /* DEVELOPMENT || DEBUG */
2677 vm_pageout_state.vm_pageout_inactive_used++;
2678
2679 m = NULL;
2680 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2681
2682 goto found_page;
2683 }
2684
2685 /*
2686 * steal 1 of the file backed pages even if
2687 * we are under the limit that has been set
2688 * for a healthy filecache
2689 */
2690 }
2691 }
2692 goto found_page;
2693 }
2694 }
2695 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2696 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2697
2698 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2699 *anons_grabbed += 1;
2700
2701 goto found_page;
2702 }
2703
2704 m = NULL;
2705
2706 found_page:
2707 *victim_page = m;
2708
2709 return retval;
2710 }
2711
2712 /*
2713 * This function is called only from vm_pageout_scan and
2714 * it will put a page back on the active/inactive queue
2715 * if we can't reclaim it for some reason.
2716 */
2717 static void
2718 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2719 {
2720 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2721 vm_page_enqueue_inactive(m, FALSE);
2722 } else {
2723 vm_page_activate(m);
2724 }
2725
2726 #if DEVELOPMENT || DEBUG
2727 vm_object_t m_object = VM_PAGE_OBJECT(m);
2728
2729 if (page_from_bg_q == TRUE) {
2730 if (m_object->internal) {
2731 vm_pageout_rejected_bq_internal++;
2732 } else {
2733 vm_pageout_rejected_bq_external++;
2734 }
2735 }
2736 #endif /* DEVELOPMENT || DEBUG */
2737 }
2738
2739 /*
2740 * This function is called only from vm_pageout_scan and
2741 * it will try to grab the victim page's VM object (m_object)
2742 * which differs from the previous victim page's object (object).
2743 */
2744 static int
2745 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2746 {
2747 struct vm_speculative_age_q *sq;
2748
2749 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2750
2751 /*
2752 * the object associated with candidate page is
2753 * different from the one we were just working
2754 * with... dump the lock if we still own it
2755 */
2756 if (*object != NULL) {
2757 vm_object_unlock(*object);
2758 *object = NULL;
2759 }
2760 /*
2761 * Try to lock object; since we've alread got the
2762 * page queues lock, we can only 'try' for this one.
2763 * if the 'try' fails, we need to do a mutex_pause
2764 * to allow the owner of the object lock a chance to
2765 * run... otherwise, we're likely to trip over this
2766 * object in the same state as we work our way through
2767 * the queue... clumps of pages associated with the same
2768 * object are fairly typical on the inactive and active queues
2769 */
2770 if (!vm_object_lock_try_scan(m_object)) {
2771 vm_page_t m_want = NULL;
2772
2773 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2774
2775 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2776 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2777 }
2778
2779 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2780
2781 m->vmp_reference = FALSE;
2782
2783 if (!m_object->object_is_shared_cache) {
2784 /*
2785 * don't apply this optimization if this is the shared cache
2786 * object, it's too easy to get rid of very hot and important
2787 * pages...
2788 * m->vmp_object must be stable since we hold the page queues lock...
2789 * we can update the scan_collisions field sans the object lock
2790 * since it is a separate field and this is the only spot that does
2791 * a read-modify-write operation and it is never executed concurrently...
2792 * we can asynchronously set this field to 0 when creating a UPL, so it
2793 * is possible for the value to be a bit non-determistic, but that's ok
2794 * since it's only used as a hint
2795 */
2796 m_object->scan_collisions = 1;
2797 }
2798 if (page_from_bg_q) {
2799 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2800 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2801 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2802 } else if (!vm_page_queue_empty(&sq->age_q)) {
2803 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2804 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2805 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2806 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2807 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2808 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2809 }
2810
2811 /*
2812 * this is the next object we're going to be interested in
2813 * try to make sure its available after the mutex_pause
2814 * returns control
2815 */
2816 if (m_want) {
2817 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2818 }
2819
2820 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2821
2822 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2823 } else {
2824 *object = m_object;
2825 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2826 }
2827
2828 return VM_PAGEOUT_SCAN_PROCEED;
2829 }
2830
2831 /*
2832 * This function is called only from vm_pageout_scan and
2833 * it notices that pageout scan may be rendered ineffective
2834 * due to a FS deadlock and will jetsam a process if possible.
2835 * If jetsam isn't supported, it'll move the page to the active
2836 * queue to try and get some different pages pushed onwards so
2837 * we can try to get out of this scenario.
2838 */
2839 static void
2840 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2841 boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2842 {
2843 struct vm_pageout_queue *eq;
2844 vm_object_t cur_object = VM_OBJECT_NULL;
2845
2846 cur_object = *object;
2847
2848 eq = &vm_pageout_queue_external;
2849
2850 if (cur_object->internal == FALSE) {
2851 /*
2852 * we need to break up the following potential deadlock case...
2853 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2854 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2855 * c) Most of the pages in the inactive queue belong to this file.
2856 *
2857 * we are potentially in this deadlock because...
2858 * a) the external pageout queue is throttled
2859 * b) we're done with the active queue and moved on to the inactive queue
2860 * c) we've got a dirty external page
2861 *
2862 * since we don't know the reason for the external pageout queue being throttled we
2863 * must suspect that we are deadlocked, so move the current page onto the active queue
2864 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2865 *
2866 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2867 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2868 * pool the next time we select a victim page... if we can make enough new free pages,
2869 * the deadlock will break, the external pageout queue will empty and it will no longer
2870 * be throttled
2871 *
2872 * if we have jetsam configured, keep a count of the pages reactivated this way so
2873 * that we can try to find clean pages in the active/inactive queues before
2874 * deciding to jetsam a process
2875 */
2876 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2877
2878 vm_page_check_pageable_safe(m);
2879 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2880 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2881 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2882 vm_page_active_count++;
2883 vm_page_pageable_external_count++;
2884
2885 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2886
2887 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2888
2889 #pragma unused(force_anonymous)
2890
2891 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2892
2893 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2894 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2895 /*
2896 * Possible deadlock scenario so request jetsam action
2897 */
2898 memorystatus_kill_on_vps_starvation();
2899 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2900 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2901 }
2902 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2903
2904 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2905
2906 *force_anonymous = TRUE;
2907 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2908 } else {
2909 vm_page_activate(m);
2910 counter_inc(&vm_statistics_reactivations);
2911
2912 #if DEVELOPMENT || DEBUG
2913 if (is_page_from_bg_q == TRUE) {
2914 if (cur_object->internal) {
2915 vm_pageout_rejected_bq_internal++;
2916 } else {
2917 vm_pageout_rejected_bq_external++;
2918 }
2919 }
2920 #endif /* DEVELOPMENT || DEBUG */
2921
2922 vm_pageout_state.vm_pageout_inactive_used++;
2923 }
2924 }
2925
2926
2927 void
2928 vm_page_balance_inactive(int max_to_move)
2929 {
2930 vm_page_t m;
2931
2932 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2933
2934 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2935 /*
2936 * It is likely that the hibernation code path is
2937 * dealing with these very queues as we are about
2938 * to move pages around in/from them and completely
2939 * change the linkage of the pages.
2940 *
2941 * And so we skip the rebalancing of these queues.
2942 */
2943 return;
2944 }
2945 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2946 vm_page_inactive_count +
2947 vm_page_speculative_count);
2948
2949 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2950 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2951
2952 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2953
2954 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2955 assert(!m->vmp_laundry);
2956 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2957 assert(!vm_page_is_guard(m));
2958
2959 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2960
2961 /*
2962 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2963 *
2964 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2965 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2966 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2967 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2968 * by pageout_scan, which is just fine since the last reference would have happened quite far
2969 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2970 * have happened before we moved the page
2971 */
2972 if (m->vmp_pmapped == TRUE) {
2973 /*
2974 * We might be holding the page queue lock as a
2975 * spin lock and clearing the "referenced" bit could
2976 * take a while if there are lots of mappings of
2977 * that page, so make sure we acquire the lock as
2978 * as mutex to avoid a spinlock timeout.
2979 */
2980 vm_page_lockconvert_queues();
2981 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2982 }
2983
2984 /*
2985 * The page might be absent or busy,
2986 * but vm_page_deactivate can handle that.
2987 * FALSE indicates that we don't want a H/W clear reference
2988 */
2989 vm_page_deactivate_internal(m, FALSE);
2990 }
2991 }
2992
2993 /*
2994 * vm_pageout_scan does the dirty work for the pageout daemon.
2995 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2996 * held and vm_page_free_wanted == 0.
2997 */
2998 void
2999 vm_pageout_scan(void)
3000 {
3001 unsigned int loop_count = 0;
3002 unsigned int inactive_burst_count = 0;
3003 unsigned int reactivated_this_call;
3004 unsigned int reactivate_limit;
3005 vm_page_t local_freeq = NULL;
3006 int local_freed = 0;
3007 int delayed_unlock;
3008 int delayed_unlock_limit = 0;
3009 int refmod_state = 0;
3010 int vm_pageout_deadlock_target = 0;
3011 struct vm_pageout_queue *iq;
3012 struct vm_pageout_queue *eq;
3013 struct vm_speculative_age_q *sq;
3014 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3015 boolean_t inactive_throttled = FALSE;
3016 vm_object_t object = NULL;
3017 uint32_t inactive_reclaim_run;
3018 boolean_t grab_anonymous = FALSE;
3019 boolean_t force_anonymous = FALSE;
3020 boolean_t force_speculative_aging = FALSE;
3021 int anons_grabbed = 0;
3022 int page_prev_q_state = 0;
3023 boolean_t page_from_bg_q = FALSE;
3024 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3025 vm_object_t m_object = VM_OBJECT_NULL;
3026 int retval = 0;
3027 boolean_t lock_yield_check = FALSE;
3028
3029
3030 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3031 vm_pageout_vminfo.vm_pageout_freed_speculative,
3032 vm_pageout_state.vm_pageout_inactive_clean,
3033 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3034 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3035
3036 flow_control.state = FCS_IDLE;
3037 iq = &vm_pageout_queue_internal;
3038 eq = &vm_pageout_queue_external;
3039 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3040
3041 /* Ask the pmap layer to return any pages it no longer needs. */
3042 pmap_release_pages_fast();
3043
3044 vm_page_lock_queues();
3045
3046 delayed_unlock = 1;
3047
3048 /*
3049 * Calculate the max number of referenced pages on the inactive
3050 * queue that we will reactivate.
3051 */
3052 reactivated_this_call = 0;
3053 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3054 vm_page_inactive_count);
3055 inactive_reclaim_run = 0;
3056
3057 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3058
3059 /*
3060 * We must limit the rate at which we send pages to the pagers
3061 * so that we don't tie up too many pages in the I/O queues.
3062 * We implement a throttling mechanism using the laundry count
3063 * to limit the number of pages outstanding to the default
3064 * and external pagers. We can bypass the throttles and look
3065 * for clean pages if the pageout queues don't drain in a timely
3066 * fashion since this may indicate that the pageout paths are
3067 * stalled waiting for memory, which only we can provide.
3068 */
3069
3070 vps_init_page_targets();
3071 assert(object == NULL);
3072 assert(delayed_unlock != 0);
3073
3074 for (;;) {
3075 vm_page_t m;
3076
3077 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3078
3079 if (lock_yield_check) {
3080 lock_yield_check = FALSE;
3081
3082 if (delayed_unlock++ > delayed_unlock_limit) {
3083 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3084 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3085 } else if (vm_pageout_scan_wants_object) {
3086 vm_page_unlock_queues();
3087 mutex_pause(0);
3088 vm_page_lock_queues();
3089 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3090 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3091 }
3092 }
3093
3094 if (vm_upl_wait_for_pages < 0) {
3095 vm_upl_wait_for_pages = 0;
3096 }
3097
3098 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3099
3100 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3101 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3102 }
3103
3104 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3105
3106 assert(delayed_unlock);
3107
3108 /*
3109 * maintain our balance
3110 */
3111 vm_page_balance_inactive(1);
3112
3113
3114 /**********************************************************************
3115 * above this point we're playing with the active and secluded queues
3116 * below this point we're playing with the throttling mechanisms
3117 * and the inactive queue
3118 **********************************************************************/
3119
3120 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3121 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3122
3123 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3124 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3125 /*
3126 * make sure the pageout I/O threads are running
3127 * throttled in case there are still requests
3128 * in the laundry... since we have met our targets
3129 * we don't need the laundry to be cleaned in a timely
3130 * fashion... so let's avoid interfering with foreground
3131 * activity
3132 */
3133 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3134
3135 vm_free_page_lock();
3136
3137 if ((vm_page_free_count >= vm_page_free_target) &&
3138 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3139 /*
3140 * done - we have met our target *and*
3141 * there is no one waiting for a page.
3142 */
3143 return_from_scan:
3144 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3145
3146 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3147 vm_pageout_state.vm_pageout_inactive,
3148 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3149 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3150 vm_pageout_vminfo.vm_pageout_freed_speculative,
3151 vm_pageout_state.vm_pageout_inactive_clean,
3152 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3153 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3154
3155 return;
3156 }
3157 vm_free_page_unlock();
3158 }
3159
3160 /*
3161 * Before anything, we check if we have any ripe volatile
3162 * objects around. If so, try to purge the first object.
3163 * If the purge fails, fall through to reclaim a page instead.
3164 * If the purge succeeds, go back to the top and reevalute
3165 * the new memory situation.
3166 */
3167 retval = vps_purge_object();
3168
3169 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3170 /*
3171 * Success
3172 */
3173 if (object != NULL) {
3174 vm_object_unlock(object);
3175 object = NULL;
3176 }
3177
3178 lock_yield_check = FALSE;
3179 continue;
3180 }
3181
3182
3183 /*
3184 * If our 'aged' queue is empty and we have some speculative pages
3185 * in the other queues, let's go through and see if we need to age
3186 * them.
3187 *
3188 * If we succeeded in aging a speculative Q or just that everything
3189 * looks normal w.r.t queue age and queue counts, we keep going onward.
3190 *
3191 * If, for some reason, we seem to have a mismatch between the spec.
3192 * page count and the page queues, we reset those variables and
3193 * restart the loop (LD TODO: Track this better?).
3194 */
3195 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3196 retval = vps_age_speculative_queue(force_speculative_aging);
3197
3198 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3199 lock_yield_check = FALSE;
3200 continue;
3201 }
3202 }
3203 force_speculative_aging = FALSE;
3204
3205 /*
3206 * Check to see if we need to evict objects from the cache.
3207 *
3208 * Note: 'object' here doesn't have anything to do with
3209 * the eviction part. We just need to make sure we have dropped
3210 * any object lock we might be holding if we need to go down
3211 * into the eviction logic.
3212 */
3213 retval = vps_object_cache_evict(&object);
3214
3215 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3216 lock_yield_check = FALSE;
3217 continue;
3218 }
3219
3220
3221 /*
3222 * Calculate our filecache_min that will affect the loop
3223 * going forward.
3224 */
3225 vps_calculate_filecache_min();
3226
3227 /*
3228 * LD TODO: Use a structure to hold all state variables for a single
3229 * vm_pageout_scan iteration and pass that structure to this function instead.
3230 */
3231 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3232 &delayed_unlock, &local_freeq, &local_freed,
3233 &vm_pageout_deadlock_target, inactive_burst_count);
3234
3235 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3236 if (loop_count >= vm_page_inactive_count) {
3237 loop_count = 0;
3238 }
3239
3240 inactive_burst_count = 0;
3241
3242 assert(object == NULL);
3243 assert(delayed_unlock != 0);
3244
3245 lock_yield_check = FALSE;
3246 continue;
3247 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3248 goto return_from_scan;
3249 }
3250
3251 flow_control.state = FCS_IDLE;
3252
3253 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3254 vm_pageout_inactive_external_forced_reactivate_limit);
3255 loop_count++;
3256 inactive_burst_count++;
3257 vm_pageout_state.vm_pageout_inactive++;
3258
3259 /*
3260 * Choose a victim.
3261 */
3262
3263 m = NULL;
3264 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3265
3266 if (m == NULL) {
3267 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3268 inactive_burst_count = 0;
3269
3270 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3271 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3272 }
3273
3274 lock_yield_check = TRUE;
3275 continue;
3276 }
3277
3278 /*
3279 * if we've gotten here, we have no victim page.
3280 * check to see if we've not finished balancing the queues
3281 * or we have a page on the aged speculative queue that we
3282 * skipped due to force_anonymous == TRUE.. or we have
3283 * speculative pages that we can prematurely age... if
3284 * one of these cases we'll keep going, else panic
3285 */
3286 force_anonymous = FALSE;
3287 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3288
3289 if (!vm_page_queue_empty(&sq->age_q)) {
3290 lock_yield_check = TRUE;
3291 continue;
3292 }
3293
3294 if (vm_page_speculative_count) {
3295 force_speculative_aging = TRUE;
3296 lock_yield_check = TRUE;
3297 continue;
3298 }
3299 panic("vm_pageout: no victim");
3300
3301 /* NOTREACHED */
3302 }
3303
3304 assert(VM_PAGE_PAGEABLE(m));
3305 m_object = VM_PAGE_OBJECT(m);
3306 force_anonymous = FALSE;
3307
3308 page_prev_q_state = m->vmp_q_state;
3309 /*
3310 * we just found this page on one of our queues...
3311 * it can't also be on the pageout queue, so safe
3312 * to call vm_page_queues_remove
3313 */
3314 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3315 vm_page_queues_remove(m, TRUE);
3316 if (donate) {
3317 /*
3318 * The compressor needs to see this bit to know
3319 * where this page needs to land. Also if stolen,
3320 * this bit helps put the page back in the right
3321 * special queue where it belongs.
3322 */
3323 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3324 }
3325
3326 assert(!m->vmp_laundry);
3327 assert(vm_page_is_canonical(m));
3328 assert(!is_kernel_object(m_object));
3329
3330 vm_pageout_vminfo.vm_pageout_considered_page++;
3331
3332 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3333
3334 /*
3335 * check to see if we currently are working
3336 * with the same object... if so, we've
3337 * already got the lock
3338 */
3339 if (m_object != object) {
3340 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3341
3342 /*
3343 * vps_switch_object() will always drop the 'object' lock first
3344 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3345 * either 'm_object' or NULL.
3346 */
3347 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3348
3349 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3350 lock_yield_check = TRUE;
3351 continue;
3352 }
3353 }
3354 assert(m_object == object);
3355 assert(VM_PAGE_OBJECT(m) == m_object);
3356
3357 if (m->vmp_busy) {
3358 /*
3359 * Somebody is already playing with this page.
3360 * Put it back on the appropriate queue
3361 *
3362 */
3363 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3364
3365 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3366 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3367 }
3368
3369 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3370
3371 lock_yield_check = TRUE;
3372 continue;
3373 }
3374
3375 /*
3376 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3377 * If already cleaning this page in place
3378 * just leave if off the paging queues.
3379 * We can leave the page mapped, and upl_commit_range
3380 * will put it on the clean queue.
3381 *
3382 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3383 * an msync INVALIDATE is in progress...
3384 * this page has been marked for destruction
3385 * after it has been cleaned,
3386 * but not yet gathered into a UPL
3387 * where 'cleaning' will be set...
3388 * just leave it off the paging queues
3389 *
3390 * if (m->vmp_free_when_done && m->vmp_clenaing)
3391 * an msync INVALIDATE is in progress
3392 * and the UPL has already gathered this page...
3393 * just leave it off the paging queues
3394 */
3395 if (m->vmp_free_when_done || m->vmp_cleaning) {
3396 lock_yield_check = TRUE;
3397 continue;
3398 }
3399
3400
3401 /*
3402 * If it's absent, in error or the object is no longer alive,
3403 * we can reclaim the page... in the no longer alive case,
3404 * there are 2 states the page can be in that preclude us
3405 * from reclaiming it - busy or cleaning - that we've already
3406 * dealt with
3407 */
3408 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3409 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3410 if (m->vmp_absent) {
3411 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3412 } else if (!object->alive ||
3413 (!object->internal &&
3414 object->pager == MEMORY_OBJECT_NULL)) {
3415 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3416 } else {
3417 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3418 }
3419 if (m->vmp_pmapped) {
3420 int refmod;
3421
3422 /*
3423 * If this page was file-backed and wired while its pager
3424 * was lost (during a forced unmount, for example), there
3425 * could still be some pmap mappings that need to be
3426 * cleaned up before we can free the page.
3427 */
3428 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3429 if ((refmod & VM_MEM_MODIFIED) &&
3430 !m->vmp_dirty) {
3431 SET_PAGE_DIRTY(m, FALSE);
3432 }
3433 }
3434 reclaim_page:
3435 if (vm_pageout_deadlock_target) {
3436 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3437 vm_pageout_deadlock_target--;
3438 }
3439
3440 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3441
3442 if (object->internal) {
3443 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3444 } else {
3445 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3446 }
3447 assert(!m->vmp_cleaning);
3448 assert(!m->vmp_laundry);
3449
3450 if (!object->internal &&
3451 object->pager != NULL &&
3452 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3453 shared_region_pager_reclaimed++;
3454 }
3455
3456 m->vmp_busy = TRUE;
3457
3458 /*
3459 * remove page from object here since we're already
3460 * behind the object lock... defer the rest of the work
3461 * we'd normally do in vm_page_free_prepare_object
3462 * until 'vm_page_free_list' is called
3463 */
3464 if (m->vmp_tabled) {
3465 vm_page_remove(m, TRUE);
3466 }
3467
3468 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3469 m->vmp_snext = local_freeq;
3470 local_freeq = m;
3471 local_freed++;
3472
3473 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3474 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3475 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3476 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3477 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3478 vm_pageout_vminfo.vm_pageout_freed_internal++;
3479 } else {
3480 vm_pageout_vminfo.vm_pageout_freed_external++;
3481 }
3482
3483 inactive_burst_count = 0;
3484
3485 lock_yield_check = TRUE;
3486 continue;
3487 }
3488 if (object->vo_copy == VM_OBJECT_NULL) {
3489 /*
3490 * No one else can have any interest in this page.
3491 * If this is an empty purgable object, the page can be
3492 * reclaimed even if dirty.
3493 * If the page belongs to a volatile purgable object, we
3494 * reactivate it if the compressor isn't active.
3495 */
3496 if (object->purgable == VM_PURGABLE_EMPTY) {
3497 if (m->vmp_pmapped == TRUE) {
3498 /* unmap the page */
3499 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3500 if (refmod_state & VM_MEM_MODIFIED) {
3501 SET_PAGE_DIRTY(m, FALSE);
3502 }
3503 }
3504 if (m->vmp_dirty || m->vmp_precious) {
3505 /* we saved the cost of cleaning this page ! */
3506 vm_page_purged_count++;
3507 }
3508 goto reclaim_page;
3509 }
3510
3511 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3512 /*
3513 * With the VM compressor, the cost of
3514 * reclaiming a page is much lower (no I/O),
3515 * so if we find a "volatile" page, it's better
3516 * to let it get compressed rather than letting
3517 * it occupy a full page until it gets purged.
3518 * So no need to check for "volatile" here.
3519 */
3520 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3521 /*
3522 * Avoid cleaning a "volatile" page which might
3523 * be purged soon.
3524 */
3525
3526 /* if it's wired, we can't put it on our queue */
3527 assert(!VM_PAGE_WIRED(m));
3528
3529 /* just stick it back on! */
3530 reactivated_this_call++;
3531
3532 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3533 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3534 }
3535
3536 goto reactivate_page;
3537 }
3538 } /* vo_copy NULL */
3539 /*
3540 * If it's being used, reactivate.
3541 * (Fictitious pages are either busy or absent.)
3542 * First, update the reference and dirty bits
3543 * to make sure the page is unreferenced.
3544 */
3545 refmod_state = -1;
3546
3547 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3548 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3549
3550 if (refmod_state & VM_MEM_REFERENCED) {
3551 m->vmp_reference = TRUE;
3552 }
3553 if (refmod_state & VM_MEM_MODIFIED) {
3554 SET_PAGE_DIRTY(m, FALSE);
3555 }
3556 }
3557
3558 if (m->vmp_reference || m->vmp_dirty) {
3559 /* deal with a rogue "reusable" page */
3560 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3561 }
3562
3563 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3564 vm_pageout_state.vm_page_xpmapped_min = 0;
3565 } else {
3566 vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3567 vm_pageout_state.vm_page_xpmapped_min_divisor;
3568 }
3569
3570 if (!m->vmp_no_cache &&
3571 page_from_bg_q == FALSE &&
3572 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3573 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3574 /*
3575 * The page we pulled off the inactive list has
3576 * been referenced. It is possible for other
3577 * processors to be touching pages faster than we
3578 * can clear the referenced bit and traverse the
3579 * inactive queue, so we limit the number of
3580 * reactivations.
3581 */
3582 if (++reactivated_this_call >= reactivate_limit &&
3583 !object->object_is_shared_cache &&
3584 !((m->vmp_realtime ||
3585 object->for_realtime) &&
3586 vm_pageout_protect_realtime)) {
3587 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3588 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3589 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3590 if (object->object_is_shared_cache) {
3591 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3592 } else if (m->vmp_realtime ||
3593 object->for_realtime) {
3594 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3595 }
3596 } else {
3597 uint32_t isinuse;
3598
3599 if (reactivated_this_call >= reactivate_limit) {
3600 if (object->object_is_shared_cache) {
3601 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3602 } else if ((m->vmp_realtime ||
3603 object->for_realtime) &&
3604 vm_pageout_protect_realtime) {
3605 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3606 }
3607 }
3608 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3609 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3610 }
3611
3612 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3613 reactivate_page:
3614 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3615 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3616 /*
3617 * no explict mappings of this object exist
3618 * and it's not open via the filesystem
3619 */
3620 vm_page_deactivate(m);
3621 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3622 } else {
3623 /*
3624 * The page was/is being used, so put back on active list.
3625 */
3626 vm_page_activate(m);
3627 counter_inc(&vm_statistics_reactivations);
3628 inactive_burst_count = 0;
3629 }
3630 #if DEVELOPMENT || DEBUG
3631 if (page_from_bg_q == TRUE) {
3632 if (m_object->internal) {
3633 vm_pageout_rejected_bq_internal++;
3634 } else {
3635 vm_pageout_rejected_bq_external++;
3636 }
3637 }
3638 #endif /* DEVELOPMENT || DEBUG */
3639
3640 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3641 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3642 }
3643 vm_pageout_state.vm_pageout_inactive_used++;
3644
3645 lock_yield_check = TRUE;
3646 continue;
3647 }
3648 /*
3649 * Make sure we call pmap_get_refmod() if it
3650 * wasn't already called just above, to update
3651 * the dirty bit.
3652 */
3653 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3654 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3655 if (refmod_state & VM_MEM_MODIFIED) {
3656 SET_PAGE_DIRTY(m, FALSE);
3657 }
3658 }
3659 }
3660
3661 /*
3662 * we've got a candidate page to steal...
3663 *
3664 * m->vmp_dirty is up to date courtesy of the
3665 * preceding check for m->vmp_reference... if
3666 * we get here, then m->vmp_reference had to be
3667 * FALSE (or possibly "reactivate_limit" was
3668 * exceeded), but in either case we called
3669 * pmap_get_refmod() and updated both
3670 * m->vmp_reference and m->vmp_dirty
3671 *
3672 * if it's dirty or precious we need to
3673 * see if the target queue is throtttled
3674 * it if is, we need to skip over it by moving it back
3675 * to the end of the inactive queue
3676 */
3677
3678 inactive_throttled = FALSE;
3679
3680 if (m->vmp_dirty || m->vmp_precious) {
3681 if (object->internal) {
3682 if (VM_PAGE_Q_THROTTLED(iq)) {
3683 inactive_throttled = TRUE;
3684 }
3685 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3686 inactive_throttled = TRUE;
3687 }
3688 }
3689 throttle_inactive:
3690 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3691 object->internal && m->vmp_dirty &&
3692 (object->purgable == VM_PURGABLE_DENY ||
3693 object->purgable == VM_PURGABLE_NONVOLATILE ||
3694 object->purgable == VM_PURGABLE_VOLATILE)) {
3695 vm_page_check_pageable_safe(m);
3696 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3697 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3698 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3699 vm_page_throttled_count++;
3700
3701 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3702
3703 inactive_burst_count = 0;
3704
3705 lock_yield_check = TRUE;
3706 continue;
3707 }
3708 if (inactive_throttled == TRUE) {
3709 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3710 &force_anonymous, page_from_bg_q);
3711
3712 inactive_burst_count = 0;
3713
3714 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3715 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3716 }
3717
3718 lock_yield_check = TRUE;
3719 continue;
3720 }
3721
3722 /*
3723 * we've got a page that we can steal...
3724 * eliminate all mappings and make sure
3725 * we have the up-to-date modified state
3726 *
3727 * if we need to do a pmap_disconnect then we
3728 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3729 * provides the true state atomically... the
3730 * page was still mapped up to the pmap_disconnect
3731 * and may have been dirtied at the last microsecond
3732 *
3733 * Note that if 'pmapped' is FALSE then the page is not
3734 * and has not been in any map, so there is no point calling
3735 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3736 * of likely usage of the page.
3737 */
3738 if (m->vmp_pmapped == TRUE) {
3739 int pmap_options;
3740
3741 /*
3742 * Don't count this page as going into the compressor
3743 * if any of these are true:
3744 * 1) compressed pager isn't enabled
3745 * 2) Freezer enabled device with compressed pager
3746 * backend (exclusive use) i.e. most of the VM system
3747 * (including vm_pageout_scan) has no knowledge of
3748 * the compressor
3749 * 3) This page belongs to a file and hence will not be
3750 * sent into the compressor
3751 */
3752 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3753 object->internal == FALSE) {
3754 pmap_options = 0;
3755 } else if (m->vmp_dirty || m->vmp_precious) {
3756 /*
3757 * VM knows that this page is dirty (or
3758 * precious) and needs to be compressed
3759 * rather than freed.
3760 * Tell the pmap layer to count this page
3761 * as "compressed".
3762 */
3763 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3764 } else {
3765 /*
3766 * VM does not know if the page needs to
3767 * be preserved but the pmap layer might tell
3768 * us if any mapping has "modified" it.
3769 * Let's the pmap layer to count this page
3770 * as compressed if and only if it has been
3771 * modified.
3772 */
3773 pmap_options =
3774 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3775 }
3776 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3777 pmap_options,
3778 NULL);
3779 if (refmod_state & VM_MEM_MODIFIED) {
3780 SET_PAGE_DIRTY(m, FALSE);
3781 }
3782 }
3783
3784 /*
3785 * reset our count of pages that have been reclaimed
3786 * since the last page was 'stolen'
3787 */
3788 inactive_reclaim_run = 0;
3789
3790 /*
3791 * If it's clean and not precious, we can free the page.
3792 */
3793 if (!m->vmp_dirty && !m->vmp_precious) {
3794 vm_pageout_state.vm_pageout_inactive_clean++;
3795
3796 /*
3797 * OK, at this point we have found a page we are going to free.
3798 */
3799 #if CONFIG_PHANTOM_CACHE
3800 if (!object->internal) {
3801 vm_phantom_cache_add_ghost(m);
3802 }
3803 #endif
3804 goto reclaim_page;
3805 }
3806
3807 /*
3808 * The page may have been dirtied since the last check
3809 * for a throttled target queue (which may have been skipped
3810 * if the page was clean then). With the dirty page
3811 * disconnected here, we can make one final check.
3812 */
3813 if (object->internal) {
3814 if (VM_PAGE_Q_THROTTLED(iq)) {
3815 inactive_throttled = TRUE;
3816 }
3817 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3818 inactive_throttled = TRUE;
3819 }
3820
3821 if (inactive_throttled == TRUE) {
3822 goto throttle_inactive;
3823 }
3824 #if !CONFIG_JETSAM
3825 memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3826 #endif /* !CONFIG_JETSAM */
3827
3828 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3829 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3830 }
3831
3832 if (object->internal) {
3833 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3834 } else {
3835 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3836 }
3837
3838 /*
3839 * internal pages will go to the compressor...
3840 * external pages will go to the appropriate pager to be cleaned
3841 * and upon completion will end up on 'vm_page_queue_cleaned' which
3842 * is a preferred queue to steal from
3843 */
3844 vm_pageout_cluster(m);
3845 inactive_burst_count = 0;
3846
3847 /*
3848 * back to top of pageout scan loop
3849 */
3850 }
3851 }
3852
3853
3854 void
3855 vm_page_free_reserve(
3856 int pages)
3857 {
3858 int free_after_reserve;
3859
3860 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3861 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3862 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3863 } else {
3864 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3865 }
3866 } else {
3867 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3868 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3869 } else {
3870 vm_page_free_reserved += pages;
3871 }
3872 }
3873 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3874
3875 vm_page_free_min = vm_page_free_reserved +
3876 VM_PAGE_FREE_MIN(free_after_reserve);
3877
3878 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3879 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3880 }
3881
3882 vm_page_free_target = vm_page_free_reserved +
3883 VM_PAGE_FREE_TARGET(free_after_reserve);
3884
3885 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3886 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3887 }
3888
3889 if (vm_page_free_target < vm_page_free_min + 5) {
3890 vm_page_free_target = vm_page_free_min + 5;
3891 }
3892
3893 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3894 }
3895
3896 /*
3897 * vm_pageout is the high level pageout daemon.
3898 */
3899
3900 void
3901 vm_pageout_continue(void)
3902 {
3903 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3904 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3905
3906 vm_free_page_lock();
3907 vm_pageout_running = TRUE;
3908 vm_free_page_unlock();
3909
3910 vm_pageout_scan();
3911 /*
3912 * we hold both the vm_page_queue_free_lock
3913 * and the vm_page_queues_lock at this point
3914 */
3915 assert(vm_page_free_wanted == 0);
3916 assert(vm_page_free_wanted_privileged == 0);
3917 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3918
3919 vm_pageout_running = FALSE;
3920 #if XNU_TARGET_OS_OSX
3921 if (vm_pageout_waiter) {
3922 vm_pageout_waiter = FALSE;
3923 thread_wakeup((event_t)&vm_pageout_waiter);
3924 }
3925 #endif /* XNU_TARGET_OS_OSX */
3926
3927 vm_free_page_unlock();
3928 vm_page_unlock_queues();
3929
3930 thread_block((thread_continue_t)vm_pageout_continue);
3931 /*NOTREACHED*/
3932 }
3933
3934 #if XNU_TARGET_OS_OSX
3935 kern_return_t
3936 vm_pageout_wait(uint64_t deadline)
3937 {
3938 kern_return_t kr;
3939
3940 vm_free_page_lock();
3941 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3942 vm_pageout_waiter = TRUE;
3943 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3944 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3945 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3946 kr = KERN_OPERATION_TIMED_OUT;
3947 }
3948 }
3949 vm_free_page_unlock();
3950
3951 return kr;
3952 }
3953 #endif /* XNU_TARGET_OS_OSX */
3954
3955 OS_NORETURN
3956 static void
3957 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3958 {
3959 vm_page_t m = NULL;
3960 vm_object_t object;
3961 vm_object_offset_t offset;
3962 memory_object_t pager;
3963 struct vm_pageout_queue *q = ethr->q;
3964
3965 /* On systems with a compressor, the external IO thread clears its
3966 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3967 * creation)
3968 */
3969 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3970 current_thread()->options &= ~TH_OPT_VMPRIV;
3971 }
3972
3973 sched_cond_ack(&(ethr->pgo_wakeup));
3974
3975 while (true) {
3976 vm_page_lockspin_queues();
3977
3978 while (!vm_page_queue_empty(&q->pgo_pending)) {
3979 q->pgo_busy = TRUE;
3980 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3981
3982 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3983 VM_PAGE_CHECK(m);
3984 /*
3985 * grab a snapshot of the object and offset this
3986 * page is tabled in so that we can relookup this
3987 * page after we've taken the object lock - these
3988 * fields are stable while we hold the page queues lock
3989 * but as soon as we drop it, there is nothing to keep
3990 * this page in this object... we hold an activity_in_progress
3991 * on this object which will keep it from terminating
3992 */
3993 object = VM_PAGE_OBJECT(m);
3994 offset = m->vmp_offset;
3995
3996 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3997 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3998
3999 vm_page_unlock_queues();
4000
4001 vm_object_lock(object);
4002
4003 m = vm_page_lookup(object, offset);
4004
4005 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4006 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4007 /*
4008 * it's either the same page that someone else has
4009 * started cleaning (or it's finished cleaning or
4010 * been put back on the pageout queue), or
4011 * the page has been freed or we have found a
4012 * new page at this offset... in all of these cases
4013 * we merely need to release the activity_in_progress
4014 * we took when we put the page on the pageout queue
4015 */
4016 vm_object_activity_end(object);
4017 vm_object_unlock(object);
4018
4019 vm_page_lockspin_queues();
4020 continue;
4021 }
4022 pager = object->pager;
4023
4024 if (pager == MEMORY_OBJECT_NULL) {
4025 /*
4026 * This pager has been destroyed by either
4027 * memory_object_destroy or vm_object_destroy, and
4028 * so there is nowhere for the page to go.
4029 */
4030 if (m->vmp_free_when_done) {
4031 /*
4032 * Just free the page... VM_PAGE_FREE takes
4033 * care of cleaning up all the state...
4034 * including doing the vm_pageout_throttle_up
4035 */
4036 VM_PAGE_FREE(m);
4037 } else {
4038 vm_page_lockspin_queues();
4039
4040 vm_pageout_throttle_up(m);
4041 vm_page_activate(m);
4042
4043 vm_page_unlock_queues();
4044
4045 /*
4046 * And we are done with it.
4047 */
4048 }
4049 vm_object_activity_end(object);
4050 vm_object_unlock(object);
4051
4052 vm_page_lockspin_queues();
4053 continue;
4054 }
4055 #if 0
4056 /*
4057 * we don't hold the page queue lock
4058 * so this check isn't safe to make
4059 */
4060 VM_PAGE_CHECK(m);
4061 #endif
4062 /*
4063 * give back the activity_in_progress reference we
4064 * took when we queued up this page and replace it
4065 * it with a paging_in_progress reference that will
4066 * also hold the paging offset from changing and
4067 * prevent the object from terminating
4068 */
4069 vm_object_activity_end(object);
4070 vm_object_paging_begin(object);
4071 vm_object_unlock(object);
4072
4073 /*
4074 * Send the data to the pager.
4075 * any pageout clustering happens there
4076 */
4077 memory_object_data_return(pager,
4078 m->vmp_offset + object->paging_offset,
4079 PAGE_SIZE,
4080 NULL,
4081 NULL,
4082 FALSE,
4083 FALSE,
4084 0);
4085
4086 vm_object_lock(object);
4087 vm_object_paging_end(object);
4088 vm_object_unlock(object);
4089
4090 vm_pageout_io_throttle();
4091
4092 vm_page_lockspin_queues();
4093 }
4094 q->pgo_busy = FALSE;
4095
4096 vm_page_unlock_queues();
4097 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4098 }
4099 /*NOTREACHED*/
4100 }
4101
4102 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4103
4104 #if DEVELOPMENT || DEBUG
4105 static void
4106 vm_pageout_record_thread_time(int cqid, int ncomps)
4107 {
4108 if (__improbable(vm_compressor_time_thread)) {
4109 vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4110 vmct_stats.vmct_pages[cqid] += ncomps;
4111 vmct_stats.vmct_iterations[cqid]++;
4112 if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4113 vmct_stats.vmct_maxpages[cqid] = ncomps;
4114 }
4115 if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4116 vmct_stats.vmct_minpages[cqid] = ncomps;
4117 }
4118 }
4119 }
4120 #endif
4121
4122 static void *
4123 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4124 {
4125 /*
4126 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4127 * However, this page has been removed from all queues and is only
4128 * known to this compressor thread dealing with this local queue.
4129 *
4130 * TODO: Add a second localq that is the early localq and
4131 * put special pages like this one on that queue in the block above
4132 * under the pageq lock to avoid this 'works but not clean' logic.
4133 */
4134 void *donate_queue_head;
4135 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4136 donate_queue_head = &cq->current_early_swapout_chead;
4137 #else /* XNU_TARGET_OS_OSX */
4138 donate_queue_head = &cq->current_late_swapout_chead;
4139 #endif /* XNU_TARGET_OS_OSX */
4140 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4141 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4142 return donate_queue_head;
4143 } else {
4144 return &cq->current_regular_swapout_chead;
4145 }
4146 }
4147
4148 #define MAX_FREE_BATCH 32
4149
4150 OS_NORETURN
4151 static void
4152 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4153 {
4154 struct vm_pageout_queue *q;
4155 vm_page_t m = NULL;
4156 boolean_t pgo_draining;
4157 vm_page_t local_q;
4158 int local_cnt;
4159 vm_page_t local_freeq = NULL;
4160 int local_freed = 0;
4161 int local_batch_size;
4162 #if DEVELOPMENT || DEBUG
4163 int ncomps = 0;
4164 boolean_t marked_active = FALSE;
4165 int num_pages_processed = 0;
4166 #endif
4167 void *chead = NULL;
4168
4169 KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4170
4171 sched_cond_ack(&(cq->pgo_wakeup));
4172
4173 q = cq->q;
4174
4175 while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4176 #if DEVELOPMENT || DEBUG
4177 bool benchmark_accounting = false;
4178 /* If we're running the compressor perf test, only process the benchmark pages.
4179 * We'll get back to our regular queue once the benchmark is done */
4180 if (compressor_running_perf_test) {
4181 q = cq->benchmark_q;
4182 if (!vm_page_queue_empty(&q->pgo_pending)) {
4183 benchmark_accounting = true;
4184 } else {
4185 q = cq->q;
4186 benchmark_accounting = false;
4187 }
4188 }
4189 #endif /* DEVELOPMENT || DEBUG */
4190
4191 #if __AMP__
4192 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4193 local_batch_size = (q->pgo_maxlaundry >> 3);
4194 local_batch_size = MAX(local_batch_size, 16);
4195 } else {
4196 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4197 }
4198 #else
4199 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4200 #endif
4201
4202 #if RECORD_THE_COMPRESSED_DATA
4203 if (q->pgo_laundry) {
4204 c_compressed_record_init();
4205 }
4206 #endif
4207 while (true) { /* this loop is for working though all the pages in the pending queue */
4208 int pages_left_on_q = 0;
4209
4210 local_cnt = 0;
4211 local_q = NULL;
4212
4213 KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4214
4215 vm_page_lock_queues();
4216 #if DEVELOPMENT || DEBUG
4217 if (marked_active == FALSE) {
4218 vmct_active++;
4219 vmct_state[cq->id] = VMCT_ACTIVE;
4220 marked_active = TRUE;
4221 if (vmct_active == 1) {
4222 vm_compressor_epoch_start = mach_absolute_time();
4223 }
4224 }
4225 #endif
4226 KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4227
4228 KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4229
4230 /* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4231 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4232 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4233 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4234 VM_PAGE_CHECK(m);
4235
4236 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4237 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4238 m->vmp_laundry = FALSE;
4239
4240 m->vmp_snext = local_q;
4241 local_q = m;
4242 local_cnt++;
4243 }
4244 if (local_q == NULL) {
4245 break;
4246 }
4247
4248 q->pgo_busy = TRUE;
4249
4250 if ((pgo_draining = q->pgo_draining) == FALSE) {
4251 vm_pageout_throttle_up_batch(q, local_cnt);
4252 pages_left_on_q = q->pgo_laundry;
4253 } else {
4254 pages_left_on_q = q->pgo_laundry - local_cnt;
4255 }
4256
4257 vm_page_unlock_queues();
4258
4259 #if !RECORD_THE_COMPRESSED_DATA
4260 /* if we have lots to compress, wake up the other thread to help.
4261 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4262 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4263 // wake up the next compressor thread
4264 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4265 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4266 }
4267 #endif
4268 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4269
4270 while (local_q) {
4271 KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4272
4273 m = local_q;
4274 local_q = m->vmp_snext;
4275 m->vmp_snext = NULL;
4276
4277
4278 chead = vm_pageout_select_filling_chead(cq, m);
4279
4280 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4281 #if DEVELOPMENT || DEBUG
4282 ncomps++;
4283 #endif
4284 KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4285
4286 m->vmp_snext = local_freeq;
4287 local_freeq = m;
4288 local_freed++;
4289
4290 /* if we gathered enough free pages, free them now */
4291 if (local_freed >= MAX_FREE_BATCH) {
4292 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4293
4294 vm_page_free_list(local_freeq, TRUE);
4295
4296 local_freeq = NULL;
4297 local_freed = 0;
4298 }
4299 }
4300 #if DEVELOPMENT || DEBUG
4301 num_pages_processed++;
4302 #endif /* DEVELOPMENT || DEBUG */
4303 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4304 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4305 kern_return_t wait_result;
4306 int need_wakeup = 0;
4307
4308 if (local_freeq) {
4309 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4310
4311 vm_page_free_list(local_freeq, TRUE);
4312 local_freeq = NULL;
4313 local_freed = 0;
4314
4315 continue;
4316 }
4317 vm_free_page_lock_spin();
4318
4319 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4320 if (vm_page_free_wanted_privileged++ == 0) {
4321 need_wakeup = 1;
4322 }
4323 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4324
4325 vm_free_page_unlock();
4326
4327 if (need_wakeup) {
4328 thread_wakeup((event_t)&vm_page_free_wanted);
4329 }
4330
4331 if (wait_result == THREAD_WAITING) {
4332 thread_block(THREAD_CONTINUE_NULL);
4333 }
4334 } else {
4335 vm_free_page_unlock();
4336 }
4337 }
4338 #endif
4339 } /* while (local_q) */
4340 /* free any leftovers in the freeq */
4341 if (local_freeq) {
4342 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4343
4344 vm_page_free_list(local_freeq, TRUE);
4345 local_freeq = NULL;
4346 local_freed = 0;
4347 }
4348 if (pgo_draining == TRUE) {
4349 vm_page_lockspin_queues();
4350 vm_pageout_throttle_up_batch(q, local_cnt);
4351 vm_page_unlock_queues();
4352 }
4353 }
4354 KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4355
4356 /*
4357 * queue lock is held and our q is empty
4358 */
4359 q->pgo_busy = FALSE;
4360 #if DEVELOPMENT || DEBUG
4361 if (marked_active == TRUE) {
4362 vmct_active--;
4363 vmct_state[cq->id] = VMCT_IDLE;
4364
4365 if (vmct_active == 0) {
4366 vm_compressor_epoch_stop = mach_absolute_time();
4367 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4368 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4369 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4370 /* This interval includes intervals where one or more
4371 * compressor threads were pre-empted
4372 */
4373 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4374 }
4375 }
4376 if (compressor_running_perf_test && benchmark_accounting) {
4377 /*
4378 * We could turn ON compressor_running_perf_test while still processing
4379 * regular non-benchmark pages. We shouldn't count them here else we
4380 * could overshoot. We might also still be populating that benchmark Q
4381 * and be under pressure. So we will go back to the regular queues. And
4382 * benchmark accounting will be off for that case too.
4383 */
4384 compressor_perf_test_pages_processed += num_pages_processed;
4385 thread_wakeup(&compressor_perf_test_pages_processed);
4386 }
4387 #endif
4388 vm_page_unlock_queues();
4389 #if DEVELOPMENT || DEBUG
4390 vm_pageout_record_thread_time(cq->id, ncomps);
4391 #endif
4392
4393 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4394 #if DEVELOPMENT || DEBUG
4395 if (compressor_running_perf_test && benchmark_accounting) {
4396 /*
4397 * We've been exclusively compressing pages from the benchmark queue,
4398 * do 1 pass over the internal queue before blocking.
4399 */
4400 continue;
4401 }
4402 #endif
4403
4404 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4405 }
4406 /*NOTREACHED*/
4407 }
4408
4409 /* resolves the pager and maintain stats in the pager and in the vm_object */
4410 kern_return_t
4411 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4412 {
4413 vm_object_t object;
4414 memory_object_t pager;
4415 int compressed_count_delta;
4416 kern_return_t retval;
4417
4418 object = VM_PAGE_OBJECT(m);
4419
4420 assert(!m->vmp_free_when_done);
4421 assert(!m->vmp_laundry);
4422
4423 pager = object->pager;
4424
4425 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4426 KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4427
4428 vm_object_lock(object);
4429
4430 /*
4431 * If there is no memory object for the page, create
4432 * one and hand it to the compression pager.
4433 */
4434
4435 if (!object->pager_initialized) {
4436 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4437 }
4438 if (!object->pager_initialized) {
4439 vm_object_compressor_pager_create(object);
4440 }
4441
4442 pager = object->pager;
4443
4444 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4445 /*
4446 * Still no pager for the object,
4447 * or the pager has been destroyed.
4448 * Reactivate the page.
4449 *
4450 * Should only happen if there is no
4451 * compression pager
4452 */
4453 vm_page_wakeup_done(object, m);
4454
4455 vm_page_lockspin_queues();
4456 vm_page_activate(m);
4457 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4458 vm_page_unlock_queues();
4459
4460 /*
4461 * And we are done with it.
4462 */
4463 vm_object_activity_end(object);
4464 vm_object_unlock(object);
4465
4466 return KERN_FAILURE;
4467 }
4468 vm_object_unlock(object);
4469
4470 KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4471 }
4472 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4473 assert(object->activity_in_progress > 0);
4474
4475 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4476 if (m->vmp_unmodified_ro == true) {
4477 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4478 }
4479 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4480
4481 vm_compressor_options_t flags = 0;
4482
4483 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4484 if (m->vmp_unmodified_ro) {
4485 flags |= C_PAGE_UNMODIFIED;
4486 }
4487 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4488
4489
4490 retval = vm_compressor_pager_put(
4491 pager,
4492 m->vmp_offset + object->paging_offset,
4493 VM_PAGE_GET_PHYS_PAGE(m),
4494 current_chead,
4495 scratch_buf,
4496 &compressed_count_delta,
4497 flags);
4498
4499 vm_object_lock(object);
4500
4501 assert(object->activity_in_progress > 0);
4502 assert(VM_PAGE_OBJECT(m) == object);
4503 assert( !VM_PAGE_WIRED(m));
4504
4505 vm_compressor_pager_count(pager,
4506 compressed_count_delta,
4507 FALSE, /* shared_lock */
4508 object);
4509
4510 if (retval == KERN_SUCCESS) {
4511 /*
4512 * If the object is purgeable, its owner's
4513 * purgeable ledgers will be updated in
4514 * vm_page_remove() but the page still
4515 * contributes to the owner's memory footprint,
4516 * so account for it as such.
4517 */
4518 if (m->vmp_tabled) {
4519 vm_page_remove(m, TRUE);
4520 }
4521 if ((object->purgable != VM_PURGABLE_DENY ||
4522 object->vo_ledger_tag) &&
4523 object->vo_owner != NULL) {
4524 /* one more compressed purgeable/tagged page */
4525 vm_object_owner_compressed_update(object,
4526 compressed_count_delta);
4527 }
4528 counter_inc(&vm_statistics_compressions);
4529 } else {
4530 vm_page_wakeup_done(object, m);
4531
4532 vm_page_lockspin_queues();
4533
4534 vm_page_activate(m);
4535 vm_pageout_vminfo.vm_compressor_failed++;
4536
4537 vm_page_unlock_queues();
4538 }
4539 vm_object_activity_end(object);
4540 vm_object_unlock(object);
4541
4542 return retval;
4543 }
4544
4545
4546 static void
4547 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4548 {
4549 uint32_t policy;
4550
4551 if (hibernate_cleaning_in_progress == TRUE) {
4552 req_lowpriority = FALSE;
4553 }
4554
4555 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4556 vm_page_unlock_queues();
4557
4558 if (req_lowpriority == TRUE) {
4559 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4560 DTRACE_VM(laundrythrottle);
4561 } else {
4562 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4563 DTRACE_VM(laundryunthrottle);
4564 }
4565 proc_set_thread_policy(ethr->pgo_iothread,
4566 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4567
4568 vm_page_lock_queues();
4569 ethr->q->pgo_lowpriority = req_lowpriority;
4570 }
4571 }
4572
4573 OS_NORETURN
4574 static void
4575 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4576 {
4577 thread_t self = current_thread();
4578
4579 self->options |= TH_OPT_VMPRIV;
4580
4581 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4582
4583 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4584 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4585
4586 vm_page_lock_queues();
4587
4588 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4589 vm_pageout_queue_external.pgo_inited = TRUE;
4590
4591 vm_page_unlock_queues();
4592
4593 #if CONFIG_THREAD_GROUPS
4594 thread_group_vm_add();
4595 #endif /* CONFIG_THREAD_GROUPS */
4596
4597 vm_pageout_iothread_external_continue(ethr, 0);
4598 /*NOTREACHED*/
4599 }
4600
4601
4602 OS_NORETURN
4603 static void
4604 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4605 {
4606 thread_t self = current_thread();
4607
4608 self->options |= TH_OPT_VMPRIV;
4609
4610 vm_page_lock_queues();
4611
4612 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4613 vm_pageout_queue_internal.pgo_inited = TRUE;
4614
4615 #if DEVELOPMENT || DEBUG
4616 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4617 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4618 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4619 #endif /* DEVELOPMENT || DEBUG */
4620
4621 vm_page_unlock_queues();
4622
4623 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4624 thread_vm_bind_group_add();
4625 }
4626
4627 #if CONFIG_THREAD_GROUPS
4628 thread_group_vm_add();
4629 #endif /* CONFIG_THREAD_GROUPS */
4630
4631 #if __AMP__
4632 if (vm_compressor_ebound) {
4633 /*
4634 * Use the soft bound option for vm_compressor to allow it to run on
4635 * P-cores if E-cluster is unavailable.
4636 */
4637 thread_soft_bind_cluster_type(self, 'E');
4638 }
4639 #endif /* __AMP__ */
4640
4641 thread_set_thread_name(current_thread(), "VM_compressor");
4642 #if DEVELOPMENT || DEBUG
4643 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4644 #endif
4645 vm_pageout_iothread_internal_continue(cthr, 0);
4646
4647 /*NOTREACHED*/
4648 }
4649
4650 kern_return_t
4651 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4652 {
4653 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4654 return KERN_SUCCESS;
4655 } else {
4656 return KERN_FAILURE; /* Already set */
4657 }
4658 }
4659
4660 extern boolean_t memorystatus_manual_testing_on;
4661 extern unsigned int memorystatus_level;
4662
4663
4664 #if VM_PRESSURE_EVENTS
4665
4666 boolean_t vm_pressure_events_enabled = FALSE;
4667
4668 extern uint64_t next_warning_notification_sent_at_ts;
4669 extern uint64_t next_critical_notification_sent_at_ts;
4670
4671 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4672
4673 /*
4674 * The last time there was change in pressure level OR we forced a check
4675 * because the system is stuck in a non-normal pressure level.
4676 */
4677 uint64_t vm_pressure_last_level_transition_abs = 0;
4678
4679 /*
4680 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4681 * level before resending out notifications for that level again.
4682 */
4683 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4684
4685 void
4686 vm_pressure_response(void)
4687 {
4688 vm_pressure_level_t old_level = kVMPressureNormal;
4689 int new_level = -1;
4690 unsigned int total_pages;
4691 uint64_t available_memory = 0;
4692 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4693 bool force_check = false;
4694 int time_in_mins;
4695
4696
4697 if (vm_pressure_events_enabled == FALSE) {
4698 return;
4699 }
4700
4701 available_memory = (uint64_t) memorystatus_get_available_page_count();
4702
4703 total_pages = (unsigned int) atop_64(max_mem);
4704 #if CONFIG_SECLUDED_MEMORY
4705 total_pages -= vm_page_secluded_count;
4706 #endif /* CONFIG_SECLUDED_MEMORY */
4707 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4708
4709 if (memorystatus_manual_testing_on) {
4710 return;
4711 }
4712
4713 curr_ts = mach_absolute_time();
4714 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4715
4716 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4717 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4718 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4719
4720 old_level = memorystatus_vm_pressure_level;
4721
4722 switch (memorystatus_vm_pressure_level) {
4723 case kVMPressureNormal:
4724 {
4725 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4726 new_level = kVMPressureCritical;
4727 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4728 new_level = kVMPressureWarning;
4729 }
4730 break;
4731 }
4732
4733 case kVMPressureWarning:
4734 case kVMPressureUrgent:
4735 {
4736 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4737 new_level = kVMPressureNormal;
4738 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4739 new_level = kVMPressureCritical;
4740 } else if (force_check) {
4741 new_level = kVMPressureWarning;
4742 next_warning_notification_sent_at_ts = curr_ts;
4743 }
4744 break;
4745 }
4746
4747 case kVMPressureCritical:
4748 {
4749 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4750 new_level = kVMPressureNormal;
4751 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4752 new_level = kVMPressureWarning;
4753 } else if (force_check) {
4754 new_level = kVMPressureCritical;
4755 next_critical_notification_sent_at_ts = curr_ts;
4756 }
4757 break;
4758 }
4759
4760 default:
4761 return;
4762 }
4763
4764 if (new_level != -1 || force_check) {
4765 if (new_level != -1) {
4766 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4767
4768 if (new_level != (int) old_level) {
4769 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4770 new_level, old_level, 0, 0);
4771 }
4772 } else {
4773 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4774 new_level, old_level, force_check, 0);
4775 }
4776
4777 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4778 /*
4779 * We don't want to schedule a wakeup while hibernation is in progress
4780 * because that could collide with checks for non-monotonicity in the scheduler.
4781 * We do however do all the updates to memorystatus_vm_pressure_level because
4782 * we _might_ want to use that for decisions regarding which pages or how
4783 * many pages we want to dump in hibernation.
4784 */
4785 return;
4786 }
4787
4788 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4789 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4790 thread_wakeup(&vm_pressure_thread);
4791 }
4792
4793 if (old_level != memorystatus_vm_pressure_level) {
4794 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4795 }
4796 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4797 }
4798 }
4799 }
4800 #endif /* VM_PRESSURE_EVENTS */
4801
4802
4803 /**
4804 * Called by a kernel thread to ask if a number of pages may be wired.
4805 */
4806 kern_return_t
4807 mach_vm_wire_level_monitor(int64_t requested_pages)
4808 {
4809 if (requested_pages <= 0) {
4810 return KERN_INVALID_ARGUMENT;
4811 }
4812
4813 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4814 /**
4815 * Available pages can be negative in the case where more system memory is
4816 * wired than the threshold, so we must use a signed integer.
4817 */
4818 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4819
4820 if (requested_pages > available_pages) {
4821 return KERN_RESOURCE_SHORTAGE;
4822 }
4823 return KERN_SUCCESS;
4824 }
4825
4826 /*
4827 * Function called by a kernel thread to either get the current pressure level or
4828 * wait until memory pressure changes from a given level.
4829 */
4830 kern_return_t
4831 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4832 {
4833 #if !VM_PRESSURE_EVENTS
4834 (void)wait_for_pressure;
4835 (void)pressure_level;
4836 return KERN_NOT_SUPPORTED;
4837 #else /* VM_PRESSURE_EVENTS */
4838
4839 uint32_t *waiters = NULL;
4840 wait_result_t wr = 0;
4841 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4842
4843 if (pressure_level == NULL) {
4844 return KERN_INVALID_ARGUMENT;
4845 }
4846 if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4847 *pressure_level == kVMPressureForegroundJetsam)) {
4848 return KERN_INVALID_ARGUMENT;
4849 }
4850
4851 if (wait_for_pressure) {
4852 switch (*pressure_level) {
4853 case kVMPressureForegroundJetsam:
4854 case kVMPressureBackgroundJetsam:
4855
4856 if (*pressure_level == kVMPressureForegroundJetsam) {
4857 waiters = &memorystatus_jetsam_fg_band_waiters;
4858 } else {
4859 /* kVMPressureBackgroundJetsam */
4860 waiters = &memorystatus_jetsam_bg_band_waiters;
4861 }
4862
4863 lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4864 wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4865 if (wr == THREAD_WAITING) {
4866 *waiters += 1;
4867 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4868 wr = thread_block(THREAD_CONTINUE_NULL);
4869 } else {
4870 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4871 }
4872
4873 if (wr != THREAD_AWAKENED) {
4874 return KERN_ABORTED;
4875 }
4876
4877 return KERN_SUCCESS;
4878 case kVMPressureNormal:
4879 case kVMPressureWarning:
4880 case kVMPressureUrgent:
4881 case kVMPressureCritical:
4882 while (old_level == *pressure_level) {
4883 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4884 THREAD_INTERRUPTIBLE);
4885 if (wr == THREAD_WAITING) {
4886 wr = thread_block(THREAD_CONTINUE_NULL);
4887 }
4888 if (wr == THREAD_INTERRUPTED) {
4889 return KERN_ABORTED;
4890 }
4891
4892 if (wr == THREAD_AWAKENED) {
4893 old_level = memorystatus_vm_pressure_level;
4894 }
4895 }
4896 break;
4897 default:
4898 return KERN_INVALID_ARGUMENT;
4899 }
4900 }
4901
4902 *pressure_level = old_level;
4903 return KERN_SUCCESS;
4904 #endif /* VM_PRESSURE_EVENTS */
4905 }
4906
4907 #if VM_PRESSURE_EVENTS
4908 void
4909 vm_pressure_thread(void)
4910 {
4911 static boolean_t thread_initialized = FALSE;
4912
4913 if (thread_initialized == TRUE) {
4914 vm_pageout_state.vm_pressure_thread_running = TRUE;
4915 consider_vm_pressure_events();
4916 vm_pageout_state.vm_pressure_thread_running = FALSE;
4917 }
4918
4919 #if CONFIG_THREAD_GROUPS
4920 thread_group_vm_add();
4921 #endif /* CONFIG_THREAD_GROUPS */
4922
4923 thread_set_thread_name(current_thread(), "VM_pressure");
4924 thread_initialized = TRUE;
4925 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4926 thread_block((thread_continue_t)vm_pressure_thread);
4927 }
4928 #endif /* VM_PRESSURE_EVENTS */
4929
4930
4931 /*
4932 * called once per-second via "compute_averages"
4933 */
4934 void
4935 compute_pageout_gc_throttle(__unused void *arg)
4936 {
4937 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4938 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4939 sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4940 }
4941 }
4942
4943 /*
4944 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4945 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4946 * jetsams. We need to check if the zone map size is above its jetsam limit to
4947 * decide if this was indeed the case.
4948 *
4949 * We need to do this on a different thread because of the following reasons:
4950 *
4951 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4952 * itself causing the system to hang. We perform synchronous jetsams if we're
4953 * leaking in the VM map entries zone, so the leaking process could be doing a
4954 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4955 * jetsam itself. We also need the vm_map lock on the process termination path,
4956 * which would now lead the dying process to deadlock against itself.
4957 *
4958 * 2. The jetsam path might need to allocate zone memory itself. We could try
4959 * using the non-blocking variant of zalloc for this path, but we can still
4960 * end up trying to do a kmem_alloc when the zone maps are almost full.
4961 */
4962 __dead2
4963 void
4964 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4965 {
4966 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4967
4968 if (step != VM_PAGEOUT_GC_INIT) {
4969 sched_cond_ack(&vm_pageout_gc_cond);
4970 }
4971
4972 while (true) {
4973 if (step == VM_PAGEOUT_GC_INIT) {
4974 /* first time being called is not about GC */
4975 #if CONFIG_THREAD_GROUPS
4976 thread_group_vm_add();
4977 #endif /* CONFIG_THREAD_GROUPS */
4978 step = VM_PAGEOUT_GC_COLLECT;
4979 } else if (zone_map_nearing_exhaustion()) {
4980 /*
4981 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4982 *
4983 * Bail out after calling zone_gc (which triggers the
4984 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4985 * operations that clear out a bunch of caches might allocate zone
4986 * memory themselves (for eg. vm_map operations would need VM map
4987 * entries). Since the zone map is almost full at this point, we
4988 * could end up with a panic. We just need to quickly jetsam a
4989 * process and exit here.
4990 *
4991 * It could so happen that we were woken up to relieve memory
4992 * pressure and the zone map also happened to be near its limit at
4993 * the time, in which case we'll skip out early. But that should be
4994 * ok; if memory pressure persists, the thread will simply be woken
4995 * up again.
4996 */
4997
4998 zone_gc(ZONE_GC_JETSAM);
4999 } else {
5000 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
5001 boolean_t buf_large_zfree = FALSE;
5002 boolean_t first_try = TRUE;
5003
5004 stack_collect();
5005
5006 consider_machine_collect();
5007 #if CONFIG_DEFERRED_RECLAIM
5008 vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE);
5009 #endif /* CONFIG_DEFERRED_RECLAIM */
5010 #if CONFIG_MBUF_MCACHE
5011 mbuf_drain(FALSE);
5012 #endif /* CONFIG_MBUF_MCACHE */
5013
5014 do {
5015 if (consider_buffer_cache_collect != NULL) {
5016 buf_large_zfree = (*consider_buffer_cache_collect)(0);
5017 }
5018 if (first_try == TRUE || buf_large_zfree == TRUE) {
5019 /*
5020 * zone_gc should be last, because the other operations
5021 * might return memory to zones.
5022 */
5023 zone_gc(ZONE_GC_TRIM);
5024 }
5025 first_try = FALSE;
5026 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5027
5028 consider_machine_adjust();
5029 }
5030
5031 sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5032 }
5033 __builtin_unreachable();
5034 }
5035
5036
5037 #if VM_PAGE_BUCKETS_CHECK
5038 #if VM_PAGE_FAKE_BUCKETS
5039 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5040 #endif /* VM_PAGE_FAKE_BUCKETS */
5041 #endif /* VM_PAGE_BUCKETS_CHECK */
5042
5043
5044
5045 void
5046 vm_set_restrictions(unsigned int num_cpus)
5047 {
5048 int vm_restricted_to_single_processor = 0;
5049
5050 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5051 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5052 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5053 } else {
5054 assert(num_cpus > 0);
5055
5056 if (num_cpus <= 3) {
5057 /*
5058 * on systems with a limited number of CPUS, bind the
5059 * 4 major threads that can free memory and that tend to use
5060 * a fair bit of CPU under pressured conditions to a single processor.
5061 * This insures that these threads don't hog all of the available CPUs
5062 * (important for camera launch), while allowing them to run independently
5063 * w/r to locks... the 4 threads are
5064 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5065 * vm_compressor_swap_trigger_thread (minor and major compactions),
5066 * memorystatus_thread (jetsams).
5067 *
5068 * the first time the thread is run, it is responsible for checking the
5069 * state of vm_restricted_to_single_processor, and if TRUE it calls
5070 * thread_bind_master... someday this should be replaced with a group
5071 * scheduling mechanism and KPI.
5072 */
5073 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5074 } else {
5075 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5076 }
5077 }
5078 }
5079
5080 /*
5081 * Set up vm_config based on the vm_compressor_mode.
5082 * Must run BEFORE the pageout thread starts up.
5083 */
5084 __startup_func
5085 void
5086 vm_config_init(void)
5087 {
5088 bzero(&vm_config, sizeof(vm_config));
5089
5090 switch (vm_compressor_mode) {
5091 case VM_PAGER_DEFAULT:
5092 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5093 OS_FALLTHROUGH;
5094
5095 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5096 vm_config.compressor_is_present = TRUE;
5097 vm_config.swap_is_present = TRUE;
5098 vm_config.compressor_is_active = TRUE;
5099 vm_config.swap_is_active = TRUE;
5100 break;
5101
5102 case VM_PAGER_COMPRESSOR_NO_SWAP:
5103 vm_config.compressor_is_present = TRUE;
5104 vm_config.swap_is_present = TRUE;
5105 vm_config.compressor_is_active = TRUE;
5106 break;
5107
5108 case VM_PAGER_FREEZER_DEFAULT:
5109 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5110 OS_FALLTHROUGH;
5111
5112 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5113 vm_config.compressor_is_present = TRUE;
5114 vm_config.swap_is_present = TRUE;
5115 break;
5116
5117 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5118 vm_config.compressor_is_present = TRUE;
5119 vm_config.swap_is_present = TRUE;
5120 vm_config.compressor_is_active = TRUE;
5121 vm_config.freezer_swap_is_active = TRUE;
5122 break;
5123
5124 case VM_PAGER_NOT_CONFIGURED:
5125 break;
5126
5127 default:
5128 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5129 break;
5130 }
5131 }
5132
5133 __startup_func
5134 static void
5135 vm_pageout_create_gc_thread(void)
5136 {
5137 thread_t thread;
5138
5139 sched_cond_init(&vm_pageout_gc_cond);
5140 if (kernel_thread_create(vm_pageout_garbage_collect,
5141 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5142 panic("vm_pageout_garbage_collect: create failed");
5143 }
5144 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5145 if (thread->reserved_stack == 0) {
5146 assert(thread->kernel_stack);
5147 thread->reserved_stack = thread->kernel_stack;
5148 }
5149
5150 /* thread is started in vm_pageout() */
5151 vm_pageout_gc_thread = thread;
5152 }
5153 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5154
5155 void
5156 vm_pageout(void)
5157 {
5158 thread_t self = current_thread();
5159 thread_t thread;
5160 kern_return_t result;
5161 spl_t s;
5162
5163 /*
5164 * Set thread privileges.
5165 */
5166 s = splsched();
5167
5168 #if CONFIG_VPS_DYNAMIC_PRIO
5169 if (vps_dynamic_priority_enabled) {
5170 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5171 thread_set_eager_preempt(self);
5172 } else {
5173 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5174 }
5175 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5176 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5177 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5178
5179 thread_lock(self);
5180 self->options |= TH_OPT_VMPRIV;
5181 thread_unlock(self);
5182
5183 if (!self->reserved_stack) {
5184 self->reserved_stack = self->kernel_stack;
5185 }
5186
5187 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5188 !vps_dynamic_priority_enabled) {
5189 thread_vm_bind_group_add();
5190 }
5191
5192
5193 #if CONFIG_THREAD_GROUPS
5194 thread_group_vm_add();
5195 #endif /* CONFIG_THREAD_GROUPS */
5196
5197 #if __AMP__
5198 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5199 if (vm_pgo_pbound) {
5200 /*
5201 * Use the soft bound option for vm pageout to allow it to run on
5202 * E-cores if P-cluster is unavailable.
5203 */
5204 thread_soft_bind_cluster_type(self, 'P');
5205 }
5206 #endif /* __AMP__ */
5207
5208 PE_parse_boot_argn("vmpgo_protect_realtime",
5209 &vm_pageout_protect_realtime,
5210 sizeof(vm_pageout_protect_realtime));
5211 splx(s);
5212
5213 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5214
5215 /*
5216 * Initialize some paging parameters.
5217 */
5218
5219 vm_pageout_state.vm_pressure_thread_running = FALSE;
5220 vm_pageout_state.vm_pressure_changed = FALSE;
5221 vm_pageout_state.memorystatus_purge_on_warning = 2;
5222 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5223 vm_pageout_state.memorystatus_purge_on_critical = 8;
5224 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5225 vm_pageout_state.vm_page_speculative_percentage = 5;
5226 vm_pageout_state.vm_page_speculative_target = 0;
5227
5228 vm_pageout_state.vm_pageout_swap_wait = 0;
5229 vm_pageout_state.vm_pageout_idle_wait = 0;
5230 vm_pageout_state.vm_pageout_empty_wait = 0;
5231 vm_pageout_state.vm_pageout_burst_wait = 0;
5232 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5233 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5234 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5235
5236 vm_pageout_state.vm_pageout_inactive = 0;
5237 vm_pageout_state.vm_pageout_inactive_used = 0;
5238 vm_pageout_state.vm_pageout_inactive_clean = 0;
5239
5240 vm_pageout_state.vm_memory_pressure = 0;
5241 vm_pageout_state.vm_page_filecache_min = 0;
5242 #if CONFIG_JETSAM
5243 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5244 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5245 #else
5246 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5247 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5248 #endif
5249 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5250
5251 vm_pageout_state.vm_pageout_considered_page_last = 0;
5252
5253 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5254 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5255 }
5256
5257 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5258 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5259 }
5260
5261 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5262 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5263 }
5264
5265 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5266 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5267 }
5268
5269 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5270 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5271 }
5272
5273 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5274 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5275 }
5276
5277 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5278 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5279 }
5280 /*
5281 * even if we've already called vm_page_free_reserve
5282 * call it again here to insure that the targets are
5283 * accurately calculated (it uses vm_page_free_count_init)
5284 * calling it with an arg of 0 will not change the reserve
5285 * but will re-calculate free_min and free_target
5286 */
5287 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5288 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5289 } else {
5290 vm_page_free_reserve(0);
5291 }
5292
5293 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5294 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5295
5296 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5297 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5298
5299 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5300
5301 #if DEVELOPMENT || DEBUG
5302 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5303 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5304 #endif /* DEVELOPMENT || DEBUG */
5305
5306
5307 /* internal pageout thread started when default pager registered first time */
5308 /* external pageout and garbage collection threads started here */
5309 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5310 ethr->id = 0;
5311 ethr->q = &vm_pageout_queue_external;
5312 /* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5313 ethr->current_early_swapout_chead = NULL;
5314 ethr->current_regular_swapout_chead = NULL;
5315 ethr->current_late_swapout_chead = NULL;
5316 ethr->scratch_buf = NULL;
5317 #if DEVELOPMENT || DEBUG
5318 ethr->benchmark_q = NULL;
5319 #endif /* DEVELOPMENT || DEBUG */
5320 sched_cond_init(&(ethr->pgo_wakeup));
5321
5322 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5323 (void *)ethr, BASEPRI_VM,
5324 &(ethr->pgo_iothread));
5325 if (result != KERN_SUCCESS) {
5326 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5327 }
5328 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5329
5330 thread_mtx_lock(vm_pageout_gc_thread );
5331 thread_start(vm_pageout_gc_thread );
5332 thread_mtx_unlock(vm_pageout_gc_thread);
5333
5334 #if VM_PRESSURE_EVENTS
5335 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5336 BASEPRI_DEFAULT,
5337 &thread);
5338
5339 if (result != KERN_SUCCESS) {
5340 panic("vm_pressure_thread: create failed");
5341 }
5342
5343 thread_deallocate(thread);
5344 #endif
5345
5346 vm_object_reaper_init();
5347
5348
5349 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5350 vm_compressor_init();
5351 }
5352
5353 #if VM_PRESSURE_EVENTS
5354 vm_pressure_events_enabled = TRUE;
5355 #endif /* VM_PRESSURE_EVENTS */
5356
5357 #if CONFIG_PHANTOM_CACHE
5358 vm_phantom_cache_init();
5359 #endif
5360 #if VM_PAGE_BUCKETS_CHECK
5361 #if VM_PAGE_FAKE_BUCKETS
5362 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5363 (uint64_t) vm_page_fake_buckets_start,
5364 (uint64_t) vm_page_fake_buckets_end);
5365 pmap_protect(kernel_pmap,
5366 vm_page_fake_buckets_start,
5367 vm_page_fake_buckets_end,
5368 VM_PROT_READ);
5369 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5370 #endif /* VM_PAGE_FAKE_BUCKETS */
5371 #endif /* VM_PAGE_BUCKETS_CHECK */
5372
5373 #if VM_OBJECT_TRACKING
5374 vm_object_tracking_init();
5375 #endif /* VM_OBJECT_TRACKING */
5376
5377 #if __arm64__
5378 // vm_tests();
5379 #endif /* __arm64__ */
5380
5381 vm_pageout_continue();
5382
5383 /*
5384 * Unreached code!
5385 *
5386 * The vm_pageout_continue() call above never returns, so the code below is never
5387 * executed. We take advantage of this to declare several DTrace VM related probe
5388 * points that our kernel doesn't have an analog for. These are probe points that
5389 * exist in Solaris and are in the DTrace documentation, so people may have written
5390 * scripts that use them. Declaring the probe points here means their scripts will
5391 * compile and execute which we want for portability of the scripts, but since this
5392 * section of code is never reached, the probe points will simply never fire. Yes,
5393 * this is basically a hack. The problem is the DTrace probe points were chosen with
5394 * Solaris specific VM events in mind, not portability to different VM implementations.
5395 */
5396
5397 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5398 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5399 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5400 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5401 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5402 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5403 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5404 /*NOTREACHED*/
5405 }
5406
5407
5408
5409 kern_return_t
5410 vm_pageout_internal_start(void)
5411 {
5412 kern_return_t result = KERN_SUCCESS;
5413 host_basic_info_data_t hinfo;
5414 vm_offset_t buf, bufsize;
5415
5416 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5417
5418 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5419 #define BSD_HOST 1
5420 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5421
5422 assert(hinfo.max_cpus > 0);
5423
5424 #if !XNU_TARGET_OS_OSX
5425 vm_pageout_state.vm_compressor_thread_count = 1;
5426 #else /* !XNU_TARGET_OS_OSX */
5427 if (hinfo.max_cpus > 4) {
5428 vm_pageout_state.vm_compressor_thread_count = 2;
5429 } else {
5430 vm_pageout_state.vm_compressor_thread_count = 1;
5431 }
5432 #endif /* !XNU_TARGET_OS_OSX */
5433 #if __AMP__
5434 if (vm_compressor_ebound) {
5435 vm_pageout_state.vm_compressor_thread_count = 2;
5436 }
5437 #endif
5438 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5439 sizeof(vm_pageout_state.vm_compressor_thread_count));
5440
5441 /* did we get from the bootargs an unreasonable number? */
5442 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5443 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5444 }
5445 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5446 vm_pageout_state.vm_compressor_thread_count = 1;
5447 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5448 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5449 }
5450
5451 vm_pageout_queue_internal.pgo_maxlaundry =
5452 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5453
5454 PE_parse_boot_argn("vmpgoi_maxlaundry",
5455 &vm_pageout_queue_internal.pgo_maxlaundry,
5456 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5457
5458 #if DEVELOPMENT || DEBUG
5459 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5460 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5461 #endif /* DEVELOPMENT || DEBUG */
5462
5463 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5464
5465 kmem_alloc(kernel_map, &buf,
5466 bufsize * vm_pageout_state.vm_compressor_thread_count,
5467 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5468 VM_KERN_MEMORY_COMPRESSOR);
5469
5470 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5471 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5472 iq->id = i;
5473 iq->q = &vm_pageout_queue_internal;
5474 iq->current_early_swapout_chead = NULL;
5475 iq->current_regular_swapout_chead = NULL;
5476 iq->current_late_swapout_chead = NULL;
5477 iq->scratch_buf = (char *)(buf + i * bufsize);
5478 #if DEVELOPMENT || DEBUG
5479 iq->benchmark_q = &vm_pageout_queue_benchmark;
5480 #endif /* DEVELOPMENT || DEBUG */
5481 sched_cond_init(&(iq->pgo_wakeup));
5482 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5483 (void *)iq, BASEPRI_VM,
5484 &(iq->pgo_iothread));
5485
5486 if (result != KERN_SUCCESS) {
5487 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5488 }
5489 }
5490 return result;
5491 }
5492
5493 #if CONFIG_IOSCHED
5494 /*
5495 * To support I/O Expedite for compressed files we mark the upls with special flags.
5496 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5497 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5498 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5499 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5500 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5501 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5502 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5503 * unless the real I/O upl is being destroyed).
5504 */
5505
5506
5507 static void
5508 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5509 {
5510 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5511
5512 upl_lock(src_upl);
5513 if (src_upl->decmp_io_upl) {
5514 /*
5515 * If there is already an alive real I/O UPL, ignore this new UPL.
5516 * This case should rarely happen and even if it does, it just means
5517 * that we might issue a spurious expedite which the driver is expected
5518 * to handle.
5519 */
5520 upl_unlock(src_upl);
5521 return;
5522 }
5523 src_upl->decmp_io_upl = (void *)upl;
5524 src_upl->ref_count++;
5525
5526 upl->flags |= UPL_DECMP_REAL_IO;
5527 upl->decmp_io_upl = (void *)src_upl;
5528 upl_unlock(src_upl);
5529 }
5530 #endif /* CONFIG_IOSCHED */
5531
5532 #if UPL_DEBUG
5533 int upl_debug_enabled = 1;
5534 #else
5535 int upl_debug_enabled = 0;
5536 #endif
5537
5538 static upl_t
5539 upl_create(int type, int flags, upl_size_t size)
5540 {
5541 uint32_t pages = (uint32_t)atop(round_page_32(size));
5542 upl_t upl;
5543
5544 assert(page_aligned(size));
5545
5546 /*
5547 * FIXME: this code assumes the allocation always succeeds,
5548 * however `pages` can be up to MAX_UPL_SIZE.
5549 *
5550 * The allocation size is above 32k (resp. 128k)
5551 * on 16k pages (resp. 4k), which kalloc might fail
5552 * to allocate.
5553 */
5554 upl = kalloc_type(struct upl, struct upl_page_info,
5555 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5556 if (type & UPL_CREATE_INTERNAL) {
5557 flags |= UPL_INTERNAL;
5558 }
5559
5560 if (type & UPL_CREATE_LITE) {
5561 flags |= UPL_LITE;
5562 if (pages) {
5563 upl->lite_list = bitmap_alloc(pages);
5564 }
5565 }
5566
5567 upl->flags = flags;
5568 upl->ref_count = 1;
5569 upl_lock_init(upl);
5570 #if CONFIG_IOSCHED
5571 if (type & UPL_CREATE_IO_TRACKING) {
5572 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5573 }
5574
5575 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5576 /* Only support expedite on internal UPLs */
5577 thread_t curthread = current_thread();
5578 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5579 Z_WAITOK | Z_ZERO);
5580 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5581 if (curthread->decmp_upl != NULL) {
5582 upl_set_decmp_info(upl, curthread->decmp_upl);
5583 }
5584 }
5585 #endif
5586 #if CONFIG_IOSCHED || UPL_DEBUG
5587 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5588 upl->upl_creator = current_thread();
5589 upl->flags |= UPL_TRACKED_BY_OBJECT;
5590 }
5591 #endif
5592
5593 #if UPL_DEBUG
5594 upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5595 #endif /* UPL_DEBUG */
5596
5597 return upl;
5598 }
5599
5600 static void
5601 upl_destroy(upl_t upl)
5602 {
5603 uint32_t pages;
5604
5605 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5606
5607 if (upl->ext_ref_count) {
5608 panic("upl(%p) ext_ref_count", upl);
5609 }
5610
5611 #if CONFIG_IOSCHED
5612 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5613 upl_t src_upl;
5614 src_upl = upl->decmp_io_upl;
5615 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5616 upl_lock(src_upl);
5617 src_upl->decmp_io_upl = NULL;
5618 upl_unlock(src_upl);
5619 upl_deallocate(src_upl);
5620 }
5621 #endif /* CONFIG_IOSCHED */
5622
5623 #if CONFIG_IOSCHED || UPL_DEBUG
5624 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5625 !(upl->flags & UPL_VECTOR)) {
5626 vm_object_t object;
5627
5628 if (upl->flags & UPL_SHADOWED) {
5629 object = upl->map_object->shadow;
5630 } else {
5631 object = upl->map_object;
5632 }
5633
5634 vm_object_lock(object);
5635 queue_remove(&object->uplq, upl, upl_t, uplq);
5636 vm_object_activity_end(object);
5637 vm_object_collapse(object, 0, TRUE);
5638 vm_object_unlock(object);
5639 }
5640 #endif
5641 /*
5642 * drop a reference on the map_object whether or
5643 * not a pageout object is inserted
5644 */
5645 if (upl->flags & UPL_SHADOWED) {
5646 vm_object_deallocate(upl->map_object);
5647 }
5648
5649 if (upl->flags & UPL_DEVICE_MEMORY) {
5650 pages = 1;
5651 } else {
5652 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5653 }
5654
5655 upl_lock_destroy(upl);
5656
5657 #if CONFIG_IOSCHED
5658 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5659 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5660 }
5661 #endif
5662
5663 #if UPL_DEBUG
5664 for (int i = 0; i < upl->upl_commit_index; i++) {
5665 btref_put(upl->upl_commit_records[i].c_btref);
5666 }
5667 btref_put(upl->upl_create_btref);
5668 #endif /* UPL_DEBUG */
5669
5670 if ((upl->flags & UPL_LITE) && pages) {
5671 bitmap_free(upl->lite_list, pages);
5672 }
5673 kfree_type(struct upl, struct upl_page_info,
5674 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5675 }
5676
5677 void
5678 upl_deallocate(upl_t upl)
5679 {
5680 upl_lock(upl);
5681
5682 if (--upl->ref_count == 0) {
5683 if (vector_upl_is_valid(upl)) {
5684 vector_upl_deallocate(upl);
5685 }
5686 upl_unlock(upl);
5687
5688 if (upl->upl_iodone) {
5689 upl_callout_iodone(upl);
5690 }
5691
5692 upl_destroy(upl);
5693 } else {
5694 upl_unlock(upl);
5695 }
5696 }
5697
5698 #if CONFIG_IOSCHED
5699 void
5700 upl_mark_decmp(upl_t upl)
5701 {
5702 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5703 upl->flags |= UPL_DECMP_REQ;
5704 upl->upl_creator->decmp_upl = (void *)upl;
5705 }
5706 }
5707
5708 void
5709 upl_unmark_decmp(upl_t upl)
5710 {
5711 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5712 upl->upl_creator->decmp_upl = NULL;
5713 }
5714 }
5715
5716 #endif /* CONFIG_IOSCHED */
5717
5718 #define VM_PAGE_Q_BACKING_UP(q) \
5719 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5720
5721 boolean_t must_throttle_writes(void);
5722
5723 boolean_t
5724 must_throttle_writes()
5725 {
5726 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5727 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5728 return TRUE;
5729 }
5730
5731 return FALSE;
5732 }
5733
5734 int vm_page_delayed_work_ctx_needed = 0;
5735 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5736
5737 __startup_func
5738 static void
5739 vm_page_delayed_work_init_ctx(void)
5740 {
5741 uint16_t min_delayed_work_ctx_allocated = 16;
5742
5743 /*
5744 * try really hard to always keep NCPU elements around in the zone
5745 * in order for the UPL code to almost always get an element.
5746 */
5747 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5748 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5749 }
5750
5751 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5752 }
5753 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5754
5755 struct vm_page_delayed_work*
5756 vm_page_delayed_work_get_ctx(void)
5757 {
5758 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5759
5760 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5761
5762 if (__probable(dw_ctx)) {
5763 dw_ctx->delayed_owner = current_thread();
5764 } else {
5765 vm_page_delayed_work_ctx_needed++;
5766 }
5767 return dw_ctx ? dw_ctx->dwp : NULL;
5768 }
5769
5770 void
5771 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5772 {
5773 struct vm_page_delayed_work_ctx *ldw_ctx;
5774
5775 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5776 ldw_ctx->delayed_owner = NULL;
5777
5778 zfree(dw_ctx_zone, ldw_ctx);
5779 }
5780
5781 /*
5782 * Routine: vm_object_upl_request
5783 * Purpose:
5784 * Cause the population of a portion of a vm_object.
5785 * Depending on the nature of the request, the pages
5786 * returned may be contain valid data or be uninitialized.
5787 * A page list structure, listing the physical pages
5788 * will be returned upon request.
5789 * This function is called by the file system or any other
5790 * supplier of backing store to a pager.
5791 * IMPORTANT NOTE: The caller must still respect the relationship
5792 * between the vm_object and its backing memory object. The
5793 * caller MUST NOT substitute changes in the backing file
5794 * without first doing a memory_object_lock_request on the
5795 * target range unless it is know that the pages are not
5796 * shared with another entity at the pager level.
5797 * Copy_in_to:
5798 * if a page list structure is present
5799 * return the mapped physical pages, where a
5800 * page is not present, return a non-initialized
5801 * one. If the no_sync bit is turned on, don't
5802 * call the pager unlock to synchronize with other
5803 * possible copies of the page. Leave pages busy
5804 * in the original object, if a page list structure
5805 * was specified. When a commit of the page list
5806 * pages is done, the dirty bit will be set for each one.
5807 * Copy_out_from:
5808 * If a page list structure is present, return
5809 * all mapped pages. Where a page does not exist
5810 * map a zero filled one. Leave pages busy in
5811 * the original object. If a page list structure
5812 * is not specified, this call is a no-op.
5813 *
5814 * Note: access of default pager objects has a rather interesting
5815 * twist. The caller of this routine, presumably the file system
5816 * page cache handling code, will never actually make a request
5817 * against a default pager backed object. Only the default
5818 * pager will make requests on backing store related vm_objects
5819 * In this way the default pager can maintain the relationship
5820 * between backing store files (abstract memory objects) and
5821 * the vm_objects (cache objects), they support.
5822 *
5823 */
5824
5825 __private_extern__ kern_return_t
5826 vm_object_upl_request(
5827 vm_object_t object,
5828 vm_object_offset_t offset,
5829 upl_size_t size,
5830 upl_t *upl_ptr,
5831 upl_page_info_array_t user_page_list,
5832 unsigned int *page_list_count,
5833 upl_control_flags_t cntrl_flags,
5834 vm_tag_t tag)
5835 {
5836 vm_page_t dst_page = VM_PAGE_NULL;
5837 vm_object_offset_t dst_offset;
5838 upl_size_t xfer_size;
5839 unsigned int size_in_pages;
5840 boolean_t dirty;
5841 boolean_t hw_dirty;
5842 upl_t upl = NULL;
5843 unsigned int entry;
5844 vm_page_t alias_page = NULL;
5845 int refmod_state = 0;
5846 vm_object_t last_copy_object;
5847 uint32_t last_copy_version;
5848 struct vm_page_delayed_work dw_array;
5849 struct vm_page_delayed_work *dwp, *dwp_start;
5850 bool dwp_finish_ctx = TRUE;
5851 int dw_count;
5852 int dw_limit;
5853 int io_tracking_flag = 0;
5854 int grab_options;
5855 int page_grab_count = 0;
5856 ppnum_t phys_page;
5857 pmap_flush_context pmap_flush_context_storage;
5858 boolean_t pmap_flushes_delayed = FALSE;
5859 task_t task = current_task();
5860
5861 dwp_start = dwp = NULL;
5862
5863 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5864 /*
5865 * For forward compatibility's sake,
5866 * reject any unknown flag.
5867 */
5868 return KERN_INVALID_VALUE;
5869 }
5870 if ((!object->internal) && (object->paging_offset != 0)) {
5871 panic("vm_object_upl_request: external object with non-zero paging offset");
5872 }
5873 if (object->phys_contiguous) {
5874 panic("vm_object_upl_request: contiguous object specified");
5875 }
5876
5877 assertf(page_aligned(offset) && page_aligned(size),
5878 "offset 0x%llx size 0x%x",
5879 offset, size);
5880
5881 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5882
5883 dw_count = 0;
5884 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5885 dwp_start = vm_page_delayed_work_get_ctx();
5886 if (dwp_start == NULL) {
5887 dwp_start = &dw_array;
5888 dw_limit = 1;
5889 dwp_finish_ctx = FALSE;
5890 }
5891
5892 dwp = dwp_start;
5893
5894 if (size > MAX_UPL_SIZE_BYTES) {
5895 size = MAX_UPL_SIZE_BYTES;
5896 }
5897
5898 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5899 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5900 }
5901
5902 #if CONFIG_IOSCHED || UPL_DEBUG
5903 if (object->io_tracking || upl_debug_enabled) {
5904 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5905 }
5906 #endif
5907 #if CONFIG_IOSCHED
5908 if (object->io_tracking) {
5909 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5910 }
5911 #endif
5912
5913 if (cntrl_flags & UPL_SET_INTERNAL) {
5914 if (cntrl_flags & UPL_SET_LITE) {
5915 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5916 } else {
5917 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5918 }
5919 user_page_list = size ? upl->page_list : NULL;
5920 } else {
5921 if (cntrl_flags & UPL_SET_LITE) {
5922 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5923 } else {
5924 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5925 }
5926 }
5927 *upl_ptr = upl;
5928
5929 if (user_page_list) {
5930 user_page_list[0].device = FALSE;
5931 }
5932
5933 if (cntrl_flags & UPL_SET_LITE) {
5934 upl->map_object = object;
5935 } else {
5936 upl->map_object = vm_object_allocate(size, object->vmo_provenance);
5937 vm_object_lock(upl->map_object);
5938 /*
5939 * No neeed to lock the new object: nobody else knows
5940 * about it yet, so it's all ours so far.
5941 */
5942 upl->map_object->shadow = object;
5943 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5944 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5945 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5946 upl->map_object->vo_shadow_offset = offset;
5947 upl->map_object->wimg_bits = object->wimg_bits;
5948 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5949 "object %p shadow_offset 0x%llx",
5950 upl->map_object, upl->map_object->vo_shadow_offset);
5951 vm_object_unlock(upl->map_object);
5952
5953 alias_page = vm_page_create_fictitious();
5954
5955 upl->flags |= UPL_SHADOWED;
5956 }
5957 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5958 upl->flags |= UPL_PAGEOUT;
5959 }
5960
5961 vm_object_lock(object);
5962 vm_object_activity_begin(object);
5963
5964 grab_options = 0;
5965 #if CONFIG_SECLUDED_MEMORY
5966 if (object->can_grab_secluded) {
5967 grab_options |= VM_PAGE_GRAB_SECLUDED;
5968 }
5969 #endif /* CONFIG_SECLUDED_MEMORY */
5970
5971 /*
5972 * we can lock in the paging_offset once paging_in_progress is set
5973 */
5974 upl->u_size = size;
5975 upl->u_offset = offset + object->paging_offset;
5976
5977 #if CONFIG_IOSCHED || UPL_DEBUG
5978 if (object->io_tracking || upl_debug_enabled) {
5979 vm_object_activity_begin(object);
5980 queue_enter(&object->uplq, upl, upl_t, uplq);
5981 }
5982 #endif
5983 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5984 /*
5985 * Honor copy-on-write obligations
5986 *
5987 * The caller is gathering these pages and
5988 * might modify their contents. We need to
5989 * make sure that the copy object has its own
5990 * private copies of these pages before we let
5991 * the caller modify them.
5992 */
5993 vm_object_update(object,
5994 offset,
5995 size,
5996 NULL,
5997 NULL,
5998 FALSE, /* should_return */
5999 MEMORY_OBJECT_COPY_SYNC,
6000 VM_PROT_NO_CHANGE);
6001
6002 VM_PAGEOUT_DEBUG(upl_cow, 1);
6003 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6004 }
6005 /*
6006 * remember which copy object we synchronized with
6007 */
6008 last_copy_object = object->vo_copy;
6009 last_copy_version = object->vo_copy_version;
6010 entry = 0;
6011
6012 xfer_size = size;
6013 dst_offset = offset;
6014 size_in_pages = size / PAGE_SIZE;
6015
6016 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6017 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6018 object->scan_collisions = 0;
6019 }
6020
6021 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6022 boolean_t isSSD = FALSE;
6023
6024 #if !XNU_TARGET_OS_OSX
6025 isSSD = TRUE;
6026 #else /* !XNU_TARGET_OS_OSX */
6027 vnode_pager_get_isSSD(object->pager, &isSSD);
6028 #endif /* !XNU_TARGET_OS_OSX */
6029 vm_object_unlock(object);
6030
6031 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6032
6033 if (isSSD == TRUE) {
6034 delay(1000 * size_in_pages);
6035 } else {
6036 delay(5000 * size_in_pages);
6037 }
6038 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6039
6040 vm_object_lock(object);
6041 }
6042
6043 while (xfer_size) {
6044 dwp->dw_mask = 0;
6045
6046 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6047 vm_object_unlock(object);
6048 alias_page = vm_page_create_fictitious();
6049 vm_object_lock(object);
6050 }
6051 if (cntrl_flags & UPL_COPYOUT_FROM) {
6052 upl->flags |= UPL_PAGE_SYNC_DONE;
6053
6054 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6055 vm_page_is_fictitious(dst_page) ||
6056 dst_page->vmp_absent ||
6057 VMP_ERROR_GET(dst_page) ||
6058 dst_page->vmp_cleaning ||
6059 (VM_PAGE_WIRED(dst_page))) {
6060 if (user_page_list) {
6061 user_page_list[entry].phys_addr = 0;
6062 }
6063
6064 goto try_next_page;
6065 }
6066 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6067
6068 /*
6069 * grab this up front...
6070 * a high percentange of the time we're going to
6071 * need the hardware modification state a bit later
6072 * anyway... so we can eliminate an extra call into
6073 * the pmap layer by grabbing it here and recording it
6074 */
6075 if (dst_page->vmp_pmapped) {
6076 refmod_state = pmap_get_refmod(phys_page);
6077 } else {
6078 refmod_state = 0;
6079 }
6080
6081 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6082 /*
6083 * page is on inactive list and referenced...
6084 * reactivate it now... this gets it out of the
6085 * way of vm_pageout_scan which would have to
6086 * reactivate it upon tripping over it
6087 */
6088 dwp->dw_mask |= DW_vm_page_activate;
6089 }
6090 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6091 /*
6092 * we're only asking for DIRTY pages to be returned
6093 */
6094 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6095 /*
6096 * if we were the page stolen by vm_pageout_scan to be
6097 * cleaned (as opposed to a buddy being clustered in
6098 * or this request is not being driven by a PAGEOUT cluster
6099 * then we only need to check for the page being dirty or
6100 * precious to decide whether to return it
6101 */
6102 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6103 goto check_busy;
6104 }
6105 goto dont_return;
6106 }
6107 /*
6108 * this is a request for a PAGEOUT cluster and this page
6109 * is merely along for the ride as a 'buddy'... not only
6110 * does it have to be dirty to be returned, but it also
6111 * can't have been referenced recently...
6112 */
6113 if ((hibernate_cleaning_in_progress == TRUE ||
6114 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6115 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6116 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6117 goto check_busy;
6118 }
6119 dont_return:
6120 /*
6121 * if we reach here, we're not to return
6122 * the page... go on to the next one
6123 */
6124 if (dst_page->vmp_laundry == TRUE) {
6125 /*
6126 * if we get here, the page is not 'cleaning' (filtered out above).
6127 * since it has been referenced, remove it from the laundry
6128 * so we don't pay the cost of an I/O to clean a page
6129 * we're just going to take back
6130 */
6131 vm_page_lockspin_queues();
6132
6133 vm_pageout_steal_laundry(dst_page, TRUE);
6134 vm_page_activate(dst_page);
6135
6136 vm_page_unlock_queues();
6137 }
6138 if (user_page_list) {
6139 user_page_list[entry].phys_addr = 0;
6140 }
6141
6142 goto try_next_page;
6143 }
6144 check_busy:
6145 if (dst_page->vmp_busy) {
6146 if (cntrl_flags & UPL_NOBLOCK) {
6147 if (user_page_list) {
6148 user_page_list[entry].phys_addr = 0;
6149 }
6150 dwp->dw_mask = 0;
6151
6152 goto try_next_page;
6153 }
6154 /*
6155 * someone else is playing with the
6156 * page. We will have to wait.
6157 */
6158 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6159
6160 continue;
6161 }
6162 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6163 vm_page_lockspin_queues();
6164
6165 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6166 /*
6167 * we've buddied up a page for a clustered pageout
6168 * that has already been moved to the pageout
6169 * queue by pageout_scan... we need to remove
6170 * it from the queue and drop the laundry count
6171 * on that queue
6172 */
6173 vm_pageout_throttle_up(dst_page);
6174 }
6175 vm_page_unlock_queues();
6176 }
6177 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6178 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6179
6180 if (phys_page > upl->highest_page) {
6181 upl->highest_page = phys_page;
6182 }
6183
6184 assert(!pmap_is_noencrypt(phys_page));
6185
6186 if (cntrl_flags & UPL_SET_LITE) {
6187 unsigned int pg_num;
6188
6189 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6190 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6191 bitmap_set(upl->lite_list, pg_num);
6192
6193 if (hw_dirty) {
6194 if (pmap_flushes_delayed == FALSE) {
6195 pmap_flush_context_init(&pmap_flush_context_storage);
6196 pmap_flushes_delayed = TRUE;
6197 }
6198 pmap_clear_refmod_options(phys_page,
6199 VM_MEM_MODIFIED,
6200 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6201 &pmap_flush_context_storage);
6202 }
6203
6204 /*
6205 * Mark original page as cleaning
6206 * in place.
6207 */
6208 dst_page->vmp_cleaning = TRUE;
6209 dst_page->vmp_precious = FALSE;
6210 } else {
6211 /*
6212 * use pageclean setup, it is more
6213 * convenient even for the pageout
6214 * cases here
6215 */
6216 vm_object_lock(upl->map_object);
6217 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6218 vm_object_unlock(upl->map_object);
6219
6220 alias_page->vmp_absent = FALSE;
6221 alias_page = NULL;
6222 }
6223 if (dirty) {
6224 SET_PAGE_DIRTY(dst_page, FALSE);
6225 } else {
6226 dst_page->vmp_dirty = FALSE;
6227 }
6228
6229 if (!dirty) {
6230 dst_page->vmp_precious = TRUE;
6231 }
6232
6233 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6234 if (!VM_PAGE_WIRED(dst_page)) {
6235 dst_page->vmp_free_when_done = TRUE;
6236 }
6237 }
6238 } else {
6239 if ((cntrl_flags & UPL_WILL_MODIFY) &&
6240 (object->vo_copy != last_copy_object ||
6241 object->vo_copy_version != last_copy_version)) {
6242 /*
6243 * Honor copy-on-write obligations
6244 *
6245 * The copy object has changed since we
6246 * last synchronized for copy-on-write.
6247 * Another copy object might have been
6248 * inserted while we released the object's
6249 * lock. Since someone could have seen the
6250 * original contents of the remaining pages
6251 * through that new object, we have to
6252 * synchronize with it again for the remaining
6253 * pages only. The previous pages are "busy"
6254 * so they can not be seen through the new
6255 * mapping. The new mapping will see our
6256 * upcoming changes for those previous pages,
6257 * but that's OK since they couldn't see what
6258 * was there before. It's just a race anyway
6259 * and there's no guarantee of consistency or
6260 * atomicity. We just don't want new mappings
6261 * to see both the *before* and *after* pages.
6262 */
6263 if (object->vo_copy != VM_OBJECT_NULL) {
6264 vm_object_update(
6265 object,
6266 dst_offset,/* current offset */
6267 xfer_size, /* remaining size */
6268 NULL,
6269 NULL,
6270 FALSE, /* should_return */
6271 MEMORY_OBJECT_COPY_SYNC,
6272 VM_PROT_NO_CHANGE);
6273
6274 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6275 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6276 }
6277 /*
6278 * remember the copy object we synced with
6279 */
6280 last_copy_object = object->vo_copy;
6281 last_copy_version = object->vo_copy_version;
6282 }
6283 dst_page = vm_page_lookup(object, dst_offset);
6284
6285 if (dst_page != VM_PAGE_NULL) {
6286 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6287 /*
6288 * skip over pages already present in the cache
6289 */
6290 if (user_page_list) {
6291 user_page_list[entry].phys_addr = 0;
6292 }
6293
6294 goto try_next_page;
6295 }
6296 if (vm_page_is_fictitious(dst_page)) {
6297 panic("need corner case for fictitious page");
6298 }
6299
6300 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6301 /*
6302 * someone else is playing with the
6303 * page. We will have to wait.
6304 */
6305 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6306
6307 continue;
6308 }
6309 if (dst_page->vmp_laundry) {
6310 vm_pageout_steal_laundry(dst_page, FALSE);
6311 }
6312 } else {
6313 if (object->private) {
6314 /*
6315 * This is a nasty wrinkle for users
6316 * of upl who encounter device or
6317 * private memory however, it is
6318 * unavoidable, only a fault can
6319 * resolve the actual backing
6320 * physical page by asking the
6321 * backing device.
6322 */
6323 if (user_page_list) {
6324 user_page_list[entry].phys_addr = 0;
6325 }
6326
6327 goto try_next_page;
6328 }
6329 if (object->scan_collisions) {
6330 /*
6331 * the pageout_scan thread is trying to steal
6332 * pages from this object, but has run into our
6333 * lock... grab 2 pages from the head of the object...
6334 * the first is freed on behalf of pageout_scan, the
6335 * 2nd is for our own use... we use vm_object_page_grab
6336 * in both cases to avoid taking pages from the free
6337 * list since we are under memory pressure and our
6338 * lock on this object is getting in the way of
6339 * relieving it
6340 */
6341 dst_page = vm_object_page_grab(object);
6342
6343 if (dst_page != VM_PAGE_NULL) {
6344 vm_page_release(dst_page,
6345 FALSE);
6346 }
6347
6348 dst_page = vm_object_page_grab(object);
6349 }
6350 if (dst_page == VM_PAGE_NULL) {
6351 /*
6352 * need to allocate a page
6353 */
6354 dst_page = vm_page_grab_options(grab_options);
6355 if (dst_page != VM_PAGE_NULL) {
6356 page_grab_count++;
6357 }
6358 }
6359 if (dst_page == VM_PAGE_NULL) {
6360 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6361 /*
6362 * we don't want to stall waiting for pages to come onto the free list
6363 * while we're already holding absent pages in this UPL
6364 * the caller will deal with the empty slots
6365 */
6366 if (user_page_list) {
6367 user_page_list[entry].phys_addr = 0;
6368 }
6369
6370 goto try_next_page;
6371 }
6372 /*
6373 * no pages available... wait
6374 * then try again for the same
6375 * offset...
6376 */
6377 vm_object_unlock(object);
6378
6379 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6380
6381 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6382
6383 VM_PAGE_WAIT();
6384 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6385
6386 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6387
6388 vm_object_lock(object);
6389
6390 continue;
6391 }
6392 vm_page_insert(dst_page, object, dst_offset);
6393
6394 dst_page->vmp_absent = TRUE;
6395 dst_page->vmp_busy = FALSE;
6396
6397 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6398 /*
6399 * if UPL_RET_ONLY_ABSENT was specified,
6400 * than we're definitely setting up a
6401 * upl for a clustered read/pagein
6402 * operation... mark the pages as clustered
6403 * so upl_commit_range can put them on the
6404 * speculative list
6405 */
6406 dst_page->vmp_clustered = TRUE;
6407
6408 if (!(cntrl_flags & UPL_FILE_IO)) {
6409 counter_inc(&vm_statistics_pageins);
6410 }
6411 }
6412 }
6413 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6414
6415 dst_page->vmp_overwriting = TRUE;
6416
6417 if (dst_page->vmp_pmapped) {
6418 if (!(cntrl_flags & UPL_FILE_IO)) {
6419 /*
6420 * eliminate all mappings from the
6421 * original object and its prodigy
6422 */
6423 refmod_state = pmap_disconnect(phys_page);
6424 } else {
6425 refmod_state = pmap_get_refmod(phys_page);
6426 }
6427 } else {
6428 refmod_state = 0;
6429 }
6430
6431 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6432 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6433
6434 if (cntrl_flags & UPL_SET_LITE) {
6435 unsigned int pg_num;
6436
6437 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6438 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6439 bitmap_set(upl->lite_list, pg_num);
6440
6441 if (hw_dirty) {
6442 pmap_clear_modify(phys_page);
6443 }
6444
6445 /*
6446 * Mark original page as cleaning
6447 * in place.
6448 */
6449 dst_page->vmp_cleaning = TRUE;
6450 dst_page->vmp_precious = FALSE;
6451 } else {
6452 /*
6453 * use pageclean setup, it is more
6454 * convenient even for the pageout
6455 * cases here
6456 */
6457 vm_object_lock(upl->map_object);
6458 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6459 vm_object_unlock(upl->map_object);
6460
6461 alias_page->vmp_absent = FALSE;
6462 alias_page = NULL;
6463 }
6464
6465 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6466 upl->flags &= ~UPL_CLEAR_DIRTY;
6467 upl->flags |= UPL_SET_DIRTY;
6468 dirty = TRUE;
6469 /*
6470 * Page belonging to a code-signed object is about to
6471 * be written. Mark it tainted and disconnect it from
6472 * all pmaps so processes have to fault it back in and
6473 * deal with the tainted bit.
6474 */
6475 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6476 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6477 vm_page_upl_tainted++;
6478 if (dst_page->vmp_pmapped) {
6479 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6480 if (refmod_state & VM_MEM_REFERENCED) {
6481 dst_page->vmp_reference = TRUE;
6482 }
6483 }
6484 }
6485 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6486 /*
6487 * clean in place for read implies
6488 * that a write will be done on all
6489 * the pages that are dirty before
6490 * a upl commit is done. The caller
6491 * is obligated to preserve the
6492 * contents of all pages marked dirty
6493 */
6494 upl->flags |= UPL_CLEAR_DIRTY;
6495 }
6496 dst_page->vmp_dirty = dirty;
6497
6498 if (!dirty) {
6499 dst_page->vmp_precious = TRUE;
6500 }
6501
6502 if (!VM_PAGE_WIRED(dst_page)) {
6503 /*
6504 * deny access to the target page while
6505 * it is being worked on
6506 */
6507 dst_page->vmp_busy = TRUE;
6508 } else {
6509 dwp->dw_mask |= DW_vm_page_wire;
6510 }
6511
6512 /*
6513 * We might be about to satisfy a fault which has been
6514 * requested. So no need for the "restart" bit.
6515 */
6516 dst_page->vmp_restart = FALSE;
6517 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6518 /*
6519 * expect the page to be used
6520 */
6521 dwp->dw_mask |= DW_set_reference;
6522 }
6523 if (cntrl_flags & UPL_PRECIOUS) {
6524 if (object->internal) {
6525 SET_PAGE_DIRTY(dst_page, FALSE);
6526 dst_page->vmp_precious = FALSE;
6527 } else {
6528 dst_page->vmp_precious = TRUE;
6529 }
6530 } else {
6531 dst_page->vmp_precious = FALSE;
6532 }
6533 }
6534 if (dst_page->vmp_busy) {
6535 upl->flags |= UPL_HAS_BUSY;
6536 }
6537 if (VM_PAGE_WIRED(dst_page)) {
6538 upl->flags |= UPL_HAS_WIRED;
6539 }
6540
6541 if (phys_page > upl->highest_page) {
6542 upl->highest_page = phys_page;
6543 }
6544 assert(!pmap_is_noencrypt(phys_page));
6545 if (user_page_list) {
6546 user_page_list[entry].phys_addr = phys_page;
6547 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6548 user_page_list[entry].absent = dst_page->vmp_absent;
6549 user_page_list[entry].dirty = dst_page->vmp_dirty;
6550 user_page_list[entry].precious = dst_page->vmp_precious;
6551 user_page_list[entry].device = FALSE;
6552 user_page_list[entry].needed = FALSE;
6553 if (dst_page->vmp_clustered == TRUE) {
6554 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6555 } else {
6556 user_page_list[entry].speculative = FALSE;
6557 }
6558 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6559 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6560 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6561 user_page_list[entry].mark = FALSE;
6562 }
6563 /*
6564 * if UPL_RET_ONLY_ABSENT is set, then
6565 * we are working with a fresh page and we've
6566 * just set the clustered flag on it to
6567 * indicate that it was drug in as part of a
6568 * speculative cluster... so leave it alone
6569 */
6570 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6571 /*
6572 * someone is explicitly grabbing this page...
6573 * update clustered and speculative state
6574 *
6575 */
6576 if (dst_page->vmp_clustered) {
6577 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6578 }
6579 }
6580 try_next_page:
6581 if (dwp->dw_mask) {
6582 if (dwp->dw_mask & DW_vm_page_activate) {
6583 counter_inc(&vm_statistics_reactivations);
6584 }
6585
6586 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6587
6588 if (dw_count >= dw_limit) {
6589 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6590
6591 dwp = dwp_start;
6592 dw_count = 0;
6593 }
6594 }
6595 entry++;
6596 dst_offset += PAGE_SIZE_64;
6597 xfer_size -= PAGE_SIZE;
6598 }
6599 if (dw_count) {
6600 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6601 dwp = dwp_start;
6602 dw_count = 0;
6603 }
6604
6605 if (alias_page != NULL) {
6606 VM_PAGE_FREE(alias_page);
6607 }
6608 if (pmap_flushes_delayed == TRUE) {
6609 pmap_flush(&pmap_flush_context_storage);
6610 }
6611
6612 if (page_list_count != NULL) {
6613 if (upl->flags & UPL_INTERNAL) {
6614 *page_list_count = 0;
6615 } else if (*page_list_count > entry) {
6616 *page_list_count = entry;
6617 }
6618 }
6619 #if UPL_DEBUG
6620 upl->upl_state = 1;
6621 #endif
6622 vm_object_unlock(object);
6623
6624 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6625 if (task != NULL) {
6626 counter_add(&task->pages_grabbed_upl, page_grab_count);
6627 }
6628
6629 if (dwp_start && dwp_finish_ctx) {
6630 vm_page_delayed_work_finish_ctx(dwp_start);
6631 dwp_start = dwp = NULL;
6632 }
6633
6634 return KERN_SUCCESS;
6635 }
6636
6637 int cs_executable_create_upl = 0;
6638 extern int proc_selfpid(void);
6639 extern char *proc_name_address(void *p);
6640
6641 kern_return_t
6642 vm_map_create_upl(
6643 vm_map_t map,
6644 vm_map_address_t offset,
6645 upl_size_t *upl_size,
6646 upl_t *upl,
6647 upl_page_info_array_t page_list,
6648 unsigned int *count,
6649 upl_control_flags_t *flags,
6650 vm_tag_t tag)
6651 {
6652 vm_map_entry_t entry;
6653 upl_control_flags_t caller_flags;
6654 int force_data_sync;
6655 int sync_cow_data;
6656 vm_object_t local_object;
6657 vm_map_offset_t local_offset;
6658 vm_map_offset_t local_start;
6659 kern_return_t ret;
6660 vm_map_address_t original_offset;
6661 vm_map_size_t original_size, adjusted_size;
6662 vm_map_offset_t local_entry_start;
6663 vm_object_offset_t local_entry_offset;
6664 vm_object_offset_t offset_in_mapped_page;
6665 boolean_t release_map = FALSE;
6666
6667
6668 start_with_map:
6669
6670 original_offset = offset;
6671 original_size = *upl_size;
6672 adjusted_size = original_size;
6673
6674 caller_flags = *flags;
6675
6676 if (caller_flags & ~UPL_VALID_FLAGS) {
6677 /*
6678 * For forward compatibility's sake,
6679 * reject any unknown flag.
6680 */
6681 ret = KERN_INVALID_VALUE;
6682 goto done;
6683 }
6684 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686
6687 if (upl == NULL) {
6688 ret = KERN_INVALID_ARGUMENT;
6689 goto done;
6690 }
6691
6692 REDISCOVER_ENTRY:
6693 vm_map_lock_read(map);
6694
6695 if (!vm_map_lookup_entry(map, offset, &entry)) {
6696 vm_map_unlock_read(map);
6697 ret = KERN_FAILURE;
6698 goto done;
6699 }
6700
6701 local_entry_start = entry->vme_start;
6702 local_entry_offset = VME_OFFSET(entry);
6703
6704 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6706 }
6707
6708 if (entry->vme_end - original_offset < adjusted_size) {
6709 adjusted_size = entry->vme_end - original_offset;
6710 assert(adjusted_size > 0);
6711 *upl_size = (upl_size_t) adjusted_size;
6712 assert(*upl_size == adjusted_size);
6713 }
6714
6715 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716 *flags = 0;
6717
6718 if (!entry->is_sub_map &&
6719 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720 if (VME_OBJECT(entry)->private) {
6721 *flags = UPL_DEV_MEMORY;
6722 }
6723
6724 if (VME_OBJECT(entry)->phys_contiguous) {
6725 *flags |= UPL_PHYS_CONTIG;
6726 }
6727 }
6728 vm_map_unlock_read(map);
6729 ret = KERN_SUCCESS;
6730 goto done;
6731 }
6732
6733 offset_in_mapped_page = 0;
6734 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736 *upl_size = (upl_size_t)
6737 (vm_map_round_page(original_offset + adjusted_size,
6738 VM_MAP_PAGE_MASK(map))
6739 - offset);
6740
6741 offset_in_mapped_page = original_offset - offset;
6742 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743
6744 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745 }
6746
6747 if (!entry->is_sub_map) {
6748 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6749 !VME_OBJECT(entry)->phys_contiguous) {
6750 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751 *upl_size = MAX_UPL_SIZE_BYTES;
6752 }
6753 }
6754
6755 /*
6756 * Create an object if necessary.
6757 */
6758 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759 if (entry->max_protection == VM_PROT_NONE) {
6760 /* don't create an object for a reserved range */
6761 vm_map_unlock_read(map);
6762 ret = KERN_PROTECTION_FAILURE;
6763 goto done;
6764 }
6765
6766 if (vm_map_lock_read_to_write(map)) {
6767 goto REDISCOVER_ENTRY;
6768 }
6769
6770 VME_OBJECT_SET(entry,
6771 vm_object_allocate((vm_size_t)
6772 vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id),
6773 false, 0);
6774 VME_OFFSET_SET(entry, 0);
6775 assert(entry->use_pmap);
6776
6777 vm_map_lock_write_to_read(map);
6778 }
6779
6780 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6781 !(entry->protection & VM_PROT_WRITE)) {
6782 vm_map_unlock_read(map);
6783 ret = KERN_PROTECTION_FAILURE;
6784 goto done;
6785 }
6786 }
6787
6788 #if !XNU_TARGET_OS_OSX
6789 if (map->pmap != kernel_pmap &&
6790 (caller_flags & UPL_COPYOUT_FROM) &&
6791 (entry->protection & VM_PROT_EXECUTE) &&
6792 !(entry->protection & VM_PROT_WRITE)) {
6793 vm_offset_t kaddr;
6794 vm_size_t ksize;
6795
6796 /*
6797 * We're about to create a read-only UPL backed by
6798 * memory from an executable mapping.
6799 * Wiring the pages would result in the pages being copied
6800 * (due to the "MAP_PRIVATE" mapping) and no longer
6801 * code-signed, so no longer eligible for execution.
6802 * Instead, let's copy the data into a kernel buffer and
6803 * create the UPL from this kernel buffer.
6804 * The kernel buffer is then freed, leaving the UPL holding
6805 * the last reference on the VM object, so the memory will
6806 * be released when the UPL is committed.
6807 */
6808
6809 vm_map_unlock_read(map);
6810 entry = VM_MAP_ENTRY_NULL;
6811 /* allocate kernel buffer */
6812 ksize = round_page(*upl_size);
6813 kaddr = 0;
6814 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6815 KMA_PAGEABLE | KMA_DATA, tag);
6816 if (ret == KERN_SUCCESS) {
6817 /* copyin the user data */
6818 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6819 }
6820 if (ret == KERN_SUCCESS) {
6821 if (ksize > *upl_size) {
6822 /* zero out the extra space in kernel buffer */
6823 memset((void *)(kaddr + *upl_size),
6824 0,
6825 ksize - *upl_size);
6826 }
6827 /* create the UPL from the kernel buffer */
6828 vm_object_offset_t offset_in_object;
6829 vm_object_offset_t offset_in_object_page;
6830
6831 offset_in_object = offset - local_entry_start + local_entry_offset;
6832 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6833 assert(offset_in_object_page < PAGE_SIZE);
6834 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6835 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6836 ret = vm_map_create_upl(kernel_map,
6837 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6838 upl_size, upl, page_list, count, flags, tag);
6839 }
6840 if (kaddr != 0) {
6841 /* free the kernel buffer */
6842 kmem_free(kernel_map, kaddr, ksize);
6843 kaddr = 0;
6844 ksize = 0;
6845 }
6846 #if DEVELOPMENT || DEBUG
6847 DTRACE_VM4(create_upl_from_executable,
6848 vm_map_t, map,
6849 vm_map_address_t, offset,
6850 upl_size_t, *upl_size,
6851 kern_return_t, ret);
6852 #endif /* DEVELOPMENT || DEBUG */
6853 goto done;
6854 }
6855 #endif /* !XNU_TARGET_OS_OSX */
6856
6857 if (!entry->is_sub_map) {
6858 local_object = VME_OBJECT(entry);
6859 assert(local_object != VM_OBJECT_NULL);
6860 }
6861
6862 if (!entry->is_sub_map &&
6863 !entry->needs_copy &&
6864 *upl_size != 0 &&
6865 local_object->vo_size > *upl_size && /* partial UPL */
6866 entry->wired_count == 0 && /* No COW for entries that are wired */
6867 (map->pmap != kernel_pmap) && /* alias checks */
6868 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6869 ||
6870 ( /* case 2 */
6871 local_object->internal &&
6872 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6873 os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6874 vm_prot_t prot;
6875
6876 /*
6877 * Case 1:
6878 * Set up the targeted range for copy-on-write to avoid
6879 * applying true_share/copy_delay to the entire object.
6880 *
6881 * Case 2:
6882 * This map entry covers only part of an internal
6883 * object. There could be other map entries covering
6884 * other areas of this object and some of these map
6885 * entries could be marked as "needs_copy", which
6886 * assumes that the object is COPY_SYMMETRIC.
6887 * To avoid marking this object as COPY_DELAY and
6888 * "true_share", let's shadow it and mark the new
6889 * (smaller) object as "true_share" and COPY_DELAY.
6890 */
6891
6892 if (vm_map_lock_read_to_write(map)) {
6893 goto REDISCOVER_ENTRY;
6894 }
6895 vm_map_lock_assert_exclusive(map);
6896 assert(VME_OBJECT(entry) == local_object);
6897
6898 vm_map_clip_start(map,
6899 entry,
6900 vm_map_trunc_page(offset,
6901 VM_MAP_PAGE_MASK(map)));
6902 vm_map_clip_end(map,
6903 entry,
6904 vm_map_round_page(offset + *upl_size,
6905 VM_MAP_PAGE_MASK(map)));
6906 if ((entry->vme_end - offset) < *upl_size) {
6907 *upl_size = (upl_size_t) (entry->vme_end - offset);
6908 assert(*upl_size == entry->vme_end - offset);
6909 }
6910
6911 prot = entry->protection & ~VM_PROT_WRITE;
6912 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6913 prot |= VM_PROT_EXECUTE;
6914 }
6915 vm_object_pmap_protect(local_object,
6916 VME_OFFSET(entry),
6917 entry->vme_end - entry->vme_start,
6918 ((entry->is_shared ||
6919 map->mapped_in_other_pmaps)
6920 ? PMAP_NULL
6921 : map->pmap),
6922 VM_MAP_PAGE_SIZE(map),
6923 entry->vme_start,
6924 prot);
6925
6926 assert(entry->wired_count == 0);
6927
6928 /*
6929 * Lock the VM object and re-check its status: if it's mapped
6930 * in another address space, we could still be racing with
6931 * another thread holding that other VM map exclusively.
6932 */
6933 vm_object_lock(local_object);
6934 if (local_object->true_share) {
6935 /* object is already in proper state: no COW needed */
6936 assert(local_object->copy_strategy !=
6937 MEMORY_OBJECT_COPY_SYMMETRIC);
6938 } else {
6939 /* not true_share: ask for copy-on-write below */
6940 assert(local_object->copy_strategy ==
6941 MEMORY_OBJECT_COPY_SYMMETRIC);
6942 entry->needs_copy = TRUE;
6943 }
6944 vm_object_unlock(local_object);
6945
6946 vm_map_lock_write_to_read(map);
6947 }
6948
6949 if (entry->needs_copy) {
6950 /*
6951 * Honor copy-on-write for COPY_SYMMETRIC
6952 * strategy.
6953 */
6954 vm_map_t local_map;
6955 vm_object_t object;
6956 vm_object_offset_t new_offset;
6957 vm_prot_t prot;
6958 boolean_t wired;
6959 vm_map_version_t version;
6960 vm_map_t real_map;
6961 vm_prot_t fault_type;
6962
6963 local_map = map;
6964
6965 if (caller_flags & UPL_COPYOUT_FROM) {
6966 fault_type = VM_PROT_READ | VM_PROT_COPY;
6967 vm_counters.create_upl_extra_cow++;
6968 vm_counters.create_upl_extra_cow_pages +=
6969 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6970 } else {
6971 fault_type = VM_PROT_WRITE;
6972 }
6973 if (vm_map_lookup_and_lock_object(&local_map,
6974 offset, fault_type,
6975 OBJECT_LOCK_EXCLUSIVE,
6976 &version, &object,
6977 &new_offset, &prot, &wired,
6978 NULL,
6979 &real_map, NULL) != KERN_SUCCESS) {
6980 if (fault_type == VM_PROT_WRITE) {
6981 vm_counters.create_upl_lookup_failure_write++;
6982 } else {
6983 vm_counters.create_upl_lookup_failure_copy++;
6984 }
6985 vm_map_unlock_read(local_map);
6986 ret = KERN_FAILURE;
6987 goto done;
6988 }
6989 if (real_map != local_map) {
6990 vm_map_unlock(real_map);
6991 }
6992 vm_map_unlock_read(local_map);
6993
6994 vm_object_unlock(object);
6995
6996 goto REDISCOVER_ENTRY;
6997 }
6998
6999 if (entry->is_sub_map) {
7000 vm_map_t submap;
7001
7002 submap = VME_SUBMAP(entry);
7003 local_start = entry->vme_start;
7004 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7005
7006 vm_map_reference(submap);
7007 vm_map_unlock_read(map);
7008
7009 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7010 offset += offset_in_mapped_page;
7011 *upl_size -= offset_in_mapped_page;
7012
7013 if (release_map) {
7014 vm_map_deallocate(map);
7015 }
7016 map = submap;
7017 release_map = TRUE;
7018 offset = local_offset + (offset - local_start);
7019 goto start_with_map;
7020 }
7021
7022 if (sync_cow_data &&
7023 (VME_OBJECT(entry)->shadow ||
7024 VME_OBJECT(entry)->vo_copy)) {
7025 local_object = VME_OBJECT(entry);
7026 local_start = entry->vme_start;
7027 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7028
7029 vm_object_reference(local_object);
7030 vm_map_unlock_read(map);
7031
7032 if (local_object->shadow && local_object->vo_copy) {
7033 vm_object_lock_request(local_object->shadow,
7034 ((vm_object_offset_t)
7035 ((offset - local_start) +
7036 local_offset) +
7037 local_object->vo_shadow_offset),
7038 *upl_size, FALSE,
7039 MEMORY_OBJECT_DATA_SYNC,
7040 VM_PROT_NO_CHANGE);
7041 }
7042 sync_cow_data = FALSE;
7043 vm_object_deallocate(local_object);
7044
7045 goto REDISCOVER_ENTRY;
7046 }
7047 if (force_data_sync) {
7048 local_object = VME_OBJECT(entry);
7049 local_start = entry->vme_start;
7050 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7051
7052 vm_object_reference(local_object);
7053 vm_map_unlock_read(map);
7054
7055 vm_object_lock_request(local_object,
7056 ((vm_object_offset_t)
7057 ((offset - local_start) +
7058 local_offset)),
7059 (vm_object_size_t)*upl_size,
7060 FALSE,
7061 MEMORY_OBJECT_DATA_SYNC,
7062 VM_PROT_NO_CHANGE);
7063
7064 force_data_sync = FALSE;
7065 vm_object_deallocate(local_object);
7066
7067 goto REDISCOVER_ENTRY;
7068 }
7069 if (VME_OBJECT(entry)->private) {
7070 *flags = UPL_DEV_MEMORY;
7071 } else {
7072 *flags = 0;
7073 }
7074
7075 if (VME_OBJECT(entry)->phys_contiguous) {
7076 *flags |= UPL_PHYS_CONTIG;
7077 }
7078
7079 local_object = VME_OBJECT(entry);
7080 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7081 local_start = entry->vme_start;
7082
7083
7084 /*
7085 * Wiring will copy the pages to the shadow object.
7086 * The shadow object will not be code-signed so
7087 * attempting to execute code from these copied pages
7088 * would trigger a code-signing violation.
7089 */
7090 if (entry->protection & VM_PROT_EXECUTE) {
7091 #if MACH_ASSERT
7092 printf("pid %d[%s] create_upl out of executable range from "
7093 "0x%llx to 0x%llx: side effects may include "
7094 "code-signing violations later on\n",
7095 proc_selfpid(),
7096 (get_bsdtask_info(current_task())
7097 ? proc_name_address(get_bsdtask_info(current_task()))
7098 : "?"),
7099 (uint64_t) entry->vme_start,
7100 (uint64_t) entry->vme_end);
7101 #endif /* MACH_ASSERT */
7102 DTRACE_VM2(cs_executable_create_upl,
7103 uint64_t, (uint64_t)entry->vme_start,
7104 uint64_t, (uint64_t)entry->vme_end);
7105 cs_executable_create_upl++;
7106 }
7107
7108 vm_object_lock(local_object);
7109
7110 /*
7111 * Ensure that this object is "true_share" and "copy_delay" now,
7112 * while we're still holding the VM map lock. After we unlock the map,
7113 * anything could happen to that mapping, including some copy-on-write
7114 * activity. We need to make sure that the IOPL will point at the
7115 * same memory as the mapping.
7116 */
7117 if (local_object->true_share) {
7118 assert(local_object->copy_strategy !=
7119 MEMORY_OBJECT_COPY_SYMMETRIC);
7120 } else if (!is_kernel_object(local_object) &&
7121 local_object != compressor_object &&
7122 !local_object->phys_contiguous) {
7123 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7124 if (!local_object->true_share &&
7125 vm_object_tracking_btlog) {
7126 btlog_record(vm_object_tracking_btlog, local_object,
7127 VM_OBJECT_TRACKING_OP_TRUESHARE,
7128 btref_get(__builtin_frame_address(0), 0));
7129 }
7130 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7131 VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7132 if (local_object->copy_strategy ==
7133 MEMORY_OBJECT_COPY_SYMMETRIC) {
7134 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7135 }
7136 }
7137
7138 vm_object_reference_locked(local_object);
7139 vm_object_unlock(local_object);
7140
7141 vm_map_unlock_read(map);
7142
7143 offset += offset_in_mapped_page;
7144 assert(*upl_size > offset_in_mapped_page);
7145 *upl_size -= offset_in_mapped_page;
7146
7147 ret = vm_object_iopl_request(local_object,
7148 ((vm_object_offset_t)
7149 ((offset - local_start) + local_offset)),
7150 *upl_size,
7151 upl,
7152 page_list,
7153 count,
7154 caller_flags,
7155 tag);
7156 vm_object_deallocate(local_object);
7157
7158
7159 done:
7160 if (release_map) {
7161 vm_map_deallocate(map);
7162 }
7163
7164 return ret;
7165 }
7166
7167 /*
7168 * Internal routine to enter a UPL into a VM map.
7169 *
7170 * JMM - This should just be doable through the standard
7171 * vm_map_enter() API.
7172 */
7173 kern_return_t
7174 vm_map_enter_upl_range(
7175 vm_map_t map,
7176 upl_t upl,
7177 vm_object_offset_t offset_to_map,
7178 vm_size_t size_to_map,
7179 vm_prot_t prot_to_map,
7180 vm_map_offset_t *dst_addr)
7181 {
7182 vm_map_size_t size;
7183 vm_object_offset_t offset;
7184 vm_map_offset_t addr;
7185 vm_page_t m;
7186 kern_return_t kr;
7187 int isVectorUPL = 0, curr_upl = 0;
7188 upl_t vector_upl = NULL;
7189 mach_vm_offset_t vector_upl_dst_addr = 0;
7190 vm_map_t vector_upl_submap = NULL;
7191 upl_offset_t subupl_offset = 0;
7192 upl_size_t subupl_size = 0;
7193
7194 if (upl == UPL_NULL) {
7195 return KERN_INVALID_ARGUMENT;
7196 }
7197
7198 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7199 assert(map == kernel_map);
7200
7201 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7202 int mapped = 0, valid_upls = 0;
7203 vector_upl = upl;
7204
7205 upl_lock(vector_upl);
7206 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7207 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7208 if (upl == NULL) {
7209 continue;
7210 }
7211 valid_upls++;
7212 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7213 mapped++;
7214 }
7215 }
7216
7217 if (mapped) {
7218 if (mapped != valid_upls) {
7219 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7220 } else {
7221 upl_unlock(vector_upl);
7222 return KERN_FAILURE;
7223 }
7224 }
7225
7226 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7227 panic("TODO4K: vector UPL not implemented");
7228 }
7229
7230 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7231 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7232 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7233 VM_KERN_MEMORY_NONE).kmr_submap;
7234 map = vector_upl_submap;
7235 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7236 curr_upl = 0;
7237 } else {
7238 upl_lock(upl);
7239 }
7240
7241 process_upl_to_enter:
7242 if (isVectorUPL) {
7243 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7244 *dst_addr = vector_upl_dst_addr;
7245 upl_unlock(vector_upl);
7246 return KERN_SUCCESS;
7247 }
7248 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7249 if (upl == NULL) {
7250 goto process_upl_to_enter;
7251 }
7252
7253 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7254 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7255 } else {
7256 /*
7257 * check to see if already mapped
7258 */
7259 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7260 upl_unlock(upl);
7261 return KERN_FAILURE;
7262 }
7263 }
7264
7265 if ((!(upl->flags & UPL_SHADOWED)) &&
7266 ((upl->flags & UPL_HAS_BUSY) ||
7267 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7268 vm_object_t object;
7269 vm_page_t alias_page;
7270 vm_object_offset_t new_offset;
7271 unsigned int pg_num;
7272
7273 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7274 object = upl->map_object;
7275 upl->map_object = vm_object_allocate(
7276 vm_object_round_page(size),
7277 /* Provenance is copied from the object we're shadowing */
7278 object->vmo_provenance);
7279
7280 vm_object_lock(upl->map_object);
7281
7282 upl->map_object->shadow = object;
7283 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7284 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7285 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7286 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7287 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7288 "object %p shadow_offset 0x%llx",
7289 upl->map_object,
7290 (uint64_t)upl->map_object->vo_shadow_offset);
7291 upl->map_object->wimg_bits = object->wimg_bits;
7292 offset = upl->map_object->vo_shadow_offset;
7293 new_offset = 0;
7294
7295 upl->flags |= UPL_SHADOWED;
7296
7297 while (size) {
7298 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7299 assert(pg_num == new_offset / PAGE_SIZE);
7300
7301 if (bitmap_test(upl->lite_list, pg_num)) {
7302 alias_page = vm_page_create_fictitious();
7303
7304 vm_object_lock(object);
7305
7306 m = vm_page_lookup(object, offset);
7307 if (m == VM_PAGE_NULL) {
7308 panic("vm_upl_map: page missing");
7309 }
7310
7311 /*
7312 * Convert the fictitious page to a private
7313 * shadow of the real page.
7314 */
7315 alias_page->vmp_free_when_done = TRUE;
7316 /*
7317 * since m is a page in the upl it must
7318 * already be wired or BUSY, so it's
7319 * safe to assign the underlying physical
7320 * page to the alias
7321 */
7322
7323 vm_object_unlock(object);
7324
7325 vm_page_lockspin_queues();
7326 vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7327 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7328 vm_page_unlock_queues();
7329
7330 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7331
7332 assert(!alias_page->vmp_wanted);
7333 alias_page->vmp_busy = FALSE;
7334 alias_page->vmp_absent = FALSE;
7335 }
7336 size -= PAGE_SIZE;
7337 offset += PAGE_SIZE_64;
7338 new_offset += PAGE_SIZE_64;
7339 }
7340 vm_object_unlock(upl->map_object);
7341 }
7342 if (upl->flags & UPL_SHADOWED) {
7343 if (isVectorUPL) {
7344 offset = 0;
7345 } else {
7346 offset = offset_to_map;
7347 }
7348 } else {
7349 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7350 if (!isVectorUPL) {
7351 offset += offset_to_map;
7352 }
7353 }
7354
7355 if (isVectorUPL) {
7356 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7357 } else {
7358 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7359 }
7360
7361 vm_object_reference(upl->map_object);
7362
7363 if (!isVectorUPL) {
7364 *dst_addr = 0;
7365 /*
7366 * NEED A UPL_MAP ALIAS
7367 */
7368 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7369 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7370 upl->map_object, offset, FALSE,
7371 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7372
7373 if (kr != KERN_SUCCESS) {
7374 vm_object_deallocate(upl->map_object);
7375 upl_unlock(upl);
7376 return kr;
7377 }
7378 } else {
7379 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7380 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7381 upl->map_object, offset, FALSE,
7382 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7383 if (kr) {
7384 panic("vm_map_enter failed for a Vector UPL");
7385 }
7386 }
7387 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7388 /* this will have to be an increment rather than */
7389 /* an assignment. */
7390 vm_object_lock(upl->map_object);
7391
7392 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7393 m = vm_page_lookup(upl->map_object, offset);
7394
7395 if (m) {
7396 m->vmp_pmapped = TRUE;
7397
7398 /*
7399 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7400 * but only in kernel space. If this was on a user map,
7401 * we'd have to set the wpmapped bit.
7402 */
7403 /* m->vmp_wpmapped = TRUE; */
7404 assert(map->pmap == kernel_pmap);
7405
7406 kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7407
7408 assert(kr == KERN_SUCCESS);
7409 #if KASAN
7410 kasan_notify_address(addr, PAGE_SIZE_64);
7411 #endif
7412 }
7413 offset += PAGE_SIZE_64;
7414 }
7415 vm_object_unlock(upl->map_object);
7416
7417 /*
7418 * hold a reference for the mapping
7419 */
7420 upl->ref_count++;
7421 upl->flags |= UPL_PAGE_LIST_MAPPED;
7422 upl->kaddr = (vm_offset_t) *dst_addr;
7423 assert(upl->kaddr == *dst_addr);
7424
7425 if (isVectorUPL) {
7426 goto process_upl_to_enter;
7427 }
7428
7429 if (!isVectorUPL) {
7430 vm_map_offset_t addr_adjustment;
7431
7432 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7433 if (addr_adjustment) {
7434 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7435 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7436 *dst_addr += addr_adjustment;
7437 }
7438 }
7439
7440 upl_unlock(upl);
7441
7442 return KERN_SUCCESS;
7443 }
7444
7445 kern_return_t
7446 vm_map_enter_upl(
7447 vm_map_t map,
7448 upl_t upl,
7449 vm_map_offset_t *dst_addr)
7450 {
7451 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7452 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7453 }
7454
7455 /*
7456 * Internal routine to remove a UPL mapping from a VM map.
7457 *
7458 * XXX - This should just be doable through a standard
7459 * vm_map_remove() operation. Otherwise, implicit clean-up
7460 * of the target map won't be able to correctly remove
7461 * these (and release the reference on the UPL). Having
7462 * to do this means we can't map these into user-space
7463 * maps yet.
7464 */
7465 kern_return_t
7466 vm_map_remove_upl_range(
7467 vm_map_t map,
7468 upl_t upl,
7469 __unused vm_object_offset_t offset_to_unmap,
7470 __unused vm_size_t size_to_unmap)
7471 {
7472 vm_address_t addr;
7473 upl_size_t size;
7474 int isVectorUPL = 0, curr_upl = 0;
7475 upl_t vector_upl = NULL;
7476
7477 if (upl == UPL_NULL) {
7478 return KERN_INVALID_ARGUMENT;
7479 }
7480
7481 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7482 int unmapped = 0, valid_upls = 0;
7483 vector_upl = upl;
7484 upl_lock(vector_upl);
7485 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7486 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7487 if (upl == NULL) {
7488 continue;
7489 }
7490 valid_upls++;
7491 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7492 unmapped++;
7493 }
7494 }
7495
7496 if (unmapped) {
7497 if (unmapped != valid_upls) {
7498 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7499 } else {
7500 upl_unlock(vector_upl);
7501 return KERN_FAILURE;
7502 }
7503 }
7504 curr_upl = 0;
7505 } else {
7506 upl_lock(upl);
7507 }
7508
7509 process_upl_to_remove:
7510 if (isVectorUPL) {
7511 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7512 vm_map_t v_upl_submap;
7513 vm_offset_t v_upl_submap_dst_addr;
7514 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7515
7516 kmem_free_guard(map, v_upl_submap_dst_addr,
7517 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7518 vm_map_deallocate(v_upl_submap);
7519 upl_unlock(vector_upl);
7520 return KERN_SUCCESS;
7521 }
7522
7523 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7524 if (upl == NULL) {
7525 goto process_upl_to_remove;
7526 }
7527 }
7528
7529 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7530 addr = upl->kaddr;
7531 size = upl->u_mapped_size;
7532
7533 assert(upl->ref_count > 1);
7534 upl->ref_count--; /* removing mapping ref */
7535
7536 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7537 upl->kaddr = (vm_offset_t) 0;
7538 upl->u_mapped_size = 0;
7539
7540 if (isVectorUPL) {
7541 /*
7542 * If it's a Vectored UPL, we'll be removing the entire
7543 * submap anyways, so no need to remove individual UPL
7544 * element mappings from within the submap
7545 */
7546 goto process_upl_to_remove;
7547 }
7548
7549 upl_unlock(upl);
7550
7551 vm_map_remove(map,
7552 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7553 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7554 return KERN_SUCCESS;
7555 }
7556 upl_unlock(upl);
7557
7558 return KERN_FAILURE;
7559 }
7560
7561 kern_return_t
7562 vm_map_remove_upl(
7563 vm_map_t map,
7564 upl_t upl)
7565 {
7566 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7567 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7568 }
7569
7570 void
7571 iopl_valid_data(
7572 upl_t upl,
7573 vm_tag_t tag)
7574 {
7575 vm_object_t object;
7576 vm_offset_t offset;
7577 vm_page_t m, nxt_page = VM_PAGE_NULL;
7578 upl_size_t size;
7579 int wired_count = 0;
7580
7581 if (upl == NULL) {
7582 panic("iopl_valid_data: NULL upl");
7583 }
7584 if (vector_upl_is_valid(upl)) {
7585 panic("iopl_valid_data: vector upl");
7586 }
7587 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7588 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7589 }
7590
7591 object = upl->map_object;
7592
7593 if (is_kernel_object(object) || object == compressor_object) {
7594 panic("iopl_valid_data: object == kernel or compressor");
7595 }
7596
7597 if (object->purgable == VM_PURGABLE_VOLATILE ||
7598 object->purgable == VM_PURGABLE_EMPTY) {
7599 panic("iopl_valid_data: object %p purgable %d",
7600 object, object->purgable);
7601 }
7602
7603 size = upl_adjusted_size(upl, PAGE_MASK);
7604
7605 vm_object_lock(object);
7606 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7607
7608 bool whole_object;
7609
7610 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7611 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7612 whole_object = true;
7613 } else {
7614 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7615 whole_object = false;
7616 }
7617
7618 while (size) {
7619 if (whole_object) {
7620 if (nxt_page != VM_PAGE_NULL) {
7621 m = nxt_page;
7622 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7623 }
7624 } else {
7625 m = vm_page_lookup(object, offset);
7626 offset += PAGE_SIZE;
7627
7628 if (m == VM_PAGE_NULL) {
7629 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7630 }
7631 }
7632 if (m->vmp_busy) {
7633 if (!m->vmp_absent) {
7634 panic("iopl_valid_data: busy page w/o absent");
7635 }
7636
7637 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7638 panic("iopl_valid_data: busy+absent page on page queue");
7639 }
7640 if (m->vmp_reusable) {
7641 panic("iopl_valid_data: %p is reusable", m);
7642 }
7643
7644 m->vmp_absent = FALSE;
7645 m->vmp_dirty = TRUE;
7646 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7647 assert(m->vmp_wire_count == 0);
7648 m->vmp_wire_count++;
7649 assert(m->vmp_wire_count);
7650 if (m->vmp_wire_count == 1) {
7651 m->vmp_q_state = VM_PAGE_IS_WIRED;
7652 wired_count++;
7653 } else {
7654 panic("iopl_valid_data: %p already wired", m);
7655 }
7656
7657
7658 vm_page_wakeup_done(object, m);
7659 }
7660 size -= PAGE_SIZE;
7661 }
7662 if (wired_count) {
7663 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7664 assert(object->resident_page_count >= object->wired_page_count);
7665
7666 /* no need to adjust purgeable accounting for this object: */
7667 assert(object->purgable != VM_PURGABLE_VOLATILE);
7668 assert(object->purgable != VM_PURGABLE_EMPTY);
7669
7670 vm_page_lockspin_queues();
7671 vm_page_wire_count += wired_count;
7672 vm_page_unlock_queues();
7673 }
7674 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7675 vm_object_unlock(object);
7676 }
7677
7678
7679 void
7680 vm_object_set_pmap_cache_attr(
7681 vm_object_t object,
7682 upl_page_info_array_t user_page_list,
7683 unsigned int num_pages,
7684 boolean_t batch_pmap_op)
7685 {
7686 unsigned int cache_attr = 0;
7687
7688 cache_attr = object->wimg_bits & VM_WIMG_MASK;
7689 assert(user_page_list);
7690 if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7691 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7692 }
7693 }
7694
7695
7696 static bool
7697 vm_object_iopl_wire_full(
7698 vm_object_t object,
7699 upl_t upl,
7700 upl_page_info_array_t user_page_list,
7701 upl_control_flags_t cntrl_flags,
7702 vm_tag_t tag)
7703 {
7704 vm_page_t dst_page;
7705 unsigned int entry;
7706 int page_count;
7707 int delayed_unlock = 0;
7708 boolean_t retval = TRUE;
7709 ppnum_t phys_page;
7710
7711 vm_object_lock_assert_exclusive(object);
7712 assert(object->purgable != VM_PURGABLE_VOLATILE);
7713 assert(object->purgable != VM_PURGABLE_EMPTY);
7714 assert(object->pager == NULL);
7715 assert(object->vo_copy == NULL);
7716 assert(object->shadow == NULL);
7717
7718 page_count = object->resident_page_count;
7719 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7720
7721 vm_page_lock_queues();
7722
7723 while (page_count--) {
7724 if (dst_page->vmp_busy ||
7725 vm_page_is_fictitious(dst_page) ||
7726 dst_page->vmp_absent ||
7727 VMP_ERROR_GET(dst_page) ||
7728 dst_page->vmp_cleaning ||
7729 dst_page->vmp_restart ||
7730 dst_page->vmp_laundry) {
7731 retval = FALSE;
7732 goto done;
7733 }
7734 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7735 retval = FALSE;
7736 goto done;
7737 }
7738 dst_page->vmp_reference = TRUE;
7739
7740 vm_page_wire(dst_page, tag, FALSE);
7741
7742 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7743 SET_PAGE_DIRTY(dst_page, FALSE);
7744 }
7745 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7746 assert(entry >= 0 && entry < object->resident_page_count);
7747 bitmap_set(upl->lite_list, entry);
7748
7749 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7750
7751 if (phys_page > upl->highest_page) {
7752 upl->highest_page = phys_page;
7753 }
7754
7755 if (user_page_list) {
7756 user_page_list[entry].phys_addr = phys_page;
7757 user_page_list[entry].absent = dst_page->vmp_absent;
7758 user_page_list[entry].dirty = dst_page->vmp_dirty;
7759 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
7760 user_page_list[entry].precious = dst_page->vmp_precious;
7761 user_page_list[entry].device = FALSE;
7762 user_page_list[entry].speculative = FALSE;
7763 user_page_list[entry].cs_validated = FALSE;
7764 user_page_list[entry].cs_tainted = FALSE;
7765 user_page_list[entry].cs_nx = FALSE;
7766 user_page_list[entry].needed = FALSE;
7767 user_page_list[entry].mark = FALSE;
7768 }
7769 if (delayed_unlock++ > 256) {
7770 delayed_unlock = 0;
7771 lck_mtx_yield(&vm_page_queue_lock);
7772
7773 VM_CHECK_MEMORYSTATUS;
7774 }
7775 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7776 }
7777 done:
7778 vm_page_unlock_queues();
7779
7780 VM_CHECK_MEMORYSTATUS;
7781
7782 return retval;
7783 }
7784
7785
7786 static kern_return_t
7787 vm_object_iopl_wire_empty(
7788 vm_object_t object,
7789 upl_t upl,
7790 upl_page_info_array_t user_page_list,
7791 upl_control_flags_t cntrl_flags,
7792 vm_tag_t tag,
7793 vm_object_offset_t *dst_offset,
7794 int page_count,
7795 int *page_grab_count)
7796 {
7797 vm_page_t dst_page;
7798 boolean_t no_zero_fill = FALSE;
7799 int interruptible;
7800 int pages_wired = 0;
7801 int pages_inserted = 0;
7802 int entry = 0;
7803 uint64_t delayed_ledger_update = 0;
7804 kern_return_t ret = KERN_SUCCESS;
7805 int grab_options;
7806 ppnum_t phys_page;
7807
7808 vm_object_lock_assert_exclusive(object);
7809 assert(object->purgable != VM_PURGABLE_VOLATILE);
7810 assert(object->purgable != VM_PURGABLE_EMPTY);
7811 assert(object->pager == NULL);
7812 assert(object->vo_copy == NULL);
7813 assert(object->shadow == NULL);
7814
7815 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7816 interruptible = THREAD_ABORTSAFE;
7817 } else {
7818 interruptible = THREAD_UNINT;
7819 }
7820
7821 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7822 no_zero_fill = TRUE;
7823 }
7824
7825 grab_options = 0;
7826 #if CONFIG_SECLUDED_MEMORY
7827 if (object->can_grab_secluded) {
7828 grab_options |= VM_PAGE_GRAB_SECLUDED;
7829 }
7830 #endif /* CONFIG_SECLUDED_MEMORY */
7831
7832 while (page_count--) {
7833 while ((dst_page = vm_page_grab_options(grab_options))
7834 == VM_PAGE_NULL) {
7835 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7836
7837 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7838
7839 if (vm_page_wait(interruptible) == FALSE) {
7840 /*
7841 * interrupted case
7842 */
7843 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7844
7845 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7846
7847 ret = MACH_SEND_INTERRUPTED;
7848 goto done;
7849 }
7850 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7851
7852 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7853 }
7854
7855 dst_page->vmp_absent = no_zero_fill;
7856 dst_page->vmp_reference = TRUE;
7857
7858 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7859 SET_PAGE_DIRTY(dst_page, FALSE);
7860 }
7861 if (dst_page->vmp_absent == FALSE) {
7862 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7863 assert(dst_page->vmp_wire_count == 0);
7864 dst_page->vmp_wire_count++;
7865 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7866 assert(dst_page->vmp_wire_count);
7867 pages_wired++;
7868
7869
7870 vm_page_wakeup_done(object, dst_page);
7871 }
7872 pages_inserted++;
7873
7874 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7875
7876 if (no_zero_fill == FALSE) {
7877 vm_page_zero_fill(
7878 dst_page
7879 );
7880 }
7881
7882 bitmap_set(upl->lite_list, entry);
7883
7884 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7885
7886 if (phys_page > upl->highest_page) {
7887 upl->highest_page = phys_page;
7888 }
7889
7890 if (user_page_list) {
7891 user_page_list[entry].phys_addr = phys_page;
7892 user_page_list[entry].absent = dst_page->vmp_absent;
7893 user_page_list[entry].dirty = dst_page->vmp_dirty;
7894 user_page_list[entry].free_when_done = FALSE;
7895 user_page_list[entry].precious = FALSE;
7896 user_page_list[entry].device = FALSE;
7897 user_page_list[entry].speculative = FALSE;
7898 user_page_list[entry].cs_validated = FALSE;
7899 user_page_list[entry].cs_tainted = FALSE;
7900 user_page_list[entry].cs_nx = FALSE;
7901 user_page_list[entry].needed = FALSE;
7902 user_page_list[entry].mark = FALSE;
7903 }
7904 entry++;
7905 *dst_offset += PAGE_SIZE_64;
7906 }
7907 done:
7908 if (pages_wired) {
7909 vm_page_lockspin_queues();
7910 vm_page_wire_count += pages_wired;
7911 vm_page_unlock_queues();
7912 }
7913 if (pages_inserted) {
7914 if (object->internal) {
7915 OSAddAtomic(pages_inserted, &vm_page_internal_count);
7916 } else {
7917 OSAddAtomic(pages_inserted, &vm_page_external_count);
7918 }
7919 }
7920 if (delayed_ledger_update) {
7921 task_t owner;
7922 int ledger_idx_volatile;
7923 int ledger_idx_nonvolatile;
7924 int ledger_idx_volatile_compressed;
7925 int ledger_idx_nonvolatile_compressed;
7926 int ledger_idx_composite;
7927 int ledger_idx_external_wired;
7928 boolean_t do_footprint;
7929
7930 owner = VM_OBJECT_OWNER(object);
7931 assert(owner);
7932
7933 vm_object_ledger_tag_ledgers(object,
7934 &ledger_idx_volatile,
7935 &ledger_idx_nonvolatile,
7936 &ledger_idx_volatile_compressed,
7937 &ledger_idx_nonvolatile_compressed,
7938 &ledger_idx_composite,
7939 &ledger_idx_external_wired,
7940 &do_footprint);
7941
7942 if (object->internal) {
7943 /* more non-volatile bytes */
7944 ledger_credit(owner->ledger,
7945 ledger_idx_nonvolatile,
7946 delayed_ledger_update);
7947 if (do_footprint) {
7948 /* more footprint */
7949 ledger_credit(owner->ledger,
7950 task_ledgers.phys_footprint,
7951 delayed_ledger_update);
7952 } else if (ledger_idx_composite != -1) {
7953 ledger_credit(owner->ledger,
7954 ledger_idx_composite,
7955 delayed_ledger_update);
7956 }
7957 } else {
7958 /* more external wired bytes */
7959 ledger_credit(owner->ledger,
7960 ledger_idx_external_wired,
7961 delayed_ledger_update);
7962 if (do_footprint) {
7963 /* more footprint */
7964 ledger_credit(owner->ledger,
7965 task_ledgers.phys_footprint,
7966 delayed_ledger_update);
7967 } else if (ledger_idx_composite != -1) {
7968 ledger_credit(owner->ledger,
7969 ledger_idx_composite,
7970 delayed_ledger_update);
7971 }
7972 }
7973 }
7974
7975 assert(page_grab_count);
7976 *page_grab_count = pages_inserted;
7977
7978 return ret;
7979 }
7980
7981
7982 kern_return_t
7983 vm_object_iopl_request(
7984 vm_object_t object,
7985 vm_object_offset_t offset,
7986 upl_size_t size,
7987 upl_t *upl_ptr,
7988 upl_page_info_array_t user_page_list,
7989 unsigned int *page_list_count,
7990 upl_control_flags_t cntrl_flags,
7991 vm_tag_t tag)
7992 {
7993 vm_page_t dst_page;
7994 vm_object_offset_t dst_offset;
7995 upl_size_t xfer_size;
7996 upl_t upl = NULL;
7997 unsigned int entry;
7998 int no_zero_fill = FALSE;
7999 unsigned int size_in_pages;
8000 int page_grab_count = 0;
8001 u_int32_t psize;
8002 kern_return_t ret;
8003 vm_prot_t prot;
8004 struct vm_object_fault_info fault_info = {};
8005 struct vm_page_delayed_work dw_array;
8006 struct vm_page_delayed_work *dwp, *dwp_start;
8007 bool dwp_finish_ctx = TRUE;
8008 int dw_count;
8009 int dw_limit;
8010 int dw_index;
8011 boolean_t caller_lookup;
8012 int io_tracking_flag = 0;
8013 int interruptible;
8014 ppnum_t phys_page;
8015
8016 boolean_t set_cache_attr_needed = FALSE;
8017 boolean_t free_wired_pages = FALSE;
8018 boolean_t fast_path_empty_req = FALSE;
8019 boolean_t fast_path_full_req = FALSE;
8020
8021 task_t task = current_task();
8022
8023 dwp_start = dwp = NULL;
8024
8025 vm_object_offset_t original_offset = offset;
8026 upl_size_t original_size = size;
8027
8028 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8029
8030 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8031 offset = vm_object_trunc_page(offset);
8032 if (size != original_size || offset != original_offset) {
8033 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8034 }
8035
8036 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8037 /*
8038 * For forward compatibility's sake,
8039 * reject any unknown flag.
8040 */
8041 return KERN_INVALID_VALUE;
8042 }
8043 if (vm_lopage_needed == FALSE) {
8044 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8045 }
8046
8047 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8048 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8049 return KERN_INVALID_VALUE;
8050 }
8051
8052 if (object->phys_contiguous) {
8053 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8054 return KERN_INVALID_ADDRESS;
8055 }
8056
8057 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8058 return KERN_INVALID_ADDRESS;
8059 }
8060 }
8061 }
8062 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8063 no_zero_fill = TRUE;
8064 }
8065
8066 if (cntrl_flags & UPL_COPYOUT_FROM) {
8067 prot = VM_PROT_READ;
8068 } else {
8069 prot = VM_PROT_READ | VM_PROT_WRITE;
8070 }
8071
8072 if ((!object->internal) && (object->paging_offset != 0)) {
8073 panic("vm_object_iopl_request: external object with non-zero paging offset");
8074 }
8075
8076 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8077
8078 #if CONFIG_IOSCHED || UPL_DEBUG
8079 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8080 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8081 }
8082 #endif
8083
8084 #if CONFIG_IOSCHED
8085 if (object->io_tracking) {
8086 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8087 if (!is_kernel_object(object)) {
8088 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8089 }
8090 }
8091 #endif
8092
8093 if (object->phys_contiguous) {
8094 psize = PAGE_SIZE;
8095 } else {
8096 psize = size;
8097
8098 dw_count = 0;
8099 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8100 dwp_start = vm_page_delayed_work_get_ctx();
8101 if (dwp_start == NULL) {
8102 dwp_start = &dw_array;
8103 dw_limit = 1;
8104 dwp_finish_ctx = FALSE;
8105 }
8106
8107 dwp = dwp_start;
8108 }
8109
8110 if (cntrl_flags & UPL_SET_INTERNAL) {
8111 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8112 user_page_list = size ? upl->page_list : NULL;
8113 } else {
8114 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8115 }
8116 if (user_page_list) {
8117 user_page_list[0].device = FALSE;
8118 }
8119 *upl_ptr = upl;
8120
8121 if (cntrl_flags & UPL_NOZEROFILLIO) {
8122 DTRACE_VM4(upl_nozerofillio,
8123 vm_object_t, object,
8124 vm_object_offset_t, offset,
8125 upl_size_t, size,
8126 upl_t, upl);
8127 }
8128
8129 upl->map_object = object;
8130 upl->u_offset = original_offset;
8131 upl->u_size = original_size;
8132
8133 size_in_pages = size / PAGE_SIZE;
8134
8135 if (is_kernel_object(object) &&
8136 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8137 upl->flags |= UPL_KERNEL_OBJECT;
8138 #if UPL_DEBUG
8139 vm_object_lock(object);
8140 #else
8141 vm_object_lock_shared(object);
8142 #endif
8143 } else {
8144 vm_object_lock(object);
8145 vm_object_activity_begin(object);
8146 }
8147 /*
8148 * paging in progress also protects the paging_offset
8149 */
8150 upl->u_offset = original_offset + object->paging_offset;
8151
8152 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8153 /*
8154 * The user requested that access to the pages in this UPL
8155 * be blocked until the UPL is commited or aborted.
8156 */
8157 upl->flags |= UPL_ACCESS_BLOCKED;
8158 }
8159
8160 #if CONFIG_IOSCHED || UPL_DEBUG
8161 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8162 vm_object_activity_begin(object);
8163 queue_enter(&object->uplq, upl, upl_t, uplq);
8164 }
8165 #endif
8166
8167 if (object->phys_contiguous) {
8168 if (upl->flags & UPL_ACCESS_BLOCKED) {
8169 assert(!object->blocked_access);
8170 object->blocked_access = TRUE;
8171 }
8172
8173 vm_object_unlock(object);
8174
8175 /*
8176 * don't need any shadow mappings for this one
8177 * since it is already I/O memory
8178 */
8179 upl->flags |= UPL_DEVICE_MEMORY;
8180
8181 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8182
8183 if (user_page_list) {
8184 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8185 user_page_list[0].device = TRUE;
8186 }
8187 if (page_list_count != NULL) {
8188 if (upl->flags & UPL_INTERNAL) {
8189 *page_list_count = 0;
8190 } else {
8191 *page_list_count = 1;
8192 }
8193 }
8194
8195 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8196 if (task != NULL) {
8197 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8198 }
8199 return KERN_SUCCESS;
8200 }
8201 if (!is_kernel_object(object) && object != compressor_object) {
8202 /*
8203 * Protect user space from future COW operations
8204 */
8205 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8206 if (!object->true_share &&
8207 vm_object_tracking_btlog) {
8208 btlog_record(vm_object_tracking_btlog, object,
8209 VM_OBJECT_TRACKING_OP_TRUESHARE,
8210 btref_get(__builtin_frame_address(0), 0));
8211 }
8212 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8213
8214 vm_object_lock_assert_exclusive(object);
8215 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8216
8217 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8218 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8219 }
8220 }
8221
8222 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8223 object->vo_copy != VM_OBJECT_NULL) {
8224 /*
8225 * Honor copy-on-write obligations
8226 *
8227 * The caller is gathering these pages and
8228 * might modify their contents. We need to
8229 * make sure that the copy object has its own
8230 * private copies of these pages before we let
8231 * the caller modify them.
8232 *
8233 * NOTE: someone else could map the original object
8234 * after we've done this copy-on-write here, and they
8235 * could then see an inconsistent picture of the memory
8236 * while it's being modified via the UPL. To prevent this,
8237 * we would have to block access to these pages until the
8238 * UPL is released. We could use the UPL_BLOCK_ACCESS
8239 * code path for that...
8240 */
8241 vm_object_update(object,
8242 offset,
8243 size,
8244 NULL,
8245 NULL,
8246 FALSE, /* should_return */
8247 MEMORY_OBJECT_COPY_SYNC,
8248 VM_PROT_NO_CHANGE);
8249 VM_PAGEOUT_DEBUG(iopl_cow, 1);
8250 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8251 }
8252 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8253 object->purgable != VM_PURGABLE_VOLATILE &&
8254 object->purgable != VM_PURGABLE_EMPTY &&
8255 object->vo_copy == NULL &&
8256 size == object->vo_size &&
8257 offset == 0 &&
8258 object->shadow == NULL &&
8259 object->pager == NULL) {
8260 if (object->resident_page_count == size_in_pages) {
8261 assert(object != compressor_object);
8262 assert(!is_kernel_object(object));
8263 fast_path_full_req = TRUE;
8264 } else if (object->resident_page_count == 0) {
8265 assert(object != compressor_object);
8266 assert(!is_kernel_object(object));
8267 fast_path_empty_req = TRUE;
8268 set_cache_attr_needed = TRUE;
8269 }
8270 }
8271
8272 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8273 interruptible = THREAD_ABORTSAFE;
8274 } else {
8275 interruptible = THREAD_UNINT;
8276 }
8277
8278 entry = 0;
8279
8280 xfer_size = size;
8281 dst_offset = offset;
8282
8283 if (fast_path_full_req) {
8284 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8285 goto finish;
8286 }
8287 /*
8288 * we couldn't complete the processing of this request on the fast path
8289 * so fall through to the slow path and finish up
8290 */
8291 } else if (fast_path_empty_req) {
8292 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8293 ret = KERN_MEMORY_ERROR;
8294 goto return_err;
8295 }
8296 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8297 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8298
8299 if (ret) {
8300 free_wired_pages = TRUE;
8301 goto return_err;
8302 }
8303 goto finish;
8304 }
8305
8306 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8307 fault_info.lo_offset = offset;
8308 fault_info.hi_offset = offset + xfer_size;
8309 fault_info.mark_zf_absent = TRUE;
8310 fault_info.interruptible = interruptible;
8311 fault_info.batch_pmap_op = TRUE;
8312
8313 while (xfer_size) {
8314 vm_fault_return_t result;
8315
8316 dwp->dw_mask = 0;
8317
8318 if (fast_path_full_req) {
8319 /*
8320 * if we get here, it means that we ran into a page
8321 * state we couldn't handle in the fast path and
8322 * bailed out to the slow path... since the order
8323 * we look at pages is different between the 2 paths,
8324 * the following check is needed to determine whether
8325 * this page was already processed in the fast path
8326 */
8327 if (bitmap_test(upl->lite_list, entry)) {
8328 goto skip_page;
8329 }
8330 }
8331 dst_page = vm_page_lookup(object, dst_offset);
8332
8333 if (dst_page == VM_PAGE_NULL ||
8334 dst_page->vmp_busy ||
8335 VMP_ERROR_GET(dst_page) ||
8336 dst_page->vmp_restart ||
8337 dst_page->vmp_absent ||
8338 vm_page_is_fictitious(dst_page)) {
8339 if (is_kernel_object(object)) {
8340 panic("vm_object_iopl_request: missing/bad page in kernel object");
8341 }
8342 if (object == compressor_object) {
8343 panic("vm_object_iopl_request: missing/bad page in compressor object");
8344 }
8345
8346 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8347 ret = KERN_MEMORY_ERROR;
8348 goto return_err;
8349 }
8350
8351 if (dst_page != VM_PAGE_NULL &&
8352 dst_page->vmp_busy) {
8353 wait_result_t wait_result;
8354 vm_object_lock_assert_exclusive(object);
8355 wait_result = vm_page_sleep(object, dst_page,
8356 interruptible, LCK_SLEEP_DEFAULT);
8357 if (wait_result == THREAD_AWAKENED ||
8358 wait_result == THREAD_RESTART) {
8359 continue;
8360 }
8361 ret = MACH_SEND_INTERRUPTED;
8362 goto return_err;
8363 }
8364
8365 set_cache_attr_needed = TRUE;
8366
8367 /*
8368 * We just looked up the page and the result remains valid
8369 * until the object lock is release, so send it to
8370 * vm_fault_page() (as "dst_page"), to avoid having to
8371 * look it up again there.
8372 */
8373 caller_lookup = TRUE;
8374
8375 do {
8376 vm_page_t top_page;
8377 kern_return_t error_code;
8378
8379 fault_info.cluster_size = xfer_size;
8380 vm_object_paging_begin(object);
8381
8382 result = vm_fault_page(object, dst_offset,
8383 prot | VM_PROT_WRITE, FALSE,
8384 caller_lookup,
8385 &prot, &dst_page, &top_page,
8386 (int *)0,
8387 &error_code, no_zero_fill,
8388 &fault_info);
8389
8390 /* our lookup is no longer valid at this point */
8391 caller_lookup = FALSE;
8392
8393 switch (result) {
8394 case VM_FAULT_SUCCESS:
8395 page_grab_count++;
8396
8397 if (!dst_page->vmp_absent) {
8398 vm_page_wakeup_done(object, dst_page);
8399 } else {
8400 /*
8401 * we only get back an absent page if we
8402 * requested that it not be zero-filled
8403 * because we are about to fill it via I/O
8404 *
8405 * absent pages should be left BUSY
8406 * to prevent them from being faulted
8407 * into an address space before we've
8408 * had a chance to complete the I/O on
8409 * them since they may contain info that
8410 * shouldn't be seen by the faulting task
8411 */
8412 }
8413 /*
8414 * Release paging references and
8415 * top-level placeholder page, if any.
8416 */
8417 if (top_page != VM_PAGE_NULL) {
8418 vm_object_t local_object;
8419
8420 local_object = VM_PAGE_OBJECT(top_page);
8421
8422 /*
8423 * comparing 2 packed pointers
8424 */
8425 if (top_page->vmp_object != dst_page->vmp_object) {
8426 vm_object_lock(local_object);
8427 VM_PAGE_FREE(top_page);
8428 vm_object_paging_end(local_object);
8429 vm_object_unlock(local_object);
8430 } else {
8431 VM_PAGE_FREE(top_page);
8432 vm_object_paging_end(local_object);
8433 }
8434 }
8435 vm_object_paging_end(object);
8436 break;
8437
8438 case VM_FAULT_RETRY:
8439 vm_object_lock(object);
8440 break;
8441
8442 case VM_FAULT_MEMORY_SHORTAGE:
8443 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8444
8445 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8446
8447 if (vm_page_wait(interruptible)) {
8448 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8449
8450 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8451 vm_object_lock(object);
8452
8453 break;
8454 }
8455 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8456
8457 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8458 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8459 OS_FALLTHROUGH;
8460
8461 case VM_FAULT_INTERRUPTED:
8462 error_code = MACH_SEND_INTERRUPTED;
8463 OS_FALLTHROUGH;
8464 case VM_FAULT_MEMORY_ERROR:
8465 memory_error:
8466 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8467
8468 vm_object_lock(object);
8469 goto return_err;
8470
8471 case VM_FAULT_SUCCESS_NO_VM_PAGE:
8472 /* success but no page: fail */
8473 vm_object_paging_end(object);
8474 vm_object_unlock(object);
8475 goto memory_error;
8476
8477 default:
8478 panic("vm_object_iopl_request: unexpected error"
8479 " 0x%x from vm_fault_page()\n", result);
8480 }
8481 } while (result != VM_FAULT_SUCCESS);
8482 }
8483 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8484
8485 if (upl->flags & UPL_KERNEL_OBJECT) {
8486 goto record_phys_addr;
8487 }
8488
8489 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8490 dst_page->vmp_busy = TRUE;
8491 goto record_phys_addr;
8492 }
8493
8494 if (dst_page->vmp_cleaning) {
8495 /*
8496 * Someone else is cleaning this page in place.
8497 * In theory, we should be able to proceed and use this
8498 * page but they'll probably end up clearing the "busy"
8499 * bit on it in upl_commit_range() but they didn't set
8500 * it, so they would clear our "busy" bit and open
8501 * us to race conditions.
8502 * We'd better wait for the cleaning to complete and
8503 * then try again.
8504 */
8505 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8506 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8507 continue;
8508 }
8509 if (dst_page->vmp_laundry) {
8510 vm_pageout_steal_laundry(dst_page, FALSE);
8511 }
8512
8513 if (
8514 ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8515 phys_page >= (max_valid_dma_address >> PAGE_SHIFT))) {
8516 vm_page_t new_page;
8517 int refmod;
8518
8519 /*
8520 * support devices that can't DMA above 32 bits
8521 * by substituting pages from a pool of low address
8522 * memory for any pages we find above the 4G mark
8523 * can't substitute if the page is already wired because
8524 * we don't know whether that physical address has been
8525 * handed out to some other 64 bit capable DMA device to use
8526 */
8527 if (VM_PAGE_WIRED(dst_page)) {
8528 ret = KERN_PROTECTION_FAILURE;
8529 goto return_err;
8530 }
8531
8532 {
8533 new_page = vm_page_grablo();
8534 }
8535
8536 if (new_page == VM_PAGE_NULL) {
8537 ret = KERN_RESOURCE_SHORTAGE;
8538 goto return_err;
8539 }
8540 /*
8541 * from here until the vm_page_replace completes
8542 * we musn't drop the object lock... we don't
8543 * want anyone refaulting this page in and using
8544 * it after we disconnect it... we want the fault
8545 * to find the new page being substituted.
8546 */
8547 if (dst_page->vmp_pmapped) {
8548 refmod = pmap_disconnect(phys_page);
8549 } else {
8550 refmod = 0;
8551 }
8552
8553 if (!dst_page->vmp_absent) {
8554 vm_page_copy(dst_page, new_page);
8555 }
8556
8557 new_page->vmp_reference = dst_page->vmp_reference;
8558 new_page->vmp_dirty = dst_page->vmp_dirty;
8559 new_page->vmp_absent = dst_page->vmp_absent;
8560
8561 if (refmod & VM_MEM_REFERENCED) {
8562 new_page->vmp_reference = TRUE;
8563 }
8564 if (refmod & VM_MEM_MODIFIED) {
8565 SET_PAGE_DIRTY(new_page, FALSE);
8566 }
8567
8568 vm_page_replace(new_page, object, dst_offset);
8569
8570 dst_page = new_page;
8571 /*
8572 * vm_page_grablo returned the page marked
8573 * BUSY... we don't need a PAGE_WAKEUP_DONE
8574 * here, because we've never dropped the object lock
8575 */
8576 if (!dst_page->vmp_absent) {
8577 dst_page->vmp_busy = FALSE;
8578 }
8579
8580 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8581 }
8582 if (!dst_page->vmp_busy) {
8583 dwp->dw_mask |= DW_vm_page_wire;
8584 }
8585
8586 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8587 /*
8588 * Mark the page "busy" to block any future page fault
8589 * on this page in addition to wiring it.
8590 * We'll also remove the mapping
8591 * of all these pages before leaving this routine.
8592 */
8593 assert(!vm_page_is_fictitious(dst_page));
8594 dst_page->vmp_busy = TRUE;
8595 }
8596 /*
8597 * expect the page to be used
8598 * page queues lock must be held to set 'reference'
8599 */
8600 dwp->dw_mask |= DW_set_reference;
8601
8602 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8603 SET_PAGE_DIRTY(dst_page, TRUE);
8604 /*
8605 * Page belonging to a code-signed object is about to
8606 * be written. Mark it tainted and disconnect it from
8607 * all pmaps so processes have to fault it back in and
8608 * deal with the tainted bit.
8609 */
8610 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8611 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8612 vm_page_iopl_tainted++;
8613 if (dst_page->vmp_pmapped) {
8614 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8615 if (refmod & VM_MEM_REFERENCED) {
8616 dst_page->vmp_reference = TRUE;
8617 }
8618 }
8619 }
8620 }
8621 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8622 pmap_sync_page_attributes_phys(phys_page);
8623 dst_page->vmp_written_by_kernel = FALSE;
8624 }
8625
8626 record_phys_addr:
8627 if (dst_page->vmp_busy) {
8628 upl->flags |= UPL_HAS_BUSY;
8629 }
8630
8631 bitmap_set(upl->lite_list, entry);
8632
8633 if (phys_page > upl->highest_page) {
8634 upl->highest_page = phys_page;
8635 }
8636
8637 if (user_page_list) {
8638 user_page_list[entry].phys_addr = phys_page;
8639 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8640 user_page_list[entry].absent = dst_page->vmp_absent;
8641 user_page_list[entry].dirty = dst_page->vmp_dirty;
8642 user_page_list[entry].precious = dst_page->vmp_precious;
8643 user_page_list[entry].device = FALSE;
8644 user_page_list[entry].needed = FALSE;
8645 if (dst_page->vmp_clustered == TRUE) {
8646 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8647 } else {
8648 user_page_list[entry].speculative = FALSE;
8649 }
8650 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8651 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8652 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8653 user_page_list[entry].mark = FALSE;
8654 }
8655 if (!is_kernel_object(object) && object != compressor_object) {
8656 /*
8657 * someone is explicitly grabbing this page...
8658 * update clustered and speculative state
8659 *
8660 */
8661 if (dst_page->vmp_clustered) {
8662 VM_PAGE_CONSUME_CLUSTERED(dst_page);
8663 }
8664 }
8665 skip_page:
8666 entry++;
8667 dst_offset += PAGE_SIZE_64;
8668 xfer_size -= PAGE_SIZE;
8669
8670 if (dwp->dw_mask) {
8671 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8672
8673 if (dw_count >= dw_limit) {
8674 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8675
8676 dwp = dwp_start;
8677 dw_count = 0;
8678 }
8679 }
8680 }
8681 assert(entry == size_in_pages);
8682
8683 if (dw_count) {
8684 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8685 dwp = dwp_start;
8686 dw_count = 0;
8687 }
8688 finish:
8689 if (user_page_list && set_cache_attr_needed == TRUE) {
8690 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8691 }
8692
8693 if (page_list_count != NULL) {
8694 if (upl->flags & UPL_INTERNAL) {
8695 *page_list_count = 0;
8696 } else if (*page_list_count > size_in_pages) {
8697 *page_list_count = size_in_pages;
8698 }
8699 }
8700 vm_object_unlock(object);
8701
8702 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8703 /*
8704 * We've marked all the pages "busy" so that future
8705 * page faults will block.
8706 * Now remove the mapping for these pages, so that they
8707 * can't be accessed without causing a page fault.
8708 */
8709 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8710 PMAP_NULL,
8711 PAGE_SIZE,
8712 0, VM_PROT_NONE);
8713 assert(!object->blocked_access);
8714 object->blocked_access = TRUE;
8715 }
8716
8717 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8718 if (task != NULL) {
8719 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8720 }
8721
8722 if (dwp_start && dwp_finish_ctx) {
8723 vm_page_delayed_work_finish_ctx(dwp_start);
8724 dwp_start = dwp = NULL;
8725 }
8726
8727 return KERN_SUCCESS;
8728
8729 return_err:
8730 dw_index = 0;
8731
8732 for (; offset < dst_offset; offset += PAGE_SIZE) {
8733 boolean_t need_unwire;
8734 bool need_wakeup;
8735
8736 dst_page = vm_page_lookup(object, offset);
8737
8738 if (dst_page == VM_PAGE_NULL) {
8739 panic("vm_object_iopl_request: Wired page missing.");
8740 }
8741
8742 /*
8743 * if we've already processed this page in an earlier
8744 * dw_do_work, we need to undo the wiring... we will
8745 * leave the dirty and reference bits on if they
8746 * were set, since we don't have a good way of knowing
8747 * what the previous state was and we won't get here
8748 * under any normal circumstances... we will always
8749 * clear BUSY and wakeup any waiters via vm_page_free
8750 * or PAGE_WAKEUP_DONE
8751 */
8752 need_unwire = TRUE;
8753
8754 need_wakeup = false;
8755 if (dw_count) {
8756 if ((dwp_start)[dw_index].dw_m == dst_page) {
8757 /*
8758 * still in the deferred work list
8759 * which means we haven't yet called
8760 * vm_page_wire on this page
8761 */
8762 need_unwire = FALSE;
8763
8764 if (dst_page->vmp_busy &&
8765 ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8766 /*
8767 * It's our own "busy" bit, so we need to clear it
8768 * now and wake up waiters below.
8769 */
8770 dst_page->vmp_busy = false;
8771 need_wakeup = true;
8772 }
8773
8774 dw_index++;
8775 dw_count--;
8776 }
8777 }
8778 vm_page_lock_queues();
8779
8780 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8781 vm_page_free(dst_page);
8782
8783 need_unwire = FALSE;
8784 } else {
8785 if (need_unwire == TRUE) {
8786 vm_page_unwire(dst_page, TRUE);
8787 }
8788 if (dst_page->vmp_busy) {
8789 /* not our "busy" or we would have cleared it above */
8790 assert(!need_wakeup);
8791 }
8792 if (need_wakeup) {
8793 assert(!dst_page->vmp_busy);
8794 vm_page_wakeup(object, dst_page);
8795 }
8796 }
8797 vm_page_unlock_queues();
8798
8799 if (need_unwire == TRUE) {
8800 counter_inc(&vm_statistics_reactivations);
8801 }
8802 }
8803 #if UPL_DEBUG
8804 upl->upl_state = 2;
8805 #endif
8806 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8807 vm_object_activity_end(object);
8808 vm_object_collapse(object, 0, TRUE);
8809 }
8810 vm_object_unlock(object);
8811 upl_destroy(upl);
8812
8813 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8814 if (task != NULL) {
8815 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8816 }
8817
8818 if (dwp_start && dwp_finish_ctx) {
8819 vm_page_delayed_work_finish_ctx(dwp_start);
8820 dwp_start = dwp = NULL;
8821 }
8822 return ret;
8823 }
8824
8825 kern_return_t
8826 upl_transpose(
8827 upl_t upl1,
8828 upl_t upl2)
8829 {
8830 kern_return_t retval;
8831 boolean_t upls_locked;
8832 vm_object_t object1, object2;
8833
8834 /* LD: Should mapped UPLs be eligible for a transpose? */
8835 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8836 return KERN_INVALID_ARGUMENT;
8837 }
8838
8839 upls_locked = FALSE;
8840
8841 /*
8842 * Since we need to lock both UPLs at the same time,
8843 * avoid deadlocks by always taking locks in the same order.
8844 */
8845 if (upl1 < upl2) {
8846 upl_lock(upl1);
8847 upl_lock(upl2);
8848 } else {
8849 upl_lock(upl2);
8850 upl_lock(upl1);
8851 }
8852 upls_locked = TRUE; /* the UPLs will need to be unlocked */
8853
8854 object1 = upl1->map_object;
8855 object2 = upl2->map_object;
8856
8857 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8858 upl1->u_size != upl2->u_size) {
8859 /*
8860 * We deal only with full objects, not subsets.
8861 * That's because we exchange the entire backing store info
8862 * for the objects: pager, resident pages, etc... We can't do
8863 * only part of it.
8864 */
8865 retval = KERN_INVALID_VALUE;
8866 goto done;
8867 }
8868
8869 /*
8870 * Tranpose the VM objects' backing store.
8871 */
8872 retval = vm_object_transpose(object1, object2,
8873 upl_adjusted_size(upl1, PAGE_MASK));
8874
8875 if (retval == KERN_SUCCESS) {
8876 /*
8877 * Make each UPL point to the correct VM object, i.e. the
8878 * object holding the pages that the UPL refers to...
8879 */
8880 #if CONFIG_IOSCHED || UPL_DEBUG
8881 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8882 vm_object_lock(object1);
8883 vm_object_lock(object2);
8884 }
8885 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8886 queue_remove(&object1->uplq, upl1, upl_t, uplq);
8887 }
8888 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8889 queue_remove(&object2->uplq, upl2, upl_t, uplq);
8890 }
8891 #endif
8892 upl1->map_object = object2;
8893 upl2->map_object = object1;
8894
8895 #if CONFIG_IOSCHED || UPL_DEBUG
8896 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8897 queue_enter(&object2->uplq, upl1, upl_t, uplq);
8898 }
8899 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8900 queue_enter(&object1->uplq, upl2, upl_t, uplq);
8901 }
8902 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8903 vm_object_unlock(object2);
8904 vm_object_unlock(object1);
8905 }
8906 #endif
8907 }
8908
8909 done:
8910 /*
8911 * Cleanup.
8912 */
8913 if (upls_locked) {
8914 upl_unlock(upl1);
8915 upl_unlock(upl2);
8916 upls_locked = FALSE;
8917 }
8918
8919 return retval;
8920 }
8921
8922 void
8923 upl_range_needed(
8924 upl_t upl,
8925 int index,
8926 int count)
8927 {
8928 int size_in_pages;
8929
8930 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8931 return;
8932 }
8933
8934 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8935
8936 while (count-- && index < size_in_pages) {
8937 upl->page_list[index++].needed = TRUE;
8938 }
8939 }
8940
8941
8942 /*
8943 * Reserve of virtual addresses in the kernel address space.
8944 * We need to map the physical pages in the kernel, so that we
8945 * can call the code-signing or slide routines with a kernel
8946 * virtual address. We keep this pool of pre-allocated kernel
8947 * virtual addresses so that we don't have to scan the kernel's
8948 * virtaul address space each time we need to work with
8949 * a physical page.
8950 */
8951 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8952 #define VM_PAGING_NUM_PAGES 64
8953 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8954 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8955 int vm_paging_max_index = 0;
8956 int vm_paging_page_waiter = 0;
8957 int vm_paging_page_waiter_total = 0;
8958
8959 unsigned long vm_paging_no_kernel_page = 0;
8960 unsigned long vm_paging_objects_mapped = 0;
8961 unsigned long vm_paging_pages_mapped = 0;
8962 unsigned long vm_paging_objects_mapped_slow = 0;
8963 unsigned long vm_paging_pages_mapped_slow = 0;
8964
8965 __startup_func
8966 static void
8967 vm_paging_map_init(void)
8968 {
8969 kmem_alloc(kernel_map, &vm_paging_base_address,
8970 ptoa(VM_PAGING_NUM_PAGES),
8971 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
8972 VM_KERN_MEMORY_NONE);
8973 }
8974 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
8975
8976 /*
8977 * vm_paging_map_object:
8978 * Maps part of a VM object's pages in the kernel
8979 * virtual address space, using the pre-allocated
8980 * kernel virtual addresses, if possible.
8981 * Context:
8982 * The VM object is locked. This lock will get
8983 * dropped and re-acquired though, so the caller
8984 * must make sure the VM object is kept alive
8985 * (by holding a VM map that has a reference
8986 * on it, for example, or taking an extra reference).
8987 * The page should also be kept busy to prevent
8988 * it from being reclaimed.
8989 */
8990 kern_return_t
8991 vm_paging_map_object(
8992 vm_page_t page,
8993 vm_object_t object,
8994 vm_object_offset_t offset,
8995 vm_prot_t protection,
8996 boolean_t can_unlock_object,
8997 vm_map_size_t *size, /* IN/OUT */
8998 vm_map_offset_t *address, /* OUT */
8999 boolean_t *need_unmap) /* OUT */
9000 {
9001 kern_return_t kr;
9002 vm_map_offset_t page_map_offset;
9003 vm_map_size_t map_size;
9004 vm_object_offset_t object_offset;
9005 int i;
9006
9007 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9008 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9009 *address = (vm_map_offset_t)
9010 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9011 *need_unmap = FALSE;
9012 return KERN_SUCCESS;
9013
9014 assert(page->vmp_busy);
9015 /*
9016 * Use one of the pre-allocated kernel virtual addresses
9017 * and just enter the VM page in the kernel address space
9018 * at that virtual address.
9019 */
9020 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9021
9022 /*
9023 * Try and find an available kernel virtual address
9024 * from our pre-allocated pool.
9025 */
9026 page_map_offset = 0;
9027 for (;;) {
9028 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9029 if (vm_paging_page_inuse[i] == FALSE) {
9030 page_map_offset =
9031 vm_paging_base_address +
9032 (i * PAGE_SIZE);
9033 break;
9034 }
9035 }
9036 if (page_map_offset != 0) {
9037 /* found a space to map our page ! */
9038 break;
9039 }
9040
9041 if (can_unlock_object) {
9042 /*
9043 * If we can afford to unlock the VM object,
9044 * let's take the slow path now...
9045 */
9046 break;
9047 }
9048 /*
9049 * We can't afford to unlock the VM object, so
9050 * let's wait for a space to become available...
9051 */
9052 vm_paging_page_waiter_total++;
9053 vm_paging_page_waiter++;
9054 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9055 if (kr == THREAD_WAITING) {
9056 simple_unlock(&vm_paging_lock);
9057 kr = thread_block(THREAD_CONTINUE_NULL);
9058 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9059 }
9060 vm_paging_page_waiter--;
9061 /* ... and try again */
9062 }
9063
9064 if (page_map_offset != 0) {
9065 /*
9066 * We found a kernel virtual address;
9067 * map the physical page to that virtual address.
9068 */
9069 if (i > vm_paging_max_index) {
9070 vm_paging_max_index = i;
9071 }
9072 vm_paging_page_inuse[i] = TRUE;
9073 simple_unlock(&vm_paging_lock);
9074
9075 page->vmp_pmapped = TRUE;
9076
9077 /*
9078 * Keep the VM object locked over the PMAP_ENTER
9079 * and the actual use of the page by the kernel,
9080 * or this pmap mapping might get undone by a
9081 * vm_object_pmap_protect() call...
9082 */
9083 kr = pmap_enter_check(kernel_pmap,
9084 page_map_offset,
9085 page,
9086 protection,
9087 VM_PROT_NONE,
9088 TRUE);
9089 assert(kr == KERN_SUCCESS);
9090 vm_paging_objects_mapped++;
9091 vm_paging_pages_mapped++;
9092 *address = page_map_offset;
9093 *need_unmap = TRUE;
9094
9095 #if KASAN
9096 kasan_notify_address(page_map_offset, PAGE_SIZE);
9097 #endif
9098
9099 /* all done and mapped, ready to use ! */
9100 return KERN_SUCCESS;
9101 }
9102
9103 /*
9104 * We ran out of pre-allocated kernel virtual
9105 * addresses. Just map the page in the kernel
9106 * the slow and regular way.
9107 */
9108 vm_paging_no_kernel_page++;
9109 simple_unlock(&vm_paging_lock);
9110 }
9111
9112 if (!can_unlock_object) {
9113 *address = 0;
9114 *size = 0;
9115 *need_unmap = FALSE;
9116 return KERN_NOT_SUPPORTED;
9117 }
9118
9119 object_offset = vm_object_trunc_page(offset);
9120 map_size = vm_map_round_page(*size,
9121 VM_MAP_PAGE_MASK(kernel_map));
9122
9123 /*
9124 * Try and map the required range of the object
9125 * in the kernel_map. Given that allocation is
9126 * for pageable memory, it shouldn't contain
9127 * pointers and is mapped into the data range.
9128 */
9129
9130 vm_object_reference_locked(object); /* for the map entry */
9131 vm_object_unlock(object);
9132
9133 kr = vm_map_enter(kernel_map,
9134 address,
9135 map_size,
9136 0,
9137 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9138 object,
9139 object_offset,
9140 FALSE,
9141 protection,
9142 VM_PROT_ALL,
9143 VM_INHERIT_NONE);
9144 if (kr != KERN_SUCCESS) {
9145 *address = 0;
9146 *size = 0;
9147 *need_unmap = FALSE;
9148 vm_object_deallocate(object); /* for the map entry */
9149 vm_object_lock(object);
9150 return kr;
9151 }
9152
9153 *size = map_size;
9154
9155 /*
9156 * Enter the mapped pages in the page table now.
9157 */
9158 vm_object_lock(object);
9159 /*
9160 * VM object must be kept locked from before PMAP_ENTER()
9161 * until after the kernel is done accessing the page(s).
9162 * Otherwise, the pmap mappings in the kernel could be
9163 * undone by a call to vm_object_pmap_protect().
9164 */
9165
9166 for (page_map_offset = 0;
9167 map_size != 0;
9168 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9169 page = vm_page_lookup(object, offset + page_map_offset);
9170 if (page == VM_PAGE_NULL) {
9171 printf("vm_paging_map_object: no page !?");
9172 vm_object_unlock(object);
9173 vm_map_remove(kernel_map, *address, *size);
9174 *address = 0;
9175 *size = 0;
9176 *need_unmap = FALSE;
9177 vm_object_lock(object);
9178 return KERN_MEMORY_ERROR;
9179 }
9180 page->vmp_pmapped = TRUE;
9181
9182 kr = pmap_enter_check(kernel_pmap,
9183 *address + page_map_offset,
9184 page,
9185 protection,
9186 VM_PROT_NONE,
9187 TRUE);
9188 assert(kr == KERN_SUCCESS);
9189 #if KASAN
9190 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9191 #endif
9192 }
9193
9194 vm_paging_objects_mapped_slow++;
9195 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9196
9197 *need_unmap = TRUE;
9198
9199 return KERN_SUCCESS;
9200 }
9201
9202 /*
9203 * vm_paging_unmap_object:
9204 * Unmaps part of a VM object's pages from the kernel
9205 * virtual address space.
9206 * Context:
9207 * The VM object is locked. This lock will get
9208 * dropped and re-acquired though.
9209 */
9210 void
9211 vm_paging_unmap_object(
9212 vm_object_t object,
9213 vm_map_offset_t start,
9214 vm_map_offset_t end)
9215 {
9216 int i;
9217
9218 if ((vm_paging_base_address == 0) ||
9219 (start < vm_paging_base_address) ||
9220 (end > (vm_paging_base_address
9221 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9222 /*
9223 * We didn't use our pre-allocated pool of
9224 * kernel virtual address. Deallocate the
9225 * virtual memory.
9226 */
9227 if (object != VM_OBJECT_NULL) {
9228 vm_object_unlock(object);
9229 }
9230 vm_map_remove(kernel_map, start, end);
9231 if (object != VM_OBJECT_NULL) {
9232 vm_object_lock(object);
9233 }
9234 } else {
9235 /*
9236 * We used a kernel virtual address from our
9237 * pre-allocated pool. Put it back in the pool
9238 * for next time.
9239 */
9240 assert(end - start == PAGE_SIZE);
9241 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9242 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9243
9244 /* undo the pmap mapping */
9245 pmap_remove(kernel_pmap, start, end);
9246
9247 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9248 vm_paging_page_inuse[i] = FALSE;
9249 if (vm_paging_page_waiter) {
9250 thread_wakeup(&vm_paging_page_waiter);
9251 }
9252 simple_unlock(&vm_paging_lock);
9253 }
9254 }
9255
9256
9257 /*
9258 * page->vmp_object must be locked
9259 */
9260 void
9261 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9262 {
9263 if (!queues_locked) {
9264 vm_page_lockspin_queues();
9265 }
9266
9267 page->vmp_free_when_done = FALSE;
9268 /*
9269 * need to drop the laundry count...
9270 * we may also need to remove it
9271 * from the I/O paging queue...
9272 * vm_pageout_throttle_up handles both cases
9273 *
9274 * the laundry and pageout_queue flags are cleared...
9275 */
9276 vm_pageout_throttle_up(page);
9277
9278 if (!queues_locked) {
9279 vm_page_unlock_queues();
9280 }
9281 }
9282
9283 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9284
9285 upl_t
9286 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9287 {
9288 int i = 0;
9289 upl_t upl;
9290
9291 assert(max_upls > 0);
9292 if (max_upls == 0) {
9293 return NULL;
9294 }
9295
9296 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9297 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9298 }
9299 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9300
9301 upl = upl_create(0, UPL_VECTOR, 0);
9302 upl->vector_upl = vector_upl;
9303 upl->u_offset = upl_offset;
9304 vector_upl->size = 0;
9305 vector_upl->offset = upl_offset;
9306 vector_upl->invalid_upls = 0;
9307 vector_upl->num_upls = 0;
9308 vector_upl->pagelist = NULL;
9309 vector_upl->max_upls = max_upls;
9310
9311 for (i = 0; i < max_upls; i++) {
9312 vector_upl->upls[i].iostate.size = 0;
9313 vector_upl->upls[i].iostate.offset = 0;
9314 }
9315 return upl;
9316 }
9317
9318 upl_size_t
9319 vector_upl_get_size(const upl_t upl)
9320 {
9321 if (!vector_upl_is_valid(upl)) {
9322 return upl_get_size(upl);
9323 } else {
9324 return round_page_32(upl->vector_upl->size);
9325 }
9326 }
9327
9328 uint32_t
9329 vector_upl_max_upls(const upl_t upl)
9330 {
9331 if (!vector_upl_is_valid(upl)) {
9332 return 0;
9333 }
9334 return ((vector_upl_t)(upl->vector_upl))->max_upls;
9335 }
9336
9337 void
9338 vector_upl_deallocate(upl_t upl)
9339 {
9340 vector_upl_t vector_upl = upl->vector_upl;
9341
9342 assert(vector_upl_is_valid(upl));
9343
9344 if (vector_upl->invalid_upls != vector_upl->num_upls) {
9345 panic("Deallocating non-empty Vectored UPL");
9346 }
9347 uint32_t max_upls = vector_upl->max_upls;
9348 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9349 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9350 upl->vector_upl = NULL;
9351 }
9352
9353 boolean_t
9354 vector_upl_is_valid(upl_t upl)
9355 {
9356 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9357 }
9358
9359 boolean_t
9360 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9361 {
9362 if (vector_upl_is_valid(upl)) {
9363 vector_upl_t vector_upl = upl->vector_upl;
9364
9365 if (vector_upl) {
9366 if (subupl) {
9367 if (io_size) {
9368 if (io_size < PAGE_SIZE) {
9369 io_size = PAGE_SIZE;
9370 }
9371 subupl->vector_upl = (void*)vector_upl;
9372 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9373 vector_upl->size += io_size;
9374 upl->u_size += io_size;
9375 } else {
9376 uint32_t i = 0, invalid_upls = 0;
9377 for (i = 0; i < vector_upl->num_upls; i++) {
9378 if (vector_upl->upls[i].elem == subupl) {
9379 break;
9380 }
9381 }
9382 if (i == vector_upl->num_upls) {
9383 panic("Trying to remove sub-upl when none exists");
9384 }
9385
9386 vector_upl->upls[i].elem = NULL;
9387 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9388 relaxed);
9389 if (invalid_upls == vector_upl->num_upls) {
9390 return TRUE;
9391 } else {
9392 return FALSE;
9393 }
9394 }
9395 } else {
9396 panic("vector_upl_set_subupl was passed a NULL upl element");
9397 }
9398 } else {
9399 panic("vector_upl_set_subupl was passed a non-vectored upl");
9400 }
9401 } else {
9402 panic("vector_upl_set_subupl was passed a NULL upl");
9403 }
9404
9405 return FALSE;
9406 }
9407
9408 void
9409 vector_upl_set_pagelist(upl_t upl)
9410 {
9411 if (vector_upl_is_valid(upl)) {
9412 uint32_t i = 0;
9413 vector_upl_t vector_upl = upl->vector_upl;
9414
9415 if (vector_upl) {
9416 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9417
9418 vector_upl->pagelist = kalloc_type(struct upl_page_info,
9419 atop(vector_upl->size), Z_WAITOK);
9420
9421 for (i = 0; i < vector_upl->num_upls; i++) {
9422 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9423 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9424 pagelist_size += cur_upl_pagelist_size;
9425 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9426 upl->highest_page = vector_upl->upls[i].elem->highest_page;
9427 }
9428 }
9429 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9430 } else {
9431 panic("vector_upl_set_pagelist was passed a non-vectored upl");
9432 }
9433 } else {
9434 panic("vector_upl_set_pagelist was passed a NULL upl");
9435 }
9436 }
9437
9438 upl_t
9439 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9440 {
9441 if (vector_upl_is_valid(upl)) {
9442 vector_upl_t vector_upl = upl->vector_upl;
9443 if (vector_upl) {
9444 if (index < vector_upl->num_upls) {
9445 return vector_upl->upls[index].elem;
9446 }
9447 } else {
9448 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9449 }
9450 }
9451 return NULL;
9452 }
9453
9454 upl_t
9455 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9456 {
9457 if (vector_upl_is_valid(upl)) {
9458 uint32_t i = 0;
9459 vector_upl_t vector_upl = upl->vector_upl;
9460
9461 if (vector_upl) {
9462 upl_t subupl = NULL;
9463 vector_upl_iostates_t subupl_state;
9464
9465 for (i = 0; i < vector_upl->num_upls; i++) {
9466 subupl = vector_upl->upls[i].elem;
9467 subupl_state = vector_upl->upls[i].iostate;
9468 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9469 /* We could have been passed an offset/size pair that belongs
9470 * to an UPL element that has already been committed/aborted.
9471 * If so, return NULL.
9472 */
9473 if (subupl == NULL) {
9474 return NULL;
9475 }
9476 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9477 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9478 if (*upl_size > subupl_state.size) {
9479 *upl_size = subupl_state.size;
9480 }
9481 }
9482 if (*upl_offset >= subupl_state.offset) {
9483 *upl_offset -= subupl_state.offset;
9484 } else if (i) {
9485 panic("Vector UPL offset miscalculation");
9486 }
9487 return subupl;
9488 }
9489 }
9490 } else {
9491 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9492 }
9493 }
9494 return NULL;
9495 }
9496
9497 void
9498 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9499 {
9500 *v_upl_submap = NULL;
9501
9502 if (vector_upl_is_valid(upl)) {
9503 vector_upl_t vector_upl = upl->vector_upl;
9504 if (vector_upl) {
9505 *v_upl_submap = vector_upl->submap;
9506 *submap_dst_addr = vector_upl->submap_dst_addr;
9507 } else {
9508 panic("vector_upl_get_submap was passed a non-vectored UPL");
9509 }
9510 } else {
9511 panic("vector_upl_get_submap was passed a null UPL");
9512 }
9513 }
9514
9515 void
9516 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9517 {
9518 if (vector_upl_is_valid(upl)) {
9519 vector_upl_t vector_upl = upl->vector_upl;
9520 if (vector_upl) {
9521 vector_upl->submap = submap;
9522 vector_upl->submap_dst_addr = submap_dst_addr;
9523 } else {
9524 panic("vector_upl_get_submap was passed a non-vectored UPL");
9525 }
9526 } else {
9527 panic("vector_upl_get_submap was passed a NULL UPL");
9528 }
9529 }
9530
9531 void
9532 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9533 {
9534 if (vector_upl_is_valid(upl)) {
9535 uint32_t i = 0;
9536 vector_upl_t vector_upl = upl->vector_upl;
9537
9538 if (vector_upl) {
9539 for (i = 0; i < vector_upl->num_upls; i++) {
9540 if (vector_upl->upls[i].elem == subupl) {
9541 break;
9542 }
9543 }
9544
9545 if (i == vector_upl->num_upls) {
9546 panic("setting sub-upl iostate when none exists");
9547 }
9548
9549 vector_upl->upls[i].iostate.offset = offset;
9550 if (size < PAGE_SIZE) {
9551 size = PAGE_SIZE;
9552 }
9553 vector_upl->upls[i].iostate.size = size;
9554 } else {
9555 panic("vector_upl_set_iostate was passed a non-vectored UPL");
9556 }
9557 } else {
9558 panic("vector_upl_set_iostate was passed a NULL UPL");
9559 }
9560 }
9561
9562 void
9563 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9564 {
9565 if (vector_upl_is_valid(upl)) {
9566 uint32_t i = 0;
9567 vector_upl_t vector_upl = upl->vector_upl;
9568
9569 if (vector_upl) {
9570 for (i = 0; i < vector_upl->num_upls; i++) {
9571 if (vector_upl->upls[i].elem == subupl) {
9572 break;
9573 }
9574 }
9575
9576 if (i == vector_upl->num_upls) {
9577 panic("getting sub-upl iostate when none exists");
9578 }
9579
9580 *offset = vector_upl->upls[i].iostate.offset;
9581 *size = vector_upl->upls[i].iostate.size;
9582 } else {
9583 panic("vector_upl_get_iostate was passed a non-vectored UPL");
9584 }
9585 } else {
9586 panic("vector_upl_get_iostate was passed a NULL UPL");
9587 }
9588 }
9589
9590 void
9591 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9592 {
9593 if (vector_upl_is_valid(upl)) {
9594 vector_upl_t vector_upl = upl->vector_upl;
9595 if (vector_upl) {
9596 if (index < vector_upl->num_upls) {
9597 *offset = vector_upl->upls[index].iostate.offset;
9598 *size = vector_upl->upls[index].iostate.size;
9599 } else {
9600 *offset = *size = 0;
9601 }
9602 } else {
9603 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9604 }
9605 } else {
9606 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9607 }
9608 }
9609
9610 void *
9611 upl_get_internal_vectorupl(upl_t upl)
9612 {
9613 return upl->vector_upl;
9614 }
9615
9616 upl_page_info_t *
9617 upl_get_internal_vectorupl_pagelist(upl_t upl)
9618 {
9619 return upl->vector_upl->pagelist;
9620 }
9621
9622 upl_page_info_t *
9623 upl_get_internal_page_list(upl_t upl)
9624 {
9625 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9626 }
9627
9628 void
9629 upl_clear_dirty(
9630 upl_t upl,
9631 boolean_t value)
9632 {
9633 if (value) {
9634 upl->flags |= UPL_CLEAR_DIRTY;
9635 } else {
9636 upl->flags &= ~UPL_CLEAR_DIRTY;
9637 }
9638 }
9639
9640 void
9641 upl_set_referenced(
9642 upl_t upl,
9643 boolean_t value)
9644 {
9645 upl_lock(upl);
9646 if (value) {
9647 upl->ext_ref_count++;
9648 } else {
9649 if (!upl->ext_ref_count) {
9650 panic("upl_set_referenced not %p", upl);
9651 }
9652 upl->ext_ref_count--;
9653 }
9654 upl_unlock(upl);
9655 }
9656
9657 void
9658 upl_set_map_exclusive(upl_t upl)
9659 {
9660 upl_lock(upl);
9661 while (upl->map_addr_owner) {
9662 upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9663 upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9664 }
9665 upl->map_addr_owner = thread_get_ctid(current_thread());
9666 upl_unlock(upl);
9667 }
9668
9669 void
9670 upl_clear_map_exclusive(upl_t upl)
9671 {
9672 assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9673 upl_lock(upl);
9674 if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9675 upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9676 upl_wakeup(&upl->map_addr_owner);
9677 }
9678 upl->map_addr_owner = 0;
9679 upl_unlock(upl);
9680 }
9681
9682 #if CONFIG_IOSCHED
9683 void
9684 upl_set_blkno(
9685 upl_t upl,
9686 vm_offset_t upl_offset,
9687 int io_size,
9688 int64_t blkno)
9689 {
9690 int i, j;
9691 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9692 return;
9693 }
9694
9695 assert(upl->upl_reprio_info != 0);
9696 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9697 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9698 }
9699 }
9700 #endif
9701
9702 void inline
9703 memoryshot(unsigned int event, unsigned int control)
9704 {
9705 if (vm_debug_events) {
9706 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9707 vm_page_active_count, vm_page_inactive_count,
9708 vm_page_free_count, vm_page_speculative_count,
9709 vm_page_throttled_count);
9710 } else {
9711 (void) event;
9712 (void) control;
9713 }
9714 }
9715
9716 #ifdef MACH_BSD
9717
9718 boolean_t
9719 upl_device_page(upl_page_info_t *upl)
9720 {
9721 return UPL_DEVICE_PAGE(upl);
9722 }
9723 boolean_t
9724 upl_page_present(upl_page_info_t *upl, int index)
9725 {
9726 return UPL_PAGE_PRESENT(upl, index);
9727 }
9728 boolean_t
9729 upl_speculative_page(upl_page_info_t *upl, int index)
9730 {
9731 return UPL_SPECULATIVE_PAGE(upl, index);
9732 }
9733 boolean_t
9734 upl_dirty_page(upl_page_info_t *upl, int index)
9735 {
9736 return UPL_DIRTY_PAGE(upl, index);
9737 }
9738 boolean_t
9739 upl_valid_page(upl_page_info_t *upl, int index)
9740 {
9741 return UPL_VALID_PAGE(upl, index);
9742 }
9743 ppnum_t
9744 upl_phys_page(upl_page_info_t *upl, int index)
9745 {
9746 return UPL_PHYS_PAGE(upl, index);
9747 }
9748
9749 void
9750 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9751 {
9752 upl[index].mark = v;
9753 }
9754
9755 boolean_t
9756 upl_page_get_mark(upl_page_info_t *upl, int index)
9757 {
9758 return upl[index].mark;
9759 }
9760
9761 void
9762 vm_countdirtypages(void)
9763 {
9764 vm_page_t m;
9765 int dpages;
9766 int pgopages;
9767 int precpages;
9768
9769
9770 dpages = 0;
9771 pgopages = 0;
9772 precpages = 0;
9773
9774 vm_page_lock_queues();
9775 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9776 do {
9777 if (m == (vm_page_t)0) {
9778 break;
9779 }
9780
9781 if (m->vmp_dirty) {
9782 dpages++;
9783 }
9784 if (m->vmp_free_when_done) {
9785 pgopages++;
9786 }
9787 if (m->vmp_precious) {
9788 precpages++;
9789 }
9790
9791 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9792 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9793 if (m == (vm_page_t)0) {
9794 break;
9795 }
9796 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9797 vm_page_unlock_queues();
9798
9799 vm_page_lock_queues();
9800 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9801 do {
9802 if (m == (vm_page_t)0) {
9803 break;
9804 }
9805
9806 dpages++;
9807 assert(m->vmp_dirty);
9808 assert(!m->vmp_free_when_done);
9809 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9810 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9811 if (m == (vm_page_t)0) {
9812 break;
9813 }
9814 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9815 vm_page_unlock_queues();
9816
9817 vm_page_lock_queues();
9818 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9819 do {
9820 if (m == (vm_page_t)0) {
9821 break;
9822 }
9823
9824 if (m->vmp_dirty) {
9825 dpages++;
9826 }
9827 if (m->vmp_free_when_done) {
9828 pgopages++;
9829 }
9830 if (m->vmp_precious) {
9831 precpages++;
9832 }
9833
9834 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9835 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9836 if (m == (vm_page_t)0) {
9837 break;
9838 }
9839 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9840 vm_page_unlock_queues();
9841
9842 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9843
9844 dpages = 0;
9845 pgopages = 0;
9846 precpages = 0;
9847
9848 vm_page_lock_queues();
9849 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9850
9851 do {
9852 if (m == (vm_page_t)0) {
9853 break;
9854 }
9855 if (m->vmp_dirty) {
9856 dpages++;
9857 }
9858 if (m->vmp_free_when_done) {
9859 pgopages++;
9860 }
9861 if (m->vmp_precious) {
9862 precpages++;
9863 }
9864
9865 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9866 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9867 if (m == (vm_page_t)0) {
9868 break;
9869 }
9870 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9871 vm_page_unlock_queues();
9872
9873 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9874 }
9875 #endif /* MACH_BSD */
9876
9877
9878 #if CONFIG_IOSCHED
9879 int
9880 upl_get_cached_tier(upl_t upl)
9881 {
9882 assert(upl);
9883 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9884 return upl->upl_priority;
9885 }
9886 return -1;
9887 }
9888 #endif /* CONFIG_IOSCHED */
9889
9890
9891 void
9892 upl_callout_iodone(upl_t upl)
9893 {
9894 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9895
9896 if (upl_ctx) {
9897 void (*iodone_func)(void *, int) = upl_ctx->io_done;
9898
9899 assert(upl_ctx->io_done);
9900
9901 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9902 }
9903 }
9904
9905 void
9906 upl_set_iodone(upl_t upl, void *upl_iodone)
9907 {
9908 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9909 }
9910
9911 void
9912 upl_set_iodone_error(upl_t upl, int error)
9913 {
9914 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9915
9916 if (upl_ctx) {
9917 upl_ctx->io_error = error;
9918 }
9919 }
9920
9921
9922 ppnum_t
9923 upl_get_highest_page(
9924 upl_t upl)
9925 {
9926 return upl->highest_page;
9927 }
9928
9929 upl_size_t
9930 upl_get_size(
9931 upl_t upl)
9932 {
9933 return upl_adjusted_size(upl, PAGE_MASK);
9934 }
9935
9936 upl_size_t
9937 upl_adjusted_size(
9938 upl_t upl,
9939 vm_map_offset_t pgmask)
9940 {
9941 vm_object_offset_t start_offset, end_offset;
9942
9943 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9944 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9945
9946 return (upl_size_t)(end_offset - start_offset);
9947 }
9948
9949 vm_object_offset_t
9950 upl_adjusted_offset(
9951 upl_t upl,
9952 vm_map_offset_t pgmask)
9953 {
9954 return trunc_page_mask_64(upl->u_offset, pgmask);
9955 }
9956
9957 vm_object_offset_t
9958 upl_get_data_offset(
9959 upl_t upl)
9960 {
9961 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9962 }
9963
9964 upl_t
9965 upl_associated_upl(upl_t upl)
9966 {
9967 return upl->associated_upl;
9968 }
9969
9970 void
9971 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9972 {
9973 upl->associated_upl = associated_upl;
9974 }
9975
9976 struct vnode *
9977 upl_lookup_vnode(upl_t upl)
9978 {
9979 if (!upl->map_object->internal) {
9980 return vnode_pager_lookup_vnode(upl->map_object->pager);
9981 } else {
9982 return NULL;
9983 }
9984 }
9985
9986 boolean_t
9987 upl_has_wired_pages(upl_t upl)
9988 {
9989 return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
9990 }
9991
9992 #if UPL_DEBUG
9993 kern_return_t
9994 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
9995 {
9996 upl->ubc_alias1 = alias1;
9997 upl->ubc_alias2 = alias2;
9998 return KERN_SUCCESS;
9999 }
10000 int
10001 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10002 {
10003 if (al) {
10004 *al = upl->ubc_alias1;
10005 }
10006 if (al2) {
10007 *al2 = upl->ubc_alias2;
10008 }
10009 return KERN_SUCCESS;
10010 }
10011 #endif /* UPL_DEBUG */
10012
10013 #if VM_PRESSURE_EVENTS
10014 /*
10015 * Upward trajectory.
10016 */
10017
10018 boolean_t
10019 VM_PRESSURE_NORMAL_TO_WARNING(void)
10020 {
10021 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10022 /* Available pages below our threshold */
10023 uint32_t available_pages = memorystatus_get_available_page_count();
10024 if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10025 #if CONFIG_FREEZE
10026 /* No frozen processes to kill */
10027 if (memorystatus_frozen_count == 0) {
10028 /* Not enough suspended processes available. */
10029 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10030 return TRUE;
10031 }
10032 }
10033 #else /* CONFIG_FREEZE */
10034 return TRUE;
10035 #endif /* CONFIG_FREEZE */
10036 }
10037 return FALSE;
10038 } else {
10039 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10040 }
10041 }
10042
10043 boolean_t
10044 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10045 {
10046 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10047 /* Available pages below our threshold */
10048 uint32_t available_pages = memorystatus_get_available_page_count();
10049 return available_pages < memorystatus_get_critical_page_shortage_threshold();
10050 } else {
10051 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10052 }
10053 }
10054
10055 /*
10056 * Downward trajectory.
10057 */
10058 boolean_t
10059 VM_PRESSURE_WARNING_TO_NORMAL(void)
10060 {
10061 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10062 /* Available pages above our threshold */
10063 uint32_t available_pages = memorystatus_get_available_page_count();
10064 uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10065 return available_pages > target_threshold;
10066 } else {
10067 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10068 }
10069 }
10070
10071 boolean_t
10072 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10073 {
10074 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10075 uint32_t available_pages = memorystatus_get_available_page_count();
10076 uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10077 return available_pages > target_threshold;
10078 } else {
10079 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10080 }
10081 }
10082 #endif /* VM_PRESSURE_EVENTS */
10083
10084 #if DEVELOPMENT || DEBUG
10085 bool compressor_running_perf_test;
10086 uint64_t compressor_perf_test_pages_processed;
10087
10088 static kern_return_t
10089 move_pages_to_queue(
10090 vm_map_t map,
10091 user_addr_t start_addr,
10092 size_t buffer_size,
10093 vm_page_queue_head_t *queue,
10094 size_t *pages_moved)
10095 {
10096 kern_return_t err = KERN_SUCCESS;
10097 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10098 boolean_t addr_in_map = FALSE;
10099 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10100 vm_object_t curr_object = VM_OBJECT_NULL;
10101 *pages_moved = 0;
10102
10103
10104 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10105 /*
10106 * We don't currently support benchmarking maps with a different page size
10107 * than the kernel.
10108 */
10109 return KERN_INVALID_ARGUMENT;
10110 }
10111
10112 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10113 return KERN_INVALID_ARGUMENT;
10114 }
10115
10116 vm_map_lock_read(map);
10117 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10118 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10119
10120
10121 while (curr_addr < end_addr) {
10122 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10123 if (!addr_in_map) {
10124 err = KERN_INVALID_ARGUMENT;
10125 break;
10126 }
10127 curr_object = VME_OBJECT(curr_entry);
10128 if (curr_object) {
10129 vm_object_lock(curr_object);
10130 /* We really only want anonymous memory that's in the top level map and object here. */
10131 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10132 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10133 err = KERN_INVALID_ARGUMENT;
10134 vm_object_unlock(curr_object);
10135 break;
10136 }
10137 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10138 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10139 (curr_entry->vme_start + VME_OFFSET(curr_entry));
10140 vm_map_offset_t curr_offset = start_offset;
10141 vm_page_t curr_page;
10142 while (curr_offset < end_offset) {
10143 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10144 if (curr_page != VM_PAGE_NULL) {
10145 vm_page_lock_queues();
10146 if (curr_page->vmp_laundry) {
10147 vm_pageout_steal_laundry(curr_page, TRUE);
10148 }
10149 /*
10150 * we've already factored out pages in the laundry which
10151 * means this page can't be on the pageout queue so it's
10152 * safe to do the vm_page_queues_remove
10153 */
10154 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10155 vm_page_queues_remove(curr_page, TRUE);
10156 if (donate) {
10157 /*
10158 * The compressor needs to see this bit to know
10159 * where this page needs to land. Also if stolen,
10160 * this bit helps put the page back in the right
10161 * special queue where it belongs.
10162 */
10163 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10164 }
10165 // Clear the referenced bit so we ensure this gets paged out
10166 curr_page->vmp_reference = false;
10167 if (curr_page->vmp_pmapped) {
10168 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10169 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10170 }
10171 vm_page_queue_enter(queue, curr_page, vmp_pageq);
10172 vm_page_unlock_queues();
10173 *pages_moved += 1;
10174 }
10175 curr_offset += PAGE_SIZE_64;
10176 curr_addr += PAGE_SIZE_64;
10177 }
10178 }
10179 vm_object_unlock(curr_object);
10180 }
10181 vm_map_unlock_read(map);
10182 return err;
10183 }
10184
10185 /*
10186 * Local queue for processing benchmark pages.
10187 * Can't be allocated on the stack because the pointer has to
10188 * be packable.
10189 */
10190 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10191 kern_return_t
10192 run_compressor_perf_test(
10193 user_addr_t buf,
10194 size_t buffer_size,
10195 uint64_t *time,
10196 uint64_t *bytes_compressed,
10197 uint64_t *compressor_growth)
10198 {
10199 kern_return_t err = KERN_SUCCESS;
10200 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10201 return KERN_NOT_SUPPORTED;
10202 }
10203 if (current_task() == kernel_task) {
10204 return KERN_INVALID_ARGUMENT;
10205 }
10206 vm_page_lock_queues();
10207 if (compressor_running_perf_test) {
10208 /* Only run one instance of the benchmark at a time. */
10209 vm_page_unlock_queues();
10210 return KERN_RESOURCE_SHORTAGE;
10211 }
10212 vm_page_unlock_queues();
10213 size_t page_count = 0;
10214 vm_map_t map;
10215 vm_page_t p, next;
10216 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10217 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10218 *bytes_compressed = *compressor_growth = 0;
10219
10220 vm_page_queue_init(&compressor_perf_test_queue);
10221 map = current_task()->map;
10222 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10223 if (err != KERN_SUCCESS) {
10224 goto out;
10225 }
10226
10227 vm_page_lock_queues();
10228 compressor_running_perf_test = true;
10229 compressor_perf_test_pages_processed = 0;
10230 /*
10231 * At this point the compressor threads should only process the benchmark queue
10232 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10233 * to determine how many compressed bytes we ended up using.
10234 */
10235 compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10236 vm_page_unlock_queues();
10237
10238 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10239
10240 vm_page_lock_queues();
10241 compressor_perf_test_start = mach_absolute_time();
10242
10243 // Wake up the compressor thread(s)
10244 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10245 pgo_iothread_internal_state[0].pgo_iothread);
10246
10247 /*
10248 * Depending on when this test is run we could overshoot or be right on the mark
10249 * with our page_count. So the comparison is of the _less than_ variety.
10250 */
10251 while (compressor_perf_test_pages_processed < page_count) {
10252 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10253 vm_page_unlock_queues();
10254 thread_block(THREAD_CONTINUE_NULL);
10255 vm_page_lock_queues();
10256 }
10257 compressor_perf_test_end = mach_absolute_time();
10258 compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10259 vm_page_unlock_queues();
10260
10261
10262 out:
10263 /*
10264 * If we errored out above, then we could still have some pages
10265 * on the local queue. Make sure to put them back on the active queue before
10266 * returning so they're not orphaned.
10267 */
10268 vm_page_lock_queues();
10269 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10270 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10271 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10272 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10273
10274 vm_page_enqueue_active(p, FALSE);
10275 p = next;
10276 }
10277
10278 compressor_running_perf_test = false;
10279 vm_page_unlock_queues();
10280 if (err == KERN_SUCCESS) {
10281 *bytes_compressed = page_count * PAGE_SIZE_64;
10282 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
10283 }
10284
10285 /*
10286 * pageout_scan will consider waking the compactor swapper
10287 * before it blocks. Do the same thing here before we return
10288 * to ensure that back to back benchmark runs can't overly fragment the
10289 * compressor pool.
10290 */
10291 vm_consider_waking_compactor_swapper();
10292 return err;
10293 }
10294 #endif /* DEVELOPMENT || DEBUG */
10295