1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69
70 #include <debug.h>
71
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92
93 #include <os/log.h>
94
95 #include <sys/kdebug_triage.h>
96
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115
116 #include <san/kasan.h>
117
118 #if CONFIG_PHANTOM_CACHE
119 #include <vm/vm_phantom_cache_internal.h>
120 #endif
121
122 #if UPL_DEBUG
123 #include <libkern/OSDebug.h>
124 #endif
125
126 extern int cs_debug;
127
128 #if CONFIG_MBUF_MCACHE
129 extern void mbuf_drain(boolean_t);
130 #endif /* CONFIG_MBUF_MCACHE */
131
132 #if VM_PRESSURE_EVENTS
133 #if CONFIG_JETSAM
134 extern unsigned int memorystatus_available_pages;
135 extern unsigned int memorystatus_available_pages_pressure;
136 extern unsigned int memorystatus_available_pages_critical;
137 #else /* CONFIG_JETSAM */
138 extern uint64_t memorystatus_available_pages;
139 extern uint64_t memorystatus_available_pages_pressure;
140 extern uint64_t memorystatus_available_pages_critical;
141 #endif /* CONFIG_JETSAM */
142 #if CONFIG_FREEZE
143 extern unsigned int memorystatus_frozen_count;
144 extern unsigned int memorystatus_suspended_count;
145 #endif /* CONFIG_FREEZE */
146 extern vm_pressure_level_t memorystatus_vm_pressure_level;
147
148 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
149 extern uint32_t memorystatus_jetsam_fg_band_waiters;
150 extern uint32_t memorystatus_jetsam_bg_band_waiters;
151
152 void vm_pressure_response(void);
153 extern void consider_vm_pressure_events(void);
154
155 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
156 #endif /* VM_PRESSURE_EVENTS */
157
158 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
159 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
160 #if CONFIG_VPS_DYNAMIC_PRIO
161 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
162 #else
163 const bool vps_dynamic_priority_enabled = false;
164 #endif
165 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
166
167 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
168 #if !XNU_TARGET_OS_OSX
169 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
170 #else /* !XNU_TARGET_OS_OSX */
171 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
172 #endif /* !XNU_TARGET_OS_OSX */
173 #endif
174
175 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
176 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
177 #endif
178
179 #ifndef VM_PAGE_LAUNDRY_MAX
180 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
181 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
182
183 #ifndef VM_PAGEOUT_BURST_WAIT
184 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
185 #endif /* VM_PAGEOUT_BURST_WAIT */
186
187 #ifndef VM_PAGEOUT_EMPTY_WAIT
188 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
189 #endif /* VM_PAGEOUT_EMPTY_WAIT */
190
191 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
192 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
193 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
194
195 #ifndef VM_PAGEOUT_IDLE_WAIT
196 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
197 #endif /* VM_PAGEOUT_IDLE_WAIT */
198
199 #ifndef VM_PAGEOUT_SWAP_WAIT
200 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
201 #endif /* VM_PAGEOUT_SWAP_WAIT */
202
203 /*
204 * vm_page_max_speculative_age_q should be less than or equal to
205 * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
206 * vm_page_queue_speculative entries.
207 */
208
209 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
210 #ifndef VM_PAGE_SPECULATIVE_TARGET
211 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
212 #endif /* VM_PAGE_SPECULATIVE_TARGET */
213
214
215 /*
216 * To obtain a reasonable LRU approximation, the inactive queue
217 * needs to be large enough to give pages on it a chance to be
218 * referenced a second time. This macro defines the fraction
219 * of active+inactive pages that should be inactive.
220 * The pageout daemon uses it to update vm_page_inactive_target.
221 *
222 * If vm_page_free_count falls below vm_page_free_target and
223 * vm_page_inactive_count is below vm_page_inactive_target,
224 * then the pageout daemon starts running.
225 */
226
227 #ifndef VM_PAGE_INACTIVE_TARGET
228 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
229 #endif /* VM_PAGE_INACTIVE_TARGET */
230
231 /*
232 * Once the pageout daemon starts running, it keeps going
233 * until vm_page_free_count meets or exceeds vm_page_free_target.
234 */
235
236 #ifndef VM_PAGE_FREE_TARGET
237 #if !XNU_TARGET_OS_OSX
238 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
239 #else /* !XNU_TARGET_OS_OSX */
240 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
241 #endif /* !XNU_TARGET_OS_OSX */
242 #endif /* VM_PAGE_FREE_TARGET */
243
244
245 /*
246 * The pageout daemon always starts running once vm_page_free_count
247 * falls below vm_page_free_min.
248 */
249
250 #ifndef VM_PAGE_FREE_MIN
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
253 #else /* !XNU_TARGET_OS_OSX */
254 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
255 #endif /* !XNU_TARGET_OS_OSX */
256 #endif /* VM_PAGE_FREE_MIN */
257
258 #if !XNU_TARGET_OS_OSX
259 #define VM_PAGE_FREE_RESERVED_LIMIT 100
260 #define VM_PAGE_FREE_MIN_LIMIT 1500
261 #define VM_PAGE_FREE_TARGET_LIMIT 2000
262 #else /* !XNU_TARGET_OS_OSX */
263 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
264 #define VM_PAGE_FREE_MIN_LIMIT 3500
265 #define VM_PAGE_FREE_TARGET_LIMIT 4000
266 #endif /* !XNU_TARGET_OS_OSX */
267
268 /*
269 * When vm_page_free_count falls below vm_page_free_reserved,
270 * only vm-privileged threads can allocate pages. vm-privilege
271 * allows the pageout daemon and default pager (and any other
272 * associated threads needed for default pageout) to continue
273 * operation by dipping into the reserved pool of pages.
274 */
275
276 #ifndef VM_PAGE_FREE_RESERVED
277 #define VM_PAGE_FREE_RESERVED(n) \
278 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
279 #endif /* VM_PAGE_FREE_RESERVED */
280
281 /*
282 * When we dequeue pages from the inactive list, they are
283 * reactivated (ie, put back on the active queue) if referenced.
284 * However, it is possible to starve the free list if other
285 * processors are referencing pages faster than we can turn off
286 * the referenced bit. So we limit the number of reactivations
287 * we will make per call of vm_pageout_scan().
288 */
289 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
290
291 #ifndef VM_PAGE_REACTIVATE_LIMIT
292 #if !XNU_TARGET_OS_OSX
293 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
294 #else /* !XNU_TARGET_OS_OSX */
295 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
296 #endif /* !XNU_TARGET_OS_OSX */
297 #endif /* VM_PAGE_REACTIVATE_LIMIT */
298 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
299
300 int vm_pageout_protect_realtime = true;
301
302 extern boolean_t hibernate_cleaning_in_progress;
303
304 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
305 struct pgo_iothread_state pgo_iothread_external_state;
306
307 #if VM_PRESSURE_EVENTS
308 void vm_pressure_thread(void);
309
310 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
311 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
312
313 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
314 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
315 #endif
316
317 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
318 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
319 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
320
321 extern void vm_pageout_continue(void);
322 extern void vm_pageout_scan(void);
323
324 boolean_t vm_pageout_running = FALSE;
325
326 uint32_t vm_page_upl_tainted = 0;
327 uint32_t vm_page_iopl_tainted = 0;
328
329 #if XNU_TARGET_OS_OSX
330 static boolean_t vm_pageout_waiter = FALSE;
331 #endif /* XNU_TARGET_OS_OSX */
332
333
334 #if DEVELOPMENT || DEBUG
335 struct vm_pageout_debug vm_pageout_debug;
336 #endif
337 struct vm_pageout_vminfo vm_pageout_vminfo;
338 struct vm_pageout_state vm_pageout_state;
339 struct vm_config vm_config;
340
341 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
342 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
343 #if DEVELOPMENT || DEBUG
344 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
345 #endif /* DEVELOPMENT || DEBUG */
346
347 int vm_upl_wait_for_pages = 0;
348 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
349
350 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
351
352 int vm_debug_events = 0;
353
354 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
355
356 #if CONFIG_MEMORYSTATUS
357 extern void memorystatus_kill_on_vps_starvation(void);
358
359 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
360 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
361
362 #endif
363
364 #if __AMP__
365
366
367 /*
368 * Bind compressor threads to e-cores unless there are multiple non-e clusters
369 */
370 #if (MAX_CPU_CLUSTERS > 2)
371 #define VM_COMPRESSOR_EBOUND_DEFAULT false
372 #elif defined(XNU_TARGET_OS_XR)
373 #define VM_COMPRESSOR_EBOUND_DEFAULT false
374 #else
375 #define VM_COMPRESSOR_EBOUND_DEFAULT true
376 #endif
377
378 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
379 int vm_pgo_pbound = 0;
380 extern void thread_bind_cluster_type(thread_t, char, bool);
381
382 #endif /* __AMP__ */
383
384
385 /*
386 * Routine: vm_pageout_object_terminate
387 * Purpose:
388 * Destroy the pageout_object, and perform all of the
389 * required cleanup actions.
390 *
391 * In/Out conditions:
392 * The object must be locked, and will be returned locked.
393 */
394 void
vm_pageout_object_terminate(vm_object_t object)395 vm_pageout_object_terminate(
396 vm_object_t object)
397 {
398 vm_object_t shadow_object;
399
400 /*
401 * Deal with the deallocation (last reference) of a pageout object
402 * (used for cleaning-in-place) by dropping the paging references/
403 * freeing pages in the original object.
404 */
405
406 assert(object->pageout);
407 shadow_object = object->shadow;
408 vm_object_lock(shadow_object);
409
410 while (!vm_page_queue_empty(&object->memq)) {
411 vm_page_t p, m;
412 vm_object_offset_t offset;
413
414 p = (vm_page_t) vm_page_queue_first(&object->memq);
415
416 assert(p->vmp_private);
417 assert(p->vmp_free_when_done);
418 p->vmp_free_when_done = FALSE;
419 assert(!p->vmp_cleaning);
420 assert(!p->vmp_laundry);
421
422 offset = p->vmp_offset;
423 VM_PAGE_FREE(p);
424 p = VM_PAGE_NULL;
425
426 m = vm_page_lookup(shadow_object,
427 offset + object->vo_shadow_offset);
428
429 if (m == VM_PAGE_NULL) {
430 continue;
431 }
432
433 assert((m->vmp_dirty) || (m->vmp_precious) ||
434 (m->vmp_busy && m->vmp_cleaning));
435
436 /*
437 * Handle the trusted pager throttle.
438 * Also decrement the burst throttle (if external).
439 */
440 vm_page_lock_queues();
441 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
442 vm_pageout_throttle_up(m);
443 }
444
445 /*
446 * Handle the "target" page(s). These pages are to be freed if
447 * successfully cleaned. Target pages are always busy, and are
448 * wired exactly once. The initial target pages are not mapped,
449 * (so cannot be referenced or modified) but converted target
450 * pages may have been modified between the selection as an
451 * adjacent page and conversion to a target.
452 */
453 if (m->vmp_free_when_done) {
454 assert(m->vmp_busy);
455 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
456 assert(m->vmp_wire_count == 1);
457 m->vmp_cleaning = FALSE;
458 m->vmp_free_when_done = FALSE;
459 /*
460 * Revoke all access to the page. Since the object is
461 * locked, and the page is busy, this prevents the page
462 * from being dirtied after the pmap_disconnect() call
463 * returns.
464 *
465 * Since the page is left "dirty" but "not modifed", we
466 * can detect whether the page was redirtied during
467 * pageout by checking the modify state.
468 */
469 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
470 SET_PAGE_DIRTY(m, FALSE);
471 } else {
472 m->vmp_dirty = FALSE;
473 }
474
475 if (m->vmp_dirty) {
476 vm_page_unwire(m, TRUE); /* reactivates */
477 counter_inc(&vm_statistics_reactivations);
478 vm_page_wakeup_done(object, m);
479 } else {
480 vm_page_free(m); /* clears busy, etc. */
481 }
482 vm_page_unlock_queues();
483 continue;
484 }
485 /*
486 * Handle the "adjacent" pages. These pages were cleaned in
487 * place, and should be left alone.
488 * If prep_pin_count is nonzero, then someone is using the
489 * page, so make it active.
490 */
491 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
492 if (m->vmp_reference) {
493 vm_page_activate(m);
494 } else {
495 vm_page_deactivate(m);
496 }
497 }
498 if (m->vmp_overwriting) {
499 /*
500 * the (COPY_OUT_FROM == FALSE) request_page_list case
501 */
502 if (m->vmp_busy) {
503 /*
504 * We do not re-set m->vmp_dirty !
505 * The page was busy so no extraneous activity
506 * could have occurred. COPY_INTO is a read into the
507 * new pages. CLEAN_IN_PLACE does actually write
508 * out the pages but handling outside of this code
509 * will take care of resetting dirty. We clear the
510 * modify however for the Programmed I/O case.
511 */
512 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
513
514 m->vmp_busy = FALSE;
515 m->vmp_absent = FALSE;
516 } else {
517 /*
518 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
519 * Occurs when the original page was wired
520 * at the time of the list request
521 */
522 assert(VM_PAGE_WIRED(m));
523 vm_page_unwire(m, TRUE); /* reactivates */
524 }
525 m->vmp_overwriting = FALSE;
526 } else {
527 m->vmp_dirty = FALSE;
528 }
529 m->vmp_cleaning = FALSE;
530
531 /*
532 * Wakeup any thread waiting for the page to be un-cleaning.
533 */
534 vm_page_wakeup(object, m);
535 vm_page_unlock_queues();
536 }
537 /*
538 * Account for the paging reference taken in vm_paging_object_allocate.
539 */
540 vm_object_activity_end(shadow_object);
541 vm_object_unlock(shadow_object);
542
543 assert(object->ref_count == 0);
544 assert(object->paging_in_progress == 0);
545 assert(object->activity_in_progress == 0);
546 assert(object->resident_page_count == 0);
547 return;
548 }
549
550 /*
551 * Routine: vm_pageclean_setup
552 *
553 * Purpose: setup a page to be cleaned (made non-dirty), but not
554 * necessarily flushed from the VM page cache.
555 * This is accomplished by cleaning in place.
556 *
557 * The page must not be busy, and new_object
558 * must be locked.
559 *
560 */
561 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)562 vm_pageclean_setup(
563 vm_page_t m,
564 vm_page_t new_m,
565 vm_object_t new_object,
566 vm_object_offset_t new_offset)
567 {
568 assert(!m->vmp_busy);
569 #if 0
570 assert(!m->vmp_cleaning);
571 #endif
572
573 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
574
575 /*
576 * Mark original page as cleaning in place.
577 */
578 m->vmp_cleaning = TRUE;
579 SET_PAGE_DIRTY(m, FALSE);
580 m->vmp_precious = FALSE;
581
582 /*
583 * Convert the fictitious page to a private shadow of
584 * the real page.
585 */
586 assert(new_m->vmp_fictitious);
587 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
588 new_m->vmp_fictitious = FALSE;
589 new_m->vmp_private = TRUE;
590 new_m->vmp_free_when_done = TRUE;
591 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
592
593 vm_page_lockspin_queues();
594 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
595 vm_page_unlock_queues();
596
597 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
598 assert(!new_m->vmp_wanted);
599 new_m->vmp_busy = FALSE;
600 }
601
602 /*
603 * Routine: vm_pageout_initialize_page
604 * Purpose:
605 * Causes the specified page to be initialized in
606 * the appropriate memory object. This routine is used to push
607 * pages into a copy-object when they are modified in the
608 * permanent object.
609 *
610 * The page is moved to a temporary object and paged out.
611 *
612 * In/out conditions:
613 * The page in question must not be on any pageout queues.
614 * The object to which it belongs must be locked.
615 * The page must be busy, but not hold a paging reference.
616 *
617 * Implementation:
618 * Move this page to a completely new object.
619 */
620 void
vm_pageout_initialize_page(vm_page_t m)621 vm_pageout_initialize_page(
622 vm_page_t m)
623 {
624 vm_object_t object;
625 vm_object_offset_t paging_offset;
626 memory_object_t pager;
627
628 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
629
630 object = VM_PAGE_OBJECT(m);
631
632 assert(m->vmp_busy);
633 assert(object->internal);
634
635 /*
636 * Verify that we really want to clean this page
637 */
638 assert(!m->vmp_absent);
639 assert(m->vmp_dirty);
640
641 /*
642 * Create a paging reference to let us play with the object.
643 */
644 paging_offset = m->vmp_offset + object->paging_offset;
645
646 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
647 panic("reservation without pageout?"); /* alan */
648
649 VM_PAGE_FREE(m);
650 vm_object_unlock(object);
651
652 return;
653 }
654
655 /*
656 * If there's no pager, then we can't clean the page. This should
657 * never happen since this should be a copy object and therefore not
658 * an external object, so the pager should always be there.
659 */
660
661 pager = object->pager;
662
663 if (pager == MEMORY_OBJECT_NULL) {
664 panic("missing pager for copy object");
665
666 VM_PAGE_FREE(m);
667 return;
668 }
669
670 /*
671 * set the page for future call to vm_fault_list_request
672 */
673 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
674 SET_PAGE_DIRTY(m, FALSE);
675
676 /*
677 * keep the object from collapsing or terminating
678 */
679 vm_object_paging_begin(object);
680 vm_object_unlock(object);
681
682 /*
683 * Write the data to its pager.
684 * Note that the data is passed by naming the new object,
685 * not a virtual address; the pager interface has been
686 * manipulated to use the "internal memory" data type.
687 * [The object reference from its allocation is donated
688 * to the eventual recipient.]
689 */
690 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
691
692 vm_object_lock(object);
693 vm_object_paging_end(object);
694 }
695
696
697 /*
698 * vm_pageout_cluster:
699 *
700 * Given a page, queue it to the appropriate I/O thread,
701 * which will page it out and attempt to clean adjacent pages
702 * in the same operation.
703 *
704 * The object and queues must be locked. We will take a
705 * paging reference to prevent deallocation or collapse when we
706 * release the object lock back at the call site. The I/O thread
707 * is responsible for consuming this reference
708 *
709 * The page must not be on any pageout queue.
710 */
711 #if DEVELOPMENT || DEBUG
712 vmct_stats_t vmct_stats;
713
714 int32_t vmct_active = 0;
715 uint64_t vm_compressor_epoch_start = 0;
716 uint64_t vm_compressor_epoch_stop = 0;
717
718 typedef enum vmct_state_t {
719 VMCT_IDLE,
720 VMCT_AWAKENED,
721 VMCT_ACTIVE,
722 } vmct_state_t;
723 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
724 #endif
725
726
727
728 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)729 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
730 {
731 vm_object_t object = VM_PAGE_OBJECT(m);
732
733 VM_PAGE_CHECK(m);
734 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
735 vm_object_lock_assert_exclusive(object);
736
737 /*
738 * Make sure it's OK to page this out.
739 */
740 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
741 assert(!m->vmp_cleaning && !m->vmp_laundry);
742 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
743
744 /*
745 * protect the object from collapse or termination
746 */
747 vm_object_activity_begin(object);
748
749
750 /*
751 * pgo_laundry count is tied to the laundry bit
752 */
753 m->vmp_laundry = TRUE;
754 q->pgo_laundry++;
755
756 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
757 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
758
759 if (object->internal == TRUE) {
760 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
761 m->vmp_busy = TRUE;
762 #if DEVELOPMENT || DEBUG
763 /*
764 * The benchmark queue will be woken up independently by the benchmark
765 * itself.
766 */
767 if (q != &vm_pageout_queue_benchmark) {
768 #else /* DEVELOPMENT || DEBUG */
769 if (true) {
770 #endif /* DEVELOPMENT || DEBUG */
771 /*
772 * Wake up the first compressor thread. It will wake subsequent
773 * threads if necessary.
774 */
775 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
776 pgo_iothread_internal_state[0].pgo_iothread);
777 }
778 } else {
779 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
780 }
781 VM_PAGE_CHECK(m);
782 }
783
784 void
785 vm_pageout_cluster(vm_page_t m)
786 {
787 struct vm_pageout_queue *q;
788 vm_object_t object = VM_PAGE_OBJECT(m);
789 if (object->internal) {
790 q = &vm_pageout_queue_internal;
791 } else {
792 q = &vm_pageout_queue_external;
793 }
794 vm_pageout_cluster_to_queue(m, q);
795 }
796
797
798 /*
799 * A page is back from laundry or we are stealing it back from
800 * the laundering state. See if there are some pages waiting to
801 * go to laundry and if we can let some of them go now.
802 *
803 * Object and page queues must be locked.
804 */
805 void
806 vm_pageout_throttle_up(
807 vm_page_t m)
808 {
809 struct vm_pageout_queue *q;
810 vm_object_t m_object;
811
812 m_object = VM_PAGE_OBJECT(m);
813
814 assert(m_object != VM_OBJECT_NULL);
815 assert(!is_kernel_object(m_object));
816
817 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
818 vm_object_lock_assert_exclusive(m_object);
819
820 if (m_object->internal == TRUE) {
821 q = &vm_pageout_queue_internal;
822 } else {
823 q = &vm_pageout_queue_external;
824 }
825
826 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
827 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
828 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
829
830 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
831
832 vm_object_activity_end(m_object);
833
834 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
835 }
836 if (m->vmp_laundry == TRUE) {
837 m->vmp_laundry = FALSE;
838 q->pgo_laundry--;
839
840 if (q->pgo_throttled == TRUE) {
841 q->pgo_throttled = FALSE;
842 thread_wakeup((event_t) &q->pgo_laundry);
843 }
844 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
845 q->pgo_draining = FALSE;
846 thread_wakeup((event_t) (&q->pgo_laundry + 1));
847 }
848 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
849 }
850 }
851
852
853 static void
854 vm_pageout_throttle_up_batch(
855 struct vm_pageout_queue *q,
856 int batch_cnt)
857 {
858 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
859
860 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
861
862 q->pgo_laundry -= batch_cnt;
863
864 if (q->pgo_throttled == TRUE) {
865 q->pgo_throttled = FALSE;
866 thread_wakeup((event_t) &q->pgo_laundry);
867 }
868 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
869 q->pgo_draining = FALSE;
870 thread_wakeup((event_t) (&q->pgo_laundry + 1));
871 }
872 }
873
874
875
876 /*
877 * VM memory pressure monitoring.
878 *
879 * vm_pageout_scan() keeps track of the number of pages it considers and
880 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
881 *
882 * compute_memory_pressure() is called every second from compute_averages()
883 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
884 * of recalimed pages in a new vm_pageout_stat[] bucket.
885 *
886 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
887 * The caller provides the number of seconds ("nsecs") worth of statistics
888 * it wants, up to 30 seconds.
889 * It computes the number of pages reclaimed in the past "nsecs" seconds and
890 * also returns the number of pages the system still needs to reclaim at this
891 * moment in time.
892 */
893 #if DEVELOPMENT || DEBUG
894 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
895 #else
896 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
897 #endif
898 struct vm_pageout_stat {
899 unsigned long vm_page_active_count;
900 unsigned long vm_page_speculative_count;
901 unsigned long vm_page_inactive_count;
902 unsigned long vm_page_anonymous_count;
903
904 unsigned long vm_page_free_count;
905 unsigned long vm_page_wire_count;
906 unsigned long vm_page_compressor_count;
907
908 unsigned long vm_page_pages_compressed;
909 unsigned long vm_page_pageable_internal_count;
910 unsigned long vm_page_pageable_external_count;
911 unsigned long vm_page_xpmapped_external_count;
912
913 unsigned int pages_grabbed;
914 unsigned int pages_freed;
915
916 unsigned int pages_compressed;
917 unsigned int pages_grabbed_by_compressor;
918 unsigned int failed_compressions;
919
920 unsigned int pages_evicted;
921 unsigned int pages_purged;
922
923 unsigned int considered;
924 unsigned int considered_bq_internal;
925 unsigned int considered_bq_external;
926
927 unsigned int skipped_external;
928 unsigned int skipped_internal;
929 unsigned int filecache_min_reactivations;
930
931 unsigned int freed_speculative;
932 unsigned int freed_cleaned;
933 unsigned int freed_internal;
934 unsigned int freed_external;
935
936 unsigned int cleaned_dirty_external;
937 unsigned int cleaned_dirty_internal;
938
939 unsigned int inactive_referenced;
940 unsigned int inactive_nolock;
941 unsigned int reactivation_limit_exceeded;
942 unsigned int forced_inactive_reclaim;
943
944 unsigned int throttled_internal_q;
945 unsigned int throttled_external_q;
946
947 unsigned int phantom_ghosts_found;
948 unsigned int phantom_ghosts_added;
949
950 unsigned int vm_page_realtime_count;
951 unsigned int forcereclaimed_sharedcache;
952 unsigned int forcereclaimed_realtime;
953 unsigned int protected_sharedcache;
954 unsigned int protected_realtime;
955 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
956
957 unsigned int vm_pageout_stat_now = 0;
958
959 #define VM_PAGEOUT_STAT_BEFORE(i) \
960 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
961 #define VM_PAGEOUT_STAT_AFTER(i) \
962 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
963
964 #if VM_PAGE_BUCKETS_CHECK
965 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
966 #endif /* VM_PAGE_BUCKETS_CHECK */
967
968
969 void
970 record_memory_pressure(void);
971 void
972 record_memory_pressure(void)
973 {
974 unsigned int vm_pageout_next;
975
976 #if VM_PAGE_BUCKETS_CHECK
977 /* check the consistency of VM page buckets at regular interval */
978 static int counter = 0;
979 if ((++counter % vm_page_buckets_check_interval) == 0) {
980 vm_page_buckets_check();
981 }
982 #endif /* VM_PAGE_BUCKETS_CHECK */
983
984 vm_pageout_state.vm_memory_pressure =
985 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
986 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
987 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
988 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
989
990 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
991
992 /* move "now" forward */
993 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
994
995 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
996
997 vm_pageout_stat_now = vm_pageout_next;
998 }
999
1000
1001 /*
1002 * IMPORTANT
1003 * mach_vm_ctl_page_free_wanted() is called indirectly, via
1004 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1005 * it must be safe in the restricted stackshot context. Locks and/or
1006 * blocking are not allowable.
1007 */
1008 unsigned int
1009 mach_vm_ctl_page_free_wanted(void)
1010 {
1011 unsigned int page_free_target, page_free_count, page_free_wanted;
1012
1013 page_free_target = vm_page_free_target;
1014 page_free_count = vm_page_free_count;
1015 if (page_free_target > page_free_count) {
1016 page_free_wanted = page_free_target - page_free_count;
1017 } else {
1018 page_free_wanted = 0;
1019 }
1020
1021 return page_free_wanted;
1022 }
1023
1024
1025 /*
1026 * IMPORTANT:
1027 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1028 * wait_for_pressure FALSE, so that code path must remain safe in the
1029 * restricted stackshot context. No blocking or locks are allowable.
1030 * on that code path.
1031 */
1032
1033 kern_return_t
1034 mach_vm_pressure_monitor(
1035 boolean_t wait_for_pressure,
1036 unsigned int nsecs_monitored,
1037 unsigned int *pages_reclaimed_p,
1038 unsigned int *pages_wanted_p)
1039 {
1040 wait_result_t wr;
1041 unsigned int vm_pageout_then, vm_pageout_now;
1042 unsigned int pages_reclaimed;
1043 unsigned int units_of_monitor;
1044
1045 units_of_monitor = 8 * nsecs_monitored;
1046 /*
1047 * We don't take the vm_page_queue_lock here because we don't want
1048 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1049 * thread when it's trying to reclaim memory. We don't need fully
1050 * accurate monitoring anyway...
1051 */
1052
1053 if (wait_for_pressure) {
1054 /* wait until there's memory pressure */
1055 while (vm_page_free_count >= vm_page_free_target) {
1056 wr = assert_wait((event_t) &vm_page_free_wanted,
1057 THREAD_INTERRUPTIBLE);
1058 if (wr == THREAD_WAITING) {
1059 wr = thread_block(THREAD_CONTINUE_NULL);
1060 }
1061 if (wr == THREAD_INTERRUPTED) {
1062 return KERN_ABORTED;
1063 }
1064 if (wr == THREAD_AWAKENED) {
1065 /*
1066 * The memory pressure might have already
1067 * been relieved but let's not block again
1068 * and let's report that there was memory
1069 * pressure at some point.
1070 */
1071 break;
1072 }
1073 }
1074 }
1075
1076 /* provide the number of pages the system wants to reclaim */
1077 if (pages_wanted_p != NULL) {
1078 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1079 }
1080
1081 if (pages_reclaimed_p == NULL) {
1082 return KERN_SUCCESS;
1083 }
1084
1085 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1086 vm_pageout_now = vm_pageout_stat_now;
1087 pages_reclaimed = 0;
1088 for (vm_pageout_then =
1089 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1090 vm_pageout_then != vm_pageout_now &&
1091 units_of_monitor-- != 0;
1092 vm_pageout_then =
1093 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1094 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1095 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1096 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1097 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1098 }
1099 *pages_reclaimed_p = pages_reclaimed;
1100
1101 return KERN_SUCCESS;
1102 }
1103
1104
1105
1106 #if DEVELOPMENT || DEBUG
1107
1108 static void
1109 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1110
1111 /*
1112 * condition variable used to make sure there is
1113 * only a single sweep going on at a time
1114 */
1115 bool vm_pageout_disconnect_all_pages_active = false;
1116
1117 void
1118 vm_pageout_disconnect_all_pages()
1119 {
1120 vm_page_lock_queues();
1121
1122 if (vm_pageout_disconnect_all_pages_active) {
1123 vm_page_unlock_queues();
1124 return;
1125 }
1126 vm_pageout_disconnect_all_pages_active = true;
1127
1128 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1129 vm_page_throttled_count);
1130 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1131 vm_page_anonymous_count);
1132 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1133 (vm_page_inactive_count - vm_page_anonymous_count));
1134 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1135 vm_page_active_count);
1136 #ifdef CONFIG_SECLUDED_MEMORY
1137 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1138 vm_page_secluded_count);
1139 #endif /* CONFIG_SECLUDED_MEMORY */
1140 vm_page_unlock_queues();
1141
1142 vm_pageout_disconnect_all_pages_active = false;
1143 }
1144
1145 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1146 void
1147 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1148 {
1149 vm_page_t m;
1150 vm_object_t t_object = NULL;
1151 vm_object_t l_object = NULL;
1152 vm_object_t m_object = NULL;
1153 int delayed_unlock = 0;
1154 int try_failed_count = 0;
1155 int disconnected_count = 0;
1156 int paused_count = 0;
1157 int object_locked_count = 0;
1158
1159 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1160 DBG_FUNC_START),
1161 q, qcount);
1162
1163 while (qcount && !vm_page_queue_empty(q)) {
1164 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1165
1166 m = (vm_page_t) vm_page_queue_first(q);
1167 m_object = VM_PAGE_OBJECT(m);
1168
1169 if (m_object == VM_OBJECT_NULL) {
1170 /*
1171 * Bumped into a free page. This should only happen on the
1172 * secluded queue
1173 */
1174 #if CONFIG_SECLUDED_MEMORY
1175 assert(q == &vm_page_queue_secluded);
1176 #endif /* CONFIG_SECLUDED_MEMORY */
1177 goto reenter_pg_on_q;
1178 }
1179
1180 /*
1181 * check to see if we currently are working
1182 * with the same object... if so, we've
1183 * already got the lock
1184 */
1185 if (m_object != l_object) {
1186 /*
1187 * the object associated with candidate page is
1188 * different from the one we were just working
1189 * with... dump the lock if we still own it
1190 */
1191 if (l_object != NULL) {
1192 vm_object_unlock(l_object);
1193 l_object = NULL;
1194 }
1195 if (m_object != t_object) {
1196 try_failed_count = 0;
1197 }
1198
1199 /*
1200 * Try to lock object; since we've alread got the
1201 * page queues lock, we can only 'try' for this one.
1202 * if the 'try' fails, we need to do a mutex_pause
1203 * to allow the owner of the object lock a chance to
1204 * run...
1205 */
1206 if (!vm_object_lock_try_scan(m_object)) {
1207 if (try_failed_count > 20) {
1208 goto reenter_pg_on_q;
1209 }
1210 vm_page_unlock_queues();
1211 mutex_pause(try_failed_count++);
1212 vm_page_lock_queues();
1213 delayed_unlock = 0;
1214
1215 paused_count++;
1216
1217 t_object = m_object;
1218 continue;
1219 }
1220 object_locked_count++;
1221
1222 l_object = m_object;
1223 }
1224 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1225 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1226 m->vmp_free_when_done) {
1227 /*
1228 * put it back on the head of its queue
1229 */
1230 goto reenter_pg_on_q;
1231 }
1232 if (m->vmp_pmapped == TRUE) {
1233 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1234
1235 disconnected_count++;
1236 }
1237 reenter_pg_on_q:
1238 vm_page_queue_remove(q, m, vmp_pageq);
1239 vm_page_queue_enter(q, m, vmp_pageq);
1240
1241 qcount--;
1242 try_failed_count = 0;
1243
1244 if (delayed_unlock++ > 128) {
1245 if (l_object != NULL) {
1246 vm_object_unlock(l_object);
1247 l_object = NULL;
1248 }
1249 lck_mtx_yield(&vm_page_queue_lock);
1250 delayed_unlock = 0;
1251 }
1252 }
1253 if (l_object != NULL) {
1254 vm_object_unlock(l_object);
1255 l_object = NULL;
1256 }
1257
1258 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1259 DBG_FUNC_END),
1260 q, disconnected_count, object_locked_count, paused_count);
1261 }
1262
1263 extern const char *proc_best_name(struct proc* proc);
1264
1265 int
1266 vm_toggle_task_selfdonate_pages(task_t task)
1267 {
1268 int state = 0;
1269 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1270 printf("VM Donation mode is OFF on the system\n");
1271 return state;
1272 }
1273 if (task != kernel_task) {
1274 task_lock(task);
1275 if (!task->donates_own_pages) {
1276 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1277 task->donates_own_pages = true;
1278 state = 1;
1279 } else if (task->donates_own_pages) {
1280 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1281 task->donates_own_pages = false;
1282 state = 0;
1283 }
1284 task_unlock(task);
1285 }
1286 return state;
1287 }
1288 #endif /* DEVELOPMENT || DEBUG */
1289
1290 void
1291 vm_task_set_selfdonate_pages(task_t task, bool donate)
1292 {
1293 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1294 assert(task != kernel_task);
1295
1296 task_lock(task);
1297 task->donates_own_pages = donate;
1298 task_unlock(task);
1299 }
1300
1301
1302
1303 static size_t
1304 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1305
1306 /*
1307 * condition variable used to make sure there is
1308 * only a single sweep going on at a time
1309 */
1310 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1311
1312
1313 kern_return_t
1314 vm_pageout_anonymous_pages()
1315 {
1316 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1317 size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1318 vm_page_lock_queues();
1319
1320 if (vm_pageout_anonymous_pages_active == TRUE) {
1321 vm_page_unlock_queues();
1322 return KERN_RESOURCE_SHORTAGE;
1323 }
1324 vm_pageout_anonymous_pages_active = TRUE;
1325 vm_page_unlock_queues();
1326
1327 throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1328 anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1329 active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1330
1331 os_log(OS_LOG_DEFAULT,
1332 "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1333 __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1334
1335 if (VM_CONFIG_SWAP_IS_PRESENT) {
1336 vm_consider_swapping();
1337 }
1338
1339 vm_page_lock_queues();
1340 vm_pageout_anonymous_pages_active = FALSE;
1341 vm_page_unlock_queues();
1342 return KERN_SUCCESS;
1343 } else {
1344 return KERN_NOT_SUPPORTED;
1345 }
1346 }
1347
1348
1349 size_t
1350 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1351 {
1352 vm_page_t m;
1353 vm_object_t t_object = NULL;
1354 vm_object_t l_object = NULL;
1355 vm_object_t m_object = NULL;
1356 int delayed_unlock = 0;
1357 int try_failed_count = 0;
1358 int refmod_state;
1359 int pmap_options;
1360 struct vm_pageout_queue *iq;
1361 ppnum_t phys_page;
1362 size_t pages_moved = 0;
1363
1364
1365 iq = &vm_pageout_queue_internal;
1366
1367 vm_page_lock_queues();
1368
1369 #if DEVELOPMENT || DEBUG
1370 if (perf_test) {
1371 iq = &vm_pageout_queue_benchmark;
1372 // ensure the benchmark queue isn't throttled
1373 iq->pgo_maxlaundry = (unsigned int) qcount;
1374 }
1375 #endif /* DEVELOPMENT ||DEBUG */
1376
1377 while (qcount && !vm_page_queue_empty(q)) {
1378 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1379
1380 if (VM_PAGE_Q_THROTTLED(iq)) {
1381 if (l_object != NULL) {
1382 vm_object_unlock(l_object);
1383 l_object = NULL;
1384 }
1385 iq->pgo_draining = TRUE;
1386
1387 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1388 vm_page_unlock_queues();
1389
1390 thread_block(THREAD_CONTINUE_NULL);
1391
1392 vm_page_lock_queues();
1393 delayed_unlock = 0;
1394 continue;
1395 }
1396 m = (vm_page_t) vm_page_queue_first(q);
1397 m_object = VM_PAGE_OBJECT(m);
1398
1399 /*
1400 * check to see if we currently are working
1401 * with the same object... if so, we've
1402 * already got the lock
1403 */
1404 if (m_object != l_object) {
1405 if (!m_object->internal) {
1406 goto reenter_pg_on_q;
1407 }
1408
1409 /*
1410 * the object associated with candidate page is
1411 * different from the one we were just working
1412 * with... dump the lock if we still own it
1413 */
1414 if (l_object != NULL) {
1415 vm_object_unlock(l_object);
1416 l_object = NULL;
1417 }
1418 if (m_object != t_object) {
1419 try_failed_count = 0;
1420 }
1421
1422 /*
1423 * Try to lock object; since we've alread got the
1424 * page queues lock, we can only 'try' for this one.
1425 * if the 'try' fails, we need to do a mutex_pause
1426 * to allow the owner of the object lock a chance to
1427 * run...
1428 */
1429 if (!vm_object_lock_try_scan(m_object)) {
1430 if (try_failed_count > 20) {
1431 goto reenter_pg_on_q;
1432 }
1433 vm_page_unlock_queues();
1434 mutex_pause(try_failed_count++);
1435 vm_page_lock_queues();
1436 delayed_unlock = 0;
1437
1438 t_object = m_object;
1439 continue;
1440 }
1441 l_object = m_object;
1442 }
1443 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1444 /*
1445 * page is not to be cleaned
1446 * put it back on the head of its queue
1447 */
1448 goto reenter_pg_on_q;
1449 }
1450 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1451
1452 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1453 refmod_state = pmap_get_refmod(phys_page);
1454
1455 if (refmod_state & VM_MEM_REFERENCED) {
1456 m->vmp_reference = TRUE;
1457 }
1458 if (refmod_state & VM_MEM_MODIFIED) {
1459 SET_PAGE_DIRTY(m, FALSE);
1460 }
1461 }
1462 if (m->vmp_reference == TRUE) {
1463 m->vmp_reference = FALSE;
1464 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1465 goto reenter_pg_on_q;
1466 }
1467 if (m->vmp_pmapped == TRUE) {
1468 if (m->vmp_dirty || m->vmp_precious) {
1469 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1470 } else {
1471 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1472 }
1473 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1474 if (refmod_state & VM_MEM_MODIFIED) {
1475 SET_PAGE_DIRTY(m, FALSE);
1476 }
1477 }
1478
1479 if (!m->vmp_dirty && !m->vmp_precious) {
1480 vm_page_unlock_queues();
1481 VM_PAGE_FREE(m);
1482 vm_page_lock_queues();
1483 delayed_unlock = 0;
1484
1485 goto next_pg;
1486 }
1487 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1488 if (!m_object->pager_initialized) {
1489 vm_page_unlock_queues();
1490
1491 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1492
1493 if (!m_object->pager_initialized) {
1494 vm_object_compressor_pager_create(m_object);
1495 }
1496
1497 vm_page_lock_queues();
1498 delayed_unlock = 0;
1499 }
1500 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1501 goto reenter_pg_on_q;
1502 }
1503 /*
1504 * vm_object_compressor_pager_create will drop the object lock
1505 * which means 'm' may no longer be valid to use
1506 */
1507 continue;
1508 }
1509
1510 if (!perf_test) {
1511 /*
1512 * we've already factored out pages in the laundry which
1513 * means this page can't be on the pageout queue so it's
1514 * safe to do the vm_page_queues_remove
1515 */
1516 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1517 vm_page_queues_remove(m, TRUE);
1518 if (donate) {
1519 /*
1520 * The compressor needs to see this bit to know
1521 * where this page needs to land. Also if stolen,
1522 * this bit helps put the page back in the right
1523 * special queue where it belongs.
1524 */
1525 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1526 }
1527 } else {
1528 vm_page_queue_remove(q, m, vmp_pageq);
1529 }
1530
1531 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1532
1533 vm_pageout_cluster_to_queue(m, iq);
1534
1535 pages_moved++;
1536 goto next_pg;
1537
1538 reenter_pg_on_q:
1539 vm_page_queue_remove(q, m, vmp_pageq);
1540 vm_page_queue_enter(q, m, vmp_pageq);
1541 next_pg:
1542 qcount--;
1543 try_failed_count = 0;
1544
1545 if (delayed_unlock++ > 128) {
1546 if (l_object != NULL) {
1547 vm_object_unlock(l_object);
1548 l_object = NULL;
1549 }
1550 lck_mtx_yield(&vm_page_queue_lock);
1551 delayed_unlock = 0;
1552 }
1553 }
1554 if (l_object != NULL) {
1555 vm_object_unlock(l_object);
1556 l_object = NULL;
1557 }
1558 vm_page_unlock_queues();
1559 return pages_moved;
1560 }
1561
1562
1563
1564 /*
1565 * function in BSD to apply I/O throttle to the pageout thread
1566 */
1567 extern void vm_pageout_io_throttle(void);
1568
1569 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1570 MACRO_BEGIN \
1571 /* \
1572 * If a "reusable" page somehow made it back into \
1573 * the active queue, it's been re-used and is not \
1574 * quite re-usable. \
1575 * If the VM object was "all_reusable", consider it \
1576 * as "all re-used" instead of converting it to \
1577 * "partially re-used", which could be expensive. \
1578 */ \
1579 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1580 if ((m)->vmp_reusable || \
1581 (obj)->all_reusable) { \
1582 vm_object_reuse_pages((obj), \
1583 (m)->vmp_offset, \
1584 (m)->vmp_offset + PAGE_SIZE_64, \
1585 FALSE); \
1586 } \
1587 MACRO_END
1588
1589
1590 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1591 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1592
1593 #define FCS_IDLE 0
1594 #define FCS_DELAYED 1
1595 #define FCS_DEADLOCK_DETECTED 2
1596
1597 struct flow_control {
1598 int state;
1599 mach_timespec_t ts;
1600 };
1601
1602
1603 uint64_t vm_pageout_rejected_bq_internal = 0;
1604 uint64_t vm_pageout_rejected_bq_external = 0;
1605 uint64_t vm_pageout_skipped_bq_internal = 0;
1606 uint64_t vm_pageout_skipped_bq_external = 0;
1607
1608 #define ANONS_GRABBED_LIMIT 2
1609
1610
1611 #if 0
1612 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1613 #endif
1614 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1615
1616 #define VM_PAGEOUT_PB_NO_ACTION 0
1617 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1618 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1619
1620
1621 #if 0
1622 static void
1623 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1624 {
1625 if (*local_freeq) {
1626 vm_page_unlock_queues();
1627
1628 VM_DEBUG_CONSTANT_EVENT(
1629 vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1630 vm_page_free_count, 0, 0, 1);
1631
1632 vm_page_free_list(*local_freeq, TRUE);
1633
1634 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1635 vm_page_free_count, *local_freed, 0, 1);
1636
1637 *local_freeq = NULL;
1638 *local_freed = 0;
1639
1640 vm_page_lock_queues();
1641 } else {
1642 lck_mtx_yield(&vm_page_queue_lock);
1643 }
1644 *delayed_unlock = 1;
1645 }
1646 #endif
1647
1648
1649 static void
1650 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1651 vm_page_t *local_freeq, int *local_freed, int action)
1652 {
1653 vm_page_unlock_queues();
1654
1655 if (*object != NULL) {
1656 vm_object_unlock(*object);
1657 *object = NULL;
1658 }
1659 if (*local_freeq) {
1660 vm_page_free_list(*local_freeq, TRUE);
1661
1662 *local_freeq = NULL;
1663 *local_freed = 0;
1664 }
1665 *delayed_unlock = 1;
1666
1667 switch (action) {
1668 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1669 vm_consider_waking_compactor_swapper();
1670 break;
1671 case VM_PAGEOUT_PB_THREAD_YIELD:
1672 thread_yield_internal(1);
1673 break;
1674 case VM_PAGEOUT_PB_NO_ACTION:
1675 default:
1676 break;
1677 }
1678 vm_page_lock_queues();
1679 }
1680
1681
1682 static struct vm_pageout_vminfo last;
1683
1684 uint64_t last_vm_page_pages_grabbed = 0;
1685
1686 extern uint32_t c_segment_pages_compressed;
1687
1688 extern uint64_t shared_region_pager_reclaimed;
1689 extern struct memory_object_pager_ops shared_region_pager_ops;
1690
1691 void
1692 update_vm_info(void)
1693 {
1694 unsigned long tmp;
1695 uint64_t tmp64;
1696
1697 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1698 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1699 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1700 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1701
1702 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1703 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1704 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1705
1706 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1707 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1708 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1709 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1710 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1711
1712 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1713 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1714 last.vm_pageout_considered_page = tmp;
1715
1716 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1717 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1718 last.vm_pageout_compressions = tmp64;
1719
1720 tmp = vm_pageout_vminfo.vm_compressor_failed;
1721 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1722 last.vm_compressor_failed = tmp;
1723
1724 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1725 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1726 last.vm_compressor_pages_grabbed = tmp64;
1727
1728 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1729 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1730 last.vm_phantom_cache_found_ghost = tmp;
1731
1732 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1733 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1734 last.vm_phantom_cache_added_ghost = tmp;
1735
1736 tmp64 = counter_load(&vm_page_grab_count);
1737 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1738 last_vm_page_pages_grabbed = tmp64;
1739
1740 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1741 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1742 last.vm_page_pages_freed = tmp;
1743
1744 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1745 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1746 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1747 last.vm_pageout_pages_evicted = tmp;
1748
1749 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1750 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1751 last.vm_pageout_pages_purged = tmp;
1752
1753 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1754 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1755 last.vm_pageout_freed_speculative = tmp;
1756
1757 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1758 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1759 last.vm_pageout_freed_external = tmp;
1760
1761 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1762 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1763 last.vm_pageout_inactive_referenced = tmp;
1764
1765 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1766 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1767 last.vm_pageout_scan_inactive_throttled_external = tmp;
1768
1769 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1770 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1771 last.vm_pageout_inactive_dirty_external = tmp;
1772
1773 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1774 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1775 last.vm_pageout_freed_cleaned = tmp;
1776
1777 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1778 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1779 last.vm_pageout_inactive_nolock = tmp;
1780
1781 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1782 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1783 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1784
1785 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1786 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1787 last.vm_pageout_skipped_external = tmp;
1788
1789 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1790 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1791 last.vm_pageout_skipped_internal = tmp;
1792
1793 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1794 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1795 last.vm_pageout_reactivation_limit_exceeded = tmp;
1796
1797 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1798 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1799 last.vm_pageout_inactive_force_reclaim = tmp;
1800
1801 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1802 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1803 last.vm_pageout_freed_internal = tmp;
1804
1805 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1806 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1807 last.vm_pageout_considered_bq_internal = tmp;
1808
1809 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1810 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1811 last.vm_pageout_considered_bq_external = tmp;
1812
1813 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1814 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1815 last.vm_pageout_filecache_min_reactivated = tmp;
1816
1817 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1818 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1819 last.vm_pageout_inactive_dirty_internal = tmp;
1820
1821 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1822 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1823 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1824
1825 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1826 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1827 last.vm_pageout_forcereclaimed_realtime = tmp;
1828
1829 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1830 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1831 last.vm_pageout_protected_sharedcache = tmp;
1832
1833 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1834 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1835 last.vm_pageout_protected_realtime = tmp;
1836 }
1837
1838 KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1839 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1840 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1841 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1842 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1843
1844 KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1845 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1846 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1847 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1848
1849 KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1850 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1851 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1852 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1853 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1854
1855 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1856 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1857 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1858 KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1859 vm_pageout_stats[vm_pageout_stat_now].considered,
1860 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1861 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1862 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1863
1864 KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1865 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1866 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1867 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1868 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1869
1870 KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1871 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1872 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1873 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1874 vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1875
1876 KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1877 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1878 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1879 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1880 vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1881
1882 KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1883 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1884 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1885 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1886 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1887
1888 KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1889 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1890 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1891 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1892 vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1893 }
1894 KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1895 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1896 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1897 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1898 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1899
1900 record_memory_pressure();
1901 }
1902
1903 extern boolean_t hibernation_vmqueues_inspection;
1904
1905 /*
1906 * Return values for functions called by vm_pageout_scan
1907 * that control its flow.
1908 *
1909 * PROCEED -- vm_pageout_scan will keep making forward progress.
1910 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1911 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1912 */
1913
1914 #define VM_PAGEOUT_SCAN_PROCEED (0)
1915 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1916 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1917
1918 /*
1919 * This function is called only from vm_pageout_scan and
1920 * it moves overflow secluded pages (one-at-a-time) to the
1921 * batched 'local' free Q or active Q.
1922 */
1923 static void
1924 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1925 {
1926 #if CONFIG_SECLUDED_MEMORY
1927 /*
1928 * Deal with secluded_q overflow.
1929 */
1930 if (vm_page_secluded_count > vm_page_secluded_target) {
1931 vm_page_t secluded_page;
1932
1933 /*
1934 * SECLUDED_AGING_BEFORE_ACTIVE:
1935 * Excess secluded pages go to the active queue and
1936 * will later go to the inactive queue.
1937 */
1938 assert((vm_page_secluded_count_free +
1939 vm_page_secluded_count_inuse) ==
1940 vm_page_secluded_count);
1941 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1942 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1943
1944 vm_page_queues_remove(secluded_page, FALSE);
1945 assert(!secluded_page->vmp_fictitious);
1946 assert(!VM_PAGE_WIRED(secluded_page));
1947
1948 if (secluded_page->vmp_object == 0) {
1949 /* transfer to free queue */
1950 assert(secluded_page->vmp_busy);
1951 secluded_page->vmp_snext = *local_freeq;
1952 *local_freeq = secluded_page;
1953 *local_freed += 1;
1954 } else {
1955 /* transfer to head of active queue */
1956 vm_page_enqueue_active(secluded_page, FALSE);
1957 secluded_page = VM_PAGE_NULL;
1958 }
1959 }
1960 #else /* CONFIG_SECLUDED_MEMORY */
1961
1962 #pragma unused(local_freeq)
1963 #pragma unused(local_freed)
1964
1965 return;
1966
1967 #endif /* CONFIG_SECLUDED_MEMORY */
1968 }
1969
1970 /*
1971 * This function is called only from vm_pageout_scan and
1972 * it initializes the loop targets for vm_pageout_scan().
1973 */
1974 static void
1975 vps_init_page_targets(void)
1976 {
1977 /*
1978 * LD TODO: Other page targets should be calculated here too.
1979 */
1980 vm_page_anonymous_min = vm_page_inactive_target / 20;
1981
1982 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1983 vm_pageout_state.vm_page_speculative_percentage = 50;
1984 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1985 vm_pageout_state.vm_page_speculative_percentage = 1;
1986 }
1987
1988 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1989 vm_page_inactive_count);
1990 }
1991
1992 /*
1993 * This function is called only from vm_pageout_scan and
1994 * it purges a single VM object at-a-time and will either
1995 * make vm_pageout_scan() restart the loop or keeping moving forward.
1996 */
1997 static int
1998 vps_purge_object()
1999 {
2000 int force_purge;
2001
2002 assert(available_for_purge >= 0);
2003 force_purge = 0; /* no force-purging */
2004
2005 #if VM_PRESSURE_EVENTS
2006 vm_pressure_level_t pressure_level;
2007
2008 pressure_level = memorystatus_vm_pressure_level;
2009
2010 if (pressure_level > kVMPressureNormal) {
2011 if (pressure_level >= kVMPressureCritical) {
2012 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2013 } else if (pressure_level >= kVMPressureUrgent) {
2014 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2015 } else if (pressure_level >= kVMPressureWarning) {
2016 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2017 }
2018 }
2019 #endif /* VM_PRESSURE_EVENTS */
2020
2021 if (available_for_purge || force_purge) {
2022 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2023
2024 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2025 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2026 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2027 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2028 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2029
2030 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2031 }
2032 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2033 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2034 }
2035
2036 return VM_PAGEOUT_SCAN_PROCEED;
2037 }
2038
2039 /*
2040 * This function is called only from vm_pageout_scan and
2041 * it will try to age the next speculative Q if the oldest
2042 * one is empty.
2043 */
2044 static int
2045 vps_age_speculative_queue(boolean_t force_speculative_aging)
2046 {
2047 #define DELAY_SPECULATIVE_AGE 1000
2048
2049 /*
2050 * try to pull pages from the aging bins...
2051 * see vm_page_internal.h for an explanation of how
2052 * this mechanism works
2053 */
2054 boolean_t can_steal = FALSE;
2055 int num_scanned_queues;
2056 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2057 mach_timespec_t ts;
2058 struct vm_speculative_age_q *aq;
2059 struct vm_speculative_age_q *sq;
2060
2061 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2062
2063 aq = &vm_page_queue_speculative[speculative_steal_index];
2064
2065 num_scanned_queues = 0;
2066 while (vm_page_queue_empty(&aq->age_q) &&
2067 num_scanned_queues++ != vm_page_max_speculative_age_q) {
2068 speculative_steal_index++;
2069
2070 if (speculative_steal_index > vm_page_max_speculative_age_q) {
2071 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2072 }
2073
2074 aq = &vm_page_queue_speculative[speculative_steal_index];
2075 }
2076
2077 if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2078 /*
2079 * XXX We've scanned all the speculative
2080 * queues but still haven't found one
2081 * that is not empty, even though
2082 * vm_page_speculative_count is not 0.
2083 */
2084 if (!vm_page_queue_empty(&sq->age_q)) {
2085 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2086 }
2087 #if DEVELOPMENT || DEBUG
2088 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2089 #endif
2090 /* readjust... */
2091 vm_page_speculative_count = 0;
2092 /* ... and continue */
2093 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2094 }
2095
2096 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2097 can_steal = TRUE;
2098 } else {
2099 if (!delay_speculative_age) {
2100 mach_timespec_t ts_fully_aged;
2101
2102 ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2103 ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2104 * 1000 * NSEC_PER_USEC;
2105
2106 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2107
2108 clock_sec_t sec;
2109 clock_nsec_t nsec;
2110 clock_get_system_nanotime(&sec, &nsec);
2111 ts.tv_sec = (unsigned int) sec;
2112 ts.tv_nsec = nsec;
2113
2114 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2115 can_steal = TRUE;
2116 } else {
2117 delay_speculative_age++;
2118 }
2119 } else {
2120 delay_speculative_age++;
2121 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2122 delay_speculative_age = 0;
2123 }
2124 }
2125 }
2126 if (can_steal == TRUE) {
2127 vm_page_speculate_ageit(aq);
2128 }
2129
2130 return VM_PAGEOUT_SCAN_PROCEED;
2131 }
2132
2133 /*
2134 * This function is called only from vm_pageout_scan and
2135 * it evicts a single VM object from the cache.
2136 */
2137 static int inline
2138 vps_object_cache_evict(vm_object_t *object_to_unlock)
2139 {
2140 static int cache_evict_throttle = 0;
2141 struct vm_speculative_age_q *sq;
2142
2143 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2144
2145 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2146 int pages_evicted;
2147
2148 if (*object_to_unlock != NULL) {
2149 vm_object_unlock(*object_to_unlock);
2150 *object_to_unlock = NULL;
2151 }
2152 KDBG(0x13001ec | DBG_FUNC_START);
2153
2154 pages_evicted = vm_object_cache_evict(100, 10);
2155
2156 KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2157
2158 if (pages_evicted) {
2159 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2160
2161 VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2162 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2163 memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2164
2165 /*
2166 * we just freed up to 100 pages,
2167 * so go back to the top of the main loop
2168 * and re-evaulate the memory situation
2169 */
2170 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2171 } else {
2172 cache_evict_throttle = 1000;
2173 }
2174 }
2175 if (cache_evict_throttle) {
2176 cache_evict_throttle--;
2177 }
2178
2179 return VM_PAGEOUT_SCAN_PROCEED;
2180 }
2181
2182
2183 /*
2184 * This function is called only from vm_pageout_scan and
2185 * it calculates the filecache min. that needs to be maintained
2186 * as we start to steal pages.
2187 */
2188 static void
2189 vps_calculate_filecache_min(void)
2190 {
2191 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2192
2193 #if CONFIG_JETSAM
2194 /*
2195 * don't let the filecache_min fall below 15% of available memory
2196 * on systems with an active compressor that isn't nearing its
2197 * limits w/r to accepting new data
2198 *
2199 * on systems w/o the compressor/swapper, the filecache is always
2200 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2201 * since most (if not all) of the anonymous pages are in the
2202 * throttled queue (which isn't counted as available) which
2203 * effectively disables this filter
2204 */
2205 if (vm_compressor_low_on_space() || divisor == 0) {
2206 vm_pageout_state.vm_page_filecache_min = 0;
2207 } else {
2208 vm_pageout_state.vm_page_filecache_min =
2209 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2210 }
2211 #else
2212 if (vm_compressor_out_of_space() || divisor == 0) {
2213 vm_pageout_state.vm_page_filecache_min = 0;
2214 } else {
2215 /*
2216 * don't let the filecache_min fall below the specified critical level
2217 */
2218 vm_pageout_state.vm_page_filecache_min =
2219 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2220 }
2221 #endif
2222 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2223 vm_pageout_state.vm_page_filecache_min = 0;
2224 }
2225 }
2226
2227 /*
2228 * This function is called only from vm_pageout_scan and
2229 * it updates the flow control time to detect if VM pageoutscan
2230 * isn't making progress.
2231 */
2232 static void
2233 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2234 {
2235 mach_timespec_t ts;
2236 clock_sec_t sec;
2237 clock_nsec_t nsec;
2238
2239 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2240 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2241 clock_get_system_nanotime(&sec, &nsec);
2242 flow_control->ts.tv_sec = (unsigned int) sec;
2243 flow_control->ts.tv_nsec = nsec;
2244 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2245
2246 flow_control->state = FCS_DELAYED;
2247
2248 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2249 }
2250
2251 /*
2252 * This function is called only from vm_pageout_scan and
2253 * it is the flow control logic of VM pageout scan which
2254 * controls if it should block and for how long.
2255 * Any blocking of vm_pageout_scan happens ONLY in this function.
2256 */
2257 static int
2258 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2259 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2260 {
2261 boolean_t exceeded_burst_throttle = FALSE;
2262 unsigned int msecs = 0;
2263 uint32_t inactive_external_count;
2264 mach_timespec_t ts;
2265 struct vm_pageout_queue *iq;
2266 struct vm_pageout_queue *eq;
2267 struct vm_speculative_age_q *sq;
2268
2269 iq = &vm_pageout_queue_internal;
2270 eq = &vm_pageout_queue_external;
2271 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2272
2273 /*
2274 * Sometimes we have to pause:
2275 * 1) No inactive pages - nothing to do.
2276 * 2) Loop control - no acceptable pages found on the inactive queue
2277 * within the last vm_pageout_burst_inactive_throttle iterations
2278 * 3) Flow control - default pageout queue is full
2279 */
2280 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2281 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2282 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2283 vm_page_queue_empty(&sq->age_q)) {
2284 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2285 msecs = vm_pageout_state.vm_pageout_empty_wait;
2286 } else if (inactive_burst_count >=
2287 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2288 (vm_page_inactive_count +
2289 vm_page_speculative_count))) {
2290 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2291 msecs = vm_pageout_state.vm_pageout_burst_wait;
2292
2293 exceeded_burst_throttle = TRUE;
2294 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2295 VM_DYNAMIC_PAGING_ENABLED()) {
2296 clock_sec_t sec;
2297 clock_nsec_t nsec;
2298
2299 switch (flow_control->state) {
2300 case FCS_IDLE:
2301 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2302 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2303 /*
2304 * since the compressor is running independently of vm_pageout_scan
2305 * let's not wait for it just yet... as long as we have a healthy supply
2306 * of filecache pages to work with, let's keep stealing those.
2307 */
2308 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2309
2310 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2311 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2312 *anons_grabbed = ANONS_GRABBED_LIMIT;
2313 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2314 return VM_PAGEOUT_SCAN_PROCEED;
2315 }
2316 }
2317
2318 vps_flow_control_reset_deadlock_timer(flow_control);
2319 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2320
2321 break;
2322
2323 case FCS_DELAYED:
2324 clock_get_system_nanotime(&sec, &nsec);
2325 ts.tv_sec = (unsigned int) sec;
2326 ts.tv_nsec = nsec;
2327
2328 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2329 /*
2330 * the pageout thread for the default pager is potentially
2331 * deadlocked since the
2332 * default pager queue has been throttled for more than the
2333 * allowable time... we need to move some clean pages or dirty
2334 * pages belonging to the external pagers if they aren't throttled
2335 * vm_page_free_wanted represents the number of threads currently
2336 * blocked waiting for pages... we'll move one page for each of
2337 * these plus a fixed amount to break the logjam... once we're done
2338 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2339 * with a new timeout target since we have no way of knowing
2340 * whether we've broken the deadlock except through observation
2341 * of the queue associated with the default pager... we need to
2342 * stop moving pages and allow the system to run to see what
2343 * state it settles into.
2344 */
2345
2346 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2347 vm_page_free_wanted + vm_page_free_wanted_privileged;
2348 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2349 flow_control->state = FCS_DEADLOCK_DETECTED;
2350 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2351 return VM_PAGEOUT_SCAN_PROCEED;
2352 }
2353 /*
2354 * just resniff instead of trying
2355 * to compute a new delay time... we're going to be
2356 * awakened immediately upon a laundry completion,
2357 * so we won't wait any longer than necessary
2358 */
2359 msecs = vm_pageout_state.vm_pageout_idle_wait;
2360 break;
2361
2362 case FCS_DEADLOCK_DETECTED:
2363 if (*vm_pageout_deadlock_target) {
2364 return VM_PAGEOUT_SCAN_PROCEED;
2365 }
2366
2367 vps_flow_control_reset_deadlock_timer(flow_control);
2368 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2369
2370 break;
2371 }
2372 } else {
2373 /*
2374 * No need to pause...
2375 */
2376 return VM_PAGEOUT_SCAN_PROCEED;
2377 }
2378
2379 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2380
2381 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2382 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2383
2384 if (vm_page_free_count >= vm_page_free_target) {
2385 /*
2386 * we're here because
2387 * 1) someone else freed up some pages while we had
2388 * the queues unlocked above
2389 * and we've hit one of the 3 conditions that
2390 * cause us to pause the pageout scan thread
2391 *
2392 * since we already have enough free pages,
2393 * let's avoid stalling and return normally
2394 *
2395 * before we return, make sure the pageout I/O threads
2396 * are running throttled in case there are still requests
2397 * in the laundry... since we have enough free pages
2398 * we don't need the laundry to be cleaned in a timely
2399 * fashion... so let's avoid interfering with foreground
2400 * activity
2401 *
2402 * we don't want to hold vm_page_queue_free_lock when
2403 * calling vm_pageout_adjust_eq_iothrottle (since it
2404 * may cause other locks to be taken), we do the intitial
2405 * check outside of the lock. Once we take the lock,
2406 * we recheck the condition since it may have changed.
2407 * if it has, no problem, we will make the threads
2408 * non-throttled before actually blocking
2409 */
2410 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2411 }
2412 vm_free_page_lock();
2413
2414 if (vm_page_free_count >= vm_page_free_target &&
2415 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2416 return VM_PAGEOUT_SCAN_DONE_RETURN;
2417 }
2418 vm_free_page_unlock();
2419
2420 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2421 /*
2422 * we're most likely about to block due to one of
2423 * the 3 conditions that cause vm_pageout_scan to
2424 * not be able to make forward progress w/r
2425 * to providing new pages to the free queue,
2426 * so unthrottle the I/O threads in case we
2427 * have laundry to be cleaned... it needs
2428 * to be completed ASAP.
2429 *
2430 * even if we don't block, we want the io threads
2431 * running unthrottled since the sum of free +
2432 * clean pages is still under our free target
2433 */
2434 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2435 }
2436 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2437 /*
2438 * if we get here we're below our free target and
2439 * we're stalling due to a full laundry queue or
2440 * we don't have any inactive pages other then
2441 * those in the clean queue...
2442 * however, we have pages on the clean queue that
2443 * can be moved to the free queue, so let's not
2444 * stall the pageout scan
2445 */
2446 flow_control->state = FCS_IDLE;
2447 return VM_PAGEOUT_SCAN_PROCEED;
2448 }
2449 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2450 flow_control->state = FCS_IDLE;
2451 return VM_PAGEOUT_SCAN_PROCEED;
2452 }
2453
2454 VM_CHECK_MEMORYSTATUS;
2455
2456 if (flow_control->state != FCS_IDLE) {
2457 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2458 }
2459
2460 iq->pgo_throttled = TRUE;
2461 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2462
2463 vm_page_unlock_queues();
2464
2465 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2466
2467 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2468 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2469 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2470
2471 thread_block(THREAD_CONTINUE_NULL);
2472
2473 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2474 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2475 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2476
2477 vm_page_lock_queues();
2478
2479 iq->pgo_throttled = FALSE;
2480
2481 vps_init_page_targets();
2482
2483 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2484 }
2485
2486 extern boolean_t vm_darkwake_mode;
2487 /*
2488 * This function is called only from vm_pageout_scan and
2489 * it will find and return the most appropriate page to be
2490 * reclaimed.
2491 */
2492 static int
2493 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2494 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2495 {
2496 vm_page_t m = NULL;
2497 vm_object_t m_object = VM_OBJECT_NULL;
2498 uint32_t inactive_external_count;
2499 struct vm_speculative_age_q *sq;
2500 struct vm_pageout_queue *iq;
2501 int retval = VM_PAGEOUT_SCAN_PROCEED;
2502
2503 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2504 iq = &vm_pageout_queue_internal;
2505
2506 *is_page_from_bg_q = FALSE;
2507
2508 m = NULL;
2509 m_object = VM_OBJECT_NULL;
2510
2511 if (VM_DYNAMIC_PAGING_ENABLED()) {
2512 assert(vm_page_throttled_count == 0);
2513 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2514 }
2515
2516 /*
2517 * Try for a clean-queue inactive page.
2518 * These are pages that vm_pageout_scan tried to steal earlier, but
2519 * were dirty and had to be cleaned. Pick them up now that they are clean.
2520 */
2521 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2522 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2523
2524 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2525
2526 goto found_page;
2527 }
2528
2529 /*
2530 * The next most eligible pages are ones we paged in speculatively,
2531 * but which have not yet been touched and have been aged out.
2532 */
2533 if (!vm_page_queue_empty(&sq->age_q)) {
2534 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2535
2536 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2537
2538 if (!m->vmp_dirty || force_anonymous == FALSE) {
2539 goto found_page;
2540 } else {
2541 m = NULL;
2542 }
2543 }
2544
2545 #if !CONFIG_JETSAM
2546 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2547 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2548 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2549 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2550 goto found_page;
2551 }
2552 }
2553 #endif /* !CONFIG_JETSAM */
2554
2555 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2556 vm_object_t bg_m_object = NULL;
2557
2558 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2559
2560 bg_m_object = VM_PAGE_OBJECT(m);
2561
2562 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2563 /*
2564 * This page is on the background queue
2565 * but not on a pageable queue OR is busy during
2566 * darkwake mode when the target is artificially lowered.
2567 * If it is busy during darkwake mode, and we don't skip it,
2568 * we will just swing back around and try again with the same
2569 * queue and might hit the same page or its neighbor in a
2570 * similar state. Both of these are transient states and will
2571 * get resolved, but, at this point let's ignore this page.
2572 */
2573 if (vm_darkwake_mode && m->vmp_busy) {
2574 if (bg_m_object->internal) {
2575 vm_pageout_skipped_bq_internal++;
2576 } else {
2577 vm_pageout_skipped_bq_external++;
2578 }
2579 }
2580 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2581 if (bg_m_object->internal &&
2582 (VM_PAGE_Q_THROTTLED(iq) ||
2583 vm_compressor_out_of_space() == TRUE ||
2584 vm_page_free_count < (vm_page_free_reserved / 4))) {
2585 vm_pageout_skipped_bq_internal++;
2586 } else {
2587 *is_page_from_bg_q = TRUE;
2588
2589 if (bg_m_object->internal) {
2590 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2591 } else {
2592 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2593 }
2594 goto found_page;
2595 }
2596 }
2597 }
2598
2599 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2600
2601 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2602 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2603 *grab_anonymous = TRUE;
2604 *anons_grabbed = 0;
2605
2606 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2607 vm_pageout_vminfo.vm_pageout_skipped_external++;
2608 } else {
2609 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2610 /*
2611 * No swap and we are in dangerously low levels of free memory.
2612 * If we keep going ahead with anonymous pages, we are going to run into a situation
2613 * where the compressor will be stuck waiting for free pages (if it isn't already).
2614 *
2615 * So, pick a file backed page...
2616 */
2617 *grab_anonymous = FALSE;
2618 *anons_grabbed = ANONS_GRABBED_LIMIT;
2619 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2620 }
2621 }
2622 goto want_anonymous;
2623 }
2624 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2625
2626 #if CONFIG_JETSAM
2627 /* If the file-backed pool has accumulated
2628 * significantly more pages than the jetsam
2629 * threshold, prefer to reclaim those
2630 * inline to minimise compute overhead of reclaiming
2631 * anonymous pages.
2632 * This calculation does not account for the CPU local
2633 * external page queues, as those are expected to be
2634 * much smaller relative to the global pools.
2635 */
2636
2637 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2638
2639 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2640 if (vm_page_pageable_external_count >
2641 vm_pageout_state.vm_page_filecache_min) {
2642 if ((vm_page_pageable_external_count *
2643 vm_pageout_memorystatus_fb_factor_dr) >
2644 (memorystatus_available_pages_critical *
2645 vm_pageout_memorystatus_fb_factor_nr)) {
2646 *grab_anonymous = FALSE;
2647
2648 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2649 }
2650 }
2651 if (*grab_anonymous) {
2652 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2653 }
2654 }
2655 #endif /* CONFIG_JETSAM */
2656
2657 want_anonymous:
2658 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2659 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2660 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2661
2662 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2663 *anons_grabbed = 0;
2664
2665 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2666 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2667 if ((++(*reactivated_this_call) % 100)) {
2668 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2669
2670 vm_page_activate(m);
2671 counter_inc(&vm_statistics_reactivations);
2672 #if DEVELOPMENT || DEBUG
2673 if (*is_page_from_bg_q == TRUE) {
2674 if (m_object->internal) {
2675 vm_pageout_rejected_bq_internal++;
2676 } else {
2677 vm_pageout_rejected_bq_external++;
2678 }
2679 }
2680 #endif /* DEVELOPMENT || DEBUG */
2681 vm_pageout_state.vm_pageout_inactive_used++;
2682
2683 m = NULL;
2684 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2685
2686 goto found_page;
2687 }
2688
2689 /*
2690 * steal 1 of the file backed pages even if
2691 * we are under the limit that has been set
2692 * for a healthy filecache
2693 */
2694 }
2695 }
2696 goto found_page;
2697 }
2698 }
2699 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2700 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2701
2702 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2703 *anons_grabbed += 1;
2704
2705 goto found_page;
2706 }
2707
2708 m = NULL;
2709
2710 found_page:
2711 *victim_page = m;
2712
2713 return retval;
2714 }
2715
2716 /*
2717 * This function is called only from vm_pageout_scan and
2718 * it will put a page back on the active/inactive queue
2719 * if we can't reclaim it for some reason.
2720 */
2721 static void
2722 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2723 {
2724 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2725 vm_page_enqueue_inactive(m, FALSE);
2726 } else {
2727 vm_page_activate(m);
2728 }
2729
2730 #if DEVELOPMENT || DEBUG
2731 vm_object_t m_object = VM_PAGE_OBJECT(m);
2732
2733 if (page_from_bg_q == TRUE) {
2734 if (m_object->internal) {
2735 vm_pageout_rejected_bq_internal++;
2736 } else {
2737 vm_pageout_rejected_bq_external++;
2738 }
2739 }
2740 #endif /* DEVELOPMENT || DEBUG */
2741 }
2742
2743 /*
2744 * This function is called only from vm_pageout_scan and
2745 * it will try to grab the victim page's VM object (m_object)
2746 * which differs from the previous victim page's object (object).
2747 */
2748 static int
2749 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2750 {
2751 struct vm_speculative_age_q *sq;
2752
2753 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2754
2755 /*
2756 * the object associated with candidate page is
2757 * different from the one we were just working
2758 * with... dump the lock if we still own it
2759 */
2760 if (*object != NULL) {
2761 vm_object_unlock(*object);
2762 *object = NULL;
2763 }
2764 /*
2765 * Try to lock object; since we've alread got the
2766 * page queues lock, we can only 'try' for this one.
2767 * if the 'try' fails, we need to do a mutex_pause
2768 * to allow the owner of the object lock a chance to
2769 * run... otherwise, we're likely to trip over this
2770 * object in the same state as we work our way through
2771 * the queue... clumps of pages associated with the same
2772 * object are fairly typical on the inactive and active queues
2773 */
2774 if (!vm_object_lock_try_scan(m_object)) {
2775 vm_page_t m_want = NULL;
2776
2777 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2778
2779 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2780 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2781 }
2782
2783 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2784
2785 m->vmp_reference = FALSE;
2786
2787 if (!m_object->object_is_shared_cache) {
2788 /*
2789 * don't apply this optimization if this is the shared cache
2790 * object, it's too easy to get rid of very hot and important
2791 * pages...
2792 * m->vmp_object must be stable since we hold the page queues lock...
2793 * we can update the scan_collisions field sans the object lock
2794 * since it is a separate field and this is the only spot that does
2795 * a read-modify-write operation and it is never executed concurrently...
2796 * we can asynchronously set this field to 0 when creating a UPL, so it
2797 * is possible for the value to be a bit non-determistic, but that's ok
2798 * since it's only used as a hint
2799 */
2800 m_object->scan_collisions = 1;
2801 }
2802 if (page_from_bg_q) {
2803 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2804 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2805 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2806 } else if (!vm_page_queue_empty(&sq->age_q)) {
2807 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2808 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2809 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2810 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2811 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2812 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2813 }
2814
2815 /*
2816 * this is the next object we're going to be interested in
2817 * try to make sure its available after the mutex_pause
2818 * returns control
2819 */
2820 if (m_want) {
2821 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2822 }
2823
2824 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2825
2826 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2827 } else {
2828 *object = m_object;
2829 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2830 }
2831
2832 return VM_PAGEOUT_SCAN_PROCEED;
2833 }
2834
2835 /*
2836 * This function is called only from vm_pageout_scan and
2837 * it notices that pageout scan may be rendered ineffective
2838 * due to a FS deadlock and will jetsam a process if possible.
2839 * If jetsam isn't supported, it'll move the page to the active
2840 * queue to try and get some different pages pushed onwards so
2841 * we can try to get out of this scenario.
2842 */
2843 static void
2844 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2845 boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2846 {
2847 struct vm_pageout_queue *eq;
2848 vm_object_t cur_object = VM_OBJECT_NULL;
2849
2850 cur_object = *object;
2851
2852 eq = &vm_pageout_queue_external;
2853
2854 if (cur_object->internal == FALSE) {
2855 /*
2856 * we need to break up the following potential deadlock case...
2857 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2858 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2859 * c) Most of the pages in the inactive queue belong to this file.
2860 *
2861 * we are potentially in this deadlock because...
2862 * a) the external pageout queue is throttled
2863 * b) we're done with the active queue and moved on to the inactive queue
2864 * c) we've got a dirty external page
2865 *
2866 * since we don't know the reason for the external pageout queue being throttled we
2867 * must suspect that we are deadlocked, so move the current page onto the active queue
2868 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2869 *
2870 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2871 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2872 * pool the next time we select a victim page... if we can make enough new free pages,
2873 * the deadlock will break, the external pageout queue will empty and it will no longer
2874 * be throttled
2875 *
2876 * if we have jetsam configured, keep a count of the pages reactivated this way so
2877 * that we can try to find clean pages in the active/inactive queues before
2878 * deciding to jetsam a process
2879 */
2880 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2881
2882 vm_page_check_pageable_safe(m);
2883 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2884 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2885 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2886 vm_page_active_count++;
2887 vm_page_pageable_external_count++;
2888
2889 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2890
2891 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2892
2893 #pragma unused(force_anonymous)
2894
2895 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2896
2897 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2898 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2899 /*
2900 * Possible deadlock scenario so request jetsam action
2901 */
2902 memorystatus_kill_on_vps_starvation();
2903 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2904 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2905 }
2906 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2907
2908 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2909
2910 *force_anonymous = TRUE;
2911 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2912 } else {
2913 vm_page_activate(m);
2914 counter_inc(&vm_statistics_reactivations);
2915
2916 #if DEVELOPMENT || DEBUG
2917 if (is_page_from_bg_q == TRUE) {
2918 if (cur_object->internal) {
2919 vm_pageout_rejected_bq_internal++;
2920 } else {
2921 vm_pageout_rejected_bq_external++;
2922 }
2923 }
2924 #endif /* DEVELOPMENT || DEBUG */
2925
2926 vm_pageout_state.vm_pageout_inactive_used++;
2927 }
2928 }
2929
2930
2931 void
2932 vm_page_balance_inactive(int max_to_move)
2933 {
2934 vm_page_t m;
2935
2936 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2937
2938 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2939 /*
2940 * It is likely that the hibernation code path is
2941 * dealing with these very queues as we are about
2942 * to move pages around in/from them and completely
2943 * change the linkage of the pages.
2944 *
2945 * And so we skip the rebalancing of these queues.
2946 */
2947 return;
2948 }
2949 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2950 vm_page_inactive_count +
2951 vm_page_speculative_count);
2952
2953 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2954 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2955
2956 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2957
2958 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2959 assert(!m->vmp_laundry);
2960 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2961 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2962
2963 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2964
2965 /*
2966 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2967 *
2968 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2969 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2970 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2971 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2972 * by pageout_scan, which is just fine since the last reference would have happened quite far
2973 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2974 * have happened before we moved the page
2975 */
2976 if (m->vmp_pmapped == TRUE) {
2977 /*
2978 * We might be holding the page queue lock as a
2979 * spin lock and clearing the "referenced" bit could
2980 * take a while if there are lots of mappings of
2981 * that page, so make sure we acquire the lock as
2982 * as mutex to avoid a spinlock timeout.
2983 */
2984 vm_page_lockconvert_queues();
2985 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2986 }
2987
2988 /*
2989 * The page might be absent or busy,
2990 * but vm_page_deactivate can handle that.
2991 * FALSE indicates that we don't want a H/W clear reference
2992 */
2993 vm_page_deactivate_internal(m, FALSE);
2994 }
2995 }
2996
2997 /*
2998 * vm_pageout_scan does the dirty work for the pageout daemon.
2999 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
3000 * held and vm_page_free_wanted == 0.
3001 */
3002 void
3003 vm_pageout_scan(void)
3004 {
3005 unsigned int loop_count = 0;
3006 unsigned int inactive_burst_count = 0;
3007 unsigned int reactivated_this_call;
3008 unsigned int reactivate_limit;
3009 vm_page_t local_freeq = NULL;
3010 int local_freed = 0;
3011 int delayed_unlock;
3012 int delayed_unlock_limit = 0;
3013 int refmod_state = 0;
3014 int vm_pageout_deadlock_target = 0;
3015 struct vm_pageout_queue *iq;
3016 struct vm_pageout_queue *eq;
3017 struct vm_speculative_age_q *sq;
3018 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3019 boolean_t inactive_throttled = FALSE;
3020 vm_object_t object = NULL;
3021 uint32_t inactive_reclaim_run;
3022 boolean_t grab_anonymous = FALSE;
3023 boolean_t force_anonymous = FALSE;
3024 boolean_t force_speculative_aging = FALSE;
3025 int anons_grabbed = 0;
3026 int page_prev_q_state = 0;
3027 boolean_t page_from_bg_q = FALSE;
3028 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3029 vm_object_t m_object = VM_OBJECT_NULL;
3030 int retval = 0;
3031 boolean_t lock_yield_check = FALSE;
3032
3033
3034 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3035 vm_pageout_vminfo.vm_pageout_freed_speculative,
3036 vm_pageout_state.vm_pageout_inactive_clean,
3037 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3038 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3039
3040 flow_control.state = FCS_IDLE;
3041 iq = &vm_pageout_queue_internal;
3042 eq = &vm_pageout_queue_external;
3043 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3044
3045 /* Ask the pmap layer to return any pages it no longer needs. */
3046 pmap_release_pages_fast();
3047
3048 vm_page_lock_queues();
3049
3050 delayed_unlock = 1;
3051
3052 /*
3053 * Calculate the max number of referenced pages on the inactive
3054 * queue that we will reactivate.
3055 */
3056 reactivated_this_call = 0;
3057 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3058 vm_page_inactive_count);
3059 inactive_reclaim_run = 0;
3060
3061 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3062
3063 /*
3064 * We must limit the rate at which we send pages to the pagers
3065 * so that we don't tie up too many pages in the I/O queues.
3066 * We implement a throttling mechanism using the laundry count
3067 * to limit the number of pages outstanding to the default
3068 * and external pagers. We can bypass the throttles and look
3069 * for clean pages if the pageout queues don't drain in a timely
3070 * fashion since this may indicate that the pageout paths are
3071 * stalled waiting for memory, which only we can provide.
3072 */
3073
3074 vps_init_page_targets();
3075 assert(object == NULL);
3076 assert(delayed_unlock != 0);
3077
3078 for (;;) {
3079 vm_page_t m;
3080
3081 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3082
3083 if (lock_yield_check) {
3084 lock_yield_check = FALSE;
3085
3086 if (delayed_unlock++ > delayed_unlock_limit) {
3087 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3088 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3089 } else if (vm_pageout_scan_wants_object) {
3090 vm_page_unlock_queues();
3091 mutex_pause(0);
3092 vm_page_lock_queues();
3093 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3094 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3095 }
3096 }
3097
3098 if (vm_upl_wait_for_pages < 0) {
3099 vm_upl_wait_for_pages = 0;
3100 }
3101
3102 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3103
3104 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3105 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3106 }
3107
3108 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3109
3110 assert(delayed_unlock);
3111
3112 /*
3113 * maintain our balance
3114 */
3115 vm_page_balance_inactive(1);
3116
3117
3118 /**********************************************************************
3119 * above this point we're playing with the active and secluded queues
3120 * below this point we're playing with the throttling mechanisms
3121 * and the inactive queue
3122 **********************************************************************/
3123
3124 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3125 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3126
3127 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3128 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3129 /*
3130 * make sure the pageout I/O threads are running
3131 * throttled in case there are still requests
3132 * in the laundry... since we have met our targets
3133 * we don't need the laundry to be cleaned in a timely
3134 * fashion... so let's avoid interfering with foreground
3135 * activity
3136 */
3137 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3138
3139 vm_free_page_lock();
3140
3141 if ((vm_page_free_count >= vm_page_free_target) &&
3142 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3143 /*
3144 * done - we have met our target *and*
3145 * there is no one waiting for a page.
3146 */
3147 return_from_scan:
3148 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3149
3150 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3151 vm_pageout_state.vm_pageout_inactive,
3152 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3153 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3154 vm_pageout_vminfo.vm_pageout_freed_speculative,
3155 vm_pageout_state.vm_pageout_inactive_clean,
3156 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3157 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3158
3159 return;
3160 }
3161 vm_free_page_unlock();
3162 }
3163
3164 /*
3165 * Before anything, we check if we have any ripe volatile
3166 * objects around. If so, try to purge the first object.
3167 * If the purge fails, fall through to reclaim a page instead.
3168 * If the purge succeeds, go back to the top and reevalute
3169 * the new memory situation.
3170 */
3171 retval = vps_purge_object();
3172
3173 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3174 /*
3175 * Success
3176 */
3177 if (object != NULL) {
3178 vm_object_unlock(object);
3179 object = NULL;
3180 }
3181
3182 lock_yield_check = FALSE;
3183 continue;
3184 }
3185
3186 /*
3187 * If our 'aged' queue is empty and we have some speculative pages
3188 * in the other queues, let's go through and see if we need to age
3189 * them.
3190 *
3191 * If we succeeded in aging a speculative Q or just that everything
3192 * looks normal w.r.t queue age and queue counts, we keep going onward.
3193 *
3194 * If, for some reason, we seem to have a mismatch between the spec.
3195 * page count and the page queues, we reset those variables and
3196 * restart the loop (LD TODO: Track this better?).
3197 */
3198 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3199 retval = vps_age_speculative_queue(force_speculative_aging);
3200
3201 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3202 lock_yield_check = FALSE;
3203 continue;
3204 }
3205 }
3206 force_speculative_aging = FALSE;
3207
3208 /*
3209 * Check to see if we need to evict objects from the cache.
3210 *
3211 * Note: 'object' here doesn't have anything to do with
3212 * the eviction part. We just need to make sure we have dropped
3213 * any object lock we might be holding if we need to go down
3214 * into the eviction logic.
3215 */
3216 retval = vps_object_cache_evict(&object);
3217
3218 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3219 lock_yield_check = FALSE;
3220 continue;
3221 }
3222
3223
3224 /*
3225 * Calculate our filecache_min that will affect the loop
3226 * going forward.
3227 */
3228 vps_calculate_filecache_min();
3229
3230 /*
3231 * LD TODO: Use a structure to hold all state variables for a single
3232 * vm_pageout_scan iteration and pass that structure to this function instead.
3233 */
3234 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3235 &delayed_unlock, &local_freeq, &local_freed,
3236 &vm_pageout_deadlock_target, inactive_burst_count);
3237
3238 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3239 if (loop_count >= vm_page_inactive_count) {
3240 loop_count = 0;
3241 }
3242
3243 inactive_burst_count = 0;
3244
3245 assert(object == NULL);
3246 assert(delayed_unlock != 0);
3247
3248 lock_yield_check = FALSE;
3249 continue;
3250 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3251 goto return_from_scan;
3252 }
3253
3254 flow_control.state = FCS_IDLE;
3255
3256 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3257 vm_pageout_inactive_external_forced_reactivate_limit);
3258 loop_count++;
3259 inactive_burst_count++;
3260 vm_pageout_state.vm_pageout_inactive++;
3261
3262 /*
3263 * Choose a victim.
3264 */
3265
3266 m = NULL;
3267 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3268
3269 if (m == NULL) {
3270 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3271 inactive_burst_count = 0;
3272
3273 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3274 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3275 }
3276
3277 lock_yield_check = TRUE;
3278 continue;
3279 }
3280
3281 /*
3282 * if we've gotten here, we have no victim page.
3283 * check to see if we've not finished balancing the queues
3284 * or we have a page on the aged speculative queue that we
3285 * skipped due to force_anonymous == TRUE.. or we have
3286 * speculative pages that we can prematurely age... if
3287 * one of these cases we'll keep going, else panic
3288 */
3289 force_anonymous = FALSE;
3290 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3291
3292 if (!vm_page_queue_empty(&sq->age_q)) {
3293 lock_yield_check = TRUE;
3294 continue;
3295 }
3296
3297 if (vm_page_speculative_count) {
3298 force_speculative_aging = TRUE;
3299 lock_yield_check = TRUE;
3300 continue;
3301 }
3302 panic("vm_pageout: no victim");
3303
3304 /* NOTREACHED */
3305 }
3306
3307 assert(VM_PAGE_PAGEABLE(m));
3308 m_object = VM_PAGE_OBJECT(m);
3309 force_anonymous = FALSE;
3310
3311 page_prev_q_state = m->vmp_q_state;
3312 /*
3313 * we just found this page on one of our queues...
3314 * it can't also be on the pageout queue, so safe
3315 * to call vm_page_queues_remove
3316 */
3317 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3318 vm_page_queues_remove(m, TRUE);
3319 if (donate) {
3320 /*
3321 * The compressor needs to see this bit to know
3322 * where this page needs to land. Also if stolen,
3323 * this bit helps put the page back in the right
3324 * special queue where it belongs.
3325 */
3326 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3327 }
3328
3329 assert(!m->vmp_laundry);
3330 assert(!m->vmp_private);
3331 assert(!m->vmp_fictitious);
3332 assert(!is_kernel_object(m_object));
3333 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3334
3335 vm_pageout_vminfo.vm_pageout_considered_page++;
3336
3337 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3338
3339 /*
3340 * check to see if we currently are working
3341 * with the same object... if so, we've
3342 * already got the lock
3343 */
3344 if (m_object != object) {
3345 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3346
3347 /*
3348 * vps_switch_object() will always drop the 'object' lock first
3349 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3350 * either 'm_object' or NULL.
3351 */
3352 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3353
3354 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3355 lock_yield_check = TRUE;
3356 continue;
3357 }
3358 }
3359 assert(m_object == object);
3360 assert(VM_PAGE_OBJECT(m) == m_object);
3361
3362 if (m->vmp_busy) {
3363 /*
3364 * Somebody is already playing with this page.
3365 * Put it back on the appropriate queue
3366 *
3367 */
3368 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3369
3370 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3371 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3372 }
3373
3374 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3375
3376 lock_yield_check = TRUE;
3377 continue;
3378 }
3379
3380 /*
3381 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3382 * If already cleaning this page in place
3383 * just leave if off the paging queues.
3384 * We can leave the page mapped, and upl_commit_range
3385 * will put it on the clean queue.
3386 *
3387 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3388 * an msync INVALIDATE is in progress...
3389 * this page has been marked for destruction
3390 * after it has been cleaned,
3391 * but not yet gathered into a UPL
3392 * where 'cleaning' will be set...
3393 * just leave it off the paging queues
3394 *
3395 * if (m->vmp_free_when_done && m->vmp_clenaing)
3396 * an msync INVALIDATE is in progress
3397 * and the UPL has already gathered this page...
3398 * just leave it off the paging queues
3399 */
3400 if (m->vmp_free_when_done || m->vmp_cleaning) {
3401 lock_yield_check = TRUE;
3402 continue;
3403 }
3404
3405
3406 /*
3407 * If it's absent, in error or the object is no longer alive,
3408 * we can reclaim the page... in the no longer alive case,
3409 * there are 2 states the page can be in that preclude us
3410 * from reclaiming it - busy or cleaning - that we've already
3411 * dealt with
3412 */
3413 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3414 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3415 if (m->vmp_absent) {
3416 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3417 } else if (!object->alive ||
3418 (!object->internal &&
3419 object->pager == MEMORY_OBJECT_NULL)) {
3420 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3421 } else {
3422 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3423 }
3424 reclaim_page:
3425 if (vm_pageout_deadlock_target) {
3426 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3427 vm_pageout_deadlock_target--;
3428 }
3429
3430 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3431
3432 if (object->internal) {
3433 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3434 } else {
3435 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3436 }
3437 assert(!m->vmp_cleaning);
3438 assert(!m->vmp_laundry);
3439
3440 if (!object->internal &&
3441 object->pager != NULL &&
3442 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3443 shared_region_pager_reclaimed++;
3444 }
3445
3446 m->vmp_busy = TRUE;
3447
3448 /*
3449 * remove page from object here since we're already
3450 * behind the object lock... defer the rest of the work
3451 * we'd normally do in vm_page_free_prepare_object
3452 * until 'vm_page_free_list' is called
3453 */
3454 if (m->vmp_tabled) {
3455 vm_page_remove(m, TRUE);
3456 }
3457
3458 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3459 m->vmp_snext = local_freeq;
3460 local_freeq = m;
3461 local_freed++;
3462
3463 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3464 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3465 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3466 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3467 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3468 vm_pageout_vminfo.vm_pageout_freed_internal++;
3469 } else {
3470 vm_pageout_vminfo.vm_pageout_freed_external++;
3471 }
3472
3473 inactive_burst_count = 0;
3474
3475 lock_yield_check = TRUE;
3476 continue;
3477 }
3478 if (object->vo_copy == VM_OBJECT_NULL) {
3479 /*
3480 * No one else can have any interest in this page.
3481 * If this is an empty purgable object, the page can be
3482 * reclaimed even if dirty.
3483 * If the page belongs to a volatile purgable object, we
3484 * reactivate it if the compressor isn't active.
3485 */
3486 if (object->purgable == VM_PURGABLE_EMPTY) {
3487 if (m->vmp_pmapped == TRUE) {
3488 /* unmap the page */
3489 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3490 if (refmod_state & VM_MEM_MODIFIED) {
3491 SET_PAGE_DIRTY(m, FALSE);
3492 }
3493 }
3494 if (m->vmp_dirty || m->vmp_precious) {
3495 /* we saved the cost of cleaning this page ! */
3496 vm_page_purged_count++;
3497 }
3498 goto reclaim_page;
3499 }
3500
3501 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3502 /*
3503 * With the VM compressor, the cost of
3504 * reclaiming a page is much lower (no I/O),
3505 * so if we find a "volatile" page, it's better
3506 * to let it get compressed rather than letting
3507 * it occupy a full page until it gets purged.
3508 * So no need to check for "volatile" here.
3509 */
3510 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3511 /*
3512 * Avoid cleaning a "volatile" page which might
3513 * be purged soon.
3514 */
3515
3516 /* if it's wired, we can't put it on our queue */
3517 assert(!VM_PAGE_WIRED(m));
3518
3519 /* just stick it back on! */
3520 reactivated_this_call++;
3521
3522 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3523 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3524 }
3525
3526 goto reactivate_page;
3527 }
3528 } /* vo_copy NULL */
3529 /*
3530 * If it's being used, reactivate.
3531 * (Fictitious pages are either busy or absent.)
3532 * First, update the reference and dirty bits
3533 * to make sure the page is unreferenced.
3534 */
3535 refmod_state = -1;
3536
3537 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3538 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3539
3540 if (refmod_state & VM_MEM_REFERENCED) {
3541 m->vmp_reference = TRUE;
3542 }
3543 if (refmod_state & VM_MEM_MODIFIED) {
3544 SET_PAGE_DIRTY(m, FALSE);
3545 }
3546 }
3547
3548 if (m->vmp_reference || m->vmp_dirty) {
3549 /* deal with a rogue "reusable" page */
3550 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3551 }
3552
3553 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3554 vm_pageout_state.vm_page_xpmapped_min = 0;
3555 } else {
3556 vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3557 vm_pageout_state.vm_page_xpmapped_min_divisor;
3558 }
3559
3560 if (!m->vmp_no_cache &&
3561 page_from_bg_q == FALSE &&
3562 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3563 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3564 /*
3565 * The page we pulled off the inactive list has
3566 * been referenced. It is possible for other
3567 * processors to be touching pages faster than we
3568 * can clear the referenced bit and traverse the
3569 * inactive queue, so we limit the number of
3570 * reactivations.
3571 */
3572 if (++reactivated_this_call >= reactivate_limit &&
3573 !object->object_is_shared_cache &&
3574 !((m->vmp_realtime ||
3575 object->for_realtime) &&
3576 vm_pageout_protect_realtime)) {
3577 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3578 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3579 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3580 if (object->object_is_shared_cache) {
3581 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3582 } else if (m->vmp_realtime ||
3583 object->for_realtime) {
3584 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3585 }
3586 } else {
3587 uint32_t isinuse;
3588
3589 if (reactivated_this_call >= reactivate_limit) {
3590 if (object->object_is_shared_cache) {
3591 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3592 } else if ((m->vmp_realtime ||
3593 object->for_realtime) &&
3594 vm_pageout_protect_realtime) {
3595 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3596 }
3597 }
3598 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3599 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3600 }
3601
3602 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3603 reactivate_page:
3604 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3605 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3606 /*
3607 * no explict mappings of this object exist
3608 * and it's not open via the filesystem
3609 */
3610 vm_page_deactivate(m);
3611 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3612 } else {
3613 /*
3614 * The page was/is being used, so put back on active list.
3615 */
3616 vm_page_activate(m);
3617 counter_inc(&vm_statistics_reactivations);
3618 inactive_burst_count = 0;
3619 }
3620 #if DEVELOPMENT || DEBUG
3621 if (page_from_bg_q == TRUE) {
3622 if (m_object->internal) {
3623 vm_pageout_rejected_bq_internal++;
3624 } else {
3625 vm_pageout_rejected_bq_external++;
3626 }
3627 }
3628 #endif /* DEVELOPMENT || DEBUG */
3629
3630 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3631 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3632 }
3633 vm_pageout_state.vm_pageout_inactive_used++;
3634
3635 lock_yield_check = TRUE;
3636 continue;
3637 }
3638 /*
3639 * Make sure we call pmap_get_refmod() if it
3640 * wasn't already called just above, to update
3641 * the dirty bit.
3642 */
3643 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3644 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3645 if (refmod_state & VM_MEM_MODIFIED) {
3646 SET_PAGE_DIRTY(m, FALSE);
3647 }
3648 }
3649 }
3650
3651 /*
3652 * we've got a candidate page to steal...
3653 *
3654 * m->vmp_dirty is up to date courtesy of the
3655 * preceding check for m->vmp_reference... if
3656 * we get here, then m->vmp_reference had to be
3657 * FALSE (or possibly "reactivate_limit" was
3658 * exceeded), but in either case we called
3659 * pmap_get_refmod() and updated both
3660 * m->vmp_reference and m->vmp_dirty
3661 *
3662 * if it's dirty or precious we need to
3663 * see if the target queue is throtttled
3664 * it if is, we need to skip over it by moving it back
3665 * to the end of the inactive queue
3666 */
3667
3668 inactive_throttled = FALSE;
3669
3670 if (m->vmp_dirty || m->vmp_precious) {
3671 if (object->internal) {
3672 if (VM_PAGE_Q_THROTTLED(iq)) {
3673 inactive_throttled = TRUE;
3674 }
3675 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3676 inactive_throttled = TRUE;
3677 }
3678 }
3679 throttle_inactive:
3680 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3681 object->internal && m->vmp_dirty &&
3682 (object->purgable == VM_PURGABLE_DENY ||
3683 object->purgable == VM_PURGABLE_NONVOLATILE ||
3684 object->purgable == VM_PURGABLE_VOLATILE)) {
3685 vm_page_check_pageable_safe(m);
3686 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3687 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3688 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3689 vm_page_throttled_count++;
3690
3691 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3692
3693 inactive_burst_count = 0;
3694
3695 lock_yield_check = TRUE;
3696 continue;
3697 }
3698 if (inactive_throttled == TRUE) {
3699 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3700 &force_anonymous, page_from_bg_q);
3701
3702 inactive_burst_count = 0;
3703
3704 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3705 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3706 }
3707
3708 lock_yield_check = TRUE;
3709 continue;
3710 }
3711
3712 /*
3713 * we've got a page that we can steal...
3714 * eliminate all mappings and make sure
3715 * we have the up-to-date modified state
3716 *
3717 * if we need to do a pmap_disconnect then we
3718 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3719 * provides the true state atomically... the
3720 * page was still mapped up to the pmap_disconnect
3721 * and may have been dirtied at the last microsecond
3722 *
3723 * Note that if 'pmapped' is FALSE then the page is not
3724 * and has not been in any map, so there is no point calling
3725 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3726 * of likely usage of the page.
3727 */
3728 if (m->vmp_pmapped == TRUE) {
3729 int pmap_options;
3730
3731 /*
3732 * Don't count this page as going into the compressor
3733 * if any of these are true:
3734 * 1) compressed pager isn't enabled
3735 * 2) Freezer enabled device with compressed pager
3736 * backend (exclusive use) i.e. most of the VM system
3737 * (including vm_pageout_scan) has no knowledge of
3738 * the compressor
3739 * 3) This page belongs to a file and hence will not be
3740 * sent into the compressor
3741 */
3742 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3743 object->internal == FALSE) {
3744 pmap_options = 0;
3745 } else if (m->vmp_dirty || m->vmp_precious) {
3746 /*
3747 * VM knows that this page is dirty (or
3748 * precious) and needs to be compressed
3749 * rather than freed.
3750 * Tell the pmap layer to count this page
3751 * as "compressed".
3752 */
3753 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3754 } else {
3755 /*
3756 * VM does not know if the page needs to
3757 * be preserved but the pmap layer might tell
3758 * us if any mapping has "modified" it.
3759 * Let's the pmap layer to count this page
3760 * as compressed if and only if it has been
3761 * modified.
3762 */
3763 pmap_options =
3764 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3765 }
3766 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3767 pmap_options,
3768 NULL);
3769 if (refmod_state & VM_MEM_MODIFIED) {
3770 SET_PAGE_DIRTY(m, FALSE);
3771 }
3772 }
3773
3774 /*
3775 * reset our count of pages that have been reclaimed
3776 * since the last page was 'stolen'
3777 */
3778 inactive_reclaim_run = 0;
3779
3780 /*
3781 * If it's clean and not precious, we can free the page.
3782 */
3783 if (!m->vmp_dirty && !m->vmp_precious) {
3784 vm_pageout_state.vm_pageout_inactive_clean++;
3785
3786 /*
3787 * OK, at this point we have found a page we are going to free.
3788 */
3789 #if CONFIG_PHANTOM_CACHE
3790 if (!object->internal) {
3791 vm_phantom_cache_add_ghost(m);
3792 }
3793 #endif
3794 goto reclaim_page;
3795 }
3796
3797 /*
3798 * The page may have been dirtied since the last check
3799 * for a throttled target queue (which may have been skipped
3800 * if the page was clean then). With the dirty page
3801 * disconnected here, we can make one final check.
3802 */
3803 if (object->internal) {
3804 if (VM_PAGE_Q_THROTTLED(iq)) {
3805 inactive_throttled = TRUE;
3806 }
3807 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3808 inactive_throttled = TRUE;
3809 }
3810
3811 if (inactive_throttled == TRUE) {
3812 goto throttle_inactive;
3813 }
3814
3815 #if VM_PRESSURE_EVENTS
3816 #if CONFIG_JETSAM
3817
3818 /*
3819 * If Jetsam is enabled, then the sending
3820 * of memory pressure notifications is handled
3821 * from the same thread that takes care of high-water
3822 * and other jetsams i.e. the memorystatus_thread.
3823 */
3824
3825 #else /* CONFIG_JETSAM */
3826
3827 vm_pressure_response();
3828
3829 #endif /* CONFIG_JETSAM */
3830 #endif /* VM_PRESSURE_EVENTS */
3831
3832 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3833 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3834 }
3835
3836 if (object->internal) {
3837 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3838 } else {
3839 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3840 }
3841
3842 /*
3843 * internal pages will go to the compressor...
3844 * external pages will go to the appropriate pager to be cleaned
3845 * and upon completion will end up on 'vm_page_queue_cleaned' which
3846 * is a preferred queue to steal from
3847 */
3848 vm_pageout_cluster(m);
3849 inactive_burst_count = 0;
3850
3851 /*
3852 * back to top of pageout scan loop
3853 */
3854 }
3855 }
3856
3857
3858 void
3859 vm_page_free_reserve(
3860 int pages)
3861 {
3862 int free_after_reserve;
3863
3864 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3865 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3866 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3867 } else {
3868 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3869 }
3870 } else {
3871 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3872 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3873 } else {
3874 vm_page_free_reserved += pages;
3875 }
3876 }
3877 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3878
3879 vm_page_free_min = vm_page_free_reserved +
3880 VM_PAGE_FREE_MIN(free_after_reserve);
3881
3882 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3883 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3884 }
3885
3886 vm_page_free_target = vm_page_free_reserved +
3887 VM_PAGE_FREE_TARGET(free_after_reserve);
3888
3889 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3890 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3891 }
3892
3893 if (vm_page_free_target < vm_page_free_min + 5) {
3894 vm_page_free_target = vm_page_free_min + 5;
3895 }
3896
3897 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3898 }
3899
3900 /*
3901 * vm_pageout is the high level pageout daemon.
3902 */
3903
3904 void
3905 vm_pageout_continue(void)
3906 {
3907 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3908 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3909
3910 vm_free_page_lock();
3911 vm_pageout_running = TRUE;
3912 vm_free_page_unlock();
3913
3914 vm_pageout_scan();
3915 /*
3916 * we hold both the vm_page_queue_free_lock
3917 * and the vm_page_queues_lock at this point
3918 */
3919 assert(vm_page_free_wanted == 0);
3920 assert(vm_page_free_wanted_privileged == 0);
3921 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3922
3923 vm_pageout_running = FALSE;
3924 #if XNU_TARGET_OS_OSX
3925 if (vm_pageout_waiter) {
3926 vm_pageout_waiter = FALSE;
3927 thread_wakeup((event_t)&vm_pageout_waiter);
3928 }
3929 #endif /* XNU_TARGET_OS_OSX */
3930
3931 vm_free_page_unlock();
3932 vm_page_unlock_queues();
3933
3934 thread_block((thread_continue_t)vm_pageout_continue);
3935 /*NOTREACHED*/
3936 }
3937
3938 #if XNU_TARGET_OS_OSX
3939 kern_return_t
3940 vm_pageout_wait(uint64_t deadline)
3941 {
3942 kern_return_t kr;
3943
3944 vm_free_page_lock();
3945 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3946 vm_pageout_waiter = TRUE;
3947 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3948 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3949 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3950 kr = KERN_OPERATION_TIMED_OUT;
3951 }
3952 }
3953 vm_free_page_unlock();
3954
3955 return kr;
3956 }
3957 #endif /* XNU_TARGET_OS_OSX */
3958
3959 OS_NORETURN
3960 static void
3961 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3962 {
3963 vm_page_t m = NULL;
3964 vm_object_t object;
3965 vm_object_offset_t offset;
3966 memory_object_t pager;
3967 struct vm_pageout_queue *q = ethr->q;
3968
3969 /* On systems with a compressor, the external IO thread clears its
3970 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3971 * creation)
3972 */
3973 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3974 current_thread()->options &= ~TH_OPT_VMPRIV;
3975 }
3976
3977 sched_cond_ack(&(ethr->pgo_wakeup));
3978
3979 while (true) {
3980 vm_page_lockspin_queues();
3981
3982 while (!vm_page_queue_empty(&q->pgo_pending)) {
3983 q->pgo_busy = TRUE;
3984 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3985
3986 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3987 VM_PAGE_CHECK(m);
3988 /*
3989 * grab a snapshot of the object and offset this
3990 * page is tabled in so that we can relookup this
3991 * page after we've taken the object lock - these
3992 * fields are stable while we hold the page queues lock
3993 * but as soon as we drop it, there is nothing to keep
3994 * this page in this object... we hold an activity_in_progress
3995 * on this object which will keep it from terminating
3996 */
3997 object = VM_PAGE_OBJECT(m);
3998 offset = m->vmp_offset;
3999
4000 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4001 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4002
4003 vm_page_unlock_queues();
4004
4005 vm_object_lock(object);
4006
4007 m = vm_page_lookup(object, offset);
4008
4009 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4010 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4011 /*
4012 * it's either the same page that someone else has
4013 * started cleaning (or it's finished cleaning or
4014 * been put back on the pageout queue), or
4015 * the page has been freed or we have found a
4016 * new page at this offset... in all of these cases
4017 * we merely need to release the activity_in_progress
4018 * we took when we put the page on the pageout queue
4019 */
4020 vm_object_activity_end(object);
4021 vm_object_unlock(object);
4022
4023 vm_page_lockspin_queues();
4024 continue;
4025 }
4026 pager = object->pager;
4027
4028 if (pager == MEMORY_OBJECT_NULL) {
4029 /*
4030 * This pager has been destroyed by either
4031 * memory_object_destroy or vm_object_destroy, and
4032 * so there is nowhere for the page to go.
4033 */
4034 if (m->vmp_free_when_done) {
4035 /*
4036 * Just free the page... VM_PAGE_FREE takes
4037 * care of cleaning up all the state...
4038 * including doing the vm_pageout_throttle_up
4039 */
4040 VM_PAGE_FREE(m);
4041 } else {
4042 vm_page_lockspin_queues();
4043
4044 vm_pageout_throttle_up(m);
4045 vm_page_activate(m);
4046
4047 vm_page_unlock_queues();
4048
4049 /*
4050 * And we are done with it.
4051 */
4052 }
4053 vm_object_activity_end(object);
4054 vm_object_unlock(object);
4055
4056 vm_page_lockspin_queues();
4057 continue;
4058 }
4059 #if 0
4060 /*
4061 * we don't hold the page queue lock
4062 * so this check isn't safe to make
4063 */
4064 VM_PAGE_CHECK(m);
4065 #endif
4066 /*
4067 * give back the activity_in_progress reference we
4068 * took when we queued up this page and replace it
4069 * it with a paging_in_progress reference that will
4070 * also hold the paging offset from changing and
4071 * prevent the object from terminating
4072 */
4073 vm_object_activity_end(object);
4074 vm_object_paging_begin(object);
4075 vm_object_unlock(object);
4076
4077 /*
4078 * Send the data to the pager.
4079 * any pageout clustering happens there
4080 */
4081 memory_object_data_return(pager,
4082 m->vmp_offset + object->paging_offset,
4083 PAGE_SIZE,
4084 NULL,
4085 NULL,
4086 FALSE,
4087 FALSE,
4088 0);
4089
4090 vm_object_lock(object);
4091 vm_object_paging_end(object);
4092 vm_object_unlock(object);
4093
4094 vm_pageout_io_throttle();
4095
4096 vm_page_lockspin_queues();
4097 }
4098 q->pgo_busy = FALSE;
4099
4100 vm_page_unlock_queues();
4101 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4102 }
4103 /*NOTREACHED*/
4104 }
4105
4106 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4107
4108 #if DEVELOPMENT || DEBUG
4109 static void
4110 vm_pageout_record_thread_time(int cqid, int ncomps)
4111 {
4112 if (__improbable(vm_compressor_time_thread)) {
4113 vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4114 vmct_stats.vmct_pages[cqid] += ncomps;
4115 vmct_stats.vmct_iterations[cqid]++;
4116 if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4117 vmct_stats.vmct_maxpages[cqid] = ncomps;
4118 }
4119 if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4120 vmct_stats.vmct_minpages[cqid] = ncomps;
4121 }
4122 }
4123 }
4124 #endif
4125
4126 static void *
4127 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4128 {
4129 /*
4130 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4131 * However, this page has been removed from all queues and is only
4132 * known to this compressor thread dealing with this local queue.
4133 *
4134 * TODO: Add a second localq that is the early localq and
4135 * put special pages like this one on that queue in the block above
4136 * under the pageq lock to avoid this 'works but not clean' logic.
4137 */
4138 void *donate_queue_head;
4139 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4140 donate_queue_head = &cq->current_early_swapout_chead;
4141 #else /* XNU_TARGET_OS_OSX */
4142 donate_queue_head = &cq->current_late_swapout_chead;
4143 #endif /* XNU_TARGET_OS_OSX */
4144 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4145 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4146 return donate_queue_head;
4147 } else {
4148 return &cq->current_regular_swapout_chead;
4149 }
4150 }
4151
4152 #define MAX_FREE_BATCH 32
4153
4154 OS_NORETURN
4155 static void
4156 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4157 {
4158 struct vm_pageout_queue *q;
4159 vm_page_t m = NULL;
4160 boolean_t pgo_draining;
4161 vm_page_t local_q;
4162 int local_cnt;
4163 vm_page_t local_freeq = NULL;
4164 int local_freed = 0;
4165 int local_batch_size;
4166 #if DEVELOPMENT || DEBUG
4167 int ncomps = 0;
4168 boolean_t marked_active = FALSE;
4169 int num_pages_processed = 0;
4170 #endif
4171 void *chead = NULL;
4172
4173 KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4174
4175 sched_cond_ack(&(cq->pgo_wakeup));
4176
4177 q = cq->q;
4178
4179 while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4180 #if DEVELOPMENT || DEBUG
4181 bool benchmark_accounting = false;
4182 /* If we're running the compressor perf test, only process the benchmark pages.
4183 * We'll get back to our regular queue once the benchmark is done */
4184 if (compressor_running_perf_test) {
4185 q = cq->benchmark_q;
4186 if (!vm_page_queue_empty(&q->pgo_pending)) {
4187 benchmark_accounting = true;
4188 } else {
4189 q = cq->q;
4190 benchmark_accounting = false;
4191 }
4192 }
4193 #endif /* DEVELOPMENT || DEBUG */
4194
4195 #if __AMP__
4196 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4197 local_batch_size = (q->pgo_maxlaundry >> 3);
4198 local_batch_size = MAX(local_batch_size, 16);
4199 } else {
4200 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4201 }
4202 #else
4203 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4204 #endif
4205
4206 #if RECORD_THE_COMPRESSED_DATA
4207 if (q->pgo_laundry) {
4208 c_compressed_record_init();
4209 }
4210 #endif
4211 while (true) { /* this loop is for working though all the pages in the pending queue */
4212 int pages_left_on_q = 0;
4213
4214 local_cnt = 0;
4215 local_q = NULL;
4216
4217 KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4218
4219 vm_page_lock_queues();
4220 #if DEVELOPMENT || DEBUG
4221 if (marked_active == FALSE) {
4222 vmct_active++;
4223 vmct_state[cq->id] = VMCT_ACTIVE;
4224 marked_active = TRUE;
4225 if (vmct_active == 1) {
4226 vm_compressor_epoch_start = mach_absolute_time();
4227 }
4228 }
4229 #endif
4230 KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4231
4232 KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4233
4234 /* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4235 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4236 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4237 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4238 VM_PAGE_CHECK(m);
4239
4240 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4241 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4242 m->vmp_laundry = FALSE;
4243
4244 m->vmp_snext = local_q;
4245 local_q = m;
4246 local_cnt++;
4247 }
4248 if (local_q == NULL) {
4249 break;
4250 }
4251
4252 q->pgo_busy = TRUE;
4253
4254 if ((pgo_draining = q->pgo_draining) == FALSE) {
4255 vm_pageout_throttle_up_batch(q, local_cnt);
4256 pages_left_on_q = q->pgo_laundry;
4257 } else {
4258 pages_left_on_q = q->pgo_laundry - local_cnt;
4259 }
4260
4261 vm_page_unlock_queues();
4262
4263 #if !RECORD_THE_COMPRESSED_DATA
4264 /* if we have lots to compress, wake up the other thread to help.
4265 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4266 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4267 // wake up the next compressor thread
4268 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4269 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4270 }
4271 #endif
4272 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4273
4274 while (local_q) {
4275 KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4276
4277 m = local_q;
4278 local_q = m->vmp_snext;
4279 m->vmp_snext = NULL;
4280
4281
4282 chead = vm_pageout_select_filling_chead(cq, m);
4283
4284 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4285 #if DEVELOPMENT || DEBUG
4286 ncomps++;
4287 #endif
4288 KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4289
4290 m->vmp_snext = local_freeq;
4291 local_freeq = m;
4292 local_freed++;
4293
4294 /* if we gathered enough free pages, free them now */
4295 if (local_freed >= MAX_FREE_BATCH) {
4296 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4297
4298 vm_page_free_list(local_freeq, TRUE);
4299
4300 local_freeq = NULL;
4301 local_freed = 0;
4302 }
4303 }
4304 #if DEVELOPMENT || DEBUG
4305 num_pages_processed++;
4306 #endif /* DEVELOPMENT || DEBUG */
4307 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4308 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4309 kern_return_t wait_result;
4310 int need_wakeup = 0;
4311
4312 if (local_freeq) {
4313 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4314
4315 vm_page_free_list(local_freeq, TRUE);
4316 local_freeq = NULL;
4317 local_freed = 0;
4318
4319 continue;
4320 }
4321 vm_free_page_lock_spin();
4322
4323 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4324 if (vm_page_free_wanted_privileged++ == 0) {
4325 need_wakeup = 1;
4326 }
4327 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4328
4329 vm_free_page_unlock();
4330
4331 if (need_wakeup) {
4332 thread_wakeup((event_t)&vm_page_free_wanted);
4333 }
4334
4335 if (wait_result == THREAD_WAITING) {
4336 thread_block(THREAD_CONTINUE_NULL);
4337 }
4338 } else {
4339 vm_free_page_unlock();
4340 }
4341 }
4342 #endif
4343 } /* while (local_q) */
4344 /* free any leftovers in the freeq */
4345 if (local_freeq) {
4346 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4347
4348 vm_page_free_list(local_freeq, TRUE);
4349 local_freeq = NULL;
4350 local_freed = 0;
4351 }
4352 if (pgo_draining == TRUE) {
4353 vm_page_lockspin_queues();
4354 vm_pageout_throttle_up_batch(q, local_cnt);
4355 vm_page_unlock_queues();
4356 }
4357 }
4358 KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4359
4360 /*
4361 * queue lock is held and our q is empty
4362 */
4363 q->pgo_busy = FALSE;
4364 #if DEVELOPMENT || DEBUG
4365 if (marked_active == TRUE) {
4366 vmct_active--;
4367 vmct_state[cq->id] = VMCT_IDLE;
4368
4369 if (vmct_active == 0) {
4370 vm_compressor_epoch_stop = mach_absolute_time();
4371 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4372 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4373 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4374 /* This interval includes intervals where one or more
4375 * compressor threads were pre-empted
4376 */
4377 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4378 }
4379 }
4380 if (compressor_running_perf_test && benchmark_accounting) {
4381 /*
4382 * We could turn ON compressor_running_perf_test while still processing
4383 * regular non-benchmark pages. We shouldn't count them here else we
4384 * could overshoot. We might also still be populating that benchmark Q
4385 * and be under pressure. So we will go back to the regular queues. And
4386 * benchmark accounting will be off for that case too.
4387 */
4388 compressor_perf_test_pages_processed += num_pages_processed;
4389 thread_wakeup(&compressor_perf_test_pages_processed);
4390 }
4391 #endif
4392 vm_page_unlock_queues();
4393 #if DEVELOPMENT || DEBUG
4394 vm_pageout_record_thread_time(cq->id, ncomps);
4395 #endif
4396
4397 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4398 #if DEVELOPMENT || DEBUG
4399 if (compressor_running_perf_test && benchmark_accounting) {
4400 /*
4401 * We've been exclusively compressing pages from the benchmark queue,
4402 * do 1 pass over the internal queue before blocking.
4403 */
4404 continue;
4405 }
4406 #endif
4407
4408 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4409 }
4410 /*NOTREACHED*/
4411 }
4412
4413 /* resolves the pager and maintain stats in the pager and in the vm_object */
4414 kern_return_t
4415 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4416 {
4417 vm_object_t object;
4418 memory_object_t pager;
4419 int compressed_count_delta;
4420 kern_return_t retval;
4421
4422 object = VM_PAGE_OBJECT(m);
4423
4424 assert(!m->vmp_free_when_done);
4425 assert(!m->vmp_laundry);
4426
4427 pager = object->pager;
4428
4429 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4430 KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4431
4432 vm_object_lock(object);
4433
4434 /*
4435 * If there is no memory object for the page, create
4436 * one and hand it to the compression pager.
4437 */
4438
4439 if (!object->pager_initialized) {
4440 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4441 }
4442 if (!object->pager_initialized) {
4443 vm_object_compressor_pager_create(object);
4444 }
4445
4446 pager = object->pager;
4447
4448 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4449 /*
4450 * Still no pager for the object,
4451 * or the pager has been destroyed.
4452 * Reactivate the page.
4453 *
4454 * Should only happen if there is no
4455 * compression pager
4456 */
4457 vm_page_wakeup_done(object, m);
4458
4459 vm_page_lockspin_queues();
4460 vm_page_activate(m);
4461 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4462 vm_page_unlock_queues();
4463
4464 /*
4465 * And we are done with it.
4466 */
4467 vm_object_activity_end(object);
4468 vm_object_unlock(object);
4469
4470 return KERN_FAILURE;
4471 }
4472 vm_object_unlock(object);
4473
4474 KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4475 }
4476 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4477 assert(object->activity_in_progress > 0);
4478
4479 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4480 if (m->vmp_unmodified_ro == true) {
4481 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4482 }
4483 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4484
4485 vm_compressor_options_t flags = 0;
4486
4487 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4488 if (m->vmp_unmodified_ro) {
4489 flags |= C_PAGE_UNMODIFIED;
4490 }
4491 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4492
4493
4494 retval = vm_compressor_pager_put(
4495 pager,
4496 m->vmp_offset + object->paging_offset,
4497 VM_PAGE_GET_PHYS_PAGE(m),
4498 current_chead,
4499 scratch_buf,
4500 &compressed_count_delta,
4501 flags);
4502
4503 vm_object_lock(object);
4504
4505 assert(object->activity_in_progress > 0);
4506 assert(VM_PAGE_OBJECT(m) == object);
4507 assert( !VM_PAGE_WIRED(m));
4508
4509 vm_compressor_pager_count(pager,
4510 compressed_count_delta,
4511 FALSE, /* shared_lock */
4512 object);
4513
4514 if (retval == KERN_SUCCESS) {
4515 /*
4516 * If the object is purgeable, its owner's
4517 * purgeable ledgers will be updated in
4518 * vm_page_remove() but the page still
4519 * contributes to the owner's memory footprint,
4520 * so account for it as such.
4521 */
4522 if (m->vmp_tabled) {
4523 vm_page_remove(m, TRUE);
4524 }
4525 if ((object->purgable != VM_PURGABLE_DENY ||
4526 object->vo_ledger_tag) &&
4527 object->vo_owner != NULL) {
4528 /* one more compressed purgeable/tagged page */
4529 vm_object_owner_compressed_update(object,
4530 compressed_count_delta);
4531 }
4532 counter_inc(&vm_statistics_compressions);
4533 } else {
4534 vm_page_wakeup_done(object, m);
4535
4536 vm_page_lockspin_queues();
4537
4538 vm_page_activate(m);
4539 vm_pageout_vminfo.vm_compressor_failed++;
4540
4541 vm_page_unlock_queues();
4542 }
4543 vm_object_activity_end(object);
4544 vm_object_unlock(object);
4545
4546 return retval;
4547 }
4548
4549
4550 static void
4551 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4552 {
4553 uint32_t policy;
4554
4555 if (hibernate_cleaning_in_progress == TRUE) {
4556 req_lowpriority = FALSE;
4557 }
4558
4559 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4560 vm_page_unlock_queues();
4561
4562 if (req_lowpriority == TRUE) {
4563 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4564 DTRACE_VM(laundrythrottle);
4565 } else {
4566 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4567 DTRACE_VM(laundryunthrottle);
4568 }
4569 proc_set_thread_policy(ethr->pgo_iothread,
4570 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4571
4572 vm_page_lock_queues();
4573 ethr->q->pgo_lowpriority = req_lowpriority;
4574 }
4575 }
4576
4577 OS_NORETURN
4578 static void
4579 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4580 {
4581 thread_t self = current_thread();
4582
4583 self->options |= TH_OPT_VMPRIV;
4584
4585 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4586
4587 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4588 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4589
4590 vm_page_lock_queues();
4591
4592 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4593 vm_pageout_queue_external.pgo_inited = TRUE;
4594
4595 vm_page_unlock_queues();
4596
4597 #if CONFIG_THREAD_GROUPS
4598 thread_group_vm_add();
4599 #endif /* CONFIG_THREAD_GROUPS */
4600
4601 vm_pageout_iothread_external_continue(ethr, 0);
4602 /*NOTREACHED*/
4603 }
4604
4605
4606 OS_NORETURN
4607 static void
4608 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4609 {
4610 thread_t self = current_thread();
4611
4612 self->options |= TH_OPT_VMPRIV;
4613
4614 vm_page_lock_queues();
4615
4616 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4617 vm_pageout_queue_internal.pgo_inited = TRUE;
4618
4619 #if DEVELOPMENT || DEBUG
4620 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4621 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4622 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4623 #endif /* DEVELOPMENT || DEBUG */
4624
4625 vm_page_unlock_queues();
4626
4627 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4628 thread_vm_bind_group_add();
4629 }
4630
4631 #if CONFIG_THREAD_GROUPS
4632 thread_group_vm_add();
4633 #endif /* CONFIG_THREAD_GROUPS */
4634
4635 #if __AMP__
4636 if (vm_compressor_ebound) {
4637 /*
4638 * Use the soft bound option for vm_compressor to allow it to run on
4639 * P-cores if E-cluster is unavailable.
4640 */
4641 thread_bind_cluster_type(self, 'E', true);
4642 }
4643 #endif /* __AMP__ */
4644
4645 thread_set_thread_name(current_thread(), "VM_compressor");
4646 #if DEVELOPMENT || DEBUG
4647 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4648 #endif
4649 vm_pageout_iothread_internal_continue(cthr, 0);
4650
4651 /*NOTREACHED*/
4652 }
4653
4654 kern_return_t
4655 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4656 {
4657 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4658 return KERN_SUCCESS;
4659 } else {
4660 return KERN_FAILURE; /* Already set */
4661 }
4662 }
4663
4664 extern boolean_t memorystatus_manual_testing_on;
4665 extern unsigned int memorystatus_level;
4666
4667
4668 #if VM_PRESSURE_EVENTS
4669
4670 boolean_t vm_pressure_events_enabled = FALSE;
4671
4672 extern uint64_t next_warning_notification_sent_at_ts;
4673 extern uint64_t next_critical_notification_sent_at_ts;
4674
4675 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4676
4677 /*
4678 * The last time there was change in pressure level OR we forced a check
4679 * because the system is stuck in a non-normal pressure level.
4680 */
4681 uint64_t vm_pressure_last_level_transition_abs = 0;
4682
4683 /*
4684 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4685 * level before resending out notifications for that level again.
4686 */
4687 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4688
4689 void
4690 vm_pressure_response(void)
4691 {
4692 vm_pressure_level_t old_level = kVMPressureNormal;
4693 int new_level = -1;
4694 unsigned int total_pages;
4695 uint64_t available_memory = 0;
4696 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4697 bool force_check = false;
4698 int time_in_mins;
4699
4700
4701 if (vm_pressure_events_enabled == FALSE) {
4702 return;
4703 }
4704
4705 #if !XNU_TARGET_OS_OSX
4706
4707 available_memory = (uint64_t) memorystatus_available_pages;
4708
4709 #else /* !XNU_TARGET_OS_OSX */
4710
4711 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4712 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4713
4714 #endif /* !XNU_TARGET_OS_OSX */
4715
4716 total_pages = (unsigned int) atop_64(max_mem);
4717 #if CONFIG_SECLUDED_MEMORY
4718 total_pages -= vm_page_secluded_count;
4719 #endif /* CONFIG_SECLUDED_MEMORY */
4720 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4721
4722 if (memorystatus_manual_testing_on) {
4723 return;
4724 }
4725
4726 curr_ts = mach_absolute_time();
4727 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4728
4729 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4730 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4731 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4732
4733 old_level = memorystatus_vm_pressure_level;
4734
4735 switch (memorystatus_vm_pressure_level) {
4736 case kVMPressureNormal:
4737 {
4738 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4739 new_level = kVMPressureCritical;
4740 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4741 new_level = kVMPressureWarning;
4742 }
4743 break;
4744 }
4745
4746 case kVMPressureWarning:
4747 case kVMPressureUrgent:
4748 {
4749 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4750 new_level = kVMPressureNormal;
4751 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4752 new_level = kVMPressureCritical;
4753 } else if (force_check) {
4754 new_level = kVMPressureWarning;
4755 next_warning_notification_sent_at_ts = curr_ts;
4756 }
4757 break;
4758 }
4759
4760 case kVMPressureCritical:
4761 {
4762 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4763 new_level = kVMPressureNormal;
4764 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4765 new_level = kVMPressureWarning;
4766 } else if (force_check) {
4767 new_level = kVMPressureCritical;
4768 next_critical_notification_sent_at_ts = curr_ts;
4769 }
4770 break;
4771 }
4772
4773 default:
4774 return;
4775 }
4776
4777 if (new_level != -1 || force_check) {
4778 if (new_level != -1) {
4779 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4780
4781 if (new_level != (int) old_level) {
4782 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4783 new_level, old_level, 0, 0);
4784 }
4785 } else {
4786 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4787 new_level, old_level, force_check, 0);
4788 }
4789
4790 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4791 /*
4792 * We don't want to schedule a wakeup while hibernation is in progress
4793 * because that could collide with checks for non-monotonicity in the scheduler.
4794 * We do however do all the updates to memorystatus_vm_pressure_level because
4795 * we _might_ want to use that for decisions regarding which pages or how
4796 * many pages we want to dump in hibernation.
4797 */
4798 return;
4799 }
4800
4801 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4802 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4803 thread_wakeup(&vm_pressure_thread);
4804 }
4805
4806 if (old_level != memorystatus_vm_pressure_level) {
4807 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4808 }
4809 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4810 }
4811 }
4812 }
4813 #endif /* VM_PRESSURE_EVENTS */
4814
4815
4816 /**
4817 * Called by a kernel thread to ask if a number of pages may be wired.
4818 */
4819 kern_return_t
4820 mach_vm_wire_level_monitor(int64_t requested_pages)
4821 {
4822 if (requested_pages <= 0) {
4823 return KERN_INVALID_ARGUMENT;
4824 }
4825
4826 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4827 /**
4828 * Available pages can be negative in the case where more system memory is
4829 * wired than the threshold, so we must use a signed integer.
4830 */
4831 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4832
4833 if (requested_pages > available_pages) {
4834 return KERN_RESOURCE_SHORTAGE;
4835 }
4836 return KERN_SUCCESS;
4837 }
4838
4839 /*
4840 * Function called by a kernel thread to either get the current pressure level or
4841 * wait until memory pressure changes from a given level.
4842 */
4843 kern_return_t
4844 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4845 {
4846 #if !VM_PRESSURE_EVENTS
4847 (void)wait_for_pressure;
4848 (void)pressure_level;
4849 return KERN_NOT_SUPPORTED;
4850 #else /* VM_PRESSURE_EVENTS */
4851
4852 uint32_t *waiters = NULL;
4853 wait_result_t wr = 0;
4854 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4855
4856 if (pressure_level == NULL) {
4857 return KERN_INVALID_ARGUMENT;
4858 }
4859 if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4860 *pressure_level == kVMPressureForegroundJetsam)) {
4861 return KERN_INVALID_ARGUMENT;
4862 }
4863
4864 if (wait_for_pressure) {
4865 switch (*pressure_level) {
4866 case kVMPressureForegroundJetsam:
4867 case kVMPressureBackgroundJetsam:
4868
4869 if (*pressure_level == kVMPressureForegroundJetsam) {
4870 waiters = &memorystatus_jetsam_fg_band_waiters;
4871 } else {
4872 /* kVMPressureBackgroundJetsam */
4873 waiters = &memorystatus_jetsam_bg_band_waiters;
4874 }
4875
4876 lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4877 wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4878 if (wr == THREAD_WAITING) {
4879 *waiters += 1;
4880 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4881 wr = thread_block(THREAD_CONTINUE_NULL);
4882 } else {
4883 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4884 }
4885
4886 if (wr != THREAD_AWAKENED) {
4887 return KERN_ABORTED;
4888 }
4889
4890 return KERN_SUCCESS;
4891 case kVMPressureNormal:
4892 case kVMPressureWarning:
4893 case kVMPressureUrgent:
4894 case kVMPressureCritical:
4895 while (old_level == *pressure_level) {
4896 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4897 THREAD_INTERRUPTIBLE);
4898 if (wr == THREAD_WAITING) {
4899 wr = thread_block(THREAD_CONTINUE_NULL);
4900 }
4901 if (wr == THREAD_INTERRUPTED) {
4902 return KERN_ABORTED;
4903 }
4904
4905 if (wr == THREAD_AWAKENED) {
4906 old_level = memorystatus_vm_pressure_level;
4907 }
4908 }
4909 break;
4910 default:
4911 return KERN_INVALID_ARGUMENT;
4912 }
4913 }
4914
4915 *pressure_level = old_level;
4916 return KERN_SUCCESS;
4917 #endif /* VM_PRESSURE_EVENTS */
4918 }
4919
4920 #if VM_PRESSURE_EVENTS
4921 void
4922 vm_pressure_thread(void)
4923 {
4924 static boolean_t thread_initialized = FALSE;
4925
4926 if (thread_initialized == TRUE) {
4927 vm_pageout_state.vm_pressure_thread_running = TRUE;
4928 consider_vm_pressure_events();
4929 vm_pageout_state.vm_pressure_thread_running = FALSE;
4930 }
4931
4932 #if CONFIG_THREAD_GROUPS
4933 thread_group_vm_add();
4934 #endif /* CONFIG_THREAD_GROUPS */
4935
4936 thread_set_thread_name(current_thread(), "VM_pressure");
4937 thread_initialized = TRUE;
4938 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4939 thread_block((thread_continue_t)vm_pressure_thread);
4940 }
4941 #endif /* VM_PRESSURE_EVENTS */
4942
4943
4944 /*
4945 * called once per-second via "compute_averages"
4946 */
4947 void
4948 compute_pageout_gc_throttle(__unused void *arg)
4949 {
4950 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4951 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4952
4953 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4954 }
4955 }
4956
4957 /*
4958 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4959 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4960 * jetsams. We need to check if the zone map size is above its jetsam limit to
4961 * decide if this was indeed the case.
4962 *
4963 * We need to do this on a different thread because of the following reasons:
4964 *
4965 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4966 * itself causing the system to hang. We perform synchronous jetsams if we're
4967 * leaking in the VM map entries zone, so the leaking process could be doing a
4968 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4969 * jetsam itself. We also need the vm_map lock on the process termination path,
4970 * which would now lead the dying process to deadlock against itself.
4971 *
4972 * 2. The jetsam path might need to allocate zone memory itself. We could try
4973 * using the non-blocking variant of zalloc for this path, but we can still
4974 * end up trying to do a kmem_alloc when the zone maps are almost full.
4975 */
4976 __dead2
4977 void
4978 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4979 {
4980 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4981
4982 if (step == VM_PAGEOUT_GC_INIT) {
4983 /* first time being called is not about GC */
4984 #if CONFIG_THREAD_GROUPS
4985 thread_group_vm_add();
4986 #endif /* CONFIG_THREAD_GROUPS */
4987 } else if (zone_map_nearing_exhaustion()) {
4988 /*
4989 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4990 *
4991 * Bail out after calling zone_gc (which triggers the
4992 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4993 * operations that clear out a bunch of caches might allocate zone
4994 * memory themselves (for eg. vm_map operations would need VM map
4995 * entries). Since the zone map is almost full at this point, we
4996 * could end up with a panic. We just need to quickly jetsam a
4997 * process and exit here.
4998 *
4999 * It could so happen that we were woken up to relieve memory
5000 * pressure and the zone map also happened to be near its limit at
5001 * the time, in which case we'll skip out early. But that should be
5002 * ok; if memory pressure persists, the thread will simply be woken
5003 * up again.
5004 */
5005 zone_gc(ZONE_GC_JETSAM);
5006 } else {
5007 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
5008 boolean_t buf_large_zfree = FALSE;
5009 boolean_t first_try = TRUE;
5010
5011 stack_collect();
5012
5013 consider_machine_collect();
5014 #if CONFIG_MBUF_MCACHE
5015 mbuf_drain(FALSE);
5016 #endif /* CONFIG_MBUF_MCACHE */
5017
5018 do {
5019 if (consider_buffer_cache_collect != NULL) {
5020 buf_large_zfree = (*consider_buffer_cache_collect)(0);
5021 }
5022 if (first_try == TRUE || buf_large_zfree == TRUE) {
5023 /*
5024 * zone_gc should be last, because the other operations
5025 * might return memory to zones.
5026 */
5027 zone_gc(ZONE_GC_TRIM);
5028 }
5029 first_try = FALSE;
5030 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5031
5032 consider_machine_adjust();
5033 }
5034
5035 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
5036
5037 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5038 __builtin_unreachable();
5039 }
5040
5041
5042 #if VM_PAGE_BUCKETS_CHECK
5043 #if VM_PAGE_FAKE_BUCKETS
5044 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5045 #endif /* VM_PAGE_FAKE_BUCKETS */
5046 #endif /* VM_PAGE_BUCKETS_CHECK */
5047
5048
5049
5050 void
5051 vm_set_restrictions(unsigned int num_cpus)
5052 {
5053 int vm_restricted_to_single_processor = 0;
5054
5055 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5056 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5057 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5058 } else {
5059 assert(num_cpus > 0);
5060
5061 if (num_cpus <= 3) {
5062 /*
5063 * on systems with a limited number of CPUS, bind the
5064 * 4 major threads that can free memory and that tend to use
5065 * a fair bit of CPU under pressured conditions to a single processor.
5066 * This insures that these threads don't hog all of the available CPUs
5067 * (important for camera launch), while allowing them to run independently
5068 * w/r to locks... the 4 threads are
5069 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5070 * vm_compressor_swap_trigger_thread (minor and major compactions),
5071 * memorystatus_thread (jetsams).
5072 *
5073 * the first time the thread is run, it is responsible for checking the
5074 * state of vm_restricted_to_single_processor, and if TRUE it calls
5075 * thread_bind_master... someday this should be replaced with a group
5076 * scheduling mechanism and KPI.
5077 */
5078 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5079 } else {
5080 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5081 }
5082 }
5083 }
5084
5085 /*
5086 * Set up vm_config based on the vm_compressor_mode.
5087 * Must run BEFORE the pageout thread starts up.
5088 */
5089 __startup_func
5090 void
5091 vm_config_init(void)
5092 {
5093 bzero(&vm_config, sizeof(vm_config));
5094
5095 switch (vm_compressor_mode) {
5096 case VM_PAGER_DEFAULT:
5097 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5098 OS_FALLTHROUGH;
5099
5100 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5101 vm_config.compressor_is_present = TRUE;
5102 vm_config.swap_is_present = TRUE;
5103 vm_config.compressor_is_active = TRUE;
5104 vm_config.swap_is_active = TRUE;
5105 break;
5106
5107 case VM_PAGER_COMPRESSOR_NO_SWAP:
5108 vm_config.compressor_is_present = TRUE;
5109 vm_config.swap_is_present = TRUE;
5110 vm_config.compressor_is_active = TRUE;
5111 break;
5112
5113 case VM_PAGER_FREEZER_DEFAULT:
5114 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5115 OS_FALLTHROUGH;
5116
5117 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5118 vm_config.compressor_is_present = TRUE;
5119 vm_config.swap_is_present = TRUE;
5120 break;
5121
5122 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5123 vm_config.compressor_is_present = TRUE;
5124 vm_config.swap_is_present = TRUE;
5125 vm_config.compressor_is_active = TRUE;
5126 vm_config.freezer_swap_is_active = TRUE;
5127 break;
5128
5129 case VM_PAGER_NOT_CONFIGURED:
5130 break;
5131
5132 default:
5133 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5134 break;
5135 }
5136 }
5137
5138 __startup_func
5139 static void
5140 vm_pageout_create_gc_thread(void)
5141 {
5142 thread_t thread;
5143
5144 if (kernel_thread_create(vm_pageout_garbage_collect,
5145 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5146 panic("vm_pageout_garbage_collect: create failed");
5147 }
5148 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5149 if (thread->reserved_stack == 0) {
5150 assert(thread->kernel_stack);
5151 thread->reserved_stack = thread->kernel_stack;
5152 }
5153
5154 /* thread is started in vm_pageout() */
5155 vm_pageout_gc_thread = thread;
5156 }
5157 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5158
5159 void
5160 vm_pageout(void)
5161 {
5162 thread_t self = current_thread();
5163 thread_t thread;
5164 kern_return_t result;
5165 spl_t s;
5166
5167 /*
5168 * Set thread privileges.
5169 */
5170 s = splsched();
5171
5172 #if CONFIG_VPS_DYNAMIC_PRIO
5173 if (vps_dynamic_priority_enabled) {
5174 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5175 thread_set_eager_preempt(self);
5176 } else {
5177 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5178 }
5179 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5180 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5181 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5182
5183 thread_lock(self);
5184 self->options |= TH_OPT_VMPRIV;
5185 thread_unlock(self);
5186
5187 if (!self->reserved_stack) {
5188 self->reserved_stack = self->kernel_stack;
5189 }
5190
5191 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5192 !vps_dynamic_priority_enabled) {
5193 thread_vm_bind_group_add();
5194 }
5195
5196
5197 #if CONFIG_THREAD_GROUPS
5198 thread_group_vm_add();
5199 #endif /* CONFIG_THREAD_GROUPS */
5200
5201 #if __AMP__
5202 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5203 if (vm_pgo_pbound) {
5204 /*
5205 * Use the soft bound option for vm pageout to allow it to run on
5206 * E-cores if P-cluster is unavailable.
5207 */
5208 thread_bind_cluster_type(self, 'P', true);
5209 }
5210 #endif /* __AMP__ */
5211
5212 PE_parse_boot_argn("vmpgo_protect_realtime",
5213 &vm_pageout_protect_realtime,
5214 sizeof(vm_pageout_protect_realtime));
5215 splx(s);
5216
5217 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5218
5219 /*
5220 * Initialize some paging parameters.
5221 */
5222
5223 vm_pageout_state.vm_pressure_thread_running = FALSE;
5224 vm_pageout_state.vm_pressure_changed = FALSE;
5225 vm_pageout_state.memorystatus_purge_on_warning = 2;
5226 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5227 vm_pageout_state.memorystatus_purge_on_critical = 8;
5228 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5229 vm_pageout_state.vm_page_speculative_percentage = 5;
5230 vm_pageout_state.vm_page_speculative_target = 0;
5231
5232 vm_pageout_state.vm_pageout_swap_wait = 0;
5233 vm_pageout_state.vm_pageout_idle_wait = 0;
5234 vm_pageout_state.vm_pageout_empty_wait = 0;
5235 vm_pageout_state.vm_pageout_burst_wait = 0;
5236 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5237 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5238 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5239
5240 vm_pageout_state.vm_pageout_inactive = 0;
5241 vm_pageout_state.vm_pageout_inactive_used = 0;
5242 vm_pageout_state.vm_pageout_inactive_clean = 0;
5243
5244 vm_pageout_state.vm_memory_pressure = 0;
5245 vm_pageout_state.vm_page_filecache_min = 0;
5246 #if CONFIG_JETSAM
5247 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5248 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5249 #else
5250 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5251 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5252 #endif
5253 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5254
5255 vm_pageout_state.vm_pageout_considered_page_last = 0;
5256
5257 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5258 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5259 }
5260
5261 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5262 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5263 }
5264
5265 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5266 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5267 }
5268
5269 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5270 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5271 }
5272
5273 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5274 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5275 }
5276
5277 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5278 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5279 }
5280
5281 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5282 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5283 }
5284 /*
5285 * even if we've already called vm_page_free_reserve
5286 * call it again here to insure that the targets are
5287 * accurately calculated (it uses vm_page_free_count_init)
5288 * calling it with an arg of 0 will not change the reserve
5289 * but will re-calculate free_min and free_target
5290 */
5291 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5292 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5293 } else {
5294 vm_page_free_reserve(0);
5295 }
5296
5297 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5298 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5299
5300 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5301 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5302
5303 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5304
5305 #if DEVELOPMENT || DEBUG
5306 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5307 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5308 #endif /* DEVELOPMENT || DEBUG */
5309
5310
5311 /* internal pageout thread started when default pager registered first time */
5312 /* external pageout and garbage collection threads started here */
5313 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5314 ethr->id = 0;
5315 ethr->q = &vm_pageout_queue_external;
5316 /* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5317 ethr->current_early_swapout_chead = NULL;
5318 ethr->current_regular_swapout_chead = NULL;
5319 ethr->current_late_swapout_chead = NULL;
5320 ethr->scratch_buf = NULL;
5321 #if DEVELOPMENT || DEBUG
5322 ethr->benchmark_q = NULL;
5323 #endif /* DEVELOPMENT || DEBUG */
5324 sched_cond_init(&(ethr->pgo_wakeup));
5325
5326 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5327 (void *)ethr, BASEPRI_VM,
5328 &(ethr->pgo_iothread));
5329 if (result != KERN_SUCCESS) {
5330 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5331 }
5332 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5333
5334 thread_mtx_lock(vm_pageout_gc_thread );
5335 thread_start(vm_pageout_gc_thread );
5336 thread_mtx_unlock(vm_pageout_gc_thread);
5337
5338 #if VM_PRESSURE_EVENTS
5339 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5340 BASEPRI_DEFAULT,
5341 &thread);
5342
5343 if (result != KERN_SUCCESS) {
5344 panic("vm_pressure_thread: create failed");
5345 }
5346
5347 thread_deallocate(thread);
5348 #endif
5349
5350 vm_object_reaper_init();
5351
5352
5353 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5354 vm_compressor_init();
5355 }
5356
5357 #if VM_PRESSURE_EVENTS
5358 vm_pressure_events_enabled = TRUE;
5359 #endif /* VM_PRESSURE_EVENTS */
5360
5361 #if CONFIG_PHANTOM_CACHE
5362 vm_phantom_cache_init();
5363 #endif
5364 #if VM_PAGE_BUCKETS_CHECK
5365 #if VM_PAGE_FAKE_BUCKETS
5366 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5367 (uint64_t) vm_page_fake_buckets_start,
5368 (uint64_t) vm_page_fake_buckets_end);
5369 pmap_protect(kernel_pmap,
5370 vm_page_fake_buckets_start,
5371 vm_page_fake_buckets_end,
5372 VM_PROT_READ);
5373 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5374 #endif /* VM_PAGE_FAKE_BUCKETS */
5375 #endif /* VM_PAGE_BUCKETS_CHECK */
5376
5377 #if VM_OBJECT_TRACKING
5378 vm_object_tracking_init();
5379 #endif /* VM_OBJECT_TRACKING */
5380
5381 #if __arm64__
5382 // vm_tests();
5383 #endif /* __arm64__ */
5384
5385 vm_pageout_continue();
5386
5387 /*
5388 * Unreached code!
5389 *
5390 * The vm_pageout_continue() call above never returns, so the code below is never
5391 * executed. We take advantage of this to declare several DTrace VM related probe
5392 * points that our kernel doesn't have an analog for. These are probe points that
5393 * exist in Solaris and are in the DTrace documentation, so people may have written
5394 * scripts that use them. Declaring the probe points here means their scripts will
5395 * compile and execute which we want for portability of the scripts, but since this
5396 * section of code is never reached, the probe points will simply never fire. Yes,
5397 * this is basically a hack. The problem is the DTrace probe points were chosen with
5398 * Solaris specific VM events in mind, not portability to different VM implementations.
5399 */
5400
5401 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5402 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5403 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5404 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5405 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5406 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5407 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5408 /*NOTREACHED*/
5409 }
5410
5411
5412
5413 kern_return_t
5414 vm_pageout_internal_start(void)
5415 {
5416 kern_return_t result = KERN_SUCCESS;
5417 host_basic_info_data_t hinfo;
5418 vm_offset_t buf, bufsize;
5419
5420 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5421
5422 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5423 #define BSD_HOST 1
5424 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5425
5426 assert(hinfo.max_cpus > 0);
5427
5428 #if !XNU_TARGET_OS_OSX
5429 vm_pageout_state.vm_compressor_thread_count = 1;
5430 #else /* !XNU_TARGET_OS_OSX */
5431 if (hinfo.max_cpus > 4) {
5432 vm_pageout_state.vm_compressor_thread_count = 2;
5433 } else {
5434 vm_pageout_state.vm_compressor_thread_count = 1;
5435 }
5436 #endif /* !XNU_TARGET_OS_OSX */
5437 #if __AMP__
5438 if (vm_compressor_ebound) {
5439 vm_pageout_state.vm_compressor_thread_count = 2;
5440 }
5441 #endif
5442 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5443 sizeof(vm_pageout_state.vm_compressor_thread_count));
5444
5445 /* did we get from the bootargs an unreasonable number? */
5446 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5447 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5448 }
5449 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5450 vm_pageout_state.vm_compressor_thread_count = 1;
5451 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5452 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5453 }
5454
5455 vm_pageout_queue_internal.pgo_maxlaundry =
5456 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5457
5458 PE_parse_boot_argn("vmpgoi_maxlaundry",
5459 &vm_pageout_queue_internal.pgo_maxlaundry,
5460 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5461
5462 #if DEVELOPMENT || DEBUG
5463 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5464 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5465 #endif /* DEVELOPMENT || DEBUG */
5466
5467 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5468
5469 kmem_alloc(kernel_map, &buf,
5470 bufsize * vm_pageout_state.vm_compressor_thread_count,
5471 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5472 VM_KERN_MEMORY_COMPRESSOR);
5473
5474 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5475 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5476 iq->id = i;
5477 iq->q = &vm_pageout_queue_internal;
5478 iq->current_early_swapout_chead = NULL;
5479 iq->current_regular_swapout_chead = NULL;
5480 iq->current_late_swapout_chead = NULL;
5481 iq->scratch_buf = (char *)(buf + i * bufsize);
5482 #if DEVELOPMENT || DEBUG
5483 iq->benchmark_q = &vm_pageout_queue_benchmark;
5484 #endif /* DEVELOPMENT || DEBUG */
5485 sched_cond_init(&(iq->pgo_wakeup));
5486 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5487 (void *)iq, BASEPRI_VM,
5488 &(iq->pgo_iothread));
5489
5490 if (result != KERN_SUCCESS) {
5491 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5492 }
5493 }
5494 return result;
5495 }
5496
5497 #if CONFIG_IOSCHED
5498 /*
5499 * To support I/O Expedite for compressed files we mark the upls with special flags.
5500 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5501 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5502 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5503 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5504 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5505 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5506 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5507 * unless the real I/O upl is being destroyed).
5508 */
5509
5510
5511 static void
5512 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5513 {
5514 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5515
5516 upl_lock(src_upl);
5517 if (src_upl->decmp_io_upl) {
5518 /*
5519 * If there is already an alive real I/O UPL, ignore this new UPL.
5520 * This case should rarely happen and even if it does, it just means
5521 * that we might issue a spurious expedite which the driver is expected
5522 * to handle.
5523 */
5524 upl_unlock(src_upl);
5525 return;
5526 }
5527 src_upl->decmp_io_upl = (void *)upl;
5528 src_upl->ref_count++;
5529
5530 upl->flags |= UPL_DECMP_REAL_IO;
5531 upl->decmp_io_upl = (void *)src_upl;
5532 upl_unlock(src_upl);
5533 }
5534 #endif /* CONFIG_IOSCHED */
5535
5536 #if UPL_DEBUG
5537 int upl_debug_enabled = 1;
5538 #else
5539 int upl_debug_enabled = 0;
5540 #endif
5541
5542 static upl_t
5543 upl_create(int type, int flags, upl_size_t size)
5544 {
5545 uint32_t pages = (uint32_t)atop(round_page_32(size));
5546 upl_t upl;
5547
5548 assert(page_aligned(size));
5549
5550 /*
5551 * FIXME: this code assumes the allocation always succeeds,
5552 * however `pages` can be up to MAX_UPL_SIZE.
5553 *
5554 * The allocation size is above 32k (resp. 128k)
5555 * on 16k pages (resp. 4k), which kalloc might fail
5556 * to allocate.
5557 */
5558 upl = kalloc_type(struct upl, struct upl_page_info,
5559 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5560 if (type & UPL_CREATE_INTERNAL) {
5561 flags |= UPL_INTERNAL;
5562 }
5563
5564 if (type & UPL_CREATE_LITE) {
5565 flags |= UPL_LITE;
5566 if (pages) {
5567 upl->lite_list = bitmap_alloc(pages);
5568 }
5569 }
5570
5571 upl->flags = flags;
5572 upl->ref_count = 1;
5573 upl_lock_init(upl);
5574 #if CONFIG_IOSCHED
5575 if (type & UPL_CREATE_IO_TRACKING) {
5576 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5577 }
5578
5579 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5580 /* Only support expedite on internal UPLs */
5581 thread_t curthread = current_thread();
5582 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5583 Z_WAITOK | Z_ZERO);
5584 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5585 if (curthread->decmp_upl != NULL) {
5586 upl_set_decmp_info(upl, curthread->decmp_upl);
5587 }
5588 }
5589 #endif
5590 #if CONFIG_IOSCHED || UPL_DEBUG
5591 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5592 upl->upl_creator = current_thread();
5593 upl->flags |= UPL_TRACKED_BY_OBJECT;
5594 }
5595 #endif
5596
5597 #if UPL_DEBUG
5598 upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5599 #endif /* UPL_DEBUG */
5600
5601 return upl;
5602 }
5603
5604 static void
5605 upl_destroy(upl_t upl)
5606 {
5607 uint32_t pages;
5608
5609 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5610
5611 if (upl->ext_ref_count) {
5612 panic("upl(%p) ext_ref_count", upl);
5613 }
5614
5615 #if CONFIG_IOSCHED
5616 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5617 upl_t src_upl;
5618 src_upl = upl->decmp_io_upl;
5619 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5620 upl_lock(src_upl);
5621 src_upl->decmp_io_upl = NULL;
5622 upl_unlock(src_upl);
5623 upl_deallocate(src_upl);
5624 }
5625 #endif /* CONFIG_IOSCHED */
5626
5627 #if CONFIG_IOSCHED || UPL_DEBUG
5628 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5629 !(upl->flags & UPL_VECTOR)) {
5630 vm_object_t object;
5631
5632 if (upl->flags & UPL_SHADOWED) {
5633 object = upl->map_object->shadow;
5634 } else {
5635 object = upl->map_object;
5636 }
5637
5638 vm_object_lock(object);
5639 queue_remove(&object->uplq, upl, upl_t, uplq);
5640 vm_object_activity_end(object);
5641 vm_object_collapse(object, 0, TRUE);
5642 vm_object_unlock(object);
5643 }
5644 #endif
5645 /*
5646 * drop a reference on the map_object whether or
5647 * not a pageout object is inserted
5648 */
5649 if (upl->flags & UPL_SHADOWED) {
5650 vm_object_deallocate(upl->map_object);
5651 }
5652
5653 if (upl->flags & UPL_DEVICE_MEMORY) {
5654 pages = 1;
5655 } else {
5656 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5657 }
5658
5659 upl_lock_destroy(upl);
5660
5661 #if CONFIG_IOSCHED
5662 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5663 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5664 }
5665 #endif
5666
5667 #if UPL_DEBUG
5668 for (int i = 0; i < upl->upl_commit_index; i++) {
5669 btref_put(upl->upl_commit_records[i].c_btref);
5670 }
5671 btref_put(upl->uple_create_btref);
5672 #endif /* UPL_DEBUG */
5673
5674 if ((upl->flags & UPL_LITE) && pages) {
5675 bitmap_free(upl->lite_list, pages);
5676 }
5677 kfree_type(struct upl, struct upl_page_info,
5678 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5679 }
5680
5681 void
5682 upl_deallocate(upl_t upl)
5683 {
5684 upl_lock(upl);
5685
5686 if (--upl->ref_count == 0) {
5687 if (vector_upl_is_valid(upl)) {
5688 vector_upl_deallocate(upl);
5689 }
5690 upl_unlock(upl);
5691
5692 if (upl->upl_iodone) {
5693 upl_callout_iodone(upl);
5694 }
5695
5696 upl_destroy(upl);
5697 } else {
5698 upl_unlock(upl);
5699 }
5700 }
5701
5702 #if CONFIG_IOSCHED
5703 void
5704 upl_mark_decmp(upl_t upl)
5705 {
5706 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5707 upl->flags |= UPL_DECMP_REQ;
5708 upl->upl_creator->decmp_upl = (void *)upl;
5709 }
5710 }
5711
5712 void
5713 upl_unmark_decmp(upl_t upl)
5714 {
5715 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5716 upl->upl_creator->decmp_upl = NULL;
5717 }
5718 }
5719
5720 #endif /* CONFIG_IOSCHED */
5721
5722 #define VM_PAGE_Q_BACKING_UP(q) \
5723 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5724
5725 boolean_t must_throttle_writes(void);
5726
5727 boolean_t
5728 must_throttle_writes()
5729 {
5730 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5731 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5732 return TRUE;
5733 }
5734
5735 return FALSE;
5736 }
5737
5738 int vm_page_delayed_work_ctx_needed = 0;
5739 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5740
5741 __startup_func
5742 static void
5743 vm_page_delayed_work_init_ctx(void)
5744 {
5745 uint16_t min_delayed_work_ctx_allocated = 16;
5746
5747 /*
5748 * try really hard to always keep NCPU elements around in the zone
5749 * in order for the UPL code to almost always get an element.
5750 */
5751 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5752 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5753 }
5754
5755 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5756 }
5757 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5758
5759 struct vm_page_delayed_work*
5760 vm_page_delayed_work_get_ctx(void)
5761 {
5762 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5763
5764 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5765
5766 if (__probable(dw_ctx)) {
5767 dw_ctx->delayed_owner = current_thread();
5768 } else {
5769 vm_page_delayed_work_ctx_needed++;
5770 }
5771 return dw_ctx ? dw_ctx->dwp : NULL;
5772 }
5773
5774 void
5775 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5776 {
5777 struct vm_page_delayed_work_ctx *ldw_ctx;
5778
5779 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5780 ldw_ctx->delayed_owner = NULL;
5781
5782 zfree(dw_ctx_zone, ldw_ctx);
5783 }
5784
5785 /*
5786 * Routine: vm_object_upl_request
5787 * Purpose:
5788 * Cause the population of a portion of a vm_object.
5789 * Depending on the nature of the request, the pages
5790 * returned may be contain valid data or be uninitialized.
5791 * A page list structure, listing the physical pages
5792 * will be returned upon request.
5793 * This function is called by the file system or any other
5794 * supplier of backing store to a pager.
5795 * IMPORTANT NOTE: The caller must still respect the relationship
5796 * between the vm_object and its backing memory object. The
5797 * caller MUST NOT substitute changes in the backing file
5798 * without first doing a memory_object_lock_request on the
5799 * target range unless it is know that the pages are not
5800 * shared with another entity at the pager level.
5801 * Copy_in_to:
5802 * if a page list structure is present
5803 * return the mapped physical pages, where a
5804 * page is not present, return a non-initialized
5805 * one. If the no_sync bit is turned on, don't
5806 * call the pager unlock to synchronize with other
5807 * possible copies of the page. Leave pages busy
5808 * in the original object, if a page list structure
5809 * was specified. When a commit of the page list
5810 * pages is done, the dirty bit will be set for each one.
5811 * Copy_out_from:
5812 * If a page list structure is present, return
5813 * all mapped pages. Where a page does not exist
5814 * map a zero filled one. Leave pages busy in
5815 * the original object. If a page list structure
5816 * is not specified, this call is a no-op.
5817 *
5818 * Note: access of default pager objects has a rather interesting
5819 * twist. The caller of this routine, presumably the file system
5820 * page cache handling code, will never actually make a request
5821 * against a default pager backed object. Only the default
5822 * pager will make requests on backing store related vm_objects
5823 * In this way the default pager can maintain the relationship
5824 * between backing store files (abstract memory objects) and
5825 * the vm_objects (cache objects), they support.
5826 *
5827 */
5828
5829 __private_extern__ kern_return_t
5830 vm_object_upl_request(
5831 vm_object_t object,
5832 vm_object_offset_t offset,
5833 upl_size_t size,
5834 upl_t *upl_ptr,
5835 upl_page_info_array_t user_page_list,
5836 unsigned int *page_list_count,
5837 upl_control_flags_t cntrl_flags,
5838 vm_tag_t tag)
5839 {
5840 vm_page_t dst_page = VM_PAGE_NULL;
5841 vm_object_offset_t dst_offset;
5842 upl_size_t xfer_size;
5843 unsigned int size_in_pages;
5844 boolean_t dirty;
5845 boolean_t hw_dirty;
5846 upl_t upl = NULL;
5847 unsigned int entry;
5848 vm_page_t alias_page = NULL;
5849 int refmod_state = 0;
5850 vm_object_t last_copy_object;
5851 uint32_t last_copy_version;
5852 struct vm_page_delayed_work dw_array;
5853 struct vm_page_delayed_work *dwp, *dwp_start;
5854 bool dwp_finish_ctx = TRUE;
5855 int dw_count;
5856 int dw_limit;
5857 int io_tracking_flag = 0;
5858 int grab_options;
5859 int page_grab_count = 0;
5860 ppnum_t phys_page;
5861 pmap_flush_context pmap_flush_context_storage;
5862 boolean_t pmap_flushes_delayed = FALSE;
5863 #if DEVELOPMENT || DEBUG
5864 task_t task = current_task();
5865 #endif /* DEVELOPMENT || DEBUG */
5866
5867 dwp_start = dwp = NULL;
5868
5869 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5870 /*
5871 * For forward compatibility's sake,
5872 * reject any unknown flag.
5873 */
5874 return KERN_INVALID_VALUE;
5875 }
5876 if ((!object->internal) && (object->paging_offset != 0)) {
5877 panic("vm_object_upl_request: external object with non-zero paging offset");
5878 }
5879 if (object->phys_contiguous) {
5880 panic("vm_object_upl_request: contiguous object specified");
5881 }
5882
5883 assertf(page_aligned(offset) && page_aligned(size),
5884 "offset 0x%llx size 0x%x",
5885 offset, size);
5886
5887 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5888
5889 dw_count = 0;
5890 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5891 dwp_start = vm_page_delayed_work_get_ctx();
5892 if (dwp_start == NULL) {
5893 dwp_start = &dw_array;
5894 dw_limit = 1;
5895 dwp_finish_ctx = FALSE;
5896 }
5897
5898 dwp = dwp_start;
5899
5900 if (size > MAX_UPL_SIZE_BYTES) {
5901 size = MAX_UPL_SIZE_BYTES;
5902 }
5903
5904 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5905 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5906 }
5907
5908 #if CONFIG_IOSCHED || UPL_DEBUG
5909 if (object->io_tracking || upl_debug_enabled) {
5910 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5911 }
5912 #endif
5913 #if CONFIG_IOSCHED
5914 if (object->io_tracking) {
5915 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5916 }
5917 #endif
5918
5919 if (cntrl_flags & UPL_SET_INTERNAL) {
5920 if (cntrl_flags & UPL_SET_LITE) {
5921 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5922 } else {
5923 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5924 }
5925 user_page_list = size ? upl->page_list : NULL;
5926 } else {
5927 if (cntrl_flags & UPL_SET_LITE) {
5928 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5929 } else {
5930 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5931 }
5932 }
5933 *upl_ptr = upl;
5934
5935 if (user_page_list) {
5936 user_page_list[0].device = FALSE;
5937 }
5938
5939 if (cntrl_flags & UPL_SET_LITE) {
5940 upl->map_object = object;
5941 } else {
5942 upl->map_object = vm_object_allocate(size);
5943 vm_object_lock(upl->map_object);
5944 /*
5945 * No neeed to lock the new object: nobody else knows
5946 * about it yet, so it's all ours so far.
5947 */
5948 upl->map_object->shadow = object;
5949 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5950 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5951 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5952 upl->map_object->vo_shadow_offset = offset;
5953 upl->map_object->wimg_bits = object->wimg_bits;
5954 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5955 "object %p shadow_offset 0x%llx",
5956 upl->map_object, upl->map_object->vo_shadow_offset);
5957 vm_object_unlock(upl->map_object);
5958
5959 alias_page = vm_page_grab_fictitious(TRUE);
5960
5961 upl->flags |= UPL_SHADOWED;
5962 }
5963 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5964 upl->flags |= UPL_PAGEOUT;
5965 }
5966
5967 vm_object_lock(object);
5968 vm_object_activity_begin(object);
5969
5970 grab_options = 0;
5971 #if CONFIG_SECLUDED_MEMORY
5972 if (object->can_grab_secluded) {
5973 grab_options |= VM_PAGE_GRAB_SECLUDED;
5974 }
5975 #endif /* CONFIG_SECLUDED_MEMORY */
5976
5977 /*
5978 * we can lock in the paging_offset once paging_in_progress is set
5979 */
5980 upl->u_size = size;
5981 upl->u_offset = offset + object->paging_offset;
5982
5983 #if CONFIG_IOSCHED || UPL_DEBUG
5984 if (object->io_tracking || upl_debug_enabled) {
5985 vm_object_activity_begin(object);
5986 queue_enter(&object->uplq, upl, upl_t, uplq);
5987 }
5988 #endif
5989 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5990 /*
5991 * Honor copy-on-write obligations
5992 *
5993 * The caller is gathering these pages and
5994 * might modify their contents. We need to
5995 * make sure that the copy object has its own
5996 * private copies of these pages before we let
5997 * the caller modify them.
5998 */
5999 vm_object_update(object,
6000 offset,
6001 size,
6002 NULL,
6003 NULL,
6004 FALSE, /* should_return */
6005 MEMORY_OBJECT_COPY_SYNC,
6006 VM_PROT_NO_CHANGE);
6007
6008 VM_PAGEOUT_DEBUG(upl_cow, 1);
6009 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6010 }
6011 /*
6012 * remember which copy object we synchronized with
6013 */
6014 last_copy_object = object->vo_copy;
6015 last_copy_version = object->vo_copy_version;
6016 entry = 0;
6017
6018 xfer_size = size;
6019 dst_offset = offset;
6020 size_in_pages = size / PAGE_SIZE;
6021
6022 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6023 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6024 object->scan_collisions = 0;
6025 }
6026
6027 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6028 boolean_t isSSD = FALSE;
6029
6030 #if !XNU_TARGET_OS_OSX
6031 isSSD = TRUE;
6032 #else /* !XNU_TARGET_OS_OSX */
6033 vnode_pager_get_isSSD(object->pager, &isSSD);
6034 #endif /* !XNU_TARGET_OS_OSX */
6035 vm_object_unlock(object);
6036
6037 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6038
6039 if (isSSD == TRUE) {
6040 delay(1000 * size_in_pages);
6041 } else {
6042 delay(5000 * size_in_pages);
6043 }
6044 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6045
6046 vm_object_lock(object);
6047 }
6048
6049 while (xfer_size) {
6050 dwp->dw_mask = 0;
6051
6052 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6053 vm_object_unlock(object);
6054 alias_page = vm_page_grab_fictitious(TRUE);
6055 vm_object_lock(object);
6056 }
6057 if (cntrl_flags & UPL_COPYOUT_FROM) {
6058 upl->flags |= UPL_PAGE_SYNC_DONE;
6059
6060 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6061 dst_page->vmp_fictitious ||
6062 dst_page->vmp_absent ||
6063 VMP_ERROR_GET(dst_page) ||
6064 dst_page->vmp_cleaning ||
6065 (VM_PAGE_WIRED(dst_page))) {
6066 if (user_page_list) {
6067 user_page_list[entry].phys_addr = 0;
6068 }
6069
6070 goto try_next_page;
6071 }
6072 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6073
6074 /*
6075 * grab this up front...
6076 * a high percentange of the time we're going to
6077 * need the hardware modification state a bit later
6078 * anyway... so we can eliminate an extra call into
6079 * the pmap layer by grabbing it here and recording it
6080 */
6081 if (dst_page->vmp_pmapped) {
6082 refmod_state = pmap_get_refmod(phys_page);
6083 } else {
6084 refmod_state = 0;
6085 }
6086
6087 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6088 /*
6089 * page is on inactive list and referenced...
6090 * reactivate it now... this gets it out of the
6091 * way of vm_pageout_scan which would have to
6092 * reactivate it upon tripping over it
6093 */
6094 dwp->dw_mask |= DW_vm_page_activate;
6095 }
6096 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6097 /*
6098 * we're only asking for DIRTY pages to be returned
6099 */
6100 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6101 /*
6102 * if we were the page stolen by vm_pageout_scan to be
6103 * cleaned (as opposed to a buddy being clustered in
6104 * or this request is not being driven by a PAGEOUT cluster
6105 * then we only need to check for the page being dirty or
6106 * precious to decide whether to return it
6107 */
6108 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6109 goto check_busy;
6110 }
6111 goto dont_return;
6112 }
6113 /*
6114 * this is a request for a PAGEOUT cluster and this page
6115 * is merely along for the ride as a 'buddy'... not only
6116 * does it have to be dirty to be returned, but it also
6117 * can't have been referenced recently...
6118 */
6119 if ((hibernate_cleaning_in_progress == TRUE ||
6120 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6121 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6122 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6123 goto check_busy;
6124 }
6125 dont_return:
6126 /*
6127 * if we reach here, we're not to return
6128 * the page... go on to the next one
6129 */
6130 if (dst_page->vmp_laundry == TRUE) {
6131 /*
6132 * if we get here, the page is not 'cleaning' (filtered out above).
6133 * since it has been referenced, remove it from the laundry
6134 * so we don't pay the cost of an I/O to clean a page
6135 * we're just going to take back
6136 */
6137 vm_page_lockspin_queues();
6138
6139 vm_pageout_steal_laundry(dst_page, TRUE);
6140 vm_page_activate(dst_page);
6141
6142 vm_page_unlock_queues();
6143 }
6144 if (user_page_list) {
6145 user_page_list[entry].phys_addr = 0;
6146 }
6147
6148 goto try_next_page;
6149 }
6150 check_busy:
6151 if (dst_page->vmp_busy) {
6152 if (cntrl_flags & UPL_NOBLOCK) {
6153 if (user_page_list) {
6154 user_page_list[entry].phys_addr = 0;
6155 }
6156 dwp->dw_mask = 0;
6157
6158 goto try_next_page;
6159 }
6160 /*
6161 * someone else is playing with the
6162 * page. We will have to wait.
6163 */
6164 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6165
6166 continue;
6167 }
6168 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6169 vm_page_lockspin_queues();
6170
6171 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6172 /*
6173 * we've buddied up a page for a clustered pageout
6174 * that has already been moved to the pageout
6175 * queue by pageout_scan... we need to remove
6176 * it from the queue and drop the laundry count
6177 * on that queue
6178 */
6179 vm_pageout_throttle_up(dst_page);
6180 }
6181 vm_page_unlock_queues();
6182 }
6183 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6184 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6185
6186 if (phys_page > upl->highest_page) {
6187 upl->highest_page = phys_page;
6188 }
6189
6190 assert(!pmap_is_noencrypt(phys_page));
6191
6192 if (cntrl_flags & UPL_SET_LITE) {
6193 unsigned int pg_num;
6194
6195 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6196 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6197 bitmap_set(upl->lite_list, pg_num);
6198
6199 if (hw_dirty) {
6200 if (pmap_flushes_delayed == FALSE) {
6201 pmap_flush_context_init(&pmap_flush_context_storage);
6202 pmap_flushes_delayed = TRUE;
6203 }
6204 pmap_clear_refmod_options(phys_page,
6205 VM_MEM_MODIFIED,
6206 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6207 &pmap_flush_context_storage);
6208 }
6209
6210 /*
6211 * Mark original page as cleaning
6212 * in place.
6213 */
6214 dst_page->vmp_cleaning = TRUE;
6215 dst_page->vmp_precious = FALSE;
6216 } else {
6217 /*
6218 * use pageclean setup, it is more
6219 * convenient even for the pageout
6220 * cases here
6221 */
6222 vm_object_lock(upl->map_object);
6223 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6224 vm_object_unlock(upl->map_object);
6225
6226 alias_page->vmp_absent = FALSE;
6227 alias_page = NULL;
6228 }
6229 if (dirty) {
6230 SET_PAGE_DIRTY(dst_page, FALSE);
6231 } else {
6232 dst_page->vmp_dirty = FALSE;
6233 }
6234
6235 if (!dirty) {
6236 dst_page->vmp_precious = TRUE;
6237 }
6238
6239 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6240 if (!VM_PAGE_WIRED(dst_page)) {
6241 dst_page->vmp_free_when_done = TRUE;
6242 }
6243 }
6244 } else {
6245 if ((cntrl_flags & UPL_WILL_MODIFY) &&
6246 (object->vo_copy != last_copy_object ||
6247 object->vo_copy_version != last_copy_version)) {
6248 /*
6249 * Honor copy-on-write obligations
6250 *
6251 * The copy object has changed since we
6252 * last synchronized for copy-on-write.
6253 * Another copy object might have been
6254 * inserted while we released the object's
6255 * lock. Since someone could have seen the
6256 * original contents of the remaining pages
6257 * through that new object, we have to
6258 * synchronize with it again for the remaining
6259 * pages only. The previous pages are "busy"
6260 * so they can not be seen through the new
6261 * mapping. The new mapping will see our
6262 * upcoming changes for those previous pages,
6263 * but that's OK since they couldn't see what
6264 * was there before. It's just a race anyway
6265 * and there's no guarantee of consistency or
6266 * atomicity. We just don't want new mappings
6267 * to see both the *before* and *after* pages.
6268 */
6269 if (object->vo_copy != VM_OBJECT_NULL) {
6270 vm_object_update(
6271 object,
6272 dst_offset,/* current offset */
6273 xfer_size, /* remaining size */
6274 NULL,
6275 NULL,
6276 FALSE, /* should_return */
6277 MEMORY_OBJECT_COPY_SYNC,
6278 VM_PROT_NO_CHANGE);
6279
6280 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6281 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6282 }
6283 /*
6284 * remember the copy object we synced with
6285 */
6286 last_copy_object = object->vo_copy;
6287 last_copy_version = object->vo_copy_version;
6288 }
6289 dst_page = vm_page_lookup(object, dst_offset);
6290
6291 if (dst_page != VM_PAGE_NULL) {
6292 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6293 /*
6294 * skip over pages already present in the cache
6295 */
6296 if (user_page_list) {
6297 user_page_list[entry].phys_addr = 0;
6298 }
6299
6300 goto try_next_page;
6301 }
6302 if (dst_page->vmp_fictitious) {
6303 panic("need corner case for fictitious page");
6304 }
6305
6306 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6307 /*
6308 * someone else is playing with the
6309 * page. We will have to wait.
6310 */
6311 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6312
6313 continue;
6314 }
6315 if (dst_page->vmp_laundry) {
6316 vm_pageout_steal_laundry(dst_page, FALSE);
6317 }
6318 } else {
6319 if (object->private) {
6320 /*
6321 * This is a nasty wrinkle for users
6322 * of upl who encounter device or
6323 * private memory however, it is
6324 * unavoidable, only a fault can
6325 * resolve the actual backing
6326 * physical page by asking the
6327 * backing device.
6328 */
6329 if (user_page_list) {
6330 user_page_list[entry].phys_addr = 0;
6331 }
6332
6333 goto try_next_page;
6334 }
6335 if (object->scan_collisions) {
6336 /*
6337 * the pageout_scan thread is trying to steal
6338 * pages from this object, but has run into our
6339 * lock... grab 2 pages from the head of the object...
6340 * the first is freed on behalf of pageout_scan, the
6341 * 2nd is for our own use... we use vm_object_page_grab
6342 * in both cases to avoid taking pages from the free
6343 * list since we are under memory pressure and our
6344 * lock on this object is getting in the way of
6345 * relieving it
6346 */
6347 dst_page = vm_object_page_grab(object);
6348
6349 if (dst_page != VM_PAGE_NULL) {
6350 vm_page_release(dst_page,
6351 FALSE);
6352 }
6353
6354 dst_page = vm_object_page_grab(object);
6355 }
6356 if (dst_page == VM_PAGE_NULL) {
6357 /*
6358 * need to allocate a page
6359 */
6360 dst_page = vm_page_grab_options(grab_options);
6361 if (dst_page != VM_PAGE_NULL) {
6362 page_grab_count++;
6363 }
6364 }
6365 if (dst_page == VM_PAGE_NULL) {
6366 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6367 /*
6368 * we don't want to stall waiting for pages to come onto the free list
6369 * while we're already holding absent pages in this UPL
6370 * the caller will deal with the empty slots
6371 */
6372 if (user_page_list) {
6373 user_page_list[entry].phys_addr = 0;
6374 }
6375
6376 goto try_next_page;
6377 }
6378 /*
6379 * no pages available... wait
6380 * then try again for the same
6381 * offset...
6382 */
6383 vm_object_unlock(object);
6384
6385 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6386
6387 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6388
6389 VM_PAGE_WAIT();
6390 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6391
6392 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6393
6394 vm_object_lock(object);
6395
6396 continue;
6397 }
6398 vm_page_insert(dst_page, object, dst_offset);
6399
6400 dst_page->vmp_absent = TRUE;
6401 dst_page->vmp_busy = FALSE;
6402
6403 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6404 /*
6405 * if UPL_RET_ONLY_ABSENT was specified,
6406 * than we're definitely setting up a
6407 * upl for a clustered read/pagein
6408 * operation... mark the pages as clustered
6409 * so upl_commit_range can put them on the
6410 * speculative list
6411 */
6412 dst_page->vmp_clustered = TRUE;
6413
6414 if (!(cntrl_flags & UPL_FILE_IO)) {
6415 counter_inc(&vm_statistics_pageins);
6416 }
6417 }
6418 }
6419 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6420
6421 dst_page->vmp_overwriting = TRUE;
6422
6423 if (dst_page->vmp_pmapped) {
6424 if (!(cntrl_flags & UPL_FILE_IO)) {
6425 /*
6426 * eliminate all mappings from the
6427 * original object and its prodigy
6428 */
6429 refmod_state = pmap_disconnect(phys_page);
6430 } else {
6431 refmod_state = pmap_get_refmod(phys_page);
6432 }
6433 } else {
6434 refmod_state = 0;
6435 }
6436
6437 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6438 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6439
6440 if (cntrl_flags & UPL_SET_LITE) {
6441 unsigned int pg_num;
6442
6443 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6444 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6445 bitmap_set(upl->lite_list, pg_num);
6446
6447 if (hw_dirty) {
6448 pmap_clear_modify(phys_page);
6449 }
6450
6451 /*
6452 * Mark original page as cleaning
6453 * in place.
6454 */
6455 dst_page->vmp_cleaning = TRUE;
6456 dst_page->vmp_precious = FALSE;
6457 } else {
6458 /*
6459 * use pageclean setup, it is more
6460 * convenient even for the pageout
6461 * cases here
6462 */
6463 vm_object_lock(upl->map_object);
6464 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6465 vm_object_unlock(upl->map_object);
6466
6467 alias_page->vmp_absent = FALSE;
6468 alias_page = NULL;
6469 }
6470
6471 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6472 upl->flags &= ~UPL_CLEAR_DIRTY;
6473 upl->flags |= UPL_SET_DIRTY;
6474 dirty = TRUE;
6475 /*
6476 * Page belonging to a code-signed object is about to
6477 * be written. Mark it tainted and disconnect it from
6478 * all pmaps so processes have to fault it back in and
6479 * deal with the tainted bit.
6480 */
6481 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6482 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6483 vm_page_upl_tainted++;
6484 if (dst_page->vmp_pmapped) {
6485 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6486 if (refmod_state & VM_MEM_REFERENCED) {
6487 dst_page->vmp_reference = TRUE;
6488 }
6489 }
6490 }
6491 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6492 /*
6493 * clean in place for read implies
6494 * that a write will be done on all
6495 * the pages that are dirty before
6496 * a upl commit is done. The caller
6497 * is obligated to preserve the
6498 * contents of all pages marked dirty
6499 */
6500 upl->flags |= UPL_CLEAR_DIRTY;
6501 }
6502 dst_page->vmp_dirty = dirty;
6503
6504 if (!dirty) {
6505 dst_page->vmp_precious = TRUE;
6506 }
6507
6508 if (!VM_PAGE_WIRED(dst_page)) {
6509 /*
6510 * deny access to the target page while
6511 * it is being worked on
6512 */
6513 dst_page->vmp_busy = TRUE;
6514 } else {
6515 dwp->dw_mask |= DW_vm_page_wire;
6516 }
6517
6518 /*
6519 * We might be about to satisfy a fault which has been
6520 * requested. So no need for the "restart" bit.
6521 */
6522 dst_page->vmp_restart = FALSE;
6523 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6524 /*
6525 * expect the page to be used
6526 */
6527 dwp->dw_mask |= DW_set_reference;
6528 }
6529 if (cntrl_flags & UPL_PRECIOUS) {
6530 if (object->internal) {
6531 SET_PAGE_DIRTY(dst_page, FALSE);
6532 dst_page->vmp_precious = FALSE;
6533 } else {
6534 dst_page->vmp_precious = TRUE;
6535 }
6536 } else {
6537 dst_page->vmp_precious = FALSE;
6538 }
6539 }
6540 if (dst_page->vmp_busy) {
6541 upl->flags |= UPL_HAS_BUSY;
6542 }
6543
6544 if (phys_page > upl->highest_page) {
6545 upl->highest_page = phys_page;
6546 }
6547 assert(!pmap_is_noencrypt(phys_page));
6548 if (user_page_list) {
6549 user_page_list[entry].phys_addr = phys_page;
6550 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6551 user_page_list[entry].absent = dst_page->vmp_absent;
6552 user_page_list[entry].dirty = dst_page->vmp_dirty;
6553 user_page_list[entry].precious = dst_page->vmp_precious;
6554 user_page_list[entry].device = FALSE;
6555 user_page_list[entry].needed = FALSE;
6556 if (dst_page->vmp_clustered == TRUE) {
6557 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6558 } else {
6559 user_page_list[entry].speculative = FALSE;
6560 }
6561 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6562 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6563 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6564 user_page_list[entry].mark = FALSE;
6565 }
6566 /*
6567 * if UPL_RET_ONLY_ABSENT is set, then
6568 * we are working with a fresh page and we've
6569 * just set the clustered flag on it to
6570 * indicate that it was drug in as part of a
6571 * speculative cluster... so leave it alone
6572 */
6573 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6574 /*
6575 * someone is explicitly grabbing this page...
6576 * update clustered and speculative state
6577 *
6578 */
6579 if (dst_page->vmp_clustered) {
6580 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6581 }
6582 }
6583 try_next_page:
6584 if (dwp->dw_mask) {
6585 if (dwp->dw_mask & DW_vm_page_activate) {
6586 counter_inc(&vm_statistics_reactivations);
6587 }
6588
6589 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6590
6591 if (dw_count >= dw_limit) {
6592 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6593
6594 dwp = dwp_start;
6595 dw_count = 0;
6596 }
6597 }
6598 entry++;
6599 dst_offset += PAGE_SIZE_64;
6600 xfer_size -= PAGE_SIZE;
6601 }
6602 if (dw_count) {
6603 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6604 dwp = dwp_start;
6605 dw_count = 0;
6606 }
6607
6608 if (alias_page != NULL) {
6609 VM_PAGE_FREE(alias_page);
6610 }
6611 if (pmap_flushes_delayed == TRUE) {
6612 pmap_flush(&pmap_flush_context_storage);
6613 }
6614
6615 if (page_list_count != NULL) {
6616 if (upl->flags & UPL_INTERNAL) {
6617 *page_list_count = 0;
6618 } else if (*page_list_count > entry) {
6619 *page_list_count = entry;
6620 }
6621 }
6622 #if UPL_DEBUG
6623 upl->upl_state = 1;
6624 #endif
6625 vm_object_unlock(object);
6626
6627 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6628 #if DEVELOPMENT || DEBUG
6629 if (task != NULL) {
6630 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6631 }
6632 #endif /* DEVELOPMENT || DEBUG */
6633
6634 if (dwp_start && dwp_finish_ctx) {
6635 vm_page_delayed_work_finish_ctx(dwp_start);
6636 dwp_start = dwp = NULL;
6637 }
6638
6639 return KERN_SUCCESS;
6640 }
6641
6642 /*
6643 * Routine: vm_object_super_upl_request
6644 * Purpose:
6645 * Cause the population of a portion of a vm_object
6646 * in much the same way as memory_object_upl_request.
6647 * Depending on the nature of the request, the pages
6648 * returned may be contain valid data or be uninitialized.
6649 * However, the region may be expanded up to the super
6650 * cluster size provided.
6651 */
6652
6653 __private_extern__ kern_return_t
6654 vm_object_super_upl_request(
6655 vm_object_t object,
6656 vm_object_offset_t offset,
6657 upl_size_t size,
6658 upl_size_t super_cluster,
6659 upl_t *upl,
6660 upl_page_info_t *user_page_list,
6661 unsigned int *page_list_count,
6662 upl_control_flags_t cntrl_flags,
6663 vm_tag_t tag)
6664 {
6665 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6666 return KERN_FAILURE;
6667 }
6668
6669 assert(object->paging_in_progress);
6670 offset = offset - object->paging_offset;
6671
6672 if (super_cluster > size) {
6673 vm_object_offset_t base_offset;
6674 upl_size_t super_size;
6675 vm_object_size_t super_size_64;
6676
6677 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6678 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6679 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6680 super_size = (upl_size_t) super_size_64;
6681 assert(super_size == super_size_64);
6682
6683 if (offset > (base_offset + super_size)) {
6684 panic("vm_object_super_upl_request: Missed target pageout"
6685 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6686 offset, base_offset, super_size, super_cluster,
6687 size, object->paging_offset);
6688 }
6689 /*
6690 * apparently there is a case where the vm requests a
6691 * page to be written out who's offset is beyond the
6692 * object size
6693 */
6694 if ((offset + size) > (base_offset + super_size)) {
6695 super_size_64 = (offset + size) - base_offset;
6696 super_size = (upl_size_t) super_size_64;
6697 assert(super_size == super_size_64);
6698 }
6699
6700 offset = base_offset;
6701 size = super_size;
6702 }
6703 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6704 }
6705
6706 int cs_executable_create_upl = 0;
6707 extern int proc_selfpid(void);
6708 extern char *proc_name_address(void *p);
6709
6710 kern_return_t
6711 vm_map_create_upl(
6712 vm_map_t map,
6713 vm_map_address_t offset,
6714 upl_size_t *upl_size,
6715 upl_t *upl,
6716 upl_page_info_array_t page_list,
6717 unsigned int *count,
6718 upl_control_flags_t *flags,
6719 vm_tag_t tag)
6720 {
6721 vm_map_entry_t entry;
6722 upl_control_flags_t caller_flags;
6723 int force_data_sync;
6724 int sync_cow_data;
6725 vm_object_t local_object;
6726 vm_map_offset_t local_offset;
6727 vm_map_offset_t local_start;
6728 kern_return_t ret;
6729 vm_map_address_t original_offset;
6730 vm_map_size_t original_size, adjusted_size;
6731 vm_map_offset_t local_entry_start;
6732 vm_object_offset_t local_entry_offset;
6733 vm_object_offset_t offset_in_mapped_page;
6734 boolean_t release_map = FALSE;
6735
6736
6737 start_with_map:
6738
6739 original_offset = offset;
6740 original_size = *upl_size;
6741 adjusted_size = original_size;
6742
6743 caller_flags = *flags;
6744
6745 if (caller_flags & ~UPL_VALID_FLAGS) {
6746 /*
6747 * For forward compatibility's sake,
6748 * reject any unknown flag.
6749 */
6750 ret = KERN_INVALID_VALUE;
6751 goto done;
6752 }
6753 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6754 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6755
6756 if (upl == NULL) {
6757 ret = KERN_INVALID_ARGUMENT;
6758 goto done;
6759 }
6760
6761 REDISCOVER_ENTRY:
6762 vm_map_lock_read(map);
6763
6764 if (!vm_map_lookup_entry(map, offset, &entry)) {
6765 vm_map_unlock_read(map);
6766 ret = KERN_FAILURE;
6767 goto done;
6768 }
6769
6770 local_entry_start = entry->vme_start;
6771 local_entry_offset = VME_OFFSET(entry);
6772
6773 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6774 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6775 }
6776
6777 if (entry->vme_end - original_offset < adjusted_size) {
6778 adjusted_size = entry->vme_end - original_offset;
6779 assert(adjusted_size > 0);
6780 *upl_size = (upl_size_t) adjusted_size;
6781 assert(*upl_size == adjusted_size);
6782 }
6783
6784 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6785 *flags = 0;
6786
6787 if (!entry->is_sub_map &&
6788 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6789 if (VME_OBJECT(entry)->private) {
6790 *flags = UPL_DEV_MEMORY;
6791 }
6792
6793 if (VME_OBJECT(entry)->phys_contiguous) {
6794 *flags |= UPL_PHYS_CONTIG;
6795 }
6796 }
6797 vm_map_unlock_read(map);
6798 ret = KERN_SUCCESS;
6799 goto done;
6800 }
6801
6802 offset_in_mapped_page = 0;
6803 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6804 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6805 *upl_size = (upl_size_t)
6806 (vm_map_round_page(original_offset + adjusted_size,
6807 VM_MAP_PAGE_MASK(map))
6808 - offset);
6809
6810 offset_in_mapped_page = original_offset - offset;
6811 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6812
6813 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6814 }
6815
6816 if (!entry->is_sub_map) {
6817 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6818 !VME_OBJECT(entry)->phys_contiguous) {
6819 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6820 *upl_size = MAX_UPL_SIZE_BYTES;
6821 }
6822 }
6823
6824 /*
6825 * Create an object if necessary.
6826 */
6827 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6828 if (vm_map_lock_read_to_write(map)) {
6829 goto REDISCOVER_ENTRY;
6830 }
6831
6832 VME_OBJECT_SET(entry,
6833 vm_object_allocate((vm_size_t)
6834 vm_object_round_page((entry->vme_end - entry->vme_start))),
6835 false, 0);
6836 VME_OFFSET_SET(entry, 0);
6837 assert(entry->use_pmap);
6838
6839 vm_map_lock_write_to_read(map);
6840 }
6841
6842 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6843 !(entry->protection & VM_PROT_WRITE)) {
6844 vm_map_unlock_read(map);
6845 ret = KERN_PROTECTION_FAILURE;
6846 goto done;
6847 }
6848 }
6849
6850 #if !XNU_TARGET_OS_OSX
6851 if (map->pmap != kernel_pmap &&
6852 (caller_flags & UPL_COPYOUT_FROM) &&
6853 (entry->protection & VM_PROT_EXECUTE) &&
6854 !(entry->protection & VM_PROT_WRITE)) {
6855 vm_offset_t kaddr;
6856 vm_size_t ksize;
6857
6858 /*
6859 * We're about to create a read-only UPL backed by
6860 * memory from an executable mapping.
6861 * Wiring the pages would result in the pages being copied
6862 * (due to the "MAP_PRIVATE" mapping) and no longer
6863 * code-signed, so no longer eligible for execution.
6864 * Instead, let's copy the data into a kernel buffer and
6865 * create the UPL from this kernel buffer.
6866 * The kernel buffer is then freed, leaving the UPL holding
6867 * the last reference on the VM object, so the memory will
6868 * be released when the UPL is committed.
6869 */
6870
6871 vm_map_unlock_read(map);
6872 entry = VM_MAP_ENTRY_NULL;
6873 /* allocate kernel buffer */
6874 ksize = round_page(*upl_size);
6875 kaddr = 0;
6876 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6877 KMA_PAGEABLE | KMA_DATA, tag);
6878 if (ret == KERN_SUCCESS) {
6879 /* copyin the user data */
6880 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6881 }
6882 if (ret == KERN_SUCCESS) {
6883 if (ksize > *upl_size) {
6884 /* zero out the extra space in kernel buffer */
6885 memset((void *)(kaddr + *upl_size),
6886 0,
6887 ksize - *upl_size);
6888 }
6889 /* create the UPL from the kernel buffer */
6890 vm_object_offset_t offset_in_object;
6891 vm_object_offset_t offset_in_object_page;
6892
6893 offset_in_object = offset - local_entry_start + local_entry_offset;
6894 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6895 assert(offset_in_object_page < PAGE_SIZE);
6896 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6897 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6898 ret = vm_map_create_upl(kernel_map,
6899 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6900 upl_size, upl, page_list, count, flags, tag);
6901 }
6902 if (kaddr != 0) {
6903 /* free the kernel buffer */
6904 kmem_free(kernel_map, kaddr, ksize);
6905 kaddr = 0;
6906 ksize = 0;
6907 }
6908 #if DEVELOPMENT || DEBUG
6909 DTRACE_VM4(create_upl_from_executable,
6910 vm_map_t, map,
6911 vm_map_address_t, offset,
6912 upl_size_t, *upl_size,
6913 kern_return_t, ret);
6914 #endif /* DEVELOPMENT || DEBUG */
6915 goto done;
6916 }
6917 #endif /* !XNU_TARGET_OS_OSX */
6918
6919 if (!entry->is_sub_map) {
6920 local_object = VME_OBJECT(entry);
6921 assert(local_object != VM_OBJECT_NULL);
6922 }
6923
6924 if (!entry->is_sub_map &&
6925 !entry->needs_copy &&
6926 *upl_size != 0 &&
6927 local_object->vo_size > *upl_size && /* partial UPL */
6928 entry->wired_count == 0 && /* No COW for entries that are wired */
6929 (map->pmap != kernel_pmap) && /* alias checks */
6930 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6931 ||
6932 ( /* case 2 */
6933 local_object->internal &&
6934 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6935 local_object->ref_count > 1))) {
6936 vm_prot_t prot;
6937
6938 /*
6939 * Case 1:
6940 * Set up the targeted range for copy-on-write to avoid
6941 * applying true_share/copy_delay to the entire object.
6942 *
6943 * Case 2:
6944 * This map entry covers only part of an internal
6945 * object. There could be other map entries covering
6946 * other areas of this object and some of these map
6947 * entries could be marked as "needs_copy", which
6948 * assumes that the object is COPY_SYMMETRIC.
6949 * To avoid marking this object as COPY_DELAY and
6950 * "true_share", let's shadow it and mark the new
6951 * (smaller) object as "true_share" and COPY_DELAY.
6952 */
6953
6954 if (vm_map_lock_read_to_write(map)) {
6955 goto REDISCOVER_ENTRY;
6956 }
6957 vm_map_lock_assert_exclusive(map);
6958 assert(VME_OBJECT(entry) == local_object);
6959
6960 vm_map_clip_start(map,
6961 entry,
6962 vm_map_trunc_page(offset,
6963 VM_MAP_PAGE_MASK(map)));
6964 vm_map_clip_end(map,
6965 entry,
6966 vm_map_round_page(offset + *upl_size,
6967 VM_MAP_PAGE_MASK(map)));
6968 if ((entry->vme_end - offset) < *upl_size) {
6969 *upl_size = (upl_size_t) (entry->vme_end - offset);
6970 assert(*upl_size == entry->vme_end - offset);
6971 }
6972
6973 prot = entry->protection & ~VM_PROT_WRITE;
6974 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6975 prot |= VM_PROT_EXECUTE;
6976 }
6977 vm_object_pmap_protect(local_object,
6978 VME_OFFSET(entry),
6979 entry->vme_end - entry->vme_start,
6980 ((entry->is_shared ||
6981 map->mapped_in_other_pmaps)
6982 ? PMAP_NULL
6983 : map->pmap),
6984 VM_MAP_PAGE_SIZE(map),
6985 entry->vme_start,
6986 prot);
6987
6988 assert(entry->wired_count == 0);
6989
6990 /*
6991 * Lock the VM object and re-check its status: if it's mapped
6992 * in another address space, we could still be racing with
6993 * another thread holding that other VM map exclusively.
6994 */
6995 vm_object_lock(local_object);
6996 if (local_object->true_share) {
6997 /* object is already in proper state: no COW needed */
6998 assert(local_object->copy_strategy !=
6999 MEMORY_OBJECT_COPY_SYMMETRIC);
7000 } else {
7001 /* not true_share: ask for copy-on-write below */
7002 assert(local_object->copy_strategy ==
7003 MEMORY_OBJECT_COPY_SYMMETRIC);
7004 entry->needs_copy = TRUE;
7005 }
7006 vm_object_unlock(local_object);
7007
7008 vm_map_lock_write_to_read(map);
7009 }
7010
7011 if (entry->needs_copy) {
7012 /*
7013 * Honor copy-on-write for COPY_SYMMETRIC
7014 * strategy.
7015 */
7016 vm_map_t local_map;
7017 vm_object_t object;
7018 vm_object_offset_t new_offset;
7019 vm_prot_t prot;
7020 boolean_t wired;
7021 vm_map_version_t version;
7022 vm_map_t real_map;
7023 vm_prot_t fault_type;
7024
7025 local_map = map;
7026
7027 if (caller_flags & UPL_COPYOUT_FROM) {
7028 fault_type = VM_PROT_READ | VM_PROT_COPY;
7029 vm_counters.create_upl_extra_cow++;
7030 vm_counters.create_upl_extra_cow_pages +=
7031 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
7032 } else {
7033 fault_type = VM_PROT_WRITE;
7034 }
7035 if (vm_map_lookup_and_lock_object(&local_map,
7036 offset, fault_type,
7037 OBJECT_LOCK_EXCLUSIVE,
7038 &version, &object,
7039 &new_offset, &prot, &wired,
7040 NULL,
7041 &real_map, NULL) != KERN_SUCCESS) {
7042 if (fault_type == VM_PROT_WRITE) {
7043 vm_counters.create_upl_lookup_failure_write++;
7044 } else {
7045 vm_counters.create_upl_lookup_failure_copy++;
7046 }
7047 vm_map_unlock_read(local_map);
7048 ret = KERN_FAILURE;
7049 goto done;
7050 }
7051 if (real_map != local_map) {
7052 vm_map_unlock(real_map);
7053 }
7054 vm_map_unlock_read(local_map);
7055
7056 vm_object_unlock(object);
7057
7058 goto REDISCOVER_ENTRY;
7059 }
7060
7061 if (entry->is_sub_map) {
7062 vm_map_t submap;
7063
7064 submap = VME_SUBMAP(entry);
7065 local_start = entry->vme_start;
7066 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7067
7068 vm_map_reference(submap);
7069 vm_map_unlock_read(map);
7070
7071 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7072 offset += offset_in_mapped_page;
7073 *upl_size -= offset_in_mapped_page;
7074
7075 if (release_map) {
7076 vm_map_deallocate(map);
7077 }
7078 map = submap;
7079 release_map = TRUE;
7080 offset = local_offset + (offset - local_start);
7081 goto start_with_map;
7082 }
7083
7084 if (sync_cow_data &&
7085 (VME_OBJECT(entry)->shadow ||
7086 VME_OBJECT(entry)->vo_copy)) {
7087 local_object = VME_OBJECT(entry);
7088 local_start = entry->vme_start;
7089 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7090
7091 vm_object_reference(local_object);
7092 vm_map_unlock_read(map);
7093
7094 if (local_object->shadow && local_object->vo_copy) {
7095 vm_object_lock_request(local_object->shadow,
7096 ((vm_object_offset_t)
7097 ((offset - local_start) +
7098 local_offset) +
7099 local_object->vo_shadow_offset),
7100 *upl_size, FALSE,
7101 MEMORY_OBJECT_DATA_SYNC,
7102 VM_PROT_NO_CHANGE);
7103 }
7104 sync_cow_data = FALSE;
7105 vm_object_deallocate(local_object);
7106
7107 goto REDISCOVER_ENTRY;
7108 }
7109 if (force_data_sync) {
7110 local_object = VME_OBJECT(entry);
7111 local_start = entry->vme_start;
7112 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7113
7114 vm_object_reference(local_object);
7115 vm_map_unlock_read(map);
7116
7117 vm_object_lock_request(local_object,
7118 ((vm_object_offset_t)
7119 ((offset - local_start) +
7120 local_offset)),
7121 (vm_object_size_t)*upl_size,
7122 FALSE,
7123 MEMORY_OBJECT_DATA_SYNC,
7124 VM_PROT_NO_CHANGE);
7125
7126 force_data_sync = FALSE;
7127 vm_object_deallocate(local_object);
7128
7129 goto REDISCOVER_ENTRY;
7130 }
7131 if (VME_OBJECT(entry)->private) {
7132 *flags = UPL_DEV_MEMORY;
7133 } else {
7134 *flags = 0;
7135 }
7136
7137 if (VME_OBJECT(entry)->phys_contiguous) {
7138 *flags |= UPL_PHYS_CONTIG;
7139 }
7140
7141 local_object = VME_OBJECT(entry);
7142 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7143 local_start = entry->vme_start;
7144
7145 /*
7146 * Wiring will copy the pages to the shadow object.
7147 * The shadow object will not be code-signed so
7148 * attempting to execute code from these copied pages
7149 * would trigger a code-signing violation.
7150 */
7151 if (entry->protection & VM_PROT_EXECUTE) {
7152 #if MACH_ASSERT
7153 printf("pid %d[%s] create_upl out of executable range from "
7154 "0x%llx to 0x%llx: side effects may include "
7155 "code-signing violations later on\n",
7156 proc_selfpid(),
7157 (get_bsdtask_info(current_task())
7158 ? proc_name_address(get_bsdtask_info(current_task()))
7159 : "?"),
7160 (uint64_t) entry->vme_start,
7161 (uint64_t) entry->vme_end);
7162 #endif /* MACH_ASSERT */
7163 DTRACE_VM2(cs_executable_create_upl,
7164 uint64_t, (uint64_t)entry->vme_start,
7165 uint64_t, (uint64_t)entry->vme_end);
7166 cs_executable_create_upl++;
7167 }
7168
7169 vm_object_lock(local_object);
7170
7171 /*
7172 * Ensure that this object is "true_share" and "copy_delay" now,
7173 * while we're still holding the VM map lock. After we unlock the map,
7174 * anything could happen to that mapping, including some copy-on-write
7175 * activity. We need to make sure that the IOPL will point at the
7176 * same memory as the mapping.
7177 */
7178 if (local_object->true_share) {
7179 assert(local_object->copy_strategy !=
7180 MEMORY_OBJECT_COPY_SYMMETRIC);
7181 } else if (!is_kernel_object(local_object) &&
7182 local_object != compressor_object &&
7183 !local_object->phys_contiguous) {
7184 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7185 if (!local_object->true_share &&
7186 vm_object_tracking_btlog) {
7187 btlog_record(vm_object_tracking_btlog, local_object,
7188 VM_OBJECT_TRACKING_OP_TRUESHARE,
7189 btref_get(__builtin_frame_address(0), 0));
7190 }
7191 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7192 VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7193 if (local_object->copy_strategy ==
7194 MEMORY_OBJECT_COPY_SYMMETRIC) {
7195 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7196 }
7197 }
7198
7199 vm_object_reference_locked(local_object);
7200 vm_object_unlock(local_object);
7201
7202 vm_map_unlock_read(map);
7203
7204 offset += offset_in_mapped_page;
7205 assert(*upl_size > offset_in_mapped_page);
7206 *upl_size -= offset_in_mapped_page;
7207
7208 ret = vm_object_iopl_request(local_object,
7209 ((vm_object_offset_t)
7210 ((offset - local_start) + local_offset)),
7211 *upl_size,
7212 upl,
7213 page_list,
7214 count,
7215 caller_flags,
7216 tag);
7217 vm_object_deallocate(local_object);
7218
7219 done:
7220 if (release_map) {
7221 vm_map_deallocate(map);
7222 }
7223
7224 return ret;
7225 }
7226
7227 /*
7228 * Internal routine to enter a UPL into a VM map.
7229 *
7230 * JMM - This should just be doable through the standard
7231 * vm_map_enter() API.
7232 */
7233 kern_return_t
7234 vm_map_enter_upl_range(
7235 vm_map_t map,
7236 upl_t upl,
7237 vm_object_offset_t offset_to_map,
7238 vm_size_t size_to_map,
7239 vm_prot_t prot_to_map,
7240 vm_map_offset_t *dst_addr)
7241 {
7242 vm_map_size_t size;
7243 vm_object_offset_t offset;
7244 vm_map_offset_t addr;
7245 vm_page_t m;
7246 kern_return_t kr;
7247 int isVectorUPL = 0, curr_upl = 0;
7248 upl_t vector_upl = NULL;
7249 mach_vm_offset_t vector_upl_dst_addr = 0;
7250 vm_map_t vector_upl_submap = NULL;
7251 upl_offset_t subupl_offset = 0;
7252 upl_size_t subupl_size = 0;
7253
7254 if (upl == UPL_NULL) {
7255 return KERN_INVALID_ARGUMENT;
7256 }
7257
7258 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7259 assert(map == kernel_map);
7260
7261 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7262 int mapped = 0, valid_upls = 0;
7263 vector_upl = upl;
7264
7265 upl_lock(vector_upl);
7266 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7267 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7268 if (upl == NULL) {
7269 continue;
7270 }
7271 valid_upls++;
7272 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7273 mapped++;
7274 }
7275 }
7276
7277 if (mapped) {
7278 if (mapped != valid_upls) {
7279 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7280 } else {
7281 upl_unlock(vector_upl);
7282 return KERN_FAILURE;
7283 }
7284 }
7285
7286 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7287 panic("TODO4K: vector UPL not implemented");
7288 }
7289
7290 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7291 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7292 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7293 VM_KERN_MEMORY_NONE).kmr_submap;
7294 map = vector_upl_submap;
7295 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7296 curr_upl = 0;
7297 } else {
7298 upl_lock(upl);
7299 }
7300
7301 process_upl_to_enter:
7302 if (isVectorUPL) {
7303 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7304 *dst_addr = vector_upl_dst_addr;
7305 upl_unlock(vector_upl);
7306 return KERN_SUCCESS;
7307 }
7308 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7309 if (upl == NULL) {
7310 goto process_upl_to_enter;
7311 }
7312
7313 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7314 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7315 } else {
7316 /*
7317 * check to see if already mapped
7318 */
7319 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7320 upl_unlock(upl);
7321 return KERN_FAILURE;
7322 }
7323 }
7324
7325 if ((!(upl->flags & UPL_SHADOWED)) &&
7326 ((upl->flags & UPL_HAS_BUSY) ||
7327 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7328 vm_object_t object;
7329 vm_page_t alias_page;
7330 vm_object_offset_t new_offset;
7331 unsigned int pg_num;
7332
7333 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7334 object = upl->map_object;
7335 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7336
7337 vm_object_lock(upl->map_object);
7338
7339 upl->map_object->shadow = object;
7340 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7341 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7342 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7343 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7344 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7345 "object %p shadow_offset 0x%llx",
7346 upl->map_object,
7347 (uint64_t)upl->map_object->vo_shadow_offset);
7348 upl->map_object->wimg_bits = object->wimg_bits;
7349 offset = upl->map_object->vo_shadow_offset;
7350 new_offset = 0;
7351
7352 upl->flags |= UPL_SHADOWED;
7353
7354 while (size) {
7355 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7356 assert(pg_num == new_offset / PAGE_SIZE);
7357
7358 if (bitmap_test(upl->lite_list, pg_num)) {
7359 alias_page = vm_page_grab_fictitious(TRUE);
7360
7361 vm_object_lock(object);
7362
7363 m = vm_page_lookup(object, offset);
7364 if (m == VM_PAGE_NULL) {
7365 panic("vm_upl_map: page missing");
7366 }
7367
7368 /*
7369 * Convert the fictitious page to a private
7370 * shadow of the real page.
7371 */
7372 assert(alias_page->vmp_fictitious);
7373 alias_page->vmp_fictitious = FALSE;
7374 alias_page->vmp_private = TRUE;
7375 alias_page->vmp_free_when_done = TRUE;
7376 /*
7377 * since m is a page in the upl it must
7378 * already be wired or BUSY, so it's
7379 * safe to assign the underlying physical
7380 * page to the alias
7381 */
7382 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7383
7384 vm_object_unlock(object);
7385
7386 vm_page_lockspin_queues();
7387 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7388 vm_page_unlock_queues();
7389
7390 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7391
7392 assert(!alias_page->vmp_wanted);
7393 alias_page->vmp_busy = FALSE;
7394 alias_page->vmp_absent = FALSE;
7395 }
7396 size -= PAGE_SIZE;
7397 offset += PAGE_SIZE_64;
7398 new_offset += PAGE_SIZE_64;
7399 }
7400 vm_object_unlock(upl->map_object);
7401 }
7402 if (upl->flags & UPL_SHADOWED) {
7403 if (isVectorUPL) {
7404 offset = 0;
7405 } else {
7406 offset = offset_to_map;
7407 }
7408 } else {
7409 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7410 if (!isVectorUPL) {
7411 offset += offset_to_map;
7412 }
7413 }
7414
7415 if (isVectorUPL) {
7416 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7417 } else {
7418 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7419 }
7420
7421 vm_object_reference(upl->map_object);
7422
7423 if (!isVectorUPL) {
7424 *dst_addr = 0;
7425 /*
7426 * NEED A UPL_MAP ALIAS
7427 */
7428 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7429 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7430 upl->map_object, offset, FALSE,
7431 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7432
7433 if (kr != KERN_SUCCESS) {
7434 vm_object_deallocate(upl->map_object);
7435 upl_unlock(upl);
7436 return kr;
7437 }
7438 } else {
7439 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7440 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7441 upl->map_object, offset, FALSE,
7442 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7443 if (kr) {
7444 panic("vm_map_enter failed for a Vector UPL");
7445 }
7446 }
7447 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7448 /* this will have to be an increment rather than */
7449 /* an assignment. */
7450 vm_object_lock(upl->map_object);
7451
7452 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7453 m = vm_page_lookup(upl->map_object, offset);
7454
7455 if (m) {
7456 m->vmp_pmapped = TRUE;
7457
7458 /*
7459 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7460 * but only in kernel space. If this was on a user map,
7461 * we'd have to set the wpmapped bit.
7462 */
7463 /* m->vmp_wpmapped = TRUE; */
7464 assert(map->pmap == kernel_pmap);
7465
7466 kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE);
7467
7468 assert(kr == KERN_SUCCESS);
7469 #if KASAN
7470 kasan_notify_address(addr, PAGE_SIZE_64);
7471 #endif
7472 }
7473 offset += PAGE_SIZE_64;
7474 }
7475 vm_object_unlock(upl->map_object);
7476
7477 /*
7478 * hold a reference for the mapping
7479 */
7480 upl->ref_count++;
7481 upl->flags |= UPL_PAGE_LIST_MAPPED;
7482 upl->kaddr = (vm_offset_t) *dst_addr;
7483 assert(upl->kaddr == *dst_addr);
7484
7485 if (isVectorUPL) {
7486 goto process_upl_to_enter;
7487 }
7488
7489 if (!isVectorUPL) {
7490 vm_map_offset_t addr_adjustment;
7491
7492 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7493 if (addr_adjustment) {
7494 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7495 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7496 *dst_addr += addr_adjustment;
7497 }
7498 }
7499
7500 upl_unlock(upl);
7501
7502 return KERN_SUCCESS;
7503 }
7504
7505 kern_return_t
7506 vm_map_enter_upl(
7507 vm_map_t map,
7508 upl_t upl,
7509 vm_map_offset_t *dst_addr)
7510 {
7511 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7512 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7513 }
7514
7515 /*
7516 * Internal routine to remove a UPL mapping from a VM map.
7517 *
7518 * XXX - This should just be doable through a standard
7519 * vm_map_remove() operation. Otherwise, implicit clean-up
7520 * of the target map won't be able to correctly remove
7521 * these (and release the reference on the UPL). Having
7522 * to do this means we can't map these into user-space
7523 * maps yet.
7524 */
7525 kern_return_t
7526 vm_map_remove_upl_range(
7527 vm_map_t map,
7528 upl_t upl,
7529 __unused vm_object_offset_t offset_to_unmap,
7530 __unused vm_size_t size_to_unmap)
7531 {
7532 vm_address_t addr;
7533 upl_size_t size;
7534 int isVectorUPL = 0, curr_upl = 0;
7535 upl_t vector_upl = NULL;
7536
7537 if (upl == UPL_NULL) {
7538 return KERN_INVALID_ARGUMENT;
7539 }
7540
7541 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7542 int unmapped = 0, valid_upls = 0;
7543 vector_upl = upl;
7544 upl_lock(vector_upl);
7545 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7546 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7547 if (upl == NULL) {
7548 continue;
7549 }
7550 valid_upls++;
7551 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7552 unmapped++;
7553 }
7554 }
7555
7556 if (unmapped) {
7557 if (unmapped != valid_upls) {
7558 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7559 } else {
7560 upl_unlock(vector_upl);
7561 return KERN_FAILURE;
7562 }
7563 }
7564 curr_upl = 0;
7565 } else {
7566 upl_lock(upl);
7567 }
7568
7569 process_upl_to_remove:
7570 if (isVectorUPL) {
7571 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7572 vm_map_t v_upl_submap;
7573 vm_offset_t v_upl_submap_dst_addr;
7574 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7575
7576 kmem_free_guard(map, v_upl_submap_dst_addr,
7577 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7578 vm_map_deallocate(v_upl_submap);
7579 upl_unlock(vector_upl);
7580 return KERN_SUCCESS;
7581 }
7582
7583 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7584 if (upl == NULL) {
7585 goto process_upl_to_remove;
7586 }
7587 }
7588
7589 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7590 addr = upl->kaddr;
7591 size = upl->u_mapped_size;
7592
7593 assert(upl->ref_count > 1);
7594 upl->ref_count--; /* removing mapping ref */
7595
7596 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7597 upl->kaddr = (vm_offset_t) 0;
7598 upl->u_mapped_size = 0;
7599
7600 if (isVectorUPL) {
7601 /*
7602 * If it's a Vectored UPL, we'll be removing the entire
7603 * submap anyways, so no need to remove individual UPL
7604 * element mappings from within the submap
7605 */
7606 goto process_upl_to_remove;
7607 }
7608
7609 upl_unlock(upl);
7610
7611 vm_map_remove(map,
7612 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7613 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7614 return KERN_SUCCESS;
7615 }
7616 upl_unlock(upl);
7617
7618 return KERN_FAILURE;
7619 }
7620
7621 kern_return_t
7622 vm_map_remove_upl(
7623 vm_map_t map,
7624 upl_t upl)
7625 {
7626 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7627 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7628 }
7629
7630 void
7631 iopl_valid_data(
7632 upl_t upl,
7633 vm_tag_t tag)
7634 {
7635 vm_object_t object;
7636 vm_offset_t offset;
7637 vm_page_t m, nxt_page = VM_PAGE_NULL;
7638 upl_size_t size;
7639 int wired_count = 0;
7640
7641 if (upl == NULL) {
7642 panic("iopl_valid_data: NULL upl");
7643 }
7644 if (vector_upl_is_valid(upl)) {
7645 panic("iopl_valid_data: vector upl");
7646 }
7647 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7648 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7649 }
7650
7651 object = upl->map_object;
7652
7653 if (is_kernel_object(object) || object == compressor_object) {
7654 panic("iopl_valid_data: object == kernel or compressor");
7655 }
7656
7657 if (object->purgable == VM_PURGABLE_VOLATILE ||
7658 object->purgable == VM_PURGABLE_EMPTY) {
7659 panic("iopl_valid_data: object %p purgable %d",
7660 object, object->purgable);
7661 }
7662
7663 size = upl_adjusted_size(upl, PAGE_MASK);
7664
7665 vm_object_lock(object);
7666 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7667
7668 bool whole_object;
7669
7670 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7671 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7672 whole_object = true;
7673 } else {
7674 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7675 whole_object = false;
7676 }
7677
7678 while (size) {
7679 if (whole_object) {
7680 if (nxt_page != VM_PAGE_NULL) {
7681 m = nxt_page;
7682 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7683 }
7684 } else {
7685 m = vm_page_lookup(object, offset);
7686 offset += PAGE_SIZE;
7687
7688 if (m == VM_PAGE_NULL) {
7689 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7690 }
7691 }
7692 if (m->vmp_busy) {
7693 if (!m->vmp_absent) {
7694 panic("iopl_valid_data: busy page w/o absent");
7695 }
7696
7697 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7698 panic("iopl_valid_data: busy+absent page on page queue");
7699 }
7700 if (m->vmp_reusable) {
7701 panic("iopl_valid_data: %p is reusable", m);
7702 }
7703
7704 m->vmp_absent = FALSE;
7705 m->vmp_dirty = TRUE;
7706 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7707 assert(m->vmp_wire_count == 0);
7708 m->vmp_wire_count++;
7709 assert(m->vmp_wire_count);
7710 if (m->vmp_wire_count == 1) {
7711 m->vmp_q_state = VM_PAGE_IS_WIRED;
7712 wired_count++;
7713 } else {
7714 panic("iopl_valid_data: %p already wired", m);
7715 }
7716
7717 vm_page_wakeup_done(object, m);
7718 }
7719 size -= PAGE_SIZE;
7720 }
7721 if (wired_count) {
7722 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7723 assert(object->resident_page_count >= object->wired_page_count);
7724
7725 /* no need to adjust purgeable accounting for this object: */
7726 assert(object->purgable != VM_PURGABLE_VOLATILE);
7727 assert(object->purgable != VM_PURGABLE_EMPTY);
7728
7729 vm_page_lockspin_queues();
7730 vm_page_wire_count += wired_count;
7731 vm_page_unlock_queues();
7732 }
7733 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7734 vm_object_unlock(object);
7735 }
7736
7737
7738 void
7739 vm_object_set_pmap_cache_attr(
7740 vm_object_t object,
7741 upl_page_info_array_t user_page_list,
7742 unsigned int num_pages,
7743 boolean_t batch_pmap_op)
7744 {
7745 unsigned int cache_attr = 0;
7746
7747 cache_attr = object->wimg_bits & VM_WIMG_MASK;
7748 assert(user_page_list);
7749 if (cache_attr != VM_WIMG_USE_DEFAULT) {
7750 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7751 }
7752 }
7753
7754
7755 static bool
7756 vm_object_iopl_wire_full(
7757 vm_object_t object,
7758 upl_t upl,
7759 upl_page_info_array_t user_page_list,
7760 upl_control_flags_t cntrl_flags,
7761 vm_tag_t tag)
7762 {
7763 vm_page_t dst_page;
7764 unsigned int entry;
7765 int page_count;
7766 int delayed_unlock = 0;
7767 boolean_t retval = TRUE;
7768 ppnum_t phys_page;
7769
7770 vm_object_lock_assert_exclusive(object);
7771 assert(object->purgable != VM_PURGABLE_VOLATILE);
7772 assert(object->purgable != VM_PURGABLE_EMPTY);
7773 assert(object->pager == NULL);
7774 assert(object->vo_copy == NULL);
7775 assert(object->shadow == NULL);
7776
7777 page_count = object->resident_page_count;
7778 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7779
7780 vm_page_lock_queues();
7781
7782 while (page_count--) {
7783 if (dst_page->vmp_busy ||
7784 dst_page->vmp_fictitious ||
7785 dst_page->vmp_absent ||
7786 VMP_ERROR_GET(dst_page) ||
7787 dst_page->vmp_cleaning ||
7788 dst_page->vmp_restart ||
7789 dst_page->vmp_laundry) {
7790 retval = FALSE;
7791 goto done;
7792 }
7793 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7794 retval = FALSE;
7795 goto done;
7796 }
7797 dst_page->vmp_reference = TRUE;
7798
7799 vm_page_wire(dst_page, tag, FALSE);
7800
7801 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7802 SET_PAGE_DIRTY(dst_page, FALSE);
7803 }
7804 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7805 assert(entry >= 0 && entry < object->resident_page_count);
7806 bitmap_set(upl->lite_list, entry);
7807
7808 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7809
7810 if (phys_page > upl->highest_page) {
7811 upl->highest_page = phys_page;
7812 }
7813
7814 if (user_page_list) {
7815 user_page_list[entry].phys_addr = phys_page;
7816 user_page_list[entry].absent = dst_page->vmp_absent;
7817 user_page_list[entry].dirty = dst_page->vmp_dirty;
7818 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
7819 user_page_list[entry].precious = dst_page->vmp_precious;
7820 user_page_list[entry].device = FALSE;
7821 user_page_list[entry].speculative = FALSE;
7822 user_page_list[entry].cs_validated = FALSE;
7823 user_page_list[entry].cs_tainted = FALSE;
7824 user_page_list[entry].cs_nx = FALSE;
7825 user_page_list[entry].needed = FALSE;
7826 user_page_list[entry].mark = FALSE;
7827 }
7828 if (delayed_unlock++ > 256) {
7829 delayed_unlock = 0;
7830 lck_mtx_yield(&vm_page_queue_lock);
7831
7832 VM_CHECK_MEMORYSTATUS;
7833 }
7834 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7835 }
7836 done:
7837 vm_page_unlock_queues();
7838
7839 VM_CHECK_MEMORYSTATUS;
7840
7841 return retval;
7842 }
7843
7844
7845 static kern_return_t
7846 vm_object_iopl_wire_empty(
7847 vm_object_t object,
7848 upl_t upl,
7849 upl_page_info_array_t user_page_list,
7850 upl_control_flags_t cntrl_flags,
7851 vm_tag_t tag,
7852 vm_object_offset_t *dst_offset,
7853 int page_count,
7854 int *page_grab_count)
7855 {
7856 vm_page_t dst_page;
7857 boolean_t no_zero_fill = FALSE;
7858 int interruptible;
7859 int pages_wired = 0;
7860 int pages_inserted = 0;
7861 int entry = 0;
7862 uint64_t delayed_ledger_update = 0;
7863 kern_return_t ret = KERN_SUCCESS;
7864 int grab_options;
7865 ppnum_t phys_page;
7866
7867 vm_object_lock_assert_exclusive(object);
7868 assert(object->purgable != VM_PURGABLE_VOLATILE);
7869 assert(object->purgable != VM_PURGABLE_EMPTY);
7870 assert(object->pager == NULL);
7871 assert(object->vo_copy == NULL);
7872 assert(object->shadow == NULL);
7873
7874 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7875 interruptible = THREAD_ABORTSAFE;
7876 } else {
7877 interruptible = THREAD_UNINT;
7878 }
7879
7880 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7881 no_zero_fill = TRUE;
7882 }
7883
7884 grab_options = 0;
7885 #if CONFIG_SECLUDED_MEMORY
7886 if (object->can_grab_secluded) {
7887 grab_options |= VM_PAGE_GRAB_SECLUDED;
7888 }
7889 #endif /* CONFIG_SECLUDED_MEMORY */
7890
7891 while (page_count--) {
7892 while ((dst_page = vm_page_grab_options(grab_options))
7893 == VM_PAGE_NULL) {
7894 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7895
7896 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7897
7898 if (vm_page_wait(interruptible) == FALSE) {
7899 /*
7900 * interrupted case
7901 */
7902 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7903
7904 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7905
7906 ret = MACH_SEND_INTERRUPTED;
7907 goto done;
7908 }
7909 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7910
7911 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7912 }
7913 if (no_zero_fill == FALSE) {
7914 vm_page_zero_fill(dst_page);
7915 } else {
7916 dst_page->vmp_absent = TRUE;
7917 }
7918
7919 dst_page->vmp_reference = TRUE;
7920
7921 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7922 SET_PAGE_DIRTY(dst_page, FALSE);
7923 }
7924 if (dst_page->vmp_absent == FALSE) {
7925 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7926 assert(dst_page->vmp_wire_count == 0);
7927 dst_page->vmp_wire_count++;
7928 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7929 assert(dst_page->vmp_wire_count);
7930 pages_wired++;
7931 vm_page_wakeup_done(object, dst_page);
7932 }
7933 pages_inserted++;
7934
7935 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7936
7937 bitmap_set(upl->lite_list, entry);
7938
7939 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7940
7941 if (phys_page > upl->highest_page) {
7942 upl->highest_page = phys_page;
7943 }
7944
7945 if (user_page_list) {
7946 user_page_list[entry].phys_addr = phys_page;
7947 user_page_list[entry].absent = dst_page->vmp_absent;
7948 user_page_list[entry].dirty = dst_page->vmp_dirty;
7949 user_page_list[entry].free_when_done = FALSE;
7950 user_page_list[entry].precious = FALSE;
7951 user_page_list[entry].device = FALSE;
7952 user_page_list[entry].speculative = FALSE;
7953 user_page_list[entry].cs_validated = FALSE;
7954 user_page_list[entry].cs_tainted = FALSE;
7955 user_page_list[entry].cs_nx = FALSE;
7956 user_page_list[entry].needed = FALSE;
7957 user_page_list[entry].mark = FALSE;
7958 }
7959 entry++;
7960 *dst_offset += PAGE_SIZE_64;
7961 }
7962 done:
7963 if (pages_wired) {
7964 vm_page_lockspin_queues();
7965 vm_page_wire_count += pages_wired;
7966 vm_page_unlock_queues();
7967 }
7968 if (pages_inserted) {
7969 if (object->internal) {
7970 OSAddAtomic(pages_inserted, &vm_page_internal_count);
7971 } else {
7972 OSAddAtomic(pages_inserted, &vm_page_external_count);
7973 }
7974 }
7975 if (delayed_ledger_update) {
7976 task_t owner;
7977 int ledger_idx_volatile;
7978 int ledger_idx_nonvolatile;
7979 int ledger_idx_volatile_compressed;
7980 int ledger_idx_nonvolatile_compressed;
7981 int ledger_idx_composite;
7982 int ledger_idx_external_wired;
7983 boolean_t do_footprint;
7984
7985 owner = VM_OBJECT_OWNER(object);
7986 assert(owner);
7987
7988 vm_object_ledger_tag_ledgers(object,
7989 &ledger_idx_volatile,
7990 &ledger_idx_nonvolatile,
7991 &ledger_idx_volatile_compressed,
7992 &ledger_idx_nonvolatile_compressed,
7993 &ledger_idx_composite,
7994 &ledger_idx_external_wired,
7995 &do_footprint);
7996
7997 if (object->internal) {
7998 /* more non-volatile bytes */
7999 ledger_credit(owner->ledger,
8000 ledger_idx_nonvolatile,
8001 delayed_ledger_update);
8002 if (do_footprint) {
8003 /* more footprint */
8004 ledger_credit(owner->ledger,
8005 task_ledgers.phys_footprint,
8006 delayed_ledger_update);
8007 } else if (ledger_idx_composite != -1) {
8008 ledger_credit(owner->ledger,
8009 ledger_idx_composite,
8010 delayed_ledger_update);
8011 }
8012 } else {
8013 /* more external wired bytes */
8014 ledger_credit(owner->ledger,
8015 ledger_idx_external_wired,
8016 delayed_ledger_update);
8017 if (do_footprint) {
8018 /* more footprint */
8019 ledger_credit(owner->ledger,
8020 task_ledgers.phys_footprint,
8021 delayed_ledger_update);
8022 } else if (ledger_idx_composite != -1) {
8023 ledger_credit(owner->ledger,
8024 ledger_idx_composite,
8025 delayed_ledger_update);
8026 }
8027 }
8028 }
8029
8030 assert(page_grab_count);
8031 *page_grab_count = pages_inserted;
8032
8033 return ret;
8034 }
8035
8036
8037
8038 kern_return_t
8039 vm_object_iopl_request(
8040 vm_object_t object,
8041 vm_object_offset_t offset,
8042 upl_size_t size,
8043 upl_t *upl_ptr,
8044 upl_page_info_array_t user_page_list,
8045 unsigned int *page_list_count,
8046 upl_control_flags_t cntrl_flags,
8047 vm_tag_t tag)
8048 {
8049 vm_page_t dst_page;
8050 vm_object_offset_t dst_offset;
8051 upl_size_t xfer_size;
8052 upl_t upl = NULL;
8053 unsigned int entry;
8054 int no_zero_fill = FALSE;
8055 unsigned int size_in_pages;
8056 int page_grab_count = 0;
8057 u_int32_t psize;
8058 kern_return_t ret;
8059 vm_prot_t prot;
8060 struct vm_object_fault_info fault_info = {};
8061 struct vm_page_delayed_work dw_array;
8062 struct vm_page_delayed_work *dwp, *dwp_start;
8063 bool dwp_finish_ctx = TRUE;
8064 int dw_count;
8065 int dw_limit;
8066 int dw_index;
8067 boolean_t caller_lookup;
8068 int io_tracking_flag = 0;
8069 int interruptible;
8070 ppnum_t phys_page;
8071
8072 boolean_t set_cache_attr_needed = FALSE;
8073 boolean_t free_wired_pages = FALSE;
8074 boolean_t fast_path_empty_req = FALSE;
8075 boolean_t fast_path_full_req = FALSE;
8076
8077 #if DEVELOPMENT || DEBUG
8078 task_t task = current_task();
8079 #endif /* DEVELOPMENT || DEBUG */
8080
8081 dwp_start = dwp = NULL;
8082
8083 vm_object_offset_t original_offset = offset;
8084 upl_size_t original_size = size;
8085
8086 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8087
8088 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8089 offset = vm_object_trunc_page(offset);
8090 if (size != original_size || offset != original_offset) {
8091 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8092 }
8093
8094 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8095 /*
8096 * For forward compatibility's sake,
8097 * reject any unknown flag.
8098 */
8099 return KERN_INVALID_VALUE;
8100 }
8101 if (vm_lopage_needed == FALSE) {
8102 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8103 }
8104
8105 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8106 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8107 return KERN_INVALID_VALUE;
8108 }
8109
8110 if (object->phys_contiguous) {
8111 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8112 return KERN_INVALID_ADDRESS;
8113 }
8114
8115 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8116 return KERN_INVALID_ADDRESS;
8117 }
8118 }
8119 }
8120 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8121 no_zero_fill = TRUE;
8122 }
8123
8124 if (cntrl_flags & UPL_COPYOUT_FROM) {
8125 prot = VM_PROT_READ;
8126 } else {
8127 prot = VM_PROT_READ | VM_PROT_WRITE;
8128 }
8129
8130 if ((!object->internal) && (object->paging_offset != 0)) {
8131 panic("vm_object_iopl_request: external object with non-zero paging offset");
8132 }
8133
8134
8135 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8136
8137 #if CONFIG_IOSCHED || UPL_DEBUG
8138 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8139 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8140 }
8141 #endif
8142
8143 #if CONFIG_IOSCHED
8144 if (object->io_tracking) {
8145 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8146 if (!is_kernel_object(object)) {
8147 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8148 }
8149 }
8150 #endif
8151
8152 if (object->phys_contiguous) {
8153 psize = PAGE_SIZE;
8154 } else {
8155 psize = size;
8156
8157 dw_count = 0;
8158 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8159 dwp_start = vm_page_delayed_work_get_ctx();
8160 if (dwp_start == NULL) {
8161 dwp_start = &dw_array;
8162 dw_limit = 1;
8163 dwp_finish_ctx = FALSE;
8164 }
8165
8166 dwp = dwp_start;
8167 }
8168
8169 if (cntrl_flags & UPL_SET_INTERNAL) {
8170 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8171 user_page_list = size ? upl->page_list : NULL;
8172 } else {
8173 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8174 }
8175 if (user_page_list) {
8176 user_page_list[0].device = FALSE;
8177 }
8178 *upl_ptr = upl;
8179
8180 if (cntrl_flags & UPL_NOZEROFILLIO) {
8181 DTRACE_VM4(upl_nozerofillio,
8182 vm_object_t, object,
8183 vm_object_offset_t, offset,
8184 upl_size_t, size,
8185 upl_t, upl);
8186 }
8187
8188 upl->map_object = object;
8189 upl->u_offset = original_offset;
8190 upl->u_size = original_size;
8191
8192 size_in_pages = size / PAGE_SIZE;
8193
8194 if (is_kernel_object(object) &&
8195 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8196 upl->flags |= UPL_KERNEL_OBJECT;
8197 #if UPL_DEBUG
8198 vm_object_lock(object);
8199 #else
8200 vm_object_lock_shared(object);
8201 #endif
8202 } else {
8203 vm_object_lock(object);
8204 vm_object_activity_begin(object);
8205 }
8206 /*
8207 * paging in progress also protects the paging_offset
8208 */
8209 upl->u_offset = original_offset + object->paging_offset;
8210
8211 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8212 /*
8213 * The user requested that access to the pages in this UPL
8214 * be blocked until the UPL is commited or aborted.
8215 */
8216 upl->flags |= UPL_ACCESS_BLOCKED;
8217 }
8218
8219 #if CONFIG_IOSCHED || UPL_DEBUG
8220 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8221 vm_object_activity_begin(object);
8222 queue_enter(&object->uplq, upl, upl_t, uplq);
8223 }
8224 #endif
8225
8226 if (object->phys_contiguous) {
8227 if (upl->flags & UPL_ACCESS_BLOCKED) {
8228 assert(!object->blocked_access);
8229 object->blocked_access = TRUE;
8230 }
8231
8232 vm_object_unlock(object);
8233
8234 /*
8235 * don't need any shadow mappings for this one
8236 * since it is already I/O memory
8237 */
8238 upl->flags |= UPL_DEVICE_MEMORY;
8239
8240 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8241
8242 if (user_page_list) {
8243 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8244 user_page_list[0].device = TRUE;
8245 }
8246 if (page_list_count != NULL) {
8247 if (upl->flags & UPL_INTERNAL) {
8248 *page_list_count = 0;
8249 } else {
8250 *page_list_count = 1;
8251 }
8252 }
8253
8254 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8255 #if DEVELOPMENT || DEBUG
8256 if (task != NULL) {
8257 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8258 }
8259 #endif /* DEVELOPMENT || DEBUG */
8260 return KERN_SUCCESS;
8261 }
8262 if (!is_kernel_object(object) && object != compressor_object) {
8263 /*
8264 * Protect user space from future COW operations
8265 */
8266 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8267 if (!object->true_share &&
8268 vm_object_tracking_btlog) {
8269 btlog_record(vm_object_tracking_btlog, object,
8270 VM_OBJECT_TRACKING_OP_TRUESHARE,
8271 btref_get(__builtin_frame_address(0), 0));
8272 }
8273 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8274
8275 vm_object_lock_assert_exclusive(object);
8276 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8277
8278 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8279 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8280 }
8281 }
8282
8283 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8284 object->vo_copy != VM_OBJECT_NULL) {
8285 /*
8286 * Honor copy-on-write obligations
8287 *
8288 * The caller is gathering these pages and
8289 * might modify their contents. We need to
8290 * make sure that the copy object has its own
8291 * private copies of these pages before we let
8292 * the caller modify them.
8293 *
8294 * NOTE: someone else could map the original object
8295 * after we've done this copy-on-write here, and they
8296 * could then see an inconsistent picture of the memory
8297 * while it's being modified via the UPL. To prevent this,
8298 * we would have to block access to these pages until the
8299 * UPL is released. We could use the UPL_BLOCK_ACCESS
8300 * code path for that...
8301 */
8302 vm_object_update(object,
8303 offset,
8304 size,
8305 NULL,
8306 NULL,
8307 FALSE, /* should_return */
8308 MEMORY_OBJECT_COPY_SYNC,
8309 VM_PROT_NO_CHANGE);
8310 VM_PAGEOUT_DEBUG(iopl_cow, 1);
8311 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8312 }
8313 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8314 object->purgable != VM_PURGABLE_VOLATILE &&
8315 object->purgable != VM_PURGABLE_EMPTY &&
8316 object->vo_copy == NULL &&
8317 size == object->vo_size &&
8318 offset == 0 &&
8319 object->shadow == NULL &&
8320 object->pager == NULL) {
8321 if (object->resident_page_count == size_in_pages) {
8322 assert(object != compressor_object);
8323 assert(!is_kernel_object(object));
8324 fast_path_full_req = TRUE;
8325 } else if (object->resident_page_count == 0) {
8326 assert(object != compressor_object);
8327 assert(!is_kernel_object(object));
8328 fast_path_empty_req = TRUE;
8329 set_cache_attr_needed = TRUE;
8330 }
8331 }
8332
8333 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8334 interruptible = THREAD_ABORTSAFE;
8335 } else {
8336 interruptible = THREAD_UNINT;
8337 }
8338
8339 entry = 0;
8340
8341 xfer_size = size;
8342 dst_offset = offset;
8343
8344 if (fast_path_full_req) {
8345 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8346 goto finish;
8347 }
8348 /*
8349 * we couldn't complete the processing of this request on the fast path
8350 * so fall through to the slow path and finish up
8351 */
8352 } else if (fast_path_empty_req) {
8353 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8354 ret = KERN_MEMORY_ERROR;
8355 goto return_err;
8356 }
8357 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8358 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8359
8360 if (ret) {
8361 free_wired_pages = TRUE;
8362 goto return_err;
8363 }
8364 goto finish;
8365 }
8366
8367 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8368 fault_info.lo_offset = offset;
8369 fault_info.hi_offset = offset + xfer_size;
8370 fault_info.mark_zf_absent = TRUE;
8371 fault_info.interruptible = interruptible;
8372 fault_info.batch_pmap_op = TRUE;
8373
8374 while (xfer_size) {
8375 vm_fault_return_t result;
8376
8377 dwp->dw_mask = 0;
8378
8379 if (fast_path_full_req) {
8380 /*
8381 * if we get here, it means that we ran into a page
8382 * state we couldn't handle in the fast path and
8383 * bailed out to the slow path... since the order
8384 * we look at pages is different between the 2 paths,
8385 * the following check is needed to determine whether
8386 * this page was already processed in the fast path
8387 */
8388 if (bitmap_test(upl->lite_list, entry)) {
8389 goto skip_page;
8390 }
8391 }
8392 dst_page = vm_page_lookup(object, dst_offset);
8393
8394 if (dst_page == VM_PAGE_NULL ||
8395 dst_page->vmp_busy ||
8396 VMP_ERROR_GET(dst_page) ||
8397 dst_page->vmp_restart ||
8398 dst_page->vmp_absent ||
8399 dst_page->vmp_fictitious) {
8400 if (is_kernel_object(object)) {
8401 panic("vm_object_iopl_request: missing/bad page in kernel object");
8402 }
8403 if (object == compressor_object) {
8404 panic("vm_object_iopl_request: missing/bad page in compressor object");
8405 }
8406
8407 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8408 ret = KERN_MEMORY_ERROR;
8409 goto return_err;
8410 }
8411 set_cache_attr_needed = TRUE;
8412
8413 /*
8414 * We just looked up the page and the result remains valid
8415 * until the object lock is release, so send it to
8416 * vm_fault_page() (as "dst_page"), to avoid having to
8417 * look it up again there.
8418 */
8419 caller_lookup = TRUE;
8420
8421 do {
8422 vm_page_t top_page;
8423 kern_return_t error_code;
8424
8425 fault_info.cluster_size = xfer_size;
8426
8427 vm_object_paging_begin(object);
8428
8429 result = vm_fault_page(object, dst_offset,
8430 prot | VM_PROT_WRITE, FALSE,
8431 caller_lookup,
8432 &prot, &dst_page, &top_page,
8433 (int *)0,
8434 &error_code, no_zero_fill,
8435 &fault_info);
8436
8437 /* our lookup is no longer valid at this point */
8438 caller_lookup = FALSE;
8439
8440 switch (result) {
8441 case VM_FAULT_SUCCESS:
8442 page_grab_count++;
8443
8444 if (!dst_page->vmp_absent) {
8445 vm_page_wakeup_done(object, dst_page);
8446 } else {
8447 /*
8448 * we only get back an absent page if we
8449 * requested that it not be zero-filled
8450 * because we are about to fill it via I/O
8451 *
8452 * absent pages should be left BUSY
8453 * to prevent them from being faulted
8454 * into an address space before we've
8455 * had a chance to complete the I/O on
8456 * them since they may contain info that
8457 * shouldn't be seen by the faulting task
8458 */
8459 }
8460 /*
8461 * Release paging references and
8462 * top-level placeholder page, if any.
8463 */
8464 if (top_page != VM_PAGE_NULL) {
8465 vm_object_t local_object;
8466
8467 local_object = VM_PAGE_OBJECT(top_page);
8468
8469 /*
8470 * comparing 2 packed pointers
8471 */
8472 if (top_page->vmp_object != dst_page->vmp_object) {
8473 vm_object_lock(local_object);
8474 VM_PAGE_FREE(top_page);
8475 vm_object_paging_end(local_object);
8476 vm_object_unlock(local_object);
8477 } else {
8478 VM_PAGE_FREE(top_page);
8479 vm_object_paging_end(local_object);
8480 }
8481 }
8482 vm_object_paging_end(object);
8483 break;
8484
8485 case VM_FAULT_RETRY:
8486 vm_object_lock(object);
8487 break;
8488
8489 case VM_FAULT_MEMORY_SHORTAGE:
8490 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8491
8492 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8493
8494 if (vm_page_wait(interruptible)) {
8495 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8496
8497 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8498 vm_object_lock(object);
8499
8500 break;
8501 }
8502 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8503
8504 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8505 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8506 OS_FALLTHROUGH;
8507
8508 case VM_FAULT_INTERRUPTED:
8509 error_code = MACH_SEND_INTERRUPTED;
8510 OS_FALLTHROUGH;
8511 case VM_FAULT_MEMORY_ERROR:
8512 memory_error:
8513 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8514
8515 vm_object_lock(object);
8516 goto return_err;
8517
8518 case VM_FAULT_SUCCESS_NO_VM_PAGE:
8519 /* success but no page: fail */
8520 vm_object_paging_end(object);
8521 vm_object_unlock(object);
8522 goto memory_error;
8523
8524 default:
8525 panic("vm_object_iopl_request: unexpected error"
8526 " 0x%x from vm_fault_page()\n", result);
8527 }
8528 } while (result != VM_FAULT_SUCCESS);
8529 }
8530 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8531
8532 if (upl->flags & UPL_KERNEL_OBJECT) {
8533 goto record_phys_addr;
8534 }
8535
8536 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8537 dst_page->vmp_busy = TRUE;
8538 goto record_phys_addr;
8539 }
8540
8541 if (dst_page->vmp_cleaning) {
8542 /*
8543 * Someone else is cleaning this page in place.
8544 * In theory, we should be able to proceed and use this
8545 * page but they'll probably end up clearing the "busy"
8546 * bit on it in upl_commit_range() but they didn't set
8547 * it, so they would clear our "busy" bit and open
8548 * us to race conditions.
8549 * We'd better wait for the cleaning to complete and
8550 * then try again.
8551 */
8552 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8553 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8554 continue;
8555 }
8556 if (dst_page->vmp_laundry) {
8557 vm_pageout_steal_laundry(dst_page, FALSE);
8558 }
8559
8560 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8561 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
8562 vm_page_t low_page;
8563 int refmod;
8564
8565 /*
8566 * support devices that can't DMA above 32 bits
8567 * by substituting pages from a pool of low address
8568 * memory for any pages we find above the 4G mark
8569 * can't substitute if the page is already wired because
8570 * we don't know whether that physical address has been
8571 * handed out to some other 64 bit capable DMA device to use
8572 */
8573 if (VM_PAGE_WIRED(dst_page)) {
8574 ret = KERN_PROTECTION_FAILURE;
8575 goto return_err;
8576 }
8577 low_page = vm_page_grablo();
8578
8579 if (low_page == VM_PAGE_NULL) {
8580 ret = KERN_RESOURCE_SHORTAGE;
8581 goto return_err;
8582 }
8583 /*
8584 * from here until the vm_page_replace completes
8585 * we musn't drop the object lock... we don't
8586 * want anyone refaulting this page in and using
8587 * it after we disconnect it... we want the fault
8588 * to find the new page being substituted.
8589 */
8590 if (dst_page->vmp_pmapped) {
8591 refmod = pmap_disconnect(phys_page);
8592 } else {
8593 refmod = 0;
8594 }
8595
8596 if (!dst_page->vmp_absent) {
8597 vm_page_copy(dst_page, low_page);
8598 }
8599
8600 low_page->vmp_reference = dst_page->vmp_reference;
8601 low_page->vmp_dirty = dst_page->vmp_dirty;
8602 low_page->vmp_absent = dst_page->vmp_absent;
8603
8604 if (refmod & VM_MEM_REFERENCED) {
8605 low_page->vmp_reference = TRUE;
8606 }
8607 if (refmod & VM_MEM_MODIFIED) {
8608 SET_PAGE_DIRTY(low_page, FALSE);
8609 }
8610
8611 vm_page_replace(low_page, object, dst_offset);
8612
8613 dst_page = low_page;
8614 /*
8615 * vm_page_grablo returned the page marked
8616 * BUSY... we don't need a PAGE_WAKEUP_DONE
8617 * here, because we've never dropped the object lock
8618 */
8619 if (!dst_page->vmp_absent) {
8620 dst_page->vmp_busy = FALSE;
8621 }
8622
8623 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8624 }
8625 if (!dst_page->vmp_busy) {
8626 dwp->dw_mask |= DW_vm_page_wire;
8627 }
8628
8629 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8630 /*
8631 * Mark the page "busy" to block any future page fault
8632 * on this page in addition to wiring it.
8633 * We'll also remove the mapping
8634 * of all these pages before leaving this routine.
8635 */
8636 assert(!dst_page->vmp_fictitious);
8637 dst_page->vmp_busy = TRUE;
8638 }
8639 /*
8640 * expect the page to be used
8641 * page queues lock must be held to set 'reference'
8642 */
8643 dwp->dw_mask |= DW_set_reference;
8644
8645 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8646 SET_PAGE_DIRTY(dst_page, TRUE);
8647 /*
8648 * Page belonging to a code-signed object is about to
8649 * be written. Mark it tainted and disconnect it from
8650 * all pmaps so processes have to fault it back in and
8651 * deal with the tainted bit.
8652 */
8653 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8654 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8655 vm_page_iopl_tainted++;
8656 if (dst_page->vmp_pmapped) {
8657 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8658 if (refmod & VM_MEM_REFERENCED) {
8659 dst_page->vmp_reference = TRUE;
8660 }
8661 }
8662 }
8663 }
8664 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8665 pmap_sync_page_attributes_phys(phys_page);
8666 dst_page->vmp_written_by_kernel = FALSE;
8667 }
8668
8669 record_phys_addr:
8670 if (dst_page->vmp_busy) {
8671 upl->flags |= UPL_HAS_BUSY;
8672 }
8673
8674 bitmap_set(upl->lite_list, entry);
8675
8676 if (phys_page > upl->highest_page) {
8677 upl->highest_page = phys_page;
8678 }
8679
8680 if (user_page_list) {
8681 user_page_list[entry].phys_addr = phys_page;
8682 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8683 user_page_list[entry].absent = dst_page->vmp_absent;
8684 user_page_list[entry].dirty = dst_page->vmp_dirty;
8685 user_page_list[entry].precious = dst_page->vmp_precious;
8686 user_page_list[entry].device = FALSE;
8687 user_page_list[entry].needed = FALSE;
8688 if (dst_page->vmp_clustered == TRUE) {
8689 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8690 } else {
8691 user_page_list[entry].speculative = FALSE;
8692 }
8693 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8694 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8695 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8696 user_page_list[entry].mark = FALSE;
8697 }
8698 if (!is_kernel_object(object) && object != compressor_object) {
8699 /*
8700 * someone is explicitly grabbing this page...
8701 * update clustered and speculative state
8702 *
8703 */
8704 if (dst_page->vmp_clustered) {
8705 VM_PAGE_CONSUME_CLUSTERED(dst_page);
8706 }
8707 }
8708 skip_page:
8709 entry++;
8710 dst_offset += PAGE_SIZE_64;
8711 xfer_size -= PAGE_SIZE;
8712
8713 if (dwp->dw_mask) {
8714 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8715
8716 if (dw_count >= dw_limit) {
8717 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8718
8719 dwp = dwp_start;
8720 dw_count = 0;
8721 }
8722 }
8723 }
8724 assert(entry == size_in_pages);
8725
8726 if (dw_count) {
8727 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8728 dwp = dwp_start;
8729 dw_count = 0;
8730 }
8731 finish:
8732 if (user_page_list && set_cache_attr_needed == TRUE) {
8733 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8734 }
8735
8736 if (page_list_count != NULL) {
8737 if (upl->flags & UPL_INTERNAL) {
8738 *page_list_count = 0;
8739 } else if (*page_list_count > size_in_pages) {
8740 *page_list_count = size_in_pages;
8741 }
8742 }
8743 vm_object_unlock(object);
8744
8745 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8746 /*
8747 * We've marked all the pages "busy" so that future
8748 * page faults will block.
8749 * Now remove the mapping for these pages, so that they
8750 * can't be accessed without causing a page fault.
8751 */
8752 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8753 PMAP_NULL,
8754 PAGE_SIZE,
8755 0, VM_PROT_NONE);
8756 assert(!object->blocked_access);
8757 object->blocked_access = TRUE;
8758 }
8759
8760 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8761 #if DEVELOPMENT || DEBUG
8762 if (task != NULL) {
8763 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8764 }
8765 #endif /* DEVELOPMENT || DEBUG */
8766
8767 if (dwp_start && dwp_finish_ctx) {
8768 vm_page_delayed_work_finish_ctx(dwp_start);
8769 dwp_start = dwp = NULL;
8770 }
8771
8772 return KERN_SUCCESS;
8773
8774 return_err:
8775 dw_index = 0;
8776
8777 for (; offset < dst_offset; offset += PAGE_SIZE) {
8778 boolean_t need_unwire;
8779
8780 dst_page = vm_page_lookup(object, offset);
8781
8782 if (dst_page == VM_PAGE_NULL) {
8783 panic("vm_object_iopl_request: Wired page missing.");
8784 }
8785
8786 /*
8787 * if we've already processed this page in an earlier
8788 * dw_do_work, we need to undo the wiring... we will
8789 * leave the dirty and reference bits on if they
8790 * were set, since we don't have a good way of knowing
8791 * what the previous state was and we won't get here
8792 * under any normal circumstances... we will always
8793 * clear BUSY and wakeup any waiters via vm_page_free
8794 * or PAGE_WAKEUP_DONE
8795 */
8796 need_unwire = TRUE;
8797
8798 if (dw_count) {
8799 if ((dwp_start)[dw_index].dw_m == dst_page) {
8800 /*
8801 * still in the deferred work list
8802 * which means we haven't yet called
8803 * vm_page_wire on this page
8804 */
8805 need_unwire = FALSE;
8806
8807 dw_index++;
8808 dw_count--;
8809 }
8810 }
8811 vm_page_lock_queues();
8812
8813 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8814 vm_page_free(dst_page);
8815
8816 need_unwire = FALSE;
8817 } else {
8818 if (need_unwire == TRUE) {
8819 vm_page_unwire(dst_page, TRUE);
8820 }
8821
8822 vm_page_wakeup_done(object, dst_page);
8823 }
8824 vm_page_unlock_queues();
8825
8826 if (need_unwire == TRUE) {
8827 counter_inc(&vm_statistics_reactivations);
8828 }
8829 }
8830 #if UPL_DEBUG
8831 upl->upl_state = 2;
8832 #endif
8833 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8834 vm_object_activity_end(object);
8835 vm_object_collapse(object, 0, TRUE);
8836 }
8837 vm_object_unlock(object);
8838 upl_destroy(upl);
8839
8840 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8841 #if DEVELOPMENT || DEBUG
8842 if (task != NULL) {
8843 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8844 }
8845 #endif /* DEVELOPMENT || DEBUG */
8846
8847 if (dwp_start && dwp_finish_ctx) {
8848 vm_page_delayed_work_finish_ctx(dwp_start);
8849 dwp_start = dwp = NULL;
8850 }
8851 return ret;
8852 }
8853
8854 kern_return_t
8855 upl_transpose(
8856 upl_t upl1,
8857 upl_t upl2)
8858 {
8859 kern_return_t retval;
8860 boolean_t upls_locked;
8861 vm_object_t object1, object2;
8862
8863 /* LD: Should mapped UPLs be eligible for a transpose? */
8864 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8865 return KERN_INVALID_ARGUMENT;
8866 }
8867
8868 upls_locked = FALSE;
8869
8870 /*
8871 * Since we need to lock both UPLs at the same time,
8872 * avoid deadlocks by always taking locks in the same order.
8873 */
8874 if (upl1 < upl2) {
8875 upl_lock(upl1);
8876 upl_lock(upl2);
8877 } else {
8878 upl_lock(upl2);
8879 upl_lock(upl1);
8880 }
8881 upls_locked = TRUE; /* the UPLs will need to be unlocked */
8882
8883 object1 = upl1->map_object;
8884 object2 = upl2->map_object;
8885
8886 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8887 upl1->u_size != upl2->u_size) {
8888 /*
8889 * We deal only with full objects, not subsets.
8890 * That's because we exchange the entire backing store info
8891 * for the objects: pager, resident pages, etc... We can't do
8892 * only part of it.
8893 */
8894 retval = KERN_INVALID_VALUE;
8895 goto done;
8896 }
8897
8898 /*
8899 * Tranpose the VM objects' backing store.
8900 */
8901 retval = vm_object_transpose(object1, object2,
8902 upl_adjusted_size(upl1, PAGE_MASK));
8903
8904 if (retval == KERN_SUCCESS) {
8905 /*
8906 * Make each UPL point to the correct VM object, i.e. the
8907 * object holding the pages that the UPL refers to...
8908 */
8909 #if CONFIG_IOSCHED || UPL_DEBUG
8910 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8911 vm_object_lock(object1);
8912 vm_object_lock(object2);
8913 }
8914 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8915 queue_remove(&object1->uplq, upl1, upl_t, uplq);
8916 }
8917 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8918 queue_remove(&object2->uplq, upl2, upl_t, uplq);
8919 }
8920 #endif
8921 upl1->map_object = object2;
8922 upl2->map_object = object1;
8923
8924 #if CONFIG_IOSCHED || UPL_DEBUG
8925 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8926 queue_enter(&object2->uplq, upl1, upl_t, uplq);
8927 }
8928 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8929 queue_enter(&object1->uplq, upl2, upl_t, uplq);
8930 }
8931 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8932 vm_object_unlock(object2);
8933 vm_object_unlock(object1);
8934 }
8935 #endif
8936 }
8937
8938 done:
8939 /*
8940 * Cleanup.
8941 */
8942 if (upls_locked) {
8943 upl_unlock(upl1);
8944 upl_unlock(upl2);
8945 upls_locked = FALSE;
8946 }
8947
8948 return retval;
8949 }
8950
8951 void
8952 upl_range_needed(
8953 upl_t upl,
8954 int index,
8955 int count)
8956 {
8957 int size_in_pages;
8958
8959 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8960 return;
8961 }
8962
8963 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8964
8965 while (count-- && index < size_in_pages) {
8966 upl->page_list[index++].needed = TRUE;
8967 }
8968 }
8969
8970
8971 /*
8972 * Reserve of virtual addresses in the kernel address space.
8973 * We need to map the physical pages in the kernel, so that we
8974 * can call the code-signing or slide routines with a kernel
8975 * virtual address. We keep this pool of pre-allocated kernel
8976 * virtual addresses so that we don't have to scan the kernel's
8977 * virtaul address space each time we need to work with
8978 * a physical page.
8979 */
8980 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8981 #define VM_PAGING_NUM_PAGES 64
8982 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8983 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8984 int vm_paging_max_index = 0;
8985 int vm_paging_page_waiter = 0;
8986 int vm_paging_page_waiter_total = 0;
8987
8988 unsigned long vm_paging_no_kernel_page = 0;
8989 unsigned long vm_paging_objects_mapped = 0;
8990 unsigned long vm_paging_pages_mapped = 0;
8991 unsigned long vm_paging_objects_mapped_slow = 0;
8992 unsigned long vm_paging_pages_mapped_slow = 0;
8993
8994 __startup_func
8995 static void
8996 vm_paging_map_init(void)
8997 {
8998 kmem_alloc(kernel_map, &vm_paging_base_address,
8999 ptoa(VM_PAGING_NUM_PAGES),
9000 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
9001 VM_KERN_MEMORY_NONE);
9002 }
9003 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
9004
9005 /*
9006 * vm_paging_map_object:
9007 * Maps part of a VM object's pages in the kernel
9008 * virtual address space, using the pre-allocated
9009 * kernel virtual addresses, if possible.
9010 * Context:
9011 * The VM object is locked. This lock will get
9012 * dropped and re-acquired though, so the caller
9013 * must make sure the VM object is kept alive
9014 * (by holding a VM map that has a reference
9015 * on it, for example, or taking an extra reference).
9016 * The page should also be kept busy to prevent
9017 * it from being reclaimed.
9018 */
9019 kern_return_t
9020 vm_paging_map_object(
9021 vm_page_t page,
9022 vm_object_t object,
9023 vm_object_offset_t offset,
9024 vm_prot_t protection,
9025 boolean_t can_unlock_object,
9026 vm_map_size_t *size, /* IN/OUT */
9027 vm_map_offset_t *address, /* OUT */
9028 boolean_t *need_unmap) /* OUT */
9029 {
9030 kern_return_t kr;
9031 vm_map_offset_t page_map_offset;
9032 vm_map_size_t map_size;
9033 vm_object_offset_t object_offset;
9034 int i;
9035
9036 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9037 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9038 *address = (vm_map_offset_t)
9039 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9040 *need_unmap = FALSE;
9041 return KERN_SUCCESS;
9042
9043 assert(page->vmp_busy);
9044 /*
9045 * Use one of the pre-allocated kernel virtual addresses
9046 * and just enter the VM page in the kernel address space
9047 * at that virtual address.
9048 */
9049 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9050
9051 /*
9052 * Try and find an available kernel virtual address
9053 * from our pre-allocated pool.
9054 */
9055 page_map_offset = 0;
9056 for (;;) {
9057 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9058 if (vm_paging_page_inuse[i] == FALSE) {
9059 page_map_offset =
9060 vm_paging_base_address +
9061 (i * PAGE_SIZE);
9062 break;
9063 }
9064 }
9065 if (page_map_offset != 0) {
9066 /* found a space to map our page ! */
9067 break;
9068 }
9069
9070 if (can_unlock_object) {
9071 /*
9072 * If we can afford to unlock the VM object,
9073 * let's take the slow path now...
9074 */
9075 break;
9076 }
9077 /*
9078 * We can't afford to unlock the VM object, so
9079 * let's wait for a space to become available...
9080 */
9081 vm_paging_page_waiter_total++;
9082 vm_paging_page_waiter++;
9083 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9084 if (kr == THREAD_WAITING) {
9085 simple_unlock(&vm_paging_lock);
9086 kr = thread_block(THREAD_CONTINUE_NULL);
9087 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9088 }
9089 vm_paging_page_waiter--;
9090 /* ... and try again */
9091 }
9092
9093 if (page_map_offset != 0) {
9094 /*
9095 * We found a kernel virtual address;
9096 * map the physical page to that virtual address.
9097 */
9098 if (i > vm_paging_max_index) {
9099 vm_paging_max_index = i;
9100 }
9101 vm_paging_page_inuse[i] = TRUE;
9102 simple_unlock(&vm_paging_lock);
9103
9104 page->vmp_pmapped = TRUE;
9105
9106 /*
9107 * Keep the VM object locked over the PMAP_ENTER
9108 * and the actual use of the page by the kernel,
9109 * or this pmap mapping might get undone by a
9110 * vm_object_pmap_protect() call...
9111 */
9112 kr = pmap_enter_check(kernel_pmap,
9113 page_map_offset,
9114 page,
9115 protection,
9116 VM_PROT_NONE,
9117 0,
9118 TRUE);
9119 assert(kr == KERN_SUCCESS);
9120 vm_paging_objects_mapped++;
9121 vm_paging_pages_mapped++;
9122 *address = page_map_offset;
9123 *need_unmap = TRUE;
9124
9125 #if KASAN
9126 kasan_notify_address(page_map_offset, PAGE_SIZE);
9127 #endif
9128
9129 /* all done and mapped, ready to use ! */
9130 return KERN_SUCCESS;
9131 }
9132
9133 /*
9134 * We ran out of pre-allocated kernel virtual
9135 * addresses. Just map the page in the kernel
9136 * the slow and regular way.
9137 */
9138 vm_paging_no_kernel_page++;
9139 simple_unlock(&vm_paging_lock);
9140 }
9141
9142 if (!can_unlock_object) {
9143 *address = 0;
9144 *size = 0;
9145 *need_unmap = FALSE;
9146 return KERN_NOT_SUPPORTED;
9147 }
9148
9149 object_offset = vm_object_trunc_page(offset);
9150 map_size = vm_map_round_page(*size,
9151 VM_MAP_PAGE_MASK(kernel_map));
9152
9153 /*
9154 * Try and map the required range of the object
9155 * in the kernel_map. Given that allocation is
9156 * for pageable memory, it shouldn't contain
9157 * pointers and is mapped into the data range.
9158 */
9159
9160 vm_object_reference_locked(object); /* for the map entry */
9161 vm_object_unlock(object);
9162
9163 kr = vm_map_enter(kernel_map,
9164 address,
9165 map_size,
9166 0,
9167 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9168 object,
9169 object_offset,
9170 FALSE,
9171 protection,
9172 VM_PROT_ALL,
9173 VM_INHERIT_NONE);
9174 if (kr != KERN_SUCCESS) {
9175 *address = 0;
9176 *size = 0;
9177 *need_unmap = FALSE;
9178 vm_object_deallocate(object); /* for the map entry */
9179 vm_object_lock(object);
9180 return kr;
9181 }
9182
9183 *size = map_size;
9184
9185 /*
9186 * Enter the mapped pages in the page table now.
9187 */
9188 vm_object_lock(object);
9189 /*
9190 * VM object must be kept locked from before PMAP_ENTER()
9191 * until after the kernel is done accessing the page(s).
9192 * Otherwise, the pmap mappings in the kernel could be
9193 * undone by a call to vm_object_pmap_protect().
9194 */
9195
9196 for (page_map_offset = 0;
9197 map_size != 0;
9198 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9199 page = vm_page_lookup(object, offset + page_map_offset);
9200 if (page == VM_PAGE_NULL) {
9201 printf("vm_paging_map_object: no page !?");
9202 vm_object_unlock(object);
9203 vm_map_remove(kernel_map, *address, *size);
9204 *address = 0;
9205 *size = 0;
9206 *need_unmap = FALSE;
9207 vm_object_lock(object);
9208 return KERN_MEMORY_ERROR;
9209 }
9210 page->vmp_pmapped = TRUE;
9211
9212 kr = pmap_enter_check(kernel_pmap,
9213 *address + page_map_offset,
9214 page,
9215 protection,
9216 VM_PROT_NONE,
9217 0,
9218 TRUE);
9219 assert(kr == KERN_SUCCESS);
9220 #if KASAN
9221 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9222 #endif
9223 }
9224
9225 vm_paging_objects_mapped_slow++;
9226 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9227
9228 *need_unmap = TRUE;
9229
9230 return KERN_SUCCESS;
9231 }
9232
9233 /*
9234 * vm_paging_unmap_object:
9235 * Unmaps part of a VM object's pages from the kernel
9236 * virtual address space.
9237 * Context:
9238 * The VM object is locked. This lock will get
9239 * dropped and re-acquired though.
9240 */
9241 void
9242 vm_paging_unmap_object(
9243 vm_object_t object,
9244 vm_map_offset_t start,
9245 vm_map_offset_t end)
9246 {
9247 int i;
9248
9249 if ((vm_paging_base_address == 0) ||
9250 (start < vm_paging_base_address) ||
9251 (end > (vm_paging_base_address
9252 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9253 /*
9254 * We didn't use our pre-allocated pool of
9255 * kernel virtual address. Deallocate the
9256 * virtual memory.
9257 */
9258 if (object != VM_OBJECT_NULL) {
9259 vm_object_unlock(object);
9260 }
9261 vm_map_remove(kernel_map, start, end);
9262 if (object != VM_OBJECT_NULL) {
9263 vm_object_lock(object);
9264 }
9265 } else {
9266 /*
9267 * We used a kernel virtual address from our
9268 * pre-allocated pool. Put it back in the pool
9269 * for next time.
9270 */
9271 assert(end - start == PAGE_SIZE);
9272 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9273 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9274
9275 /* undo the pmap mapping */
9276 pmap_remove(kernel_pmap, start, end);
9277
9278 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9279 vm_paging_page_inuse[i] = FALSE;
9280 if (vm_paging_page_waiter) {
9281 thread_wakeup(&vm_paging_page_waiter);
9282 }
9283 simple_unlock(&vm_paging_lock);
9284 }
9285 }
9286
9287
9288 /*
9289 * page->vmp_object must be locked
9290 */
9291 void
9292 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9293 {
9294 if (!queues_locked) {
9295 vm_page_lockspin_queues();
9296 }
9297
9298 page->vmp_free_when_done = FALSE;
9299 /*
9300 * need to drop the laundry count...
9301 * we may also need to remove it
9302 * from the I/O paging queue...
9303 * vm_pageout_throttle_up handles both cases
9304 *
9305 * the laundry and pageout_queue flags are cleared...
9306 */
9307 vm_pageout_throttle_up(page);
9308
9309 if (!queues_locked) {
9310 vm_page_unlock_queues();
9311 }
9312 }
9313
9314 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9315
9316 upl_t
9317 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9318 {
9319 int i = 0;
9320 upl_t upl;
9321
9322 assert(max_upls > 0);
9323 if (max_upls == 0) {
9324 return NULL;
9325 }
9326
9327 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9328 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9329 }
9330 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9331
9332 upl = upl_create(0, UPL_VECTOR, 0);
9333 upl->vector_upl = vector_upl;
9334 upl->u_offset = upl_offset;
9335 vector_upl->size = 0;
9336 vector_upl->offset = upl_offset;
9337 vector_upl->invalid_upls = 0;
9338 vector_upl->num_upls = 0;
9339 vector_upl->pagelist = NULL;
9340 vector_upl->max_upls = max_upls;
9341
9342 for (i = 0; i < max_upls; i++) {
9343 vector_upl->upls[i].iostate.size = 0;
9344 vector_upl->upls[i].iostate.offset = 0;
9345 }
9346 return upl;
9347 }
9348
9349 upl_size_t
9350 vector_upl_get_size(const upl_t upl)
9351 {
9352 if (!vector_upl_is_valid(upl)) {
9353 return upl_get_size(upl);
9354 } else {
9355 return round_page_32(upl->vector_upl->size);
9356 }
9357 }
9358
9359 uint32_t
9360 vector_upl_max_upls(const upl_t upl)
9361 {
9362 if (!vector_upl_is_valid(upl)) {
9363 return 0;
9364 }
9365 return ((vector_upl_t)(upl->vector_upl))->max_upls;
9366 }
9367
9368 void
9369 vector_upl_deallocate(upl_t upl)
9370 {
9371 vector_upl_t vector_upl = upl->vector_upl;
9372
9373 assert(vector_upl_is_valid(upl));
9374
9375 if (vector_upl->invalid_upls != vector_upl->num_upls) {
9376 panic("Deallocating non-empty Vectored UPL");
9377 }
9378 uint32_t max_upls = vector_upl->max_upls;
9379 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9380 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9381 upl->vector_upl = NULL;
9382 }
9383
9384 boolean_t
9385 vector_upl_is_valid(upl_t upl)
9386 {
9387 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9388 }
9389
9390 boolean_t
9391 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9392 {
9393 if (vector_upl_is_valid(upl)) {
9394 vector_upl_t vector_upl = upl->vector_upl;
9395
9396 if (vector_upl) {
9397 if (subupl) {
9398 if (io_size) {
9399 if (io_size < PAGE_SIZE) {
9400 io_size = PAGE_SIZE;
9401 }
9402 subupl->vector_upl = (void*)vector_upl;
9403 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9404 vector_upl->size += io_size;
9405 upl->u_size += io_size;
9406 } else {
9407 uint32_t i = 0, invalid_upls = 0;
9408 for (i = 0; i < vector_upl->num_upls; i++) {
9409 if (vector_upl->upls[i].elem == subupl) {
9410 break;
9411 }
9412 }
9413 if (i == vector_upl->num_upls) {
9414 panic("Trying to remove sub-upl when none exists");
9415 }
9416
9417 vector_upl->upls[i].elem = NULL;
9418 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9419 relaxed);
9420 if (invalid_upls == vector_upl->num_upls) {
9421 return TRUE;
9422 } else {
9423 return FALSE;
9424 }
9425 }
9426 } else {
9427 panic("vector_upl_set_subupl was passed a NULL upl element");
9428 }
9429 } else {
9430 panic("vector_upl_set_subupl was passed a non-vectored upl");
9431 }
9432 } else {
9433 panic("vector_upl_set_subupl was passed a NULL upl");
9434 }
9435
9436 return FALSE;
9437 }
9438
9439 void
9440 vector_upl_set_pagelist(upl_t upl)
9441 {
9442 if (vector_upl_is_valid(upl)) {
9443 uint32_t i = 0;
9444 vector_upl_t vector_upl = upl->vector_upl;
9445
9446 if (vector_upl) {
9447 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9448
9449 vector_upl->pagelist = kalloc_type(struct upl_page_info,
9450 atop(vector_upl->size), Z_WAITOK);
9451
9452 for (i = 0; i < vector_upl->num_upls; i++) {
9453 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9454 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9455 pagelist_size += cur_upl_pagelist_size;
9456 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9457 upl->highest_page = vector_upl->upls[i].elem->highest_page;
9458 }
9459 }
9460 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9461 } else {
9462 panic("vector_upl_set_pagelist was passed a non-vectored upl");
9463 }
9464 } else {
9465 panic("vector_upl_set_pagelist was passed a NULL upl");
9466 }
9467 }
9468
9469 upl_t
9470 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9471 {
9472 if (vector_upl_is_valid(upl)) {
9473 vector_upl_t vector_upl = upl->vector_upl;
9474 if (vector_upl) {
9475 if (index < vector_upl->num_upls) {
9476 return vector_upl->upls[index].elem;
9477 }
9478 } else {
9479 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9480 }
9481 }
9482 return NULL;
9483 }
9484
9485 upl_t
9486 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9487 {
9488 if (vector_upl_is_valid(upl)) {
9489 uint32_t i = 0;
9490 vector_upl_t vector_upl = upl->vector_upl;
9491
9492 if (vector_upl) {
9493 upl_t subupl = NULL;
9494 vector_upl_iostates_t subupl_state;
9495
9496 for (i = 0; i < vector_upl->num_upls; i++) {
9497 subupl = vector_upl->upls[i].elem;
9498 subupl_state = vector_upl->upls[i].iostate;
9499 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9500 /* We could have been passed an offset/size pair that belongs
9501 * to an UPL element that has already been committed/aborted.
9502 * If so, return NULL.
9503 */
9504 if (subupl == NULL) {
9505 return NULL;
9506 }
9507 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9508 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9509 if (*upl_size > subupl_state.size) {
9510 *upl_size = subupl_state.size;
9511 }
9512 }
9513 if (*upl_offset >= subupl_state.offset) {
9514 *upl_offset -= subupl_state.offset;
9515 } else if (i) {
9516 panic("Vector UPL offset miscalculation");
9517 }
9518 return subupl;
9519 }
9520 }
9521 } else {
9522 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9523 }
9524 }
9525 return NULL;
9526 }
9527
9528 void
9529 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9530 {
9531 *v_upl_submap = NULL;
9532
9533 if (vector_upl_is_valid(upl)) {
9534 vector_upl_t vector_upl = upl->vector_upl;
9535 if (vector_upl) {
9536 *v_upl_submap = vector_upl->submap;
9537 *submap_dst_addr = vector_upl->submap_dst_addr;
9538 } else {
9539 panic("vector_upl_get_submap was passed a non-vectored UPL");
9540 }
9541 } else {
9542 panic("vector_upl_get_submap was passed a null UPL");
9543 }
9544 }
9545
9546 void
9547 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9548 {
9549 if (vector_upl_is_valid(upl)) {
9550 vector_upl_t vector_upl = upl->vector_upl;
9551 if (vector_upl) {
9552 vector_upl->submap = submap;
9553 vector_upl->submap_dst_addr = submap_dst_addr;
9554 } else {
9555 panic("vector_upl_get_submap was passed a non-vectored UPL");
9556 }
9557 } else {
9558 panic("vector_upl_get_submap was passed a NULL UPL");
9559 }
9560 }
9561
9562 void
9563 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9564 {
9565 if (vector_upl_is_valid(upl)) {
9566 uint32_t i = 0;
9567 vector_upl_t vector_upl = upl->vector_upl;
9568
9569 if (vector_upl) {
9570 for (i = 0; i < vector_upl->num_upls; i++) {
9571 if (vector_upl->upls[i].elem == subupl) {
9572 break;
9573 }
9574 }
9575
9576 if (i == vector_upl->num_upls) {
9577 panic("setting sub-upl iostate when none exists");
9578 }
9579
9580 vector_upl->upls[i].iostate.offset = offset;
9581 if (size < PAGE_SIZE) {
9582 size = PAGE_SIZE;
9583 }
9584 vector_upl->upls[i].iostate.size = size;
9585 } else {
9586 panic("vector_upl_set_iostate was passed a non-vectored UPL");
9587 }
9588 } else {
9589 panic("vector_upl_set_iostate was passed a NULL UPL");
9590 }
9591 }
9592
9593 void
9594 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9595 {
9596 if (vector_upl_is_valid(upl)) {
9597 uint32_t i = 0;
9598 vector_upl_t vector_upl = upl->vector_upl;
9599
9600 if (vector_upl) {
9601 for (i = 0; i < vector_upl->num_upls; i++) {
9602 if (vector_upl->upls[i].elem == subupl) {
9603 break;
9604 }
9605 }
9606
9607 if (i == vector_upl->num_upls) {
9608 panic("getting sub-upl iostate when none exists");
9609 }
9610
9611 *offset = vector_upl->upls[i].iostate.offset;
9612 *size = vector_upl->upls[i].iostate.size;
9613 } else {
9614 panic("vector_upl_get_iostate was passed a non-vectored UPL");
9615 }
9616 } else {
9617 panic("vector_upl_get_iostate was passed a NULL UPL");
9618 }
9619 }
9620
9621 void
9622 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9623 {
9624 if (vector_upl_is_valid(upl)) {
9625 vector_upl_t vector_upl = upl->vector_upl;
9626 if (vector_upl) {
9627 if (index < vector_upl->num_upls) {
9628 *offset = vector_upl->upls[index].iostate.offset;
9629 *size = vector_upl->upls[index].iostate.size;
9630 } else {
9631 *offset = *size = 0;
9632 }
9633 } else {
9634 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9635 }
9636 } else {
9637 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9638 }
9639 }
9640
9641 void *
9642 upl_get_internal_vectorupl(upl_t upl)
9643 {
9644 return upl->vector_upl;
9645 }
9646
9647 upl_page_info_t *
9648 upl_get_internal_vectorupl_pagelist(upl_t upl)
9649 {
9650 return upl->vector_upl->pagelist;
9651 }
9652
9653 upl_page_info_t *
9654 upl_get_internal_page_list(upl_t upl)
9655 {
9656 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9657 }
9658
9659 void
9660 upl_clear_dirty(
9661 upl_t upl,
9662 boolean_t value)
9663 {
9664 if (value) {
9665 upl->flags |= UPL_CLEAR_DIRTY;
9666 } else {
9667 upl->flags &= ~UPL_CLEAR_DIRTY;
9668 }
9669 }
9670
9671 void
9672 upl_set_referenced(
9673 upl_t upl,
9674 boolean_t value)
9675 {
9676 upl_lock(upl);
9677 if (value) {
9678 upl->ext_ref_count++;
9679 } else {
9680 if (!upl->ext_ref_count) {
9681 panic("upl_set_referenced not %p", upl);
9682 }
9683 upl->ext_ref_count--;
9684 }
9685 upl_unlock(upl);
9686 }
9687
9688 void
9689 upl_set_map_exclusive(upl_t upl)
9690 {
9691 upl_lock(upl);
9692 while (upl->map_addr_owner) {
9693 upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9694 upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9695 }
9696 upl->map_addr_owner = thread_get_ctid(current_thread());
9697 upl_unlock(upl);
9698 }
9699
9700 void
9701 upl_clear_map_exclusive(upl_t upl)
9702 {
9703 assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9704 upl_lock(upl);
9705 if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9706 upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9707 upl_wakeup(&upl->map_addr_owner);
9708 }
9709 upl->map_addr_owner = 0;
9710 upl_unlock(upl);
9711 }
9712
9713 #if CONFIG_IOSCHED
9714 void
9715 upl_set_blkno(
9716 upl_t upl,
9717 vm_offset_t upl_offset,
9718 int io_size,
9719 int64_t blkno)
9720 {
9721 int i, j;
9722 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9723 return;
9724 }
9725
9726 assert(upl->upl_reprio_info != 0);
9727 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9728 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9729 }
9730 }
9731 #endif
9732
9733 void inline
9734 memoryshot(unsigned int event, unsigned int control)
9735 {
9736 if (vm_debug_events) {
9737 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9738 vm_page_active_count, vm_page_inactive_count,
9739 vm_page_free_count, vm_page_speculative_count,
9740 vm_page_throttled_count);
9741 } else {
9742 (void) event;
9743 (void) control;
9744 }
9745 }
9746
9747 #ifdef MACH_BSD
9748
9749 boolean_t
9750 upl_device_page(upl_page_info_t *upl)
9751 {
9752 return UPL_DEVICE_PAGE(upl);
9753 }
9754 boolean_t
9755 upl_page_present(upl_page_info_t *upl, int index)
9756 {
9757 return UPL_PAGE_PRESENT(upl, index);
9758 }
9759 boolean_t
9760 upl_speculative_page(upl_page_info_t *upl, int index)
9761 {
9762 return UPL_SPECULATIVE_PAGE(upl, index);
9763 }
9764 boolean_t
9765 upl_dirty_page(upl_page_info_t *upl, int index)
9766 {
9767 return UPL_DIRTY_PAGE(upl, index);
9768 }
9769 boolean_t
9770 upl_valid_page(upl_page_info_t *upl, int index)
9771 {
9772 return UPL_VALID_PAGE(upl, index);
9773 }
9774 ppnum_t
9775 upl_phys_page(upl_page_info_t *upl, int index)
9776 {
9777 return UPL_PHYS_PAGE(upl, index);
9778 }
9779
9780 void
9781 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9782 {
9783 upl[index].mark = v;
9784 }
9785
9786 boolean_t
9787 upl_page_get_mark(upl_page_info_t *upl, int index)
9788 {
9789 return upl[index].mark;
9790 }
9791
9792 void
9793 vm_countdirtypages(void)
9794 {
9795 vm_page_t m;
9796 int dpages;
9797 int pgopages;
9798 int precpages;
9799
9800
9801 dpages = 0;
9802 pgopages = 0;
9803 precpages = 0;
9804
9805 vm_page_lock_queues();
9806 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9807 do {
9808 if (m == (vm_page_t)0) {
9809 break;
9810 }
9811
9812 if (m->vmp_dirty) {
9813 dpages++;
9814 }
9815 if (m->vmp_free_when_done) {
9816 pgopages++;
9817 }
9818 if (m->vmp_precious) {
9819 precpages++;
9820 }
9821
9822 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9823 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9824 if (m == (vm_page_t)0) {
9825 break;
9826 }
9827 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9828 vm_page_unlock_queues();
9829
9830 vm_page_lock_queues();
9831 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9832 do {
9833 if (m == (vm_page_t)0) {
9834 break;
9835 }
9836
9837 dpages++;
9838 assert(m->vmp_dirty);
9839 assert(!m->vmp_free_when_done);
9840 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9841 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9842 if (m == (vm_page_t)0) {
9843 break;
9844 }
9845 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9846 vm_page_unlock_queues();
9847
9848 vm_page_lock_queues();
9849 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9850 do {
9851 if (m == (vm_page_t)0) {
9852 break;
9853 }
9854
9855 if (m->vmp_dirty) {
9856 dpages++;
9857 }
9858 if (m->vmp_free_when_done) {
9859 pgopages++;
9860 }
9861 if (m->vmp_precious) {
9862 precpages++;
9863 }
9864
9865 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9866 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9867 if (m == (vm_page_t)0) {
9868 break;
9869 }
9870 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9871 vm_page_unlock_queues();
9872
9873 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9874
9875 dpages = 0;
9876 pgopages = 0;
9877 precpages = 0;
9878
9879 vm_page_lock_queues();
9880 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9881
9882 do {
9883 if (m == (vm_page_t)0) {
9884 break;
9885 }
9886 if (m->vmp_dirty) {
9887 dpages++;
9888 }
9889 if (m->vmp_free_when_done) {
9890 pgopages++;
9891 }
9892 if (m->vmp_precious) {
9893 precpages++;
9894 }
9895
9896 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9897 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9898 if (m == (vm_page_t)0) {
9899 break;
9900 }
9901 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9902 vm_page_unlock_queues();
9903
9904 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9905 }
9906 #endif /* MACH_BSD */
9907
9908
9909 #if CONFIG_IOSCHED
9910 int
9911 upl_get_cached_tier(upl_t upl)
9912 {
9913 assert(upl);
9914 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9915 return upl->upl_priority;
9916 }
9917 return -1;
9918 }
9919 #endif /* CONFIG_IOSCHED */
9920
9921
9922 void
9923 upl_callout_iodone(upl_t upl)
9924 {
9925 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9926
9927 if (upl_ctx) {
9928 void (*iodone_func)(void *, int) = upl_ctx->io_done;
9929
9930 assert(upl_ctx->io_done);
9931
9932 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9933 }
9934 }
9935
9936 void
9937 upl_set_iodone(upl_t upl, void *upl_iodone)
9938 {
9939 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9940 }
9941
9942 void
9943 upl_set_iodone_error(upl_t upl, int error)
9944 {
9945 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9946
9947 if (upl_ctx) {
9948 upl_ctx->io_error = error;
9949 }
9950 }
9951
9952
9953 ppnum_t
9954 upl_get_highest_page(
9955 upl_t upl)
9956 {
9957 return upl->highest_page;
9958 }
9959
9960 upl_size_t
9961 upl_get_size(
9962 upl_t upl)
9963 {
9964 return upl_adjusted_size(upl, PAGE_MASK);
9965 }
9966
9967 upl_size_t
9968 upl_adjusted_size(
9969 upl_t upl,
9970 vm_map_offset_t pgmask)
9971 {
9972 vm_object_offset_t start_offset, end_offset;
9973
9974 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9975 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9976
9977 return (upl_size_t)(end_offset - start_offset);
9978 }
9979
9980 vm_object_offset_t
9981 upl_adjusted_offset(
9982 upl_t upl,
9983 vm_map_offset_t pgmask)
9984 {
9985 return trunc_page_mask_64(upl->u_offset, pgmask);
9986 }
9987
9988 vm_object_offset_t
9989 upl_get_data_offset(
9990 upl_t upl)
9991 {
9992 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9993 }
9994
9995 upl_t
9996 upl_associated_upl(upl_t upl)
9997 {
9998 return upl->associated_upl;
9999 }
10000
10001 void
10002 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10003 {
10004 upl->associated_upl = associated_upl;
10005 }
10006
10007 struct vnode *
10008 upl_lookup_vnode(upl_t upl)
10009 {
10010 if (!upl->map_object->internal) {
10011 return vnode_pager_lookup_vnode(upl->map_object->pager);
10012 } else {
10013 return NULL;
10014 }
10015 }
10016
10017 #if UPL_DEBUG
10018 kern_return_t
10019 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10020 {
10021 upl->ubc_alias1 = alias1;
10022 upl->ubc_alias2 = alias2;
10023 return KERN_SUCCESS;
10024 }
10025 int
10026 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10027 {
10028 if (al) {
10029 *al = upl->ubc_alias1;
10030 }
10031 if (al2) {
10032 *al2 = upl->ubc_alias2;
10033 }
10034 return KERN_SUCCESS;
10035 }
10036 #endif /* UPL_DEBUG */
10037
10038 #if VM_PRESSURE_EVENTS
10039 /*
10040 * Upward trajectory.
10041 */
10042
10043 boolean_t
10044 VM_PRESSURE_NORMAL_TO_WARNING(void)
10045 {
10046 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10047 /* Available pages below our threshold */
10048 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
10049 #if CONFIG_FREEZE
10050 /* No frozen processes to kill */
10051 if (memorystatus_frozen_count == 0) {
10052 /* Not enough suspended processes available. */
10053 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10054 return TRUE;
10055 }
10056 }
10057 #else /* CONFIG_FREEZE */
10058 return TRUE;
10059 #endif /* CONFIG_FREEZE */
10060 }
10061 return FALSE;
10062 } else {
10063 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10064 }
10065 }
10066
10067 boolean_t
10068 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10069 {
10070 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10071 /* Available pages below our threshold */
10072 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
10073 return TRUE;
10074 }
10075 return FALSE;
10076 } else {
10077 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10078 }
10079 }
10080
10081 /*
10082 * Downward trajectory.
10083 */
10084 boolean_t
10085 VM_PRESSURE_WARNING_TO_NORMAL(void)
10086 {
10087 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10088 /* Available pages above our threshold */
10089 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
10090 if (memorystatus_available_pages > target_threshold) {
10091 return TRUE;
10092 }
10093 return FALSE;
10094 } else {
10095 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10096 }
10097 }
10098
10099 boolean_t
10100 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10101 {
10102 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10103 /* Available pages above our threshold */
10104 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
10105 if (memorystatus_available_pages > target_threshold) {
10106 return TRUE;
10107 }
10108 return FALSE;
10109 } else {
10110 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10111 }
10112 }
10113 #endif /* VM_PRESSURE_EVENTS */
10114
10115 #if DEVELOPMENT || DEBUG
10116 bool compressor_running_perf_test;
10117 uint64_t compressor_perf_test_pages_processed;
10118
10119 static kern_return_t
10120 move_pages_to_queue(
10121 vm_map_t map,
10122 user_addr_t start_addr,
10123 size_t buffer_size,
10124 vm_page_queue_head_t *queue,
10125 size_t *pages_moved)
10126 {
10127 kern_return_t err = KERN_SUCCESS;
10128 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10129 boolean_t addr_in_map = FALSE;
10130 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10131 vm_object_t curr_object = VM_OBJECT_NULL;
10132 *pages_moved = 0;
10133
10134
10135 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10136 /*
10137 * We don't currently support benchmarking maps with a different page size
10138 * than the kernel.
10139 */
10140 return KERN_INVALID_ARGUMENT;
10141 }
10142
10143 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10144 return KERN_INVALID_ARGUMENT;
10145 }
10146
10147 vm_map_lock_read(map);
10148 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10149 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10150
10151
10152 while (curr_addr < end_addr) {
10153 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10154 if (!addr_in_map) {
10155 err = KERN_INVALID_ARGUMENT;
10156 break;
10157 }
10158 curr_object = VME_OBJECT(curr_entry);
10159 if (curr_object) {
10160 vm_object_lock(curr_object);
10161 /* We really only want anonymous memory that's in the top level map and object here. */
10162 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10163 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10164 err = KERN_INVALID_ARGUMENT;
10165 vm_object_unlock(curr_object);
10166 break;
10167 }
10168 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10169 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10170 (curr_entry->vme_start + VME_OFFSET(curr_entry));
10171 vm_map_offset_t curr_offset = start_offset;
10172 vm_page_t curr_page;
10173 while (curr_offset < end_offset) {
10174 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10175 if (curr_page != VM_PAGE_NULL) {
10176 vm_page_lock_queues();
10177 if (curr_page->vmp_laundry) {
10178 vm_pageout_steal_laundry(curr_page, TRUE);
10179 }
10180 /*
10181 * we've already factored out pages in the laundry which
10182 * means this page can't be on the pageout queue so it's
10183 * safe to do the vm_page_queues_remove
10184 */
10185 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10186 vm_page_queues_remove(curr_page, TRUE);
10187 if (donate) {
10188 /*
10189 * The compressor needs to see this bit to know
10190 * where this page needs to land. Also if stolen,
10191 * this bit helps put the page back in the right
10192 * special queue where it belongs.
10193 */
10194 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10195 }
10196 // Clear the referenced bit so we ensure this gets paged out
10197 curr_page->vmp_reference = false;
10198 if (curr_page->vmp_pmapped) {
10199 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10200 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10201 }
10202 vm_page_queue_enter(queue, curr_page, vmp_pageq);
10203 vm_page_unlock_queues();
10204 *pages_moved += 1;
10205 }
10206 curr_offset += PAGE_SIZE_64;
10207 curr_addr += PAGE_SIZE_64;
10208 }
10209 }
10210 vm_object_unlock(curr_object);
10211 }
10212 vm_map_unlock_read(map);
10213 return err;
10214 }
10215
10216 /*
10217 * Local queue for processing benchmark pages.
10218 * Can't be allocated on the stack because the pointer has to
10219 * be packable.
10220 */
10221 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10222 kern_return_t
10223 run_compressor_perf_test(
10224 user_addr_t buf,
10225 size_t buffer_size,
10226 uint64_t *time,
10227 uint64_t *bytes_compressed,
10228 uint64_t *compressor_growth)
10229 {
10230 kern_return_t err = KERN_SUCCESS;
10231 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10232 return KERN_NOT_SUPPORTED;
10233 }
10234 if (current_task() == kernel_task) {
10235 return KERN_INVALID_ARGUMENT;
10236 }
10237 vm_page_lock_queues();
10238 if (compressor_running_perf_test) {
10239 /* Only run one instance of the benchmark at a time. */
10240 vm_page_unlock_queues();
10241 return KERN_RESOURCE_SHORTAGE;
10242 }
10243 vm_page_unlock_queues();
10244 size_t page_count = 0;
10245 vm_map_t map;
10246 vm_page_t p, next;
10247 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10248 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10249 *bytes_compressed = *compressor_growth = 0;
10250
10251 vm_page_queue_init(&compressor_perf_test_queue);
10252 map = current_task()->map;
10253 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10254 if (err != KERN_SUCCESS) {
10255 goto out;
10256 }
10257
10258 vm_page_lock_queues();
10259 compressor_running_perf_test = true;
10260 compressor_perf_test_pages_processed = 0;
10261 /*
10262 * At this point the compressor threads should only process the benchmark queue
10263 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10264 * to determine how many compressed bytes we ended up using.
10265 */
10266 compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10267 vm_page_unlock_queues();
10268
10269 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10270
10271 vm_page_lock_queues();
10272 compressor_perf_test_start = mach_absolute_time();
10273
10274 // Wake up the compressor thread(s)
10275 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10276 pgo_iothread_internal_state[0].pgo_iothread);
10277
10278 /*
10279 * Depending on when this test is run we could overshoot or be right on the mark
10280 * with our page_count. So the comparison is of the _less than_ variety.
10281 */
10282 while (compressor_perf_test_pages_processed < page_count) {
10283 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10284 vm_page_unlock_queues();
10285 thread_block(THREAD_CONTINUE_NULL);
10286 vm_page_lock_queues();
10287 }
10288 compressor_perf_test_end = mach_absolute_time();
10289 compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10290 vm_page_unlock_queues();
10291
10292
10293 out:
10294 /*
10295 * If we errored out above, then we could still have some pages
10296 * on the local queue. Make sure to put them back on the active queue before
10297 * returning so they're not orphaned.
10298 */
10299 vm_page_lock_queues();
10300 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10301 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10302 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10303 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10304
10305 vm_page_enqueue_active(p, FALSE);
10306 p = next;
10307 }
10308
10309 compressor_running_perf_test = false;
10310 vm_page_unlock_queues();
10311 if (err == KERN_SUCCESS) {
10312 *bytes_compressed = page_count * PAGE_SIZE_64;
10313 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
10314 }
10315
10316 /*
10317 * pageout_scan will consider waking the compactor swapper
10318 * before it blocks. Do the same thing here before we return
10319 * to ensure that back to back benchmark runs can't overly fragment the
10320 * compressor pool.
10321 */
10322 vm_consider_waking_compactor_swapper();
10323 return err;
10324 }
10325 #endif /* DEVELOPMENT || DEBUG */
10326