1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69
70 #include <debug.h>
71
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92
93 #include <os/log.h>
94
95 #include <sys/kdebug_triage.h>
96
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116
117 #include <san/kasan.h>
118 #include <sys/kern_memorystatus_xnu.h>
119
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123
124
125 #if UPL_DEBUG
126 #include <libkern/OSDebug.h>
127 #endif
128
129 extern int cs_debug;
130
131 #if CONFIG_MBUF_MCACHE
132 extern void mbuf_drain(boolean_t);
133 #endif /* CONFIG_MBUF_MCACHE */
134
135 #if CONFIG_FREEZE
136 extern unsigned int memorystatus_frozen_count;
137 extern unsigned int memorystatus_suspended_count;
138 #endif /* CONFIG_FREEZE */
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140
141 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 extern uint32_t memorystatus_jetsam_bg_band_waiters;
144
145 void vm_pressure_response(void);
146 extern void consider_vm_pressure_events(void);
147
148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
149
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 sched_cond_atomic_t vm_pageout_gc_cond;
153 #if CONFIG_VPS_DYNAMIC_PRIO
154 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
155 #else
156 const bool vps_dynamic_priority_enabled = false;
157 #endif
158 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
159
160 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
161 #if !XNU_TARGET_OS_OSX
162 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
163 #else /* !XNU_TARGET_OS_OSX */
164 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
165 #endif /* !XNU_TARGET_OS_OSX */
166 #endif
167
168 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
169 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
170 #endif
171
172 #ifndef VM_PAGE_LAUNDRY_MAX
173 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
174 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
175
176 #ifndef VM_PAGEOUT_BURST_WAIT
177 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
178 #endif /* VM_PAGEOUT_BURST_WAIT */
179
180 #ifndef VM_PAGEOUT_EMPTY_WAIT
181 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
182 #endif /* VM_PAGEOUT_EMPTY_WAIT */
183
184 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
185 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
186 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
187
188 #ifndef VM_PAGEOUT_IDLE_WAIT
189 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
190 #endif /* VM_PAGEOUT_IDLE_WAIT */
191
192 #ifndef VM_PAGEOUT_SWAP_WAIT
193 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
194 #endif /* VM_PAGEOUT_SWAP_WAIT */
195
196 /*
197 * vm_page_max_speculative_age_q should be less than or equal to
198 * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
199 * vm_page_queue_speculative entries.
200 */
201
202 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
203 #ifndef VM_PAGE_SPECULATIVE_TARGET
204 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
205 #endif /* VM_PAGE_SPECULATIVE_TARGET */
206
207
208 /*
209 * To obtain a reasonable LRU approximation, the inactive queue
210 * needs to be large enough to give pages on it a chance to be
211 * referenced a second time. This macro defines the fraction
212 * of active+inactive pages that should be inactive.
213 * The pageout daemon uses it to update vm_page_inactive_target.
214 *
215 * If vm_page_free_count falls below vm_page_free_target and
216 * vm_page_inactive_count is below vm_page_inactive_target,
217 * then the pageout daemon starts running.
218 */
219
220 #ifndef VM_PAGE_INACTIVE_TARGET
221 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
222 #endif /* VM_PAGE_INACTIVE_TARGET */
223
224 /*
225 * Once the pageout daemon starts running, it keeps going
226 * until vm_page_free_count meets or exceeds vm_page_free_target.
227 */
228
229 #ifndef VM_PAGE_FREE_TARGET
230 #if !XNU_TARGET_OS_OSX
231 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
232 #else /* !XNU_TARGET_OS_OSX */
233 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
234 #endif /* !XNU_TARGET_OS_OSX */
235 #endif /* VM_PAGE_FREE_TARGET */
236
237
238 /*
239 * The pageout daemon always starts running once vm_page_free_count
240 * falls below vm_page_free_min.
241 */
242
243 #ifndef VM_PAGE_FREE_MIN
244 #if !XNU_TARGET_OS_OSX
245 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
246 #else /* !XNU_TARGET_OS_OSX */
247 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
248 #endif /* !XNU_TARGET_OS_OSX */
249 #endif /* VM_PAGE_FREE_MIN */
250
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_RESERVED_LIMIT 100
253 #define VM_PAGE_FREE_MIN_LIMIT 1500
254 #define VM_PAGE_FREE_TARGET_LIMIT 2000
255 #else /* !XNU_TARGET_OS_OSX */
256 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
257 #define VM_PAGE_FREE_MIN_LIMIT 3500
258 #define VM_PAGE_FREE_TARGET_LIMIT 4000
259 #endif /* !XNU_TARGET_OS_OSX */
260
261 /*
262 * When vm_page_free_count falls below vm_page_free_reserved,
263 * only vm-privileged threads can allocate pages. vm-privilege
264 * allows the pageout daemon and default pager (and any other
265 * associated threads needed for default pageout) to continue
266 * operation by dipping into the reserved pool of pages.
267 */
268
269 #ifndef VM_PAGE_FREE_RESERVED
270 #define VM_PAGE_FREE_RESERVED(n) \
271 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
272 #endif /* VM_PAGE_FREE_RESERVED */
273
274 /*
275 * When we dequeue pages from the inactive list, they are
276 * reactivated (ie, put back on the active queue) if referenced.
277 * However, it is possible to starve the free list if other
278 * processors are referencing pages faster than we can turn off
279 * the referenced bit. So we limit the number of reactivations
280 * we will make per call of vm_pageout_scan().
281 */
282 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
283
284 #ifndef VM_PAGE_REACTIVATE_LIMIT
285 #if !XNU_TARGET_OS_OSX
286 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
287 #else /* !XNU_TARGET_OS_OSX */
288 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
289 #endif /* !XNU_TARGET_OS_OSX */
290 #endif /* VM_PAGE_REACTIVATE_LIMIT */
291 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
292
293 int vm_pageout_protect_realtime = true;
294
295 extern boolean_t hibernate_cleaning_in_progress;
296
297 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
298 struct pgo_iothread_state pgo_iothread_external_state;
299
300 #if VM_PRESSURE_EVENTS
301 void vm_pressure_thread(void);
302
303 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
304 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
305
306 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
307 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
308 #endif
309
310 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
311 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
312 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
313
314 extern void vm_pageout_continue(void);
315 extern void vm_pageout_scan(void);
316
317 boolean_t vm_pageout_running = FALSE;
318
319 uint32_t vm_page_upl_tainted = 0;
320 uint32_t vm_page_iopl_tainted = 0;
321
322 #if XNU_TARGET_OS_OSX
323 static boolean_t vm_pageout_waiter = FALSE;
324 #endif /* XNU_TARGET_OS_OSX */
325
326
327 #if DEVELOPMENT || DEBUG
328 struct vm_pageout_debug vm_pageout_debug;
329 #endif
330 struct vm_pageout_vminfo vm_pageout_vminfo;
331 struct vm_pageout_state vm_pageout_state;
332 struct vm_config vm_config;
333
334 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
335 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
336 #if DEVELOPMENT || DEBUG
337 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
338 #endif /* DEVELOPMENT || DEBUG */
339
340 int vm_upl_wait_for_pages = 0;
341 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
342
343 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
344
345 int vm_debug_events = 0;
346
347 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
348
349 #if CONFIG_MEMORYSTATUS
350 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
351 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
352 #endif
353
354 #if __AMP__
355
356
357 /*
358 * Bind compressor threads to e-cores unless there are multiple non-e clusters
359 */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_soft_bind_cluster_type(thread_t, char);
371
372 #endif /* __AMP__ */
373
374
375 /*
376 * Routine: vm_pageout_object_terminate
377 * Purpose:
378 * Destroy the pageout_object, and perform all of the
379 * required cleanup actions.
380 *
381 * In/Out conditions:
382 * The object must be locked, and will be returned locked.
383 */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 vm_object_t object)
387 {
388 vm_object_t shadow_object;
389
390 /*
391 * Deal with the deallocation (last reference) of a pageout object
392 * (used for cleaning-in-place) by dropping the paging references/
393 * freeing pages in the original object.
394 */
395
396 assert(object->pageout);
397 shadow_object = object->shadow;
398 vm_object_lock(shadow_object);
399
400 while (!vm_page_queue_empty(&object->memq)) {
401 vm_page_t p, m;
402 vm_object_offset_t offset;
403
404 p = (vm_page_t) vm_page_queue_first(&object->memq);
405
406 assert(vm_page_is_private(p));
407 assert(p->vmp_free_when_done);
408 p->vmp_free_when_done = FALSE;
409 assert(!p->vmp_cleaning);
410 assert(!p->vmp_laundry);
411
412 offset = p->vmp_offset;
413 VM_PAGE_FREE(p);
414 p = VM_PAGE_NULL;
415
416 m = vm_page_lookup(shadow_object,
417 offset + object->vo_shadow_offset);
418
419 if (m == VM_PAGE_NULL) {
420 continue;
421 }
422
423 assert((m->vmp_dirty) || (m->vmp_precious) ||
424 (m->vmp_busy && m->vmp_cleaning));
425
426 /*
427 * Handle the trusted pager throttle.
428 * Also decrement the burst throttle (if external).
429 */
430 vm_page_lock_queues();
431 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 vm_pageout_throttle_up(m);
433 }
434
435 /*
436 * Handle the "target" page(s). These pages are to be freed if
437 * successfully cleaned. Target pages are always busy, and are
438 * wired exactly once. The initial target pages are not mapped,
439 * (so cannot be referenced or modified) but converted target
440 * pages may have been modified between the selection as an
441 * adjacent page and conversion to a target.
442 */
443 if (m->vmp_free_when_done) {
444 assert(m->vmp_busy);
445 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 assert(m->vmp_wire_count == 1);
447 m->vmp_cleaning = FALSE;
448 m->vmp_free_when_done = FALSE;
449 /*
450 * Revoke all access to the page. Since the object is
451 * locked, and the page is busy, this prevents the page
452 * from being dirtied after the pmap_disconnect() call
453 * returns.
454 *
455 * Since the page is left "dirty" but "not modifed", we
456 * can detect whether the page was redirtied during
457 * pageout by checking the modify state.
458 */
459 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 SET_PAGE_DIRTY(m, FALSE);
461 } else {
462 m->vmp_dirty = FALSE;
463 }
464
465 if (m->vmp_dirty) {
466 vm_page_unwire(m, TRUE); /* reactivates */
467 counter_inc(&vm_statistics_reactivations);
468 vm_page_wakeup_done(object, m);
469 } else {
470 vm_page_free(m); /* clears busy, etc. */
471 }
472 vm_page_unlock_queues();
473 continue;
474 }
475 /*
476 * Handle the "adjacent" pages. These pages were cleaned in
477 * place, and should be left alone.
478 * If prep_pin_count is nonzero, then someone is using the
479 * page, so make it active.
480 */
481 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
482 if (m->vmp_reference) {
483 vm_page_activate(m);
484 } else {
485 vm_page_deactivate(m);
486 }
487 }
488 if (m->vmp_overwriting) {
489 /*
490 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 */
492 if (m->vmp_busy) {
493 /*
494 * We do not re-set m->vmp_dirty !
495 * The page was busy so no extraneous activity
496 * could have occurred. COPY_INTO is a read into the
497 * new pages. CLEAN_IN_PLACE does actually write
498 * out the pages but handling outside of this code
499 * will take care of resetting dirty. We clear the
500 * modify however for the Programmed I/O case.
501 */
502 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503
504 m->vmp_busy = FALSE;
505 m->vmp_absent = FALSE;
506 } else {
507 /*
508 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 * Occurs when the original page was wired
510 * at the time of the list request
511 */
512 assert(VM_PAGE_WIRED(m));
513 vm_page_unwire(m, TRUE); /* reactivates */
514 }
515 m->vmp_overwriting = FALSE;
516 } else {
517 m->vmp_dirty = FALSE;
518 }
519 m->vmp_cleaning = FALSE;
520
521 /*
522 * Wakeup any thread waiting for the page to be un-cleaning.
523 */
524 vm_page_wakeup(object, m);
525 vm_page_unlock_queues();
526 }
527 /*
528 * Account for the paging reference taken in vm_paging_object_allocate.
529 */
530 vm_object_activity_end(shadow_object);
531 vm_object_unlock(shadow_object);
532
533 assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 assert(object->paging_in_progress == 0);
535 assert(object->activity_in_progress == 0);
536 assert(object->resident_page_count == 0);
537 return;
538 }
539
540 /*
541 * Routine: vm_pageclean_setup
542 *
543 * Purpose: setup a page to be cleaned (made non-dirty), but not
544 * necessarily flushed from the VM page cache.
545 * This is accomplished by cleaning in place.
546 *
547 * The page must not be busy, and new_object
548 * must be locked.
549 *
550 */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 vm_page_t m,
554 vm_page_t new_m,
555 vm_object_t new_object,
556 vm_object_offset_t new_offset)
557 {
558 assert(!m->vmp_busy);
559 #if 0
560 assert(!m->vmp_cleaning);
561 #endif
562
563 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564
565 /*
566 * Mark original page as cleaning in place.
567 */
568 m->vmp_cleaning = TRUE;
569 SET_PAGE_DIRTY(m, FALSE);
570 m->vmp_precious = FALSE;
571
572 /*
573 * Convert the fictitious page to a private shadow of
574 * the real page.
575 */
576 new_m->vmp_free_when_done = TRUE;
577
578 vm_page_lockspin_queues();
579 vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
580 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 vm_page_unlock_queues();
582
583 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 assert(!new_m->vmp_wanted);
585 new_m->vmp_busy = FALSE;
586 }
587
588 /*
589 * Routine: vm_pageout_initialize_page
590 * Purpose:
591 * Causes the specified page to be initialized in
592 * the appropriate memory object. This routine is used to push
593 * pages into a copy-object when they are modified in the
594 * permanent object.
595 *
596 * The page is moved to a temporary object and paged out.
597 *
598 * In/out conditions:
599 * The page in question must not be on any pageout queues.
600 * The object to which it belongs must be locked.
601 * The page must be busy, but not hold a paging reference.
602 *
603 * Implementation:
604 * Move this page to a completely new object.
605 */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 vm_page_t m)
609 {
610 vm_object_t object;
611 vm_object_offset_t paging_offset;
612 memory_object_t pager;
613
614 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615
616 object = VM_PAGE_OBJECT(m);
617
618 assert(m->vmp_busy);
619 assert(object->internal);
620
621 /*
622 * Verify that we really want to clean this page
623 */
624 assert(!m->vmp_absent);
625 assert(m->vmp_dirty);
626
627 /*
628 * Create a paging reference to let us play with the object.
629 */
630 paging_offset = m->vmp_offset + object->paging_offset;
631
632 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 panic("reservation without pageout?"); /* alan */
634
635 VM_PAGE_FREE(m);
636 vm_object_unlock(object);
637
638 return;
639 }
640
641 /*
642 * If there's no pager, then we can't clean the page. This should
643 * never happen since this should be a copy object and therefore not
644 * an external object, so the pager should always be there.
645 */
646
647 pager = object->pager;
648
649 if (pager == MEMORY_OBJECT_NULL) {
650 panic("missing pager for copy object");
651
652 VM_PAGE_FREE(m);
653 return;
654 }
655
656 /*
657 * set the page for future call to vm_fault_list_request
658 */
659 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 SET_PAGE_DIRTY(m, FALSE);
661
662 /*
663 * keep the object from collapsing or terminating
664 */
665 vm_object_paging_begin(object);
666 vm_object_unlock(object);
667
668 /*
669 * Write the data to its pager.
670 * Note that the data is passed by naming the new object,
671 * not a virtual address; the pager interface has been
672 * manipulated to use the "internal memory" data type.
673 * [The object reference from its allocation is donated
674 * to the eventual recipient.]
675 */
676 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677
678 vm_object_lock(object);
679 vm_object_paging_end(object);
680 }
681
682
683 /*
684 * vm_pageout_cluster:
685 *
686 * Given a page, queue it to the appropriate I/O thread,
687 * which will page it out and attempt to clean adjacent pages
688 * in the same operation.
689 *
690 * The object and queues must be locked. We will take a
691 * paging reference to prevent deallocation or collapse when we
692 * release the object lock back at the call site. The I/O thread
693 * is responsible for consuming this reference
694 *
695 * The page must not be on any pageout queue.
696 */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703
704 typedef enum vmct_state_t {
705 VMCT_IDLE,
706 VMCT_AWAKENED,
707 VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711
712
713
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 vm_object_t object = VM_PAGE_OBJECT(m);
718
719 VM_PAGE_CHECK(m);
720 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 vm_object_lock_assert_exclusive(object);
722
723 /*
724 * Make sure it's OK to page this out.
725 */
726 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 assert(!m->vmp_cleaning && !m->vmp_laundry);
728 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729
730 /*
731 * protect the object from collapse or termination
732 */
733 vm_object_activity_begin(object);
734
735
736 /*
737 * pgo_laundry count is tied to the laundry bit
738 */
739 m->vmp_laundry = TRUE;
740 q->pgo_laundry++;
741
742 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744
745 if (object->internal == TRUE) {
746 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
747 m->vmp_busy = TRUE;
748 #if DEVELOPMENT || DEBUG
749 /*
750 * The benchmark queue will be woken up independently by the benchmark
751 * itself.
752 */
753 if (q != &vm_pageout_queue_benchmark) {
754 #else /* DEVELOPMENT || DEBUG */
755 if (true) {
756 #endif /* DEVELOPMENT || DEBUG */
757 /*
758 * Wake up the first compressor thread. It will wake subsequent
759 * threads if necessary.
760 */
761 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
762 pgo_iothread_internal_state[0].pgo_iothread);
763 }
764 } else {
765 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
766 }
767 VM_PAGE_CHECK(m);
768 }
769
770 void
771 vm_pageout_cluster(vm_page_t m)
772 {
773 struct vm_pageout_queue *q;
774 vm_object_t object = VM_PAGE_OBJECT(m);
775 if (object->internal) {
776 q = &vm_pageout_queue_internal;
777 } else {
778 q = &vm_pageout_queue_external;
779 }
780 vm_pageout_cluster_to_queue(m, q);
781 }
782
783
784 /*
785 * A page is back from laundry or we are stealing it back from
786 * the laundering state. See if there are some pages waiting to
787 * go to laundry and if we can let some of them go now.
788 *
789 * Object and page queues must be locked.
790 */
791 void
792 vm_pageout_throttle_up(
793 vm_page_t m)
794 {
795 struct vm_pageout_queue *q;
796 vm_object_t m_object;
797
798 m_object = VM_PAGE_OBJECT(m);
799
800 assert(m_object != VM_OBJECT_NULL);
801 assert(!is_kernel_object(m_object));
802
803 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
804 vm_object_lock_assert_exclusive(m_object);
805
806 if (m_object->internal == TRUE) {
807 q = &vm_pageout_queue_internal;
808 } else {
809 q = &vm_pageout_queue_external;
810 }
811
812 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
813 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
814 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
815
816 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
817
818 vm_object_activity_end(m_object);
819
820 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
821 }
822 if (m->vmp_laundry == TRUE) {
823 m->vmp_laundry = FALSE;
824 q->pgo_laundry--;
825
826 if (q->pgo_throttled == TRUE) {
827 q->pgo_throttled = FALSE;
828 thread_wakeup((event_t) &q->pgo_laundry);
829 }
830 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
831 q->pgo_draining = FALSE;
832 thread_wakeup((event_t) (&q->pgo_laundry + 1));
833 }
834 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
835 }
836 }
837
838
839 static void
840 vm_pageout_throttle_up_batch(
841 struct vm_pageout_queue *q,
842 int batch_cnt)
843 {
844 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
845
846 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
847
848 q->pgo_laundry -= batch_cnt;
849
850 if (q->pgo_throttled == TRUE) {
851 q->pgo_throttled = FALSE;
852 thread_wakeup((event_t) &q->pgo_laundry);
853 }
854 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
855 q->pgo_draining = FALSE;
856 thread_wakeup((event_t) (&q->pgo_laundry + 1));
857 }
858 }
859
860
861
862 /*
863 * VM memory pressure monitoring.
864 *
865 * vm_pageout_scan() keeps track of the number of pages it considers and
866 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
867 *
868 * compute_memory_pressure() is called every second from compute_averages()
869 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
870 * of recalimed pages in a new vm_pageout_stat[] bucket.
871 *
872 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
873 * The caller provides the number of seconds ("nsecs") worth of statistics
874 * it wants, up to 30 seconds.
875 * It computes the number of pages reclaimed in the past "nsecs" seconds and
876 * also returns the number of pages the system still needs to reclaim at this
877 * moment in time.
878 */
879 #if DEVELOPMENT || DEBUG
880 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
881 #else
882 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
883 #endif
884 struct vm_pageout_stat {
885 unsigned long vm_page_active_count;
886 unsigned long vm_page_speculative_count;
887 unsigned long vm_page_inactive_count;
888 unsigned long vm_page_anonymous_count;
889
890 unsigned long vm_page_free_count;
891 unsigned long vm_page_wire_count;
892 unsigned long vm_page_compressor_count;
893
894 unsigned long vm_page_pages_compressed;
895 unsigned long vm_page_pageable_internal_count;
896 unsigned long vm_page_pageable_external_count;
897 unsigned long vm_page_xpmapped_external_count;
898
899 unsigned int pages_grabbed;
900 unsigned int pages_freed;
901
902 unsigned int pages_compressed;
903 unsigned int pages_grabbed_by_compressor;
904 unsigned int failed_compressions;
905
906 unsigned int pages_evicted;
907 unsigned int pages_purged;
908
909 unsigned int considered;
910 unsigned int considered_bq_internal;
911 unsigned int considered_bq_external;
912
913 unsigned int skipped_external;
914 unsigned int skipped_internal;
915 unsigned int filecache_min_reactivations;
916
917 unsigned int freed_speculative;
918 unsigned int freed_cleaned;
919 unsigned int freed_internal;
920 unsigned int freed_external;
921
922 unsigned int cleaned_dirty_external;
923 unsigned int cleaned_dirty_internal;
924
925 unsigned int inactive_referenced;
926 unsigned int inactive_nolock;
927 unsigned int reactivation_limit_exceeded;
928 unsigned int forced_inactive_reclaim;
929
930 unsigned int throttled_internal_q;
931 unsigned int throttled_external_q;
932
933 unsigned int phantom_ghosts_found;
934 unsigned int phantom_ghosts_added;
935
936 unsigned int vm_page_realtime_count;
937 unsigned int forcereclaimed_sharedcache;
938 unsigned int forcereclaimed_realtime;
939 unsigned int protected_sharedcache;
940 unsigned int protected_realtime;
941 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
942
943 unsigned int vm_pageout_stat_now = 0;
944
945 #define VM_PAGEOUT_STAT_BEFORE(i) \
946 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
947 #define VM_PAGEOUT_STAT_AFTER(i) \
948 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
949
950 #if VM_PAGE_BUCKETS_CHECK
951 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
952 #endif /* VM_PAGE_BUCKETS_CHECK */
953
954
955 void
956 record_memory_pressure(void);
957 void
958 record_memory_pressure(void)
959 {
960 unsigned int vm_pageout_next;
961
962 #if VM_PAGE_BUCKETS_CHECK
963 /* check the consistency of VM page buckets at regular interval */
964 static int counter = 0;
965 if ((++counter % vm_page_buckets_check_interval) == 0) {
966 vm_page_buckets_check();
967 }
968 #endif /* VM_PAGE_BUCKETS_CHECK */
969
970 vm_pageout_state.vm_memory_pressure =
971 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
972 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
973 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
974 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
975
976 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
977
978 /* move "now" forward */
979 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
980
981 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
982
983 vm_pageout_stat_now = vm_pageout_next;
984 }
985
986
987 /*
988 * IMPORTANT
989 * mach_vm_ctl_page_free_wanted() is called indirectly, via
990 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
991 * it must be safe in the restricted stackshot context. Locks and/or
992 * blocking are not allowable.
993 */
994 unsigned int
995 mach_vm_ctl_page_free_wanted(void)
996 {
997 unsigned int page_free_target, page_free_count, page_free_wanted;
998
999 page_free_target = vm_page_free_target;
1000 page_free_count = vm_page_free_count;
1001 if (page_free_target > page_free_count) {
1002 page_free_wanted = page_free_target - page_free_count;
1003 } else {
1004 page_free_wanted = 0;
1005 }
1006
1007 return page_free_wanted;
1008 }
1009
1010
1011 /*
1012 * IMPORTANT:
1013 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1014 * wait_for_pressure FALSE, so that code path must remain safe in the
1015 * restricted stackshot context. No blocking or locks are allowable.
1016 * on that code path.
1017 */
1018
1019 kern_return_t
1020 mach_vm_pressure_monitor(
1021 boolean_t wait_for_pressure,
1022 unsigned int nsecs_monitored,
1023 unsigned int *pages_reclaimed_p,
1024 unsigned int *pages_wanted_p)
1025 {
1026 wait_result_t wr;
1027 unsigned int vm_pageout_then, vm_pageout_now;
1028 unsigned int pages_reclaimed;
1029 unsigned int units_of_monitor;
1030
1031 units_of_monitor = 8 * nsecs_monitored;
1032 /*
1033 * We don't take the vm_page_queue_lock here because we don't want
1034 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1035 * thread when it's trying to reclaim memory. We don't need fully
1036 * accurate monitoring anyway...
1037 */
1038
1039 if (wait_for_pressure) {
1040 /* wait until there's memory pressure */
1041 while (vm_page_free_count >= vm_page_free_target) {
1042 wr = assert_wait((event_t) &vm_page_free_wanted,
1043 THREAD_INTERRUPTIBLE);
1044 if (wr == THREAD_WAITING) {
1045 wr = thread_block(THREAD_CONTINUE_NULL);
1046 }
1047 if (wr == THREAD_INTERRUPTED) {
1048 return KERN_ABORTED;
1049 }
1050 if (wr == THREAD_AWAKENED) {
1051 /*
1052 * The memory pressure might have already
1053 * been relieved but let's not block again
1054 * and let's report that there was memory
1055 * pressure at some point.
1056 */
1057 break;
1058 }
1059 }
1060 }
1061
1062 /* provide the number of pages the system wants to reclaim */
1063 if (pages_wanted_p != NULL) {
1064 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1065 }
1066
1067 if (pages_reclaimed_p == NULL) {
1068 return KERN_SUCCESS;
1069 }
1070
1071 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1072 vm_pageout_now = vm_pageout_stat_now;
1073 pages_reclaimed = 0;
1074 for (vm_pageout_then =
1075 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1076 vm_pageout_then != vm_pageout_now &&
1077 units_of_monitor-- != 0;
1078 vm_pageout_then =
1079 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1080 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1081 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1082 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1083 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1084 }
1085 *pages_reclaimed_p = pages_reclaimed;
1086
1087 return KERN_SUCCESS;
1088 }
1089
1090
1091
1092 #if DEVELOPMENT || DEBUG
1093
1094 static void
1095 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1096
1097 /*
1098 * condition variable used to make sure there is
1099 * only a single sweep going on at a time
1100 */
1101 bool vm_pageout_disconnect_all_pages_active = false;
1102
1103 void
1104 vm_pageout_disconnect_all_pages()
1105 {
1106 vm_page_lock_queues();
1107
1108 if (vm_pageout_disconnect_all_pages_active) {
1109 vm_page_unlock_queues();
1110 return;
1111 }
1112 vm_pageout_disconnect_all_pages_active = true;
1113
1114 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1115 vm_page_throttled_count);
1116 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1117 vm_page_anonymous_count);
1118 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1119 (vm_page_inactive_count - vm_page_anonymous_count));
1120 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1121 vm_page_active_count);
1122 #ifdef CONFIG_SECLUDED_MEMORY
1123 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1124 vm_page_secluded_count);
1125 #endif /* CONFIG_SECLUDED_MEMORY */
1126 vm_page_unlock_queues();
1127
1128 vm_pageout_disconnect_all_pages_active = false;
1129 }
1130
1131 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1132 void
1133 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1134 {
1135 vm_page_t m;
1136 vm_object_t t_object = NULL;
1137 vm_object_t l_object = NULL;
1138 vm_object_t m_object = NULL;
1139 int delayed_unlock = 0;
1140 int try_failed_count = 0;
1141 int disconnected_count = 0;
1142 int paused_count = 0;
1143 int object_locked_count = 0;
1144
1145 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1146 DBG_FUNC_START),
1147 q, qcount);
1148
1149 while (qcount && !vm_page_queue_empty(q)) {
1150 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1151
1152 m = (vm_page_t) vm_page_queue_first(q);
1153 m_object = VM_PAGE_OBJECT(m);
1154
1155 if (m_object == VM_OBJECT_NULL) {
1156 /*
1157 * Bumped into a free page. This should only happen on the
1158 * secluded queue
1159 */
1160 #if CONFIG_SECLUDED_MEMORY
1161 assert(q == &vm_page_queue_secluded);
1162 #endif /* CONFIG_SECLUDED_MEMORY */
1163 goto reenter_pg_on_q;
1164 }
1165
1166 /*
1167 * check to see if we currently are working
1168 * with the same object... if so, we've
1169 * already got the lock
1170 */
1171 if (m_object != l_object) {
1172 /*
1173 * the object associated with candidate page is
1174 * different from the one we were just working
1175 * with... dump the lock if we still own it
1176 */
1177 if (l_object != NULL) {
1178 vm_object_unlock(l_object);
1179 l_object = NULL;
1180 }
1181 if (m_object != t_object) {
1182 try_failed_count = 0;
1183 }
1184
1185 /*
1186 * Try to lock object; since we've alread got the
1187 * page queues lock, we can only 'try' for this one.
1188 * if the 'try' fails, we need to do a mutex_pause
1189 * to allow the owner of the object lock a chance to
1190 * run...
1191 */
1192 if (!vm_object_lock_try_scan(m_object)) {
1193 if (try_failed_count > 20) {
1194 goto reenter_pg_on_q;
1195 }
1196 vm_page_unlock_queues();
1197 mutex_pause(try_failed_count++);
1198 vm_page_lock_queues();
1199 delayed_unlock = 0;
1200
1201 paused_count++;
1202
1203 t_object = m_object;
1204 continue;
1205 }
1206 object_locked_count++;
1207
1208 l_object = m_object;
1209 }
1210 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1211 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1212 m->vmp_free_when_done) {
1213 /*
1214 * put it back on the head of its queue
1215 */
1216 goto reenter_pg_on_q;
1217 }
1218 if (m->vmp_pmapped == TRUE) {
1219 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1220
1221 disconnected_count++;
1222 }
1223 reenter_pg_on_q:
1224 vm_page_queue_remove(q, m, vmp_pageq);
1225 vm_page_queue_enter(q, m, vmp_pageq);
1226
1227 qcount--;
1228 try_failed_count = 0;
1229
1230 if (delayed_unlock++ > 128) {
1231 if (l_object != NULL) {
1232 vm_object_unlock(l_object);
1233 l_object = NULL;
1234 }
1235 lck_mtx_yield(&vm_page_queue_lock);
1236 delayed_unlock = 0;
1237 }
1238 }
1239 if (l_object != NULL) {
1240 vm_object_unlock(l_object);
1241 l_object = NULL;
1242 }
1243
1244 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1245 DBG_FUNC_END),
1246 q, disconnected_count, object_locked_count, paused_count);
1247 }
1248
1249 extern const char *proc_best_name(struct proc* proc);
1250
1251 int
1252 vm_toggle_task_selfdonate_pages(task_t task)
1253 {
1254 int state = 0;
1255 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1256 printf("VM Donation mode is OFF on the system\n");
1257 return state;
1258 }
1259 if (task != kernel_task) {
1260 task_lock(task);
1261 if (!task->donates_own_pages) {
1262 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1263 task->donates_own_pages = true;
1264 state = 1;
1265 } else if (task->donates_own_pages) {
1266 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1267 task->donates_own_pages = false;
1268 state = 0;
1269 }
1270 task_unlock(task);
1271 }
1272 return state;
1273 }
1274 #endif /* DEVELOPMENT || DEBUG */
1275
1276 void
1277 vm_task_set_selfdonate_pages(task_t task, bool donate)
1278 {
1279 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1280 assert(task != kernel_task);
1281
1282 task_lock(task);
1283 task->donates_own_pages = donate;
1284 task_unlock(task);
1285 }
1286
1287
1288
1289 static size_t
1290 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1291
1292 /*
1293 * condition variable used to make sure there is
1294 * only a single sweep going on at a time
1295 */
1296 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1297
1298
1299 kern_return_t
1300 vm_pageout_anonymous_pages()
1301 {
1302 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1303 size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1304 vm_page_lock_queues();
1305
1306 if (vm_pageout_anonymous_pages_active == TRUE) {
1307 vm_page_unlock_queues();
1308 return KERN_RESOURCE_SHORTAGE;
1309 }
1310 vm_pageout_anonymous_pages_active = TRUE;
1311 vm_page_unlock_queues();
1312
1313 throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1314 anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1315 active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1316
1317 os_log(OS_LOG_DEFAULT,
1318 "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1319 __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1320
1321 if (VM_CONFIG_SWAP_IS_PRESENT) {
1322 vm_consider_swapping();
1323 }
1324
1325 vm_page_lock_queues();
1326 vm_pageout_anonymous_pages_active = FALSE;
1327 vm_page_unlock_queues();
1328 return KERN_SUCCESS;
1329 } else {
1330 return KERN_NOT_SUPPORTED;
1331 }
1332 }
1333
1334
1335 size_t
1336 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1337 {
1338 vm_page_t m;
1339 vm_object_t t_object = NULL;
1340 vm_object_t l_object = NULL;
1341 vm_object_t m_object = NULL;
1342 int delayed_unlock = 0;
1343 int try_failed_count = 0;
1344 int refmod_state;
1345 int pmap_options;
1346 struct vm_pageout_queue *iq;
1347 ppnum_t phys_page;
1348 size_t pages_moved = 0;
1349
1350
1351 iq = &vm_pageout_queue_internal;
1352
1353 vm_page_lock_queues();
1354
1355 #if DEVELOPMENT || DEBUG
1356 if (perf_test) {
1357 iq = &vm_pageout_queue_benchmark;
1358 // ensure the benchmark queue isn't throttled
1359 iq->pgo_maxlaundry = (unsigned int) qcount;
1360 }
1361 #endif /* DEVELOPMENT ||DEBUG */
1362
1363 while (qcount && !vm_page_queue_empty(q)) {
1364 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1365
1366 if (VM_PAGE_Q_THROTTLED(iq)) {
1367 if (l_object != NULL) {
1368 vm_object_unlock(l_object);
1369 l_object = NULL;
1370 }
1371 iq->pgo_draining = TRUE;
1372
1373 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1374 vm_page_unlock_queues();
1375
1376 thread_block(THREAD_CONTINUE_NULL);
1377
1378 vm_page_lock_queues();
1379 delayed_unlock = 0;
1380 continue;
1381 }
1382 m = (vm_page_t) vm_page_queue_first(q);
1383 m_object = VM_PAGE_OBJECT(m);
1384
1385 /*
1386 * check to see if we currently are working
1387 * with the same object... if so, we've
1388 * already got the lock
1389 */
1390 if (m_object != l_object) {
1391 if (!m_object->internal) {
1392 goto reenter_pg_on_q;
1393 }
1394
1395 /*
1396 * the object associated with candidate page is
1397 * different from the one we were just working
1398 * with... dump the lock if we still own it
1399 */
1400 if (l_object != NULL) {
1401 vm_object_unlock(l_object);
1402 l_object = NULL;
1403 }
1404 if (m_object != t_object) {
1405 try_failed_count = 0;
1406 }
1407
1408 /*
1409 * Try to lock object; since we've alread got the
1410 * page queues lock, we can only 'try' for this one.
1411 * if the 'try' fails, we need to do a mutex_pause
1412 * to allow the owner of the object lock a chance to
1413 * run...
1414 */
1415 if (!vm_object_lock_try_scan(m_object)) {
1416 if (try_failed_count > 20) {
1417 goto reenter_pg_on_q;
1418 }
1419 vm_page_unlock_queues();
1420 mutex_pause(try_failed_count++);
1421 vm_page_lock_queues();
1422 delayed_unlock = 0;
1423
1424 t_object = m_object;
1425 continue;
1426 }
1427 l_object = m_object;
1428 }
1429 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1430 /*
1431 * page is not to be cleaned
1432 * put it back on the head of its queue
1433 */
1434 goto reenter_pg_on_q;
1435 }
1436 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1437
1438 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1439 refmod_state = pmap_get_refmod(phys_page);
1440
1441 if (refmod_state & VM_MEM_REFERENCED) {
1442 m->vmp_reference = TRUE;
1443 }
1444 if (refmod_state & VM_MEM_MODIFIED) {
1445 SET_PAGE_DIRTY(m, FALSE);
1446 }
1447 }
1448 if (m->vmp_reference == TRUE) {
1449 m->vmp_reference = FALSE;
1450 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1451 goto reenter_pg_on_q;
1452 }
1453 if (m->vmp_pmapped == TRUE) {
1454 if (m->vmp_dirty || m->vmp_precious) {
1455 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1456 } else {
1457 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1458 }
1459 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1460 if (refmod_state & VM_MEM_MODIFIED) {
1461 SET_PAGE_DIRTY(m, FALSE);
1462 }
1463 }
1464
1465 if (!m->vmp_dirty && !m->vmp_precious) {
1466 vm_page_unlock_queues();
1467 VM_PAGE_FREE(m);
1468 vm_page_lock_queues();
1469 delayed_unlock = 0;
1470
1471 goto next_pg;
1472 }
1473 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1474 if (!m_object->pager_initialized) {
1475 vm_page_unlock_queues();
1476
1477 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1478
1479 if (!m_object->pager_initialized) {
1480 vm_object_compressor_pager_create(m_object);
1481 }
1482
1483 vm_page_lock_queues();
1484 delayed_unlock = 0;
1485 }
1486 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1487 /*
1488 * We dropped the page queues lock above, so
1489 * "m" might no longer be on this queue...
1490 */
1491 if (m != (vm_page_t) vm_page_queue_first(q)) {
1492 continue;
1493 }
1494 goto reenter_pg_on_q;
1495 }
1496 /*
1497 * vm_object_compressor_pager_create will drop the object lock
1498 * which means 'm' may no longer be valid to use
1499 */
1500 continue;
1501 }
1502
1503 if (!perf_test) {
1504 /*
1505 * we've already factored out pages in the laundry which
1506 * means this page can't be on the pageout queue so it's
1507 * safe to do the vm_page_queues_remove
1508 */
1509 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1510 vm_page_queues_remove(m, TRUE);
1511 if (donate) {
1512 /*
1513 * The compressor needs to see this bit to know
1514 * where this page needs to land. Also if stolen,
1515 * this bit helps put the page back in the right
1516 * special queue where it belongs.
1517 */
1518 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1519 }
1520 } else {
1521 vm_page_queue_remove(q, m, vmp_pageq);
1522 }
1523
1524 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1525
1526 vm_pageout_cluster_to_queue(m, iq);
1527
1528 pages_moved++;
1529 goto next_pg;
1530
1531 reenter_pg_on_q:
1532 vm_page_queue_remove(q, m, vmp_pageq);
1533 vm_page_queue_enter(q, m, vmp_pageq);
1534 next_pg:
1535 qcount--;
1536 try_failed_count = 0;
1537
1538 if (delayed_unlock++ > 128) {
1539 if (l_object != NULL) {
1540 vm_object_unlock(l_object);
1541 l_object = NULL;
1542 }
1543 lck_mtx_yield(&vm_page_queue_lock);
1544 delayed_unlock = 0;
1545 }
1546 }
1547 if (l_object != NULL) {
1548 vm_object_unlock(l_object);
1549 l_object = NULL;
1550 }
1551 vm_page_unlock_queues();
1552 return pages_moved;
1553 }
1554
1555
1556
1557 /*
1558 * function in BSD to apply I/O throttle to the pageout thread
1559 */
1560 extern void vm_pageout_io_throttle(void);
1561
1562 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1563 MACRO_BEGIN \
1564 /* \
1565 * If a "reusable" page somehow made it back into \
1566 * the active queue, it's been re-used and is not \
1567 * quite re-usable. \
1568 * If the VM object was "all_reusable", consider it \
1569 * as "all re-used" instead of converting it to \
1570 * "partially re-used", which could be expensive. \
1571 */ \
1572 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1573 if ((m)->vmp_reusable || \
1574 (obj)->all_reusable) { \
1575 vm_object_reuse_pages((obj), \
1576 (m)->vmp_offset, \
1577 (m)->vmp_offset + PAGE_SIZE_64, \
1578 FALSE); \
1579 } \
1580 MACRO_END
1581
1582
1583 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1585
1586 #define FCS_IDLE 0
1587 #define FCS_DELAYED 1
1588 #define FCS_DEADLOCK_DETECTED 2
1589
1590 struct flow_control {
1591 int state;
1592 mach_timespec_t ts;
1593 };
1594
1595
1596 uint64_t vm_pageout_rejected_bq_internal = 0;
1597 uint64_t vm_pageout_rejected_bq_external = 0;
1598 uint64_t vm_pageout_skipped_bq_internal = 0;
1599 uint64_t vm_pageout_skipped_bq_external = 0;
1600
1601 #define ANONS_GRABBED_LIMIT 2
1602
1603
1604 #if 0
1605 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1606 #endif
1607 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1608
1609 #define VM_PAGEOUT_PB_NO_ACTION 0
1610 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1611 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1612
1613
1614 #if 0
1615 static void
1616 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1617 {
1618 if (*local_freeq) {
1619 vm_page_unlock_queues();
1620
1621 VM_DEBUG_CONSTANT_EVENT(
1622 vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1623 vm_page_free_count, 0, 0, 1);
1624
1625 vm_page_free_list(*local_freeq, TRUE);
1626
1627 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1628 vm_page_free_count, *local_freed, 0, 1);
1629
1630 *local_freeq = NULL;
1631 *local_freed = 0;
1632
1633 vm_page_lock_queues();
1634 } else {
1635 lck_mtx_yield(&vm_page_queue_lock);
1636 }
1637 *delayed_unlock = 1;
1638 }
1639 #endif
1640
1641
1642 static void
1643 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1644 vm_page_t *local_freeq, int *local_freed, int action)
1645 {
1646 vm_page_unlock_queues();
1647
1648 if (*object != NULL) {
1649 vm_object_unlock(*object);
1650 *object = NULL;
1651 }
1652 if (*local_freeq) {
1653 vm_page_free_list(*local_freeq, TRUE);
1654
1655 *local_freeq = NULL;
1656 *local_freed = 0;
1657 }
1658 *delayed_unlock = 1;
1659
1660 switch (action) {
1661 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1662 vm_consider_waking_compactor_swapper();
1663 break;
1664 case VM_PAGEOUT_PB_THREAD_YIELD:
1665 thread_yield_internal(1);
1666 break;
1667 case VM_PAGEOUT_PB_NO_ACTION:
1668 default:
1669 break;
1670 }
1671 vm_page_lock_queues();
1672 }
1673
1674
1675 static struct vm_pageout_vminfo last;
1676
1677 uint64_t last_vm_page_pages_grabbed = 0;
1678
1679 extern uint32_t c_segment_pages_compressed;
1680
1681 extern uint64_t shared_region_pager_reclaimed;
1682 extern struct memory_object_pager_ops shared_region_pager_ops;
1683
1684 void
1685 update_vm_info(void)
1686 {
1687 unsigned long tmp;
1688 uint64_t tmp64;
1689
1690 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1691 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1692 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1693 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1694
1695 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1696 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1697 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1698
1699 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1700 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1701 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1702 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1703 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1704
1705 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1706 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1707 last.vm_pageout_considered_page = tmp;
1708
1709 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1710 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1711 last.vm_pageout_compressions = tmp64;
1712
1713 tmp = vm_pageout_vminfo.vm_compressor_failed;
1714 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1715 last.vm_compressor_failed = tmp;
1716
1717 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1718 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1719 last.vm_compressor_pages_grabbed = tmp64;
1720
1721 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1722 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1723 last.vm_phantom_cache_found_ghost = tmp;
1724
1725 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1726 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1727 last.vm_phantom_cache_added_ghost = tmp;
1728
1729 tmp64 = counter_load(&vm_page_grab_count);
1730 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1731 last_vm_page_pages_grabbed = tmp64;
1732
1733 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1734 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1735 last.vm_page_pages_freed = tmp;
1736
1737 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1738 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1739 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1740 last.vm_pageout_pages_evicted = tmp;
1741
1742 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1743 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1744 last.vm_pageout_pages_purged = tmp;
1745
1746 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1747 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1748 last.vm_pageout_freed_speculative = tmp;
1749
1750 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1751 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1752 last.vm_pageout_freed_external = tmp;
1753
1754 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1755 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1756 last.vm_pageout_inactive_referenced = tmp;
1757
1758 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1759 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1760 last.vm_pageout_scan_inactive_throttled_external = tmp;
1761
1762 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1763 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1764 last.vm_pageout_inactive_dirty_external = tmp;
1765
1766 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1767 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1768 last.vm_pageout_freed_cleaned = tmp;
1769
1770 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1771 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1772 last.vm_pageout_inactive_nolock = tmp;
1773
1774 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1775 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1776 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1777
1778 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1779 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1780 last.vm_pageout_skipped_external = tmp;
1781
1782 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1783 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1784 last.vm_pageout_skipped_internal = tmp;
1785
1786 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1787 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1788 last.vm_pageout_reactivation_limit_exceeded = tmp;
1789
1790 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1791 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1792 last.vm_pageout_inactive_force_reclaim = tmp;
1793
1794 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1795 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1796 last.vm_pageout_freed_internal = tmp;
1797
1798 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1799 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1800 last.vm_pageout_considered_bq_internal = tmp;
1801
1802 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1803 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1804 last.vm_pageout_considered_bq_external = tmp;
1805
1806 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1807 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1808 last.vm_pageout_filecache_min_reactivated = tmp;
1809
1810 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1811 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1812 last.vm_pageout_inactive_dirty_internal = tmp;
1813
1814 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1815 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1816 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1817
1818 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1819 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1820 last.vm_pageout_forcereclaimed_realtime = tmp;
1821
1822 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1823 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1824 last.vm_pageout_protected_sharedcache = tmp;
1825
1826 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1827 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1828 last.vm_pageout_protected_realtime = tmp;
1829 }
1830
1831 KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1832 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1833 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1834 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1835 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1836
1837 KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1838 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1839 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1840 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1841
1842 KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1843 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1844 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1845 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1846 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1847
1848 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1849 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1850 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1851 KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1852 vm_pageout_stats[vm_pageout_stat_now].considered,
1853 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1854 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1855 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1856
1857 KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1858 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1859 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1860 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1861 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1862
1863 KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1864 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1865 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1866 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1867 vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1868
1869 KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1870 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1871 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1872 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1873 vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1874
1875 KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1876 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1877 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1878 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1879 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1880
1881 KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1882 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1883 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1884 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1885 vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1886 }
1887 KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1888 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1889 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1890 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1891 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1892
1893 record_memory_pressure();
1894 }
1895
1896 extern boolean_t hibernation_vmqueues_inspection;
1897
1898 /*
1899 * Return values for functions called by vm_pageout_scan
1900 * that control its flow.
1901 *
1902 * PROCEED -- vm_pageout_scan will keep making forward progress.
1903 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1904 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1905 */
1906
1907 #define VM_PAGEOUT_SCAN_PROCEED (0)
1908 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1909 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1910
1911 /*
1912 * This function is called only from vm_pageout_scan and
1913 * it moves overflow secluded pages (one-at-a-time) to the
1914 * batched 'local' free Q or active Q.
1915 */
1916 static void
1917 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1918 {
1919 #if CONFIG_SECLUDED_MEMORY
1920 /*
1921 * Deal with secluded_q overflow.
1922 */
1923 if (vm_page_secluded_count > vm_page_secluded_target) {
1924 vm_page_t secluded_page;
1925
1926 /*
1927 * SECLUDED_AGING_BEFORE_ACTIVE:
1928 * Excess secluded pages go to the active queue and
1929 * will later go to the inactive queue.
1930 */
1931 assert((vm_page_secluded_count_free +
1932 vm_page_secluded_count_inuse) ==
1933 vm_page_secluded_count);
1934 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1935 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1936
1937 vm_page_queues_remove(secluded_page, FALSE);
1938 assert(!vm_page_is_fictitious(secluded_page));
1939 assert(!VM_PAGE_WIRED(secluded_page));
1940
1941 if (secluded_page->vmp_object == 0) {
1942 /* transfer to free queue */
1943 assert(secluded_page->vmp_busy);
1944 secluded_page->vmp_snext = *local_freeq;
1945 *local_freeq = secluded_page;
1946 *local_freed += 1;
1947 } else {
1948 /* transfer to head of active queue */
1949 vm_page_enqueue_active(secluded_page, FALSE);
1950 secluded_page = VM_PAGE_NULL;
1951 }
1952 }
1953 #else /* CONFIG_SECLUDED_MEMORY */
1954
1955 #pragma unused(local_freeq)
1956 #pragma unused(local_freed)
1957
1958 return;
1959
1960 #endif /* CONFIG_SECLUDED_MEMORY */
1961 }
1962
1963 /*
1964 * This function is called only from vm_pageout_scan and
1965 * it initializes the loop targets for vm_pageout_scan().
1966 */
1967 static void
1968 vps_init_page_targets(void)
1969 {
1970 /*
1971 * LD TODO: Other page targets should be calculated here too.
1972 */
1973 vm_page_anonymous_min = vm_page_inactive_target / 20;
1974
1975 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1976 vm_pageout_state.vm_page_speculative_percentage = 50;
1977 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1978 vm_pageout_state.vm_page_speculative_percentage = 1;
1979 }
1980
1981 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1982 vm_page_inactive_count);
1983 }
1984
1985 /*
1986 * This function is called only from vm_pageout_scan and
1987 * it purges a single VM object at-a-time and will either
1988 * make vm_pageout_scan() restart the loop or keeping moving forward.
1989 */
1990 static int
1991 vps_purge_object()
1992 {
1993 int force_purge;
1994
1995 assert(available_for_purge >= 0);
1996 force_purge = 0; /* no force-purging */
1997
1998 #if VM_PRESSURE_EVENTS
1999 vm_pressure_level_t pressure_level;
2000
2001 pressure_level = memorystatus_vm_pressure_level;
2002
2003 if (pressure_level > kVMPressureNormal) {
2004 if (pressure_level >= kVMPressureCritical) {
2005 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2006 } else if (pressure_level >= kVMPressureUrgent) {
2007 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2008 } else if (pressure_level >= kVMPressureWarning) {
2009 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2010 }
2011 }
2012 #endif /* VM_PRESSURE_EVENTS */
2013
2014 if (available_for_purge || force_purge) {
2015 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2016
2017 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2018 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2019 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2020 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2021 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2022
2023 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2024 }
2025 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2026 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2027 }
2028
2029 return VM_PAGEOUT_SCAN_PROCEED;
2030 }
2031
2032 /*
2033 * This function is called only from vm_pageout_scan and
2034 * it will try to age the next speculative Q if the oldest
2035 * one is empty.
2036 */
2037 static int
2038 vps_age_speculative_queue(boolean_t force_speculative_aging)
2039 {
2040 #define DELAY_SPECULATIVE_AGE 1000
2041
2042 /*
2043 * try to pull pages from the aging bins...
2044 * see vm_page_internal.h for an explanation of how
2045 * this mechanism works
2046 */
2047 boolean_t can_steal = FALSE;
2048 int num_scanned_queues;
2049 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2050 mach_timespec_t ts;
2051 struct vm_speculative_age_q *aq;
2052 struct vm_speculative_age_q *sq;
2053
2054 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2055
2056 aq = &vm_page_queue_speculative[speculative_steal_index];
2057
2058 num_scanned_queues = 0;
2059 while (vm_page_queue_empty(&aq->age_q) &&
2060 num_scanned_queues++ != vm_page_max_speculative_age_q) {
2061 speculative_steal_index++;
2062
2063 if (speculative_steal_index > vm_page_max_speculative_age_q) {
2064 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2065 }
2066
2067 aq = &vm_page_queue_speculative[speculative_steal_index];
2068 }
2069
2070 if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2071 /*
2072 * XXX We've scanned all the speculative
2073 * queues but still haven't found one
2074 * that is not empty, even though
2075 * vm_page_speculative_count is not 0.
2076 */
2077 if (!vm_page_queue_empty(&sq->age_q)) {
2078 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2079 }
2080 #if DEVELOPMENT || DEBUG
2081 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2082 #endif
2083 /* readjust... */
2084 vm_page_speculative_count = 0;
2085 /* ... and continue */
2086 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2087 }
2088
2089 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2090 can_steal = TRUE;
2091 } else {
2092 if (!delay_speculative_age) {
2093 mach_timespec_t ts_fully_aged;
2094
2095 ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2096 ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2097 * 1000 * NSEC_PER_USEC;
2098
2099 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2100
2101 clock_sec_t sec;
2102 clock_nsec_t nsec;
2103 clock_get_system_nanotime(&sec, &nsec);
2104 ts.tv_sec = (unsigned int) sec;
2105 ts.tv_nsec = nsec;
2106
2107 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2108 can_steal = TRUE;
2109 } else {
2110 delay_speculative_age++;
2111 }
2112 } else {
2113 delay_speculative_age++;
2114 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2115 delay_speculative_age = 0;
2116 }
2117 }
2118 }
2119 if (can_steal == TRUE) {
2120 vm_page_speculate_ageit(aq);
2121 }
2122
2123 return VM_PAGEOUT_SCAN_PROCEED;
2124 }
2125
2126 /*
2127 * This function is called only from vm_pageout_scan and
2128 * it evicts a single VM object from the cache.
2129 */
2130 static int inline
2131 vps_object_cache_evict(vm_object_t *object_to_unlock)
2132 {
2133 static int cache_evict_throttle = 0;
2134 struct vm_speculative_age_q *sq;
2135
2136 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2137
2138 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2139 int pages_evicted;
2140
2141 if (*object_to_unlock != NULL) {
2142 vm_object_unlock(*object_to_unlock);
2143 *object_to_unlock = NULL;
2144 }
2145 KDBG(0x13001ec | DBG_FUNC_START);
2146
2147 pages_evicted = vm_object_cache_evict(100, 10);
2148
2149 KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2150
2151 if (pages_evicted) {
2152 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2153
2154 VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2155 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2156 memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2157
2158 /*
2159 * we just freed up to 100 pages,
2160 * so go back to the top of the main loop
2161 * and re-evaulate the memory situation
2162 */
2163 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2164 } else {
2165 cache_evict_throttle = 1000;
2166 }
2167 }
2168 if (cache_evict_throttle) {
2169 cache_evict_throttle--;
2170 }
2171
2172 return VM_PAGEOUT_SCAN_PROCEED;
2173 }
2174
2175
2176 /*
2177 * This function is called only from vm_pageout_scan and
2178 * it calculates the filecache min. that needs to be maintained
2179 * as we start to steal pages.
2180 */
2181 static void
2182 vps_calculate_filecache_min(void)
2183 {
2184 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2185
2186 #if CONFIG_JETSAM
2187 /*
2188 * don't let the filecache_min fall below 15% of available memory
2189 * on systems with an active compressor that isn't nearing its
2190 * limits w/r to accepting new data
2191 *
2192 * on systems w/o the compressor/swapper, the filecache is always
2193 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2194 * since most (if not all) of the anonymous pages are in the
2195 * throttled queue (which isn't counted as available) which
2196 * effectively disables this filter
2197 */
2198 if (vm_compressor_low_on_space() || divisor == 0) {
2199 vm_pageout_state.vm_page_filecache_min = 0;
2200 } else {
2201 vm_pageout_state.vm_page_filecache_min =
2202 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2203 }
2204 #else
2205 if (vm_compressor_out_of_space() || divisor == 0) {
2206 vm_pageout_state.vm_page_filecache_min = 0;
2207 } else {
2208 /*
2209 * don't let the filecache_min fall below the specified critical level
2210 */
2211 vm_pageout_state.vm_page_filecache_min =
2212 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2213 }
2214 #endif
2215 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2216 vm_pageout_state.vm_page_filecache_min = 0;
2217 }
2218 }
2219
2220 /*
2221 * This function is called only from vm_pageout_scan and
2222 * it updates the flow control time to detect if VM pageoutscan
2223 * isn't making progress.
2224 */
2225 static void
2226 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2227 {
2228 mach_timespec_t ts;
2229 clock_sec_t sec;
2230 clock_nsec_t nsec;
2231
2232 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2233 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2234 clock_get_system_nanotime(&sec, &nsec);
2235 flow_control->ts.tv_sec = (unsigned int) sec;
2236 flow_control->ts.tv_nsec = nsec;
2237 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2238
2239 flow_control->state = FCS_DELAYED;
2240
2241 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2242 }
2243
2244 /*
2245 * This function is called only from vm_pageout_scan and
2246 * it is the flow control logic of VM pageout scan which
2247 * controls if it should block and for how long.
2248 * Any blocking of vm_pageout_scan happens ONLY in this function.
2249 */
2250 static int
2251 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2252 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2253 {
2254 boolean_t exceeded_burst_throttle = FALSE;
2255 unsigned int msecs = 0;
2256 uint32_t inactive_external_count;
2257 mach_timespec_t ts;
2258 struct vm_pageout_queue *iq;
2259 struct vm_pageout_queue *eq;
2260 struct vm_speculative_age_q *sq;
2261
2262 iq = &vm_pageout_queue_internal;
2263 eq = &vm_pageout_queue_external;
2264 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2265
2266 /*
2267 * Sometimes we have to pause:
2268 * 1) No inactive pages - nothing to do.
2269 * 2) Loop control - no acceptable pages found on the inactive queue
2270 * within the last vm_pageout_burst_inactive_throttle iterations
2271 * 3) Flow control - default pageout queue is full
2272 */
2273 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2274 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2275 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2276 vm_page_queue_empty(&sq->age_q)) {
2277 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2278 msecs = vm_pageout_state.vm_pageout_empty_wait;
2279 } else if (inactive_burst_count >=
2280 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2281 (vm_page_inactive_count +
2282 vm_page_speculative_count))) {
2283 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2284 msecs = vm_pageout_state.vm_pageout_burst_wait;
2285
2286 exceeded_burst_throttle = TRUE;
2287 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2288 VM_DYNAMIC_PAGING_ENABLED()) {
2289 clock_sec_t sec;
2290 clock_nsec_t nsec;
2291
2292 switch (flow_control->state) {
2293 case FCS_IDLE:
2294 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2295 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2296 /*
2297 * since the compressor is running independently of vm_pageout_scan
2298 * let's not wait for it just yet... as long as we have a healthy supply
2299 * of filecache pages to work with, let's keep stealing those.
2300 */
2301 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2302
2303 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2304 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2305 *anons_grabbed = ANONS_GRABBED_LIMIT;
2306 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2307 return VM_PAGEOUT_SCAN_PROCEED;
2308 }
2309 }
2310
2311 vps_flow_control_reset_deadlock_timer(flow_control);
2312 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2313
2314 break;
2315
2316 case FCS_DELAYED:
2317 clock_get_system_nanotime(&sec, &nsec);
2318 ts.tv_sec = (unsigned int) sec;
2319 ts.tv_nsec = nsec;
2320
2321 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2322 /*
2323 * the pageout thread for the default pager is potentially
2324 * deadlocked since the
2325 * default pager queue has been throttled for more than the
2326 * allowable time... we need to move some clean pages or dirty
2327 * pages belonging to the external pagers if they aren't throttled
2328 * vm_page_free_wanted represents the number of threads currently
2329 * blocked waiting for pages... we'll move one page for each of
2330 * these plus a fixed amount to break the logjam... once we're done
2331 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2332 * with a new timeout target since we have no way of knowing
2333 * whether we've broken the deadlock except through observation
2334 * of the queue associated with the default pager... we need to
2335 * stop moving pages and allow the system to run to see what
2336 * state it settles into.
2337 */
2338
2339 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2340 vm_page_free_wanted + vm_page_free_wanted_privileged;
2341 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2342 flow_control->state = FCS_DEADLOCK_DETECTED;
2343 sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2344 return VM_PAGEOUT_SCAN_PROCEED;
2345 }
2346 /*
2347 * just resniff instead of trying
2348 * to compute a new delay time... we're going to be
2349 * awakened immediately upon a laundry completion,
2350 * so we won't wait any longer than necessary
2351 */
2352 msecs = vm_pageout_state.vm_pageout_idle_wait;
2353 break;
2354
2355 case FCS_DEADLOCK_DETECTED:
2356 if (*vm_pageout_deadlock_target) {
2357 return VM_PAGEOUT_SCAN_PROCEED;
2358 }
2359
2360 vps_flow_control_reset_deadlock_timer(flow_control);
2361 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2362
2363 break;
2364 }
2365 } else {
2366 /*
2367 * No need to pause...
2368 */
2369 return VM_PAGEOUT_SCAN_PROCEED;
2370 }
2371
2372 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2373
2374 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2375 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2376
2377 if (vm_page_free_count >= vm_page_free_target) {
2378 /*
2379 * we're here because
2380 * 1) someone else freed up some pages while we had
2381 * the queues unlocked above
2382 * and we've hit one of the 3 conditions that
2383 * cause us to pause the pageout scan thread
2384 *
2385 * since we already have enough free pages,
2386 * let's avoid stalling and return normally
2387 *
2388 * before we return, make sure the pageout I/O threads
2389 * are running throttled in case there are still requests
2390 * in the laundry... since we have enough free pages
2391 * we don't need the laundry to be cleaned in a timely
2392 * fashion... so let's avoid interfering with foreground
2393 * activity
2394 *
2395 * we don't want to hold vm_page_queue_free_lock when
2396 * calling vm_pageout_adjust_eq_iothrottle (since it
2397 * may cause other locks to be taken), we do the intitial
2398 * check outside of the lock. Once we take the lock,
2399 * we recheck the condition since it may have changed.
2400 * if it has, no problem, we will make the threads
2401 * non-throttled before actually blocking
2402 */
2403 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2404 }
2405 vm_free_page_lock();
2406
2407 if (vm_page_free_count >= vm_page_free_target &&
2408 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2409 return VM_PAGEOUT_SCAN_DONE_RETURN;
2410 }
2411 vm_free_page_unlock();
2412
2413 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2414 /*
2415 * we're most likely about to block due to one of
2416 * the 3 conditions that cause vm_pageout_scan to
2417 * not be able to make forward progress w/r
2418 * to providing new pages to the free queue,
2419 * so unthrottle the I/O threads in case we
2420 * have laundry to be cleaned... it needs
2421 * to be completed ASAP.
2422 *
2423 * even if we don't block, we want the io threads
2424 * running unthrottled since the sum of free +
2425 * clean pages is still under our free target
2426 */
2427 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2428 }
2429 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2430 /*
2431 * if we get here we're below our free target and
2432 * we're stalling due to a full laundry queue or
2433 * we don't have any inactive pages other then
2434 * those in the clean queue...
2435 * however, we have pages on the clean queue that
2436 * can be moved to the free queue, so let's not
2437 * stall the pageout scan
2438 */
2439 flow_control->state = FCS_IDLE;
2440 return VM_PAGEOUT_SCAN_PROCEED;
2441 }
2442 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2443 flow_control->state = FCS_IDLE;
2444 return VM_PAGEOUT_SCAN_PROCEED;
2445 }
2446
2447 VM_CHECK_MEMORYSTATUS;
2448
2449 if (flow_control->state != FCS_IDLE) {
2450 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2451 }
2452
2453 iq->pgo_throttled = TRUE;
2454 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2455
2456 vm_page_unlock_queues();
2457
2458 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2459
2460 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2461 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2462 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2463
2464 thread_block(THREAD_CONTINUE_NULL);
2465
2466 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2467 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2468 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2469
2470 vm_page_lock_queues();
2471
2472 iq->pgo_throttled = FALSE;
2473
2474 vps_init_page_targets();
2475
2476 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2477 }
2478
2479 extern boolean_t vm_darkwake_mode;
2480 /*
2481 * This function is called only from vm_pageout_scan and
2482 * it will find and return the most appropriate page to be
2483 * reclaimed.
2484 */
2485 static int
2486 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2487 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2488 {
2489 vm_page_t m = NULL;
2490 vm_object_t m_object = VM_OBJECT_NULL;
2491 uint32_t inactive_external_count;
2492 struct vm_speculative_age_q *sq;
2493 struct vm_pageout_queue *iq;
2494 int retval = VM_PAGEOUT_SCAN_PROCEED;
2495
2496 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2497 iq = &vm_pageout_queue_internal;
2498
2499 *is_page_from_bg_q = FALSE;
2500
2501 m = NULL;
2502 m_object = VM_OBJECT_NULL;
2503
2504 if (VM_DYNAMIC_PAGING_ENABLED()) {
2505 assert(vm_page_throttled_count == 0);
2506 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2507 }
2508
2509 /*
2510 * Try for a clean-queue inactive page.
2511 * These are pages that vm_pageout_scan tried to steal earlier, but
2512 * were dirty and had to be cleaned. Pick them up now that they are clean.
2513 */
2514 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2515 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2516
2517 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2518
2519 goto found_page;
2520 }
2521
2522 /*
2523 * The next most eligible pages are ones we paged in speculatively,
2524 * but which have not yet been touched and have been aged out.
2525 */
2526 if (!vm_page_queue_empty(&sq->age_q)) {
2527 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2528
2529 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2530
2531 if (!m->vmp_dirty || force_anonymous == FALSE) {
2532 goto found_page;
2533 } else {
2534 m = NULL;
2535 }
2536 }
2537
2538 #if !CONFIG_JETSAM
2539 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2540 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2541 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2542 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2543 goto found_page;
2544 }
2545 }
2546 #endif /* !CONFIG_JETSAM */
2547
2548 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2549 vm_object_t bg_m_object = NULL;
2550
2551 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2552
2553 bg_m_object = VM_PAGE_OBJECT(m);
2554
2555 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2556 /*
2557 * This page is on the background queue
2558 * but not on a pageable queue OR is busy during
2559 * darkwake mode when the target is artificially lowered.
2560 * If it is busy during darkwake mode, and we don't skip it,
2561 * we will just swing back around and try again with the same
2562 * queue and might hit the same page or its neighbor in a
2563 * similar state. Both of these are transient states and will
2564 * get resolved, but, at this point let's ignore this page.
2565 */
2566 if (vm_darkwake_mode && m->vmp_busy) {
2567 if (bg_m_object->internal) {
2568 vm_pageout_skipped_bq_internal++;
2569 } else {
2570 vm_pageout_skipped_bq_external++;
2571 }
2572 }
2573 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2574 if (bg_m_object->internal &&
2575 (VM_PAGE_Q_THROTTLED(iq) ||
2576 vm_compressor_out_of_space() == TRUE ||
2577 vm_page_free_count < (vm_page_free_reserved / 4))) {
2578 vm_pageout_skipped_bq_internal++;
2579 } else {
2580 *is_page_from_bg_q = TRUE;
2581
2582 if (bg_m_object->internal) {
2583 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2584 } else {
2585 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2586 }
2587 goto found_page;
2588 }
2589 }
2590 }
2591
2592 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2593
2594 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2595 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2596 *grab_anonymous = TRUE;
2597 *anons_grabbed = 0;
2598
2599 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2600 vm_pageout_vminfo.vm_pageout_skipped_external++;
2601 } else {
2602 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2603 /*
2604 * No swap and we are in dangerously low levels of free memory.
2605 * If we keep going ahead with anonymous pages, we are going to run into a situation
2606 * where the compressor will be stuck waiting for free pages (if it isn't already).
2607 *
2608 * So, pick a file backed page...
2609 */
2610 *grab_anonymous = FALSE;
2611 *anons_grabbed = ANONS_GRABBED_LIMIT;
2612 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2613 }
2614 }
2615 goto want_anonymous;
2616 }
2617 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2618
2619 #if CONFIG_JETSAM
2620 /* If the file-backed pool has accumulated
2621 * significantly more pages than the jetsam
2622 * threshold, prefer to reclaim those
2623 * inline to minimise compute overhead of reclaiming
2624 * anonymous pages.
2625 * This calculation does not account for the CPU local
2626 * external page queues, as those are expected to be
2627 * much smaller relative to the global pools.
2628 */
2629
2630 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2631
2632 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2633 if (vm_page_pageable_external_count >
2634 vm_pageout_state.vm_page_filecache_min) {
2635 if ((vm_page_pageable_external_count *
2636 vm_pageout_memorystatus_fb_factor_dr) >
2637 (memorystatus_get_critical_page_shortage_threshold() *
2638 vm_pageout_memorystatus_fb_factor_nr)) {
2639 *grab_anonymous = FALSE;
2640
2641 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2642 }
2643 }
2644 if (*grab_anonymous) {
2645 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2646 }
2647 }
2648 #endif /* CONFIG_JETSAM */
2649
2650 want_anonymous:
2651 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2652 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2653 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2654
2655 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2656 *anons_grabbed = 0;
2657
2658 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2659 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2660 if ((++(*reactivated_this_call) % 100)) {
2661 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2662
2663 vm_page_activate(m);
2664 counter_inc(&vm_statistics_reactivations);
2665 #if DEVELOPMENT || DEBUG
2666 if (*is_page_from_bg_q == TRUE) {
2667 if (m_object->internal) {
2668 vm_pageout_rejected_bq_internal++;
2669 } else {
2670 vm_pageout_rejected_bq_external++;
2671 }
2672 }
2673 #endif /* DEVELOPMENT || DEBUG */
2674 vm_pageout_state.vm_pageout_inactive_used++;
2675
2676 m = NULL;
2677 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2678
2679 goto found_page;
2680 }
2681
2682 /*
2683 * steal 1 of the file backed pages even if
2684 * we are under the limit that has been set
2685 * for a healthy filecache
2686 */
2687 }
2688 }
2689 goto found_page;
2690 }
2691 }
2692 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2693 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2694
2695 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2696 *anons_grabbed += 1;
2697
2698 goto found_page;
2699 }
2700
2701 m = NULL;
2702
2703 found_page:
2704 *victim_page = m;
2705
2706 return retval;
2707 }
2708
2709 /*
2710 * This function is called only from vm_pageout_scan and
2711 * it will put a page back on the active/inactive queue
2712 * if we can't reclaim it for some reason.
2713 */
2714 static void
2715 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2716 {
2717 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2718 vm_page_enqueue_inactive(m, FALSE);
2719 } else {
2720 vm_page_activate(m);
2721 }
2722
2723 #if DEVELOPMENT || DEBUG
2724 vm_object_t m_object = VM_PAGE_OBJECT(m);
2725
2726 if (page_from_bg_q == TRUE) {
2727 if (m_object->internal) {
2728 vm_pageout_rejected_bq_internal++;
2729 } else {
2730 vm_pageout_rejected_bq_external++;
2731 }
2732 }
2733 #endif /* DEVELOPMENT || DEBUG */
2734 }
2735
2736 /*
2737 * This function is called only from vm_pageout_scan and
2738 * it will try to grab the victim page's VM object (m_object)
2739 * which differs from the previous victim page's object (object).
2740 */
2741 static int
2742 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2743 {
2744 struct vm_speculative_age_q *sq;
2745
2746 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2747
2748 /*
2749 * the object associated with candidate page is
2750 * different from the one we were just working
2751 * with... dump the lock if we still own it
2752 */
2753 if (*object != NULL) {
2754 vm_object_unlock(*object);
2755 *object = NULL;
2756 }
2757 /*
2758 * Try to lock object; since we've alread got the
2759 * page queues lock, we can only 'try' for this one.
2760 * if the 'try' fails, we need to do a mutex_pause
2761 * to allow the owner of the object lock a chance to
2762 * run... otherwise, we're likely to trip over this
2763 * object in the same state as we work our way through
2764 * the queue... clumps of pages associated with the same
2765 * object are fairly typical on the inactive and active queues
2766 */
2767 if (!vm_object_lock_try_scan(m_object)) {
2768 vm_page_t m_want = NULL;
2769
2770 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2771
2772 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2773 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2774 }
2775
2776 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2777
2778 m->vmp_reference = FALSE;
2779
2780 if (!m_object->object_is_shared_cache) {
2781 /*
2782 * don't apply this optimization if this is the shared cache
2783 * object, it's too easy to get rid of very hot and important
2784 * pages...
2785 * m->vmp_object must be stable since we hold the page queues lock...
2786 * we can update the scan_collisions field sans the object lock
2787 * since it is a separate field and this is the only spot that does
2788 * a read-modify-write operation and it is never executed concurrently...
2789 * we can asynchronously set this field to 0 when creating a UPL, so it
2790 * is possible for the value to be a bit non-determistic, but that's ok
2791 * since it's only used as a hint
2792 */
2793 m_object->scan_collisions = 1;
2794 }
2795 if (page_from_bg_q) {
2796 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2797 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2798 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2799 } else if (!vm_page_queue_empty(&sq->age_q)) {
2800 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2801 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2802 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2803 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2804 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2805 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2806 }
2807
2808 /*
2809 * this is the next object we're going to be interested in
2810 * try to make sure its available after the mutex_pause
2811 * returns control
2812 */
2813 if (m_want) {
2814 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2815 }
2816
2817 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2818
2819 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2820 } else {
2821 *object = m_object;
2822 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2823 }
2824
2825 return VM_PAGEOUT_SCAN_PROCEED;
2826 }
2827
2828 /*
2829 * This function is called only from vm_pageout_scan and
2830 * it notices that pageout scan may be rendered ineffective
2831 * due to a FS deadlock and will jetsam a process if possible.
2832 * If jetsam isn't supported, it'll move the page to the active
2833 * queue to try and get some different pages pushed onwards so
2834 * we can try to get out of this scenario.
2835 */
2836 static void
2837 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2838 boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2839 {
2840 struct vm_pageout_queue *eq;
2841 vm_object_t cur_object = VM_OBJECT_NULL;
2842
2843 cur_object = *object;
2844
2845 eq = &vm_pageout_queue_external;
2846
2847 if (cur_object->internal == FALSE) {
2848 /*
2849 * we need to break up the following potential deadlock case...
2850 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2851 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2852 * c) Most of the pages in the inactive queue belong to this file.
2853 *
2854 * we are potentially in this deadlock because...
2855 * a) the external pageout queue is throttled
2856 * b) we're done with the active queue and moved on to the inactive queue
2857 * c) we've got a dirty external page
2858 *
2859 * since we don't know the reason for the external pageout queue being throttled we
2860 * must suspect that we are deadlocked, so move the current page onto the active queue
2861 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2862 *
2863 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2864 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2865 * pool the next time we select a victim page... if we can make enough new free pages,
2866 * the deadlock will break, the external pageout queue will empty and it will no longer
2867 * be throttled
2868 *
2869 * if we have jetsam configured, keep a count of the pages reactivated this way so
2870 * that we can try to find clean pages in the active/inactive queues before
2871 * deciding to jetsam a process
2872 */
2873 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2874
2875 vm_page_check_pageable_safe(m);
2876 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2877 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2878 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2879 vm_page_active_count++;
2880 vm_page_pageable_external_count++;
2881
2882 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2883
2884 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2885
2886 #pragma unused(force_anonymous)
2887
2888 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2889
2890 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2891 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2892 /*
2893 * Possible deadlock scenario so request jetsam action
2894 */
2895 memorystatus_kill_on_vps_starvation();
2896 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2897 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2898 }
2899 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2900
2901 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2902
2903 *force_anonymous = TRUE;
2904 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2905 } else {
2906 vm_page_activate(m);
2907 counter_inc(&vm_statistics_reactivations);
2908
2909 #if DEVELOPMENT || DEBUG
2910 if (is_page_from_bg_q == TRUE) {
2911 if (cur_object->internal) {
2912 vm_pageout_rejected_bq_internal++;
2913 } else {
2914 vm_pageout_rejected_bq_external++;
2915 }
2916 }
2917 #endif /* DEVELOPMENT || DEBUG */
2918
2919 vm_pageout_state.vm_pageout_inactive_used++;
2920 }
2921 }
2922
2923
2924 void
2925 vm_page_balance_inactive(int max_to_move)
2926 {
2927 vm_page_t m;
2928
2929 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2930
2931 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2932 /*
2933 * It is likely that the hibernation code path is
2934 * dealing with these very queues as we are about
2935 * to move pages around in/from them and completely
2936 * change the linkage of the pages.
2937 *
2938 * And so we skip the rebalancing of these queues.
2939 */
2940 return;
2941 }
2942 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2943 vm_page_inactive_count +
2944 vm_page_speculative_count);
2945
2946 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2947 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2948
2949 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2950
2951 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2952 assert(!m->vmp_laundry);
2953 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2954 assert(!vm_page_is_guard(m));
2955
2956 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2957
2958 /*
2959 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2960 *
2961 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2962 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2963 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2964 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2965 * by pageout_scan, which is just fine since the last reference would have happened quite far
2966 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2967 * have happened before we moved the page
2968 */
2969 if (m->vmp_pmapped == TRUE) {
2970 /*
2971 * We might be holding the page queue lock as a
2972 * spin lock and clearing the "referenced" bit could
2973 * take a while if there are lots of mappings of
2974 * that page, so make sure we acquire the lock as
2975 * as mutex to avoid a spinlock timeout.
2976 */
2977 vm_page_lockconvert_queues();
2978 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2979 }
2980
2981 /*
2982 * The page might be absent or busy,
2983 * but vm_page_deactivate can handle that.
2984 * FALSE indicates that we don't want a H/W clear reference
2985 */
2986 vm_page_deactivate_internal(m, FALSE);
2987 }
2988 }
2989
2990 /*
2991 * vm_pageout_scan does the dirty work for the pageout daemon.
2992 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2993 * held and vm_page_free_wanted == 0.
2994 */
2995 void
2996 vm_pageout_scan(void)
2997 {
2998 unsigned int loop_count = 0;
2999 unsigned int inactive_burst_count = 0;
3000 unsigned int reactivated_this_call;
3001 unsigned int reactivate_limit;
3002 vm_page_t local_freeq = NULL;
3003 int local_freed = 0;
3004 int delayed_unlock;
3005 int delayed_unlock_limit = 0;
3006 int refmod_state = 0;
3007 int vm_pageout_deadlock_target = 0;
3008 struct vm_pageout_queue *iq;
3009 struct vm_pageout_queue *eq;
3010 struct vm_speculative_age_q *sq;
3011 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3012 boolean_t inactive_throttled = FALSE;
3013 vm_object_t object = NULL;
3014 uint32_t inactive_reclaim_run;
3015 boolean_t grab_anonymous = FALSE;
3016 boolean_t force_anonymous = FALSE;
3017 boolean_t force_speculative_aging = FALSE;
3018 int anons_grabbed = 0;
3019 int page_prev_q_state = 0;
3020 boolean_t page_from_bg_q = FALSE;
3021 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3022 vm_object_t m_object = VM_OBJECT_NULL;
3023 int retval = 0;
3024 boolean_t lock_yield_check = FALSE;
3025
3026
3027 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3028 vm_pageout_vminfo.vm_pageout_freed_speculative,
3029 vm_pageout_state.vm_pageout_inactive_clean,
3030 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3031 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3032
3033 flow_control.state = FCS_IDLE;
3034 iq = &vm_pageout_queue_internal;
3035 eq = &vm_pageout_queue_external;
3036 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3037
3038 /* Ask the pmap layer to return any pages it no longer needs. */
3039 pmap_release_pages_fast();
3040
3041 vm_page_lock_queues();
3042
3043 delayed_unlock = 1;
3044
3045 /*
3046 * Calculate the max number of referenced pages on the inactive
3047 * queue that we will reactivate.
3048 */
3049 reactivated_this_call = 0;
3050 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3051 vm_page_inactive_count);
3052 inactive_reclaim_run = 0;
3053
3054 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3055
3056 /*
3057 * We must limit the rate at which we send pages to the pagers
3058 * so that we don't tie up too many pages in the I/O queues.
3059 * We implement a throttling mechanism using the laundry count
3060 * to limit the number of pages outstanding to the default
3061 * and external pagers. We can bypass the throttles and look
3062 * for clean pages if the pageout queues don't drain in a timely
3063 * fashion since this may indicate that the pageout paths are
3064 * stalled waiting for memory, which only we can provide.
3065 */
3066
3067 vps_init_page_targets();
3068 assert(object == NULL);
3069 assert(delayed_unlock != 0);
3070
3071 for (;;) {
3072 vm_page_t m;
3073
3074 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3075
3076 if (lock_yield_check) {
3077 lock_yield_check = FALSE;
3078
3079 if (delayed_unlock++ > delayed_unlock_limit) {
3080 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3081 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3082 } else if (vm_pageout_scan_wants_object) {
3083 vm_page_unlock_queues();
3084 mutex_pause(0);
3085 vm_page_lock_queues();
3086 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3087 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3088 }
3089 }
3090
3091 if (vm_upl_wait_for_pages < 0) {
3092 vm_upl_wait_for_pages = 0;
3093 }
3094
3095 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3096
3097 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3098 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3099 }
3100
3101 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3102
3103 assert(delayed_unlock);
3104
3105 /*
3106 * maintain our balance
3107 */
3108 vm_page_balance_inactive(1);
3109
3110
3111 /**********************************************************************
3112 * above this point we're playing with the active and secluded queues
3113 * below this point we're playing with the throttling mechanisms
3114 * and the inactive queue
3115 **********************************************************************/
3116
3117 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3118 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3119
3120 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3121 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3122 /*
3123 * make sure the pageout I/O threads are running
3124 * throttled in case there are still requests
3125 * in the laundry... since we have met our targets
3126 * we don't need the laundry to be cleaned in a timely
3127 * fashion... so let's avoid interfering with foreground
3128 * activity
3129 */
3130 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3131
3132 vm_free_page_lock();
3133
3134 if ((vm_page_free_count >= vm_page_free_target) &&
3135 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3136 /*
3137 * done - we have met our target *and*
3138 * there is no one waiting for a page.
3139 */
3140 return_from_scan:
3141 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3142
3143 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3144 vm_pageout_state.vm_pageout_inactive,
3145 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3146 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3147 vm_pageout_vminfo.vm_pageout_freed_speculative,
3148 vm_pageout_state.vm_pageout_inactive_clean,
3149 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3150 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3151
3152 return;
3153 }
3154 vm_free_page_unlock();
3155 }
3156
3157 /*
3158 * Before anything, we check if we have any ripe volatile
3159 * objects around. If so, try to purge the first object.
3160 * If the purge fails, fall through to reclaim a page instead.
3161 * If the purge succeeds, go back to the top and reevalute
3162 * the new memory situation.
3163 */
3164 retval = vps_purge_object();
3165
3166 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3167 /*
3168 * Success
3169 */
3170 if (object != NULL) {
3171 vm_object_unlock(object);
3172 object = NULL;
3173 }
3174
3175 lock_yield_check = FALSE;
3176 continue;
3177 }
3178
3179 /*
3180 * If our 'aged' queue is empty and we have some speculative pages
3181 * in the other queues, let's go through and see if we need to age
3182 * them.
3183 *
3184 * If we succeeded in aging a speculative Q or just that everything
3185 * looks normal w.r.t queue age and queue counts, we keep going onward.
3186 *
3187 * If, for some reason, we seem to have a mismatch between the spec.
3188 * page count and the page queues, we reset those variables and
3189 * restart the loop (LD TODO: Track this better?).
3190 */
3191 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3192 retval = vps_age_speculative_queue(force_speculative_aging);
3193
3194 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3195 lock_yield_check = FALSE;
3196 continue;
3197 }
3198 }
3199 force_speculative_aging = FALSE;
3200
3201 /*
3202 * Check to see if we need to evict objects from the cache.
3203 *
3204 * Note: 'object' here doesn't have anything to do with
3205 * the eviction part. We just need to make sure we have dropped
3206 * any object lock we might be holding if we need to go down
3207 * into the eviction logic.
3208 */
3209 retval = vps_object_cache_evict(&object);
3210
3211 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3212 lock_yield_check = FALSE;
3213 continue;
3214 }
3215
3216
3217 /*
3218 * Calculate our filecache_min that will affect the loop
3219 * going forward.
3220 */
3221 vps_calculate_filecache_min();
3222
3223 /*
3224 * LD TODO: Use a structure to hold all state variables for a single
3225 * vm_pageout_scan iteration and pass that structure to this function instead.
3226 */
3227 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3228 &delayed_unlock, &local_freeq, &local_freed,
3229 &vm_pageout_deadlock_target, inactive_burst_count);
3230
3231 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3232 if (loop_count >= vm_page_inactive_count) {
3233 loop_count = 0;
3234 }
3235
3236 inactive_burst_count = 0;
3237
3238 assert(object == NULL);
3239 assert(delayed_unlock != 0);
3240
3241 lock_yield_check = FALSE;
3242 continue;
3243 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3244 goto return_from_scan;
3245 }
3246
3247 flow_control.state = FCS_IDLE;
3248
3249 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3250 vm_pageout_inactive_external_forced_reactivate_limit);
3251 loop_count++;
3252 inactive_burst_count++;
3253 vm_pageout_state.vm_pageout_inactive++;
3254
3255 /*
3256 * Choose a victim.
3257 */
3258
3259 m = NULL;
3260 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3261
3262 if (m == NULL) {
3263 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3264 inactive_burst_count = 0;
3265
3266 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3267 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3268 }
3269
3270 lock_yield_check = TRUE;
3271 continue;
3272 }
3273
3274 /*
3275 * if we've gotten here, we have no victim page.
3276 * check to see if we've not finished balancing the queues
3277 * or we have a page on the aged speculative queue that we
3278 * skipped due to force_anonymous == TRUE.. or we have
3279 * speculative pages that we can prematurely age... if
3280 * one of these cases we'll keep going, else panic
3281 */
3282 force_anonymous = FALSE;
3283 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3284
3285 if (!vm_page_queue_empty(&sq->age_q)) {
3286 lock_yield_check = TRUE;
3287 continue;
3288 }
3289
3290 if (vm_page_speculative_count) {
3291 force_speculative_aging = TRUE;
3292 lock_yield_check = TRUE;
3293 continue;
3294 }
3295 panic("vm_pageout: no victim");
3296
3297 /* NOTREACHED */
3298 }
3299
3300 assert(VM_PAGE_PAGEABLE(m));
3301 m_object = VM_PAGE_OBJECT(m);
3302 force_anonymous = FALSE;
3303
3304 page_prev_q_state = m->vmp_q_state;
3305 /*
3306 * we just found this page on one of our queues...
3307 * it can't also be on the pageout queue, so safe
3308 * to call vm_page_queues_remove
3309 */
3310 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3311 vm_page_queues_remove(m, TRUE);
3312 if (donate) {
3313 /*
3314 * The compressor needs to see this bit to know
3315 * where this page needs to land. Also if stolen,
3316 * this bit helps put the page back in the right
3317 * special queue where it belongs.
3318 */
3319 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3320 }
3321
3322 assert(!m->vmp_laundry);
3323 assert(vm_page_is_canonical(m));
3324 assert(!is_kernel_object(m_object));
3325
3326 vm_pageout_vminfo.vm_pageout_considered_page++;
3327
3328 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3329
3330 /*
3331 * check to see if we currently are working
3332 * with the same object... if so, we've
3333 * already got the lock
3334 */
3335 if (m_object != object) {
3336 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3337
3338 /*
3339 * vps_switch_object() will always drop the 'object' lock first
3340 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3341 * either 'm_object' or NULL.
3342 */
3343 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3344
3345 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3346 lock_yield_check = TRUE;
3347 continue;
3348 }
3349 }
3350 assert(m_object == object);
3351 assert(VM_PAGE_OBJECT(m) == m_object);
3352
3353 if (m->vmp_busy) {
3354 /*
3355 * Somebody is already playing with this page.
3356 * Put it back on the appropriate queue
3357 *
3358 */
3359 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3360
3361 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3362 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3363 }
3364
3365 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3366
3367 lock_yield_check = TRUE;
3368 continue;
3369 }
3370
3371 /*
3372 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3373 * If already cleaning this page in place
3374 * just leave if off the paging queues.
3375 * We can leave the page mapped, and upl_commit_range
3376 * will put it on the clean queue.
3377 *
3378 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3379 * an msync INVALIDATE is in progress...
3380 * this page has been marked for destruction
3381 * after it has been cleaned,
3382 * but not yet gathered into a UPL
3383 * where 'cleaning' will be set...
3384 * just leave it off the paging queues
3385 *
3386 * if (m->vmp_free_when_done && m->vmp_clenaing)
3387 * an msync INVALIDATE is in progress
3388 * and the UPL has already gathered this page...
3389 * just leave it off the paging queues
3390 */
3391 if (m->vmp_free_when_done || m->vmp_cleaning) {
3392 lock_yield_check = TRUE;
3393 continue;
3394 }
3395
3396
3397 /*
3398 * If it's absent, in error or the object is no longer alive,
3399 * we can reclaim the page... in the no longer alive case,
3400 * there are 2 states the page can be in that preclude us
3401 * from reclaiming it - busy or cleaning - that we've already
3402 * dealt with
3403 */
3404 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3405 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3406 if (m->vmp_absent) {
3407 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3408 } else if (!object->alive ||
3409 (!object->internal &&
3410 object->pager == MEMORY_OBJECT_NULL)) {
3411 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3412 } else {
3413 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3414 }
3415 if (m->vmp_pmapped) {
3416 int refmod;
3417
3418 /*
3419 * If this page was file-backed and wired while its pager
3420 * was lost (during a forced unmount, for example), there
3421 * could still be some pmap mappings that need to be
3422 * cleaned up before we can free the page.
3423 */
3424 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3425 if ((refmod & VM_MEM_MODIFIED) &&
3426 !m->vmp_dirty) {
3427 SET_PAGE_DIRTY(m, FALSE);
3428 }
3429 }
3430 reclaim_page:
3431 if (vm_pageout_deadlock_target) {
3432 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3433 vm_pageout_deadlock_target--;
3434 }
3435
3436 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3437
3438 if (object->internal) {
3439 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3440 } else {
3441 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3442 }
3443 assert(!m->vmp_cleaning);
3444 assert(!m->vmp_laundry);
3445
3446 if (!object->internal &&
3447 object->pager != NULL &&
3448 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3449 shared_region_pager_reclaimed++;
3450 }
3451
3452 m->vmp_busy = TRUE;
3453
3454 /*
3455 * remove page from object here since we're already
3456 * behind the object lock... defer the rest of the work
3457 * we'd normally do in vm_page_free_prepare_object
3458 * until 'vm_page_free_list' is called
3459 */
3460 if (m->vmp_tabled) {
3461 vm_page_remove(m, TRUE);
3462 }
3463
3464 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3465 m->vmp_snext = local_freeq;
3466 local_freeq = m;
3467 local_freed++;
3468
3469 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3470 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3471 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3472 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3473 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3474 vm_pageout_vminfo.vm_pageout_freed_internal++;
3475 } else {
3476 vm_pageout_vminfo.vm_pageout_freed_external++;
3477 }
3478
3479 inactive_burst_count = 0;
3480
3481 lock_yield_check = TRUE;
3482 continue;
3483 }
3484 if (object->vo_copy == VM_OBJECT_NULL) {
3485 /*
3486 * No one else can have any interest in this page.
3487 * If this is an empty purgable object, the page can be
3488 * reclaimed even if dirty.
3489 * If the page belongs to a volatile purgable object, we
3490 * reactivate it if the compressor isn't active.
3491 */
3492 if (object->purgable == VM_PURGABLE_EMPTY) {
3493 if (m->vmp_pmapped == TRUE) {
3494 /* unmap the page */
3495 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3496 if (refmod_state & VM_MEM_MODIFIED) {
3497 SET_PAGE_DIRTY(m, FALSE);
3498 }
3499 }
3500 if (m->vmp_dirty || m->vmp_precious) {
3501 /* we saved the cost of cleaning this page ! */
3502 vm_page_purged_count++;
3503 }
3504 goto reclaim_page;
3505 }
3506
3507 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3508 /*
3509 * With the VM compressor, the cost of
3510 * reclaiming a page is much lower (no I/O),
3511 * so if we find a "volatile" page, it's better
3512 * to let it get compressed rather than letting
3513 * it occupy a full page until it gets purged.
3514 * So no need to check for "volatile" here.
3515 */
3516 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3517 /*
3518 * Avoid cleaning a "volatile" page which might
3519 * be purged soon.
3520 */
3521
3522 /* if it's wired, we can't put it on our queue */
3523 assert(!VM_PAGE_WIRED(m));
3524
3525 /* just stick it back on! */
3526 reactivated_this_call++;
3527
3528 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3529 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3530 }
3531
3532 goto reactivate_page;
3533 }
3534 } /* vo_copy NULL */
3535 /*
3536 * If it's being used, reactivate.
3537 * (Fictitious pages are either busy or absent.)
3538 * First, update the reference and dirty bits
3539 * to make sure the page is unreferenced.
3540 */
3541 refmod_state = -1;
3542
3543 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3544 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3545
3546 if (refmod_state & VM_MEM_REFERENCED) {
3547 m->vmp_reference = TRUE;
3548 }
3549 if (refmod_state & VM_MEM_MODIFIED) {
3550 SET_PAGE_DIRTY(m, FALSE);
3551 }
3552 }
3553
3554 if (m->vmp_reference || m->vmp_dirty) {
3555 /* deal with a rogue "reusable" page */
3556 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3557 }
3558
3559 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3560 vm_pageout_state.vm_page_xpmapped_min = 0;
3561 } else {
3562 vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3563 vm_pageout_state.vm_page_xpmapped_min_divisor;
3564 }
3565
3566 if (!m->vmp_no_cache &&
3567 page_from_bg_q == FALSE &&
3568 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3569 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3570 /*
3571 * The page we pulled off the inactive list has
3572 * been referenced. It is possible for other
3573 * processors to be touching pages faster than we
3574 * can clear the referenced bit and traverse the
3575 * inactive queue, so we limit the number of
3576 * reactivations.
3577 */
3578 if (++reactivated_this_call >= reactivate_limit &&
3579 !object->object_is_shared_cache &&
3580 !((m->vmp_realtime ||
3581 object->for_realtime) &&
3582 vm_pageout_protect_realtime)) {
3583 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3584 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3585 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3586 if (object->object_is_shared_cache) {
3587 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3588 } else if (m->vmp_realtime ||
3589 object->for_realtime) {
3590 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3591 }
3592 } else {
3593 uint32_t isinuse;
3594
3595 if (reactivated_this_call >= reactivate_limit) {
3596 if (object->object_is_shared_cache) {
3597 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3598 } else if ((m->vmp_realtime ||
3599 object->for_realtime) &&
3600 vm_pageout_protect_realtime) {
3601 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3602 }
3603 }
3604 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3605 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3606 }
3607
3608 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3609 reactivate_page:
3610 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3611 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3612 /*
3613 * no explict mappings of this object exist
3614 * and it's not open via the filesystem
3615 */
3616 vm_page_deactivate(m);
3617 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3618 } else {
3619 /*
3620 * The page was/is being used, so put back on active list.
3621 */
3622 vm_page_activate(m);
3623 counter_inc(&vm_statistics_reactivations);
3624 inactive_burst_count = 0;
3625 }
3626 #if DEVELOPMENT || DEBUG
3627 if (page_from_bg_q == TRUE) {
3628 if (m_object->internal) {
3629 vm_pageout_rejected_bq_internal++;
3630 } else {
3631 vm_pageout_rejected_bq_external++;
3632 }
3633 }
3634 #endif /* DEVELOPMENT || DEBUG */
3635
3636 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3637 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3638 }
3639 vm_pageout_state.vm_pageout_inactive_used++;
3640
3641 lock_yield_check = TRUE;
3642 continue;
3643 }
3644 /*
3645 * Make sure we call pmap_get_refmod() if it
3646 * wasn't already called just above, to update
3647 * the dirty bit.
3648 */
3649 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3650 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3651 if (refmod_state & VM_MEM_MODIFIED) {
3652 SET_PAGE_DIRTY(m, FALSE);
3653 }
3654 }
3655 }
3656
3657 /*
3658 * we've got a candidate page to steal...
3659 *
3660 * m->vmp_dirty is up to date courtesy of the
3661 * preceding check for m->vmp_reference... if
3662 * we get here, then m->vmp_reference had to be
3663 * FALSE (or possibly "reactivate_limit" was
3664 * exceeded), but in either case we called
3665 * pmap_get_refmod() and updated both
3666 * m->vmp_reference and m->vmp_dirty
3667 *
3668 * if it's dirty or precious we need to
3669 * see if the target queue is throtttled
3670 * it if is, we need to skip over it by moving it back
3671 * to the end of the inactive queue
3672 */
3673
3674 inactive_throttled = FALSE;
3675
3676 if (m->vmp_dirty || m->vmp_precious) {
3677 if (object->internal) {
3678 if (VM_PAGE_Q_THROTTLED(iq)) {
3679 inactive_throttled = TRUE;
3680 }
3681 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3682 inactive_throttled = TRUE;
3683 }
3684 }
3685 throttle_inactive:
3686 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3687 object->internal && m->vmp_dirty &&
3688 (object->purgable == VM_PURGABLE_DENY ||
3689 object->purgable == VM_PURGABLE_NONVOLATILE ||
3690 object->purgable == VM_PURGABLE_VOLATILE)) {
3691 vm_page_check_pageable_safe(m);
3692 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3693 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3694 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3695 vm_page_throttled_count++;
3696
3697 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3698
3699 inactive_burst_count = 0;
3700
3701 lock_yield_check = TRUE;
3702 continue;
3703 }
3704 if (inactive_throttled == TRUE) {
3705 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3706 &force_anonymous, page_from_bg_q);
3707
3708 inactive_burst_count = 0;
3709
3710 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3711 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3712 }
3713
3714 lock_yield_check = TRUE;
3715 continue;
3716 }
3717
3718 /*
3719 * we've got a page that we can steal...
3720 * eliminate all mappings and make sure
3721 * we have the up-to-date modified state
3722 *
3723 * if we need to do a pmap_disconnect then we
3724 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3725 * provides the true state atomically... the
3726 * page was still mapped up to the pmap_disconnect
3727 * and may have been dirtied at the last microsecond
3728 *
3729 * Note that if 'pmapped' is FALSE then the page is not
3730 * and has not been in any map, so there is no point calling
3731 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3732 * of likely usage of the page.
3733 */
3734 if (m->vmp_pmapped == TRUE) {
3735 int pmap_options;
3736
3737 /*
3738 * Don't count this page as going into the compressor
3739 * if any of these are true:
3740 * 1) compressed pager isn't enabled
3741 * 2) Freezer enabled device with compressed pager
3742 * backend (exclusive use) i.e. most of the VM system
3743 * (including vm_pageout_scan) has no knowledge of
3744 * the compressor
3745 * 3) This page belongs to a file and hence will not be
3746 * sent into the compressor
3747 */
3748 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3749 object->internal == FALSE) {
3750 pmap_options = 0;
3751 } else if (m->vmp_dirty || m->vmp_precious) {
3752 /*
3753 * VM knows that this page is dirty (or
3754 * precious) and needs to be compressed
3755 * rather than freed.
3756 * Tell the pmap layer to count this page
3757 * as "compressed".
3758 */
3759 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3760 } else {
3761 /*
3762 * VM does not know if the page needs to
3763 * be preserved but the pmap layer might tell
3764 * us if any mapping has "modified" it.
3765 * Let's the pmap layer to count this page
3766 * as compressed if and only if it has been
3767 * modified.
3768 */
3769 pmap_options =
3770 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3771 }
3772 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3773 pmap_options,
3774 NULL);
3775 if (refmod_state & VM_MEM_MODIFIED) {
3776 SET_PAGE_DIRTY(m, FALSE);
3777 }
3778 }
3779
3780 /*
3781 * reset our count of pages that have been reclaimed
3782 * since the last page was 'stolen'
3783 */
3784 inactive_reclaim_run = 0;
3785
3786 /*
3787 * If it's clean and not precious, we can free the page.
3788 */
3789 if (!m->vmp_dirty && !m->vmp_precious) {
3790 vm_pageout_state.vm_pageout_inactive_clean++;
3791
3792 /*
3793 * OK, at this point we have found a page we are going to free.
3794 */
3795 #if CONFIG_PHANTOM_CACHE
3796 if (!object->internal) {
3797 vm_phantom_cache_add_ghost(m);
3798 }
3799 #endif
3800 goto reclaim_page;
3801 }
3802
3803 /*
3804 * The page may have been dirtied since the last check
3805 * for a throttled target queue (which may have been skipped
3806 * if the page was clean then). With the dirty page
3807 * disconnected here, we can make one final check.
3808 */
3809 if (object->internal) {
3810 if (VM_PAGE_Q_THROTTLED(iq)) {
3811 inactive_throttled = TRUE;
3812 }
3813 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3814 inactive_throttled = TRUE;
3815 }
3816
3817 if (inactive_throttled == TRUE) {
3818 goto throttle_inactive;
3819 }
3820 #if !CONFIG_JETSAM
3821 memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3822 #endif /* !CONFIG_JETSAM */
3823
3824 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3825 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3826 }
3827
3828 if (object->internal) {
3829 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3830 } else {
3831 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3832 }
3833
3834 /*
3835 * internal pages will go to the compressor...
3836 * external pages will go to the appropriate pager to be cleaned
3837 * and upon completion will end up on 'vm_page_queue_cleaned' which
3838 * is a preferred queue to steal from
3839 */
3840 vm_pageout_cluster(m);
3841 inactive_burst_count = 0;
3842
3843 /*
3844 * back to top of pageout scan loop
3845 */
3846 }
3847 }
3848
3849
3850 void
3851 vm_page_free_reserve(
3852 int pages)
3853 {
3854 int free_after_reserve;
3855
3856 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3857 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3858 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3859 } else {
3860 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3861 }
3862 } else {
3863 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3864 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3865 } else {
3866 vm_page_free_reserved += pages;
3867 }
3868 }
3869 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3870
3871 vm_page_free_min = vm_page_free_reserved +
3872 VM_PAGE_FREE_MIN(free_after_reserve);
3873
3874 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3875 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3876 }
3877
3878 vm_page_free_target = vm_page_free_reserved +
3879 VM_PAGE_FREE_TARGET(free_after_reserve);
3880
3881 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3882 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3883 }
3884
3885 if (vm_page_free_target < vm_page_free_min + 5) {
3886 vm_page_free_target = vm_page_free_min + 5;
3887 }
3888
3889 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3890 }
3891
3892 /*
3893 * vm_pageout is the high level pageout daemon.
3894 */
3895
3896 void
3897 vm_pageout_continue(void)
3898 {
3899 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3900 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3901
3902 vm_free_page_lock();
3903 vm_pageout_running = TRUE;
3904 vm_free_page_unlock();
3905
3906 vm_pageout_scan();
3907 /*
3908 * we hold both the vm_page_queue_free_lock
3909 * and the vm_page_queues_lock at this point
3910 */
3911 assert(vm_page_free_wanted == 0);
3912 assert(vm_page_free_wanted_privileged == 0);
3913 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3914
3915 vm_pageout_running = FALSE;
3916 #if XNU_TARGET_OS_OSX
3917 if (vm_pageout_waiter) {
3918 vm_pageout_waiter = FALSE;
3919 thread_wakeup((event_t)&vm_pageout_waiter);
3920 }
3921 #endif /* XNU_TARGET_OS_OSX */
3922
3923 vm_free_page_unlock();
3924 vm_page_unlock_queues();
3925
3926 thread_block((thread_continue_t)vm_pageout_continue);
3927 /*NOTREACHED*/
3928 }
3929
3930 #if XNU_TARGET_OS_OSX
3931 kern_return_t
3932 vm_pageout_wait(uint64_t deadline)
3933 {
3934 kern_return_t kr;
3935
3936 vm_free_page_lock();
3937 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3938 vm_pageout_waiter = TRUE;
3939 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3940 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3941 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3942 kr = KERN_OPERATION_TIMED_OUT;
3943 }
3944 }
3945 vm_free_page_unlock();
3946
3947 return kr;
3948 }
3949 #endif /* XNU_TARGET_OS_OSX */
3950
3951 OS_NORETURN
3952 static void
3953 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3954 {
3955 vm_page_t m = NULL;
3956 vm_object_t object;
3957 vm_object_offset_t offset;
3958 memory_object_t pager;
3959 struct vm_pageout_queue *q = ethr->q;
3960
3961 /* On systems with a compressor, the external IO thread clears its
3962 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3963 * creation)
3964 */
3965 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3966 current_thread()->options &= ~TH_OPT_VMPRIV;
3967 }
3968
3969 sched_cond_ack(&(ethr->pgo_wakeup));
3970
3971 while (true) {
3972 vm_page_lockspin_queues();
3973
3974 while (!vm_page_queue_empty(&q->pgo_pending)) {
3975 q->pgo_busy = TRUE;
3976 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3977
3978 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3979 VM_PAGE_CHECK(m);
3980 /*
3981 * grab a snapshot of the object and offset this
3982 * page is tabled in so that we can relookup this
3983 * page after we've taken the object lock - these
3984 * fields are stable while we hold the page queues lock
3985 * but as soon as we drop it, there is nothing to keep
3986 * this page in this object... we hold an activity_in_progress
3987 * on this object which will keep it from terminating
3988 */
3989 object = VM_PAGE_OBJECT(m);
3990 offset = m->vmp_offset;
3991
3992 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3993 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3994
3995 vm_page_unlock_queues();
3996
3997 vm_object_lock(object);
3998
3999 m = vm_page_lookup(object, offset);
4000
4001 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4002 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4003 /*
4004 * it's either the same page that someone else has
4005 * started cleaning (or it's finished cleaning or
4006 * been put back on the pageout queue), or
4007 * the page has been freed or we have found a
4008 * new page at this offset... in all of these cases
4009 * we merely need to release the activity_in_progress
4010 * we took when we put the page on the pageout queue
4011 */
4012 vm_object_activity_end(object);
4013 vm_object_unlock(object);
4014
4015 vm_page_lockspin_queues();
4016 continue;
4017 }
4018 pager = object->pager;
4019
4020 if (pager == MEMORY_OBJECT_NULL) {
4021 /*
4022 * This pager has been destroyed by either
4023 * memory_object_destroy or vm_object_destroy, and
4024 * so there is nowhere for the page to go.
4025 */
4026 if (m->vmp_free_when_done) {
4027 /*
4028 * Just free the page... VM_PAGE_FREE takes
4029 * care of cleaning up all the state...
4030 * including doing the vm_pageout_throttle_up
4031 */
4032 VM_PAGE_FREE(m);
4033 } else {
4034 vm_page_lockspin_queues();
4035
4036 vm_pageout_throttle_up(m);
4037 vm_page_activate(m);
4038
4039 vm_page_unlock_queues();
4040
4041 /*
4042 * And we are done with it.
4043 */
4044 }
4045 vm_object_activity_end(object);
4046 vm_object_unlock(object);
4047
4048 vm_page_lockspin_queues();
4049 continue;
4050 }
4051 #if 0
4052 /*
4053 * we don't hold the page queue lock
4054 * so this check isn't safe to make
4055 */
4056 VM_PAGE_CHECK(m);
4057 #endif
4058 /*
4059 * give back the activity_in_progress reference we
4060 * took when we queued up this page and replace it
4061 * it with a paging_in_progress reference that will
4062 * also hold the paging offset from changing and
4063 * prevent the object from terminating
4064 */
4065 vm_object_activity_end(object);
4066 vm_object_paging_begin(object);
4067 vm_object_unlock(object);
4068
4069 /*
4070 * Send the data to the pager.
4071 * any pageout clustering happens there
4072 */
4073 memory_object_data_return(pager,
4074 m->vmp_offset + object->paging_offset,
4075 PAGE_SIZE,
4076 NULL,
4077 NULL,
4078 FALSE,
4079 FALSE,
4080 0);
4081
4082 vm_object_lock(object);
4083 vm_object_paging_end(object);
4084 vm_object_unlock(object);
4085
4086 vm_pageout_io_throttle();
4087
4088 vm_page_lockspin_queues();
4089 }
4090 q->pgo_busy = FALSE;
4091
4092 vm_page_unlock_queues();
4093 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4094 }
4095 /*NOTREACHED*/
4096 }
4097
4098 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4099
4100 #if DEVELOPMENT || DEBUG
4101 static void
4102 vm_pageout_record_thread_time(int cqid, int ncomps)
4103 {
4104 if (__improbable(vm_compressor_time_thread)) {
4105 vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4106 vmct_stats.vmct_pages[cqid] += ncomps;
4107 vmct_stats.vmct_iterations[cqid]++;
4108 if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4109 vmct_stats.vmct_maxpages[cqid] = ncomps;
4110 }
4111 if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4112 vmct_stats.vmct_minpages[cqid] = ncomps;
4113 }
4114 }
4115 }
4116 #endif
4117
4118 static void *
4119 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4120 {
4121 /*
4122 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4123 * However, this page has been removed from all queues and is only
4124 * known to this compressor thread dealing with this local queue.
4125 *
4126 * TODO: Add a second localq that is the early localq and
4127 * put special pages like this one on that queue in the block above
4128 * under the pageq lock to avoid this 'works but not clean' logic.
4129 */
4130 void *donate_queue_head;
4131 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4132 donate_queue_head = &cq->current_early_swapout_chead;
4133 #else /* XNU_TARGET_OS_OSX */
4134 donate_queue_head = &cq->current_late_swapout_chead;
4135 #endif /* XNU_TARGET_OS_OSX */
4136 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4137 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4138 return donate_queue_head;
4139 } else {
4140 return &cq->current_regular_swapout_chead;
4141 }
4142 }
4143
4144 #define MAX_FREE_BATCH 32
4145
4146 OS_NORETURN
4147 static void
4148 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4149 {
4150 struct vm_pageout_queue *q;
4151 vm_page_t m = NULL;
4152 boolean_t pgo_draining;
4153 vm_page_t local_q;
4154 int local_cnt;
4155 vm_page_t local_freeq = NULL;
4156 int local_freed = 0;
4157 int local_batch_size;
4158 #if DEVELOPMENT || DEBUG
4159 int ncomps = 0;
4160 boolean_t marked_active = FALSE;
4161 int num_pages_processed = 0;
4162 #endif
4163 void *chead = NULL;
4164
4165 KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4166
4167 sched_cond_ack(&(cq->pgo_wakeup));
4168
4169 q = cq->q;
4170
4171 while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4172 #if DEVELOPMENT || DEBUG
4173 bool benchmark_accounting = false;
4174 /* If we're running the compressor perf test, only process the benchmark pages.
4175 * We'll get back to our regular queue once the benchmark is done */
4176 if (compressor_running_perf_test) {
4177 q = cq->benchmark_q;
4178 if (!vm_page_queue_empty(&q->pgo_pending)) {
4179 benchmark_accounting = true;
4180 } else {
4181 q = cq->q;
4182 benchmark_accounting = false;
4183 }
4184 }
4185 #endif /* DEVELOPMENT || DEBUG */
4186
4187 #if __AMP__
4188 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4189 local_batch_size = (q->pgo_maxlaundry >> 3);
4190 local_batch_size = MAX(local_batch_size, 16);
4191 } else {
4192 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4193 }
4194 #else
4195 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4196 #endif
4197
4198 #if RECORD_THE_COMPRESSED_DATA
4199 if (q->pgo_laundry) {
4200 c_compressed_record_init();
4201 }
4202 #endif
4203 while (true) { /* this loop is for working though all the pages in the pending queue */
4204 int pages_left_on_q = 0;
4205
4206 local_cnt = 0;
4207 local_q = NULL;
4208
4209 KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4210
4211 vm_page_lock_queues();
4212 #if DEVELOPMENT || DEBUG
4213 if (marked_active == FALSE) {
4214 vmct_active++;
4215 vmct_state[cq->id] = VMCT_ACTIVE;
4216 marked_active = TRUE;
4217 if (vmct_active == 1) {
4218 vm_compressor_epoch_start = mach_absolute_time();
4219 }
4220 }
4221 #endif
4222 KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4223
4224 KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4225
4226 /* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4227 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4228 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4229 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4230 VM_PAGE_CHECK(m);
4231
4232 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4233 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4234 m->vmp_laundry = FALSE;
4235
4236 m->vmp_snext = local_q;
4237 local_q = m;
4238 local_cnt++;
4239 }
4240 if (local_q == NULL) {
4241 break;
4242 }
4243
4244 q->pgo_busy = TRUE;
4245
4246 if ((pgo_draining = q->pgo_draining) == FALSE) {
4247 vm_pageout_throttle_up_batch(q, local_cnt);
4248 pages_left_on_q = q->pgo_laundry;
4249 } else {
4250 pages_left_on_q = q->pgo_laundry - local_cnt;
4251 }
4252
4253 vm_page_unlock_queues();
4254
4255 #if !RECORD_THE_COMPRESSED_DATA
4256 /* if we have lots to compress, wake up the other thread to help.
4257 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4258 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4259 // wake up the next compressor thread
4260 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4261 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4262 }
4263 #endif
4264 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4265
4266 while (local_q) {
4267 KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4268
4269 m = local_q;
4270 local_q = m->vmp_snext;
4271 m->vmp_snext = NULL;
4272
4273
4274 chead = vm_pageout_select_filling_chead(cq, m);
4275
4276 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4277 #if DEVELOPMENT || DEBUG
4278 ncomps++;
4279 #endif
4280 KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4281
4282 m->vmp_snext = local_freeq;
4283 local_freeq = m;
4284 local_freed++;
4285
4286 /* if we gathered enough free pages, free them now */
4287 if (local_freed >= MAX_FREE_BATCH) {
4288 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4289
4290 vm_page_free_list(local_freeq, TRUE);
4291
4292 local_freeq = NULL;
4293 local_freed = 0;
4294 }
4295 }
4296 #if DEVELOPMENT || DEBUG
4297 num_pages_processed++;
4298 #endif /* DEVELOPMENT || DEBUG */
4299 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4300 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4301 kern_return_t wait_result;
4302 int need_wakeup = 0;
4303
4304 if (local_freeq) {
4305 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4306
4307 vm_page_free_list(local_freeq, TRUE);
4308 local_freeq = NULL;
4309 local_freed = 0;
4310
4311 continue;
4312 }
4313 vm_free_page_lock_spin();
4314
4315 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4316 if (vm_page_free_wanted_privileged++ == 0) {
4317 need_wakeup = 1;
4318 }
4319 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4320
4321 vm_free_page_unlock();
4322
4323 if (need_wakeup) {
4324 thread_wakeup((event_t)&vm_page_free_wanted);
4325 }
4326
4327 if (wait_result == THREAD_WAITING) {
4328 thread_block(THREAD_CONTINUE_NULL);
4329 }
4330 } else {
4331 vm_free_page_unlock();
4332 }
4333 }
4334 #endif
4335 } /* while (local_q) */
4336 /* free any leftovers in the freeq */
4337 if (local_freeq) {
4338 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4339
4340 vm_page_free_list(local_freeq, TRUE);
4341 local_freeq = NULL;
4342 local_freed = 0;
4343 }
4344 if (pgo_draining == TRUE) {
4345 vm_page_lockspin_queues();
4346 vm_pageout_throttle_up_batch(q, local_cnt);
4347 vm_page_unlock_queues();
4348 }
4349 }
4350 KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4351
4352 /*
4353 * queue lock is held and our q is empty
4354 */
4355 q->pgo_busy = FALSE;
4356 #if DEVELOPMENT || DEBUG
4357 if (marked_active == TRUE) {
4358 vmct_active--;
4359 vmct_state[cq->id] = VMCT_IDLE;
4360
4361 if (vmct_active == 0) {
4362 vm_compressor_epoch_stop = mach_absolute_time();
4363 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4364 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4365 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4366 /* This interval includes intervals where one or more
4367 * compressor threads were pre-empted
4368 */
4369 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4370 }
4371 }
4372 if (compressor_running_perf_test && benchmark_accounting) {
4373 /*
4374 * We could turn ON compressor_running_perf_test while still processing
4375 * regular non-benchmark pages. We shouldn't count them here else we
4376 * could overshoot. We might also still be populating that benchmark Q
4377 * and be under pressure. So we will go back to the regular queues. And
4378 * benchmark accounting will be off for that case too.
4379 */
4380 compressor_perf_test_pages_processed += num_pages_processed;
4381 thread_wakeup(&compressor_perf_test_pages_processed);
4382 }
4383 #endif
4384 vm_page_unlock_queues();
4385 #if DEVELOPMENT || DEBUG
4386 vm_pageout_record_thread_time(cq->id, ncomps);
4387 #endif
4388
4389 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4390 #if DEVELOPMENT || DEBUG
4391 if (compressor_running_perf_test && benchmark_accounting) {
4392 /*
4393 * We've been exclusively compressing pages from the benchmark queue,
4394 * do 1 pass over the internal queue before blocking.
4395 */
4396 continue;
4397 }
4398 #endif
4399
4400 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4401 }
4402 /*NOTREACHED*/
4403 }
4404
4405 /* resolves the pager and maintain stats in the pager and in the vm_object */
4406 kern_return_t
4407 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4408 {
4409 vm_object_t object;
4410 memory_object_t pager;
4411 int compressed_count_delta;
4412 kern_return_t retval;
4413
4414 object = VM_PAGE_OBJECT(m);
4415
4416 assert(!m->vmp_free_when_done);
4417 assert(!m->vmp_laundry);
4418
4419 pager = object->pager;
4420
4421 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4422 KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4423
4424 vm_object_lock(object);
4425
4426 /*
4427 * If there is no memory object for the page, create
4428 * one and hand it to the compression pager.
4429 */
4430
4431 if (!object->pager_initialized) {
4432 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4433 }
4434 if (!object->pager_initialized) {
4435 vm_object_compressor_pager_create(object);
4436 }
4437
4438 pager = object->pager;
4439
4440 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4441 /*
4442 * Still no pager for the object,
4443 * or the pager has been destroyed.
4444 * Reactivate the page.
4445 *
4446 * Should only happen if there is no
4447 * compression pager
4448 */
4449 vm_page_wakeup_done(object, m);
4450
4451 vm_page_lockspin_queues();
4452 vm_page_activate(m);
4453 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4454 vm_page_unlock_queues();
4455
4456 /*
4457 * And we are done with it.
4458 */
4459 vm_object_activity_end(object);
4460 vm_object_unlock(object);
4461
4462 return KERN_FAILURE;
4463 }
4464 vm_object_unlock(object);
4465
4466 KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4467 }
4468 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4469 assert(object->activity_in_progress > 0);
4470
4471 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4472 if (m->vmp_unmodified_ro == true) {
4473 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4474 }
4475 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4476
4477 vm_compressor_options_t flags = 0;
4478
4479 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4480 if (m->vmp_unmodified_ro) {
4481 flags |= C_PAGE_UNMODIFIED;
4482 }
4483 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4484
4485
4486 retval = vm_compressor_pager_put(
4487 pager,
4488 m->vmp_offset + object->paging_offset,
4489 VM_PAGE_GET_PHYS_PAGE(m),
4490 current_chead,
4491 scratch_buf,
4492 &compressed_count_delta,
4493 flags);
4494
4495 vm_object_lock(object);
4496
4497 assert(object->activity_in_progress > 0);
4498 assert(VM_PAGE_OBJECT(m) == object);
4499 assert( !VM_PAGE_WIRED(m));
4500
4501 vm_compressor_pager_count(pager,
4502 compressed_count_delta,
4503 FALSE, /* shared_lock */
4504 object);
4505
4506 if (retval == KERN_SUCCESS) {
4507 /*
4508 * If the object is purgeable, its owner's
4509 * purgeable ledgers will be updated in
4510 * vm_page_remove() but the page still
4511 * contributes to the owner's memory footprint,
4512 * so account for it as such.
4513 */
4514 if (m->vmp_tabled) {
4515 vm_page_remove(m, TRUE);
4516 }
4517 if ((object->purgable != VM_PURGABLE_DENY ||
4518 object->vo_ledger_tag) &&
4519 object->vo_owner != NULL) {
4520 /* one more compressed purgeable/tagged page */
4521 vm_object_owner_compressed_update(object,
4522 compressed_count_delta);
4523 }
4524 counter_inc(&vm_statistics_compressions);
4525 } else {
4526 vm_page_wakeup_done(object, m);
4527
4528 vm_page_lockspin_queues();
4529
4530 vm_page_activate(m);
4531 vm_pageout_vminfo.vm_compressor_failed++;
4532
4533 vm_page_unlock_queues();
4534 }
4535 vm_object_activity_end(object);
4536 vm_object_unlock(object);
4537
4538 return retval;
4539 }
4540
4541
4542 static void
4543 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4544 {
4545 uint32_t policy;
4546
4547 if (hibernate_cleaning_in_progress == TRUE) {
4548 req_lowpriority = FALSE;
4549 }
4550
4551 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4552 vm_page_unlock_queues();
4553
4554 if (req_lowpriority == TRUE) {
4555 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4556 DTRACE_VM(laundrythrottle);
4557 } else {
4558 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4559 DTRACE_VM(laundryunthrottle);
4560 }
4561 proc_set_thread_policy(ethr->pgo_iothread,
4562 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4563
4564 vm_page_lock_queues();
4565 ethr->q->pgo_lowpriority = req_lowpriority;
4566 }
4567 }
4568
4569 OS_NORETURN
4570 static void
4571 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4572 {
4573 thread_t self = current_thread();
4574
4575 self->options |= TH_OPT_VMPRIV;
4576
4577 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4578
4579 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4580 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4581
4582 vm_page_lock_queues();
4583
4584 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4585 vm_pageout_queue_external.pgo_inited = TRUE;
4586
4587 vm_page_unlock_queues();
4588
4589 #if CONFIG_THREAD_GROUPS
4590 thread_group_vm_add();
4591 #endif /* CONFIG_THREAD_GROUPS */
4592
4593 vm_pageout_iothread_external_continue(ethr, 0);
4594 /*NOTREACHED*/
4595 }
4596
4597
4598 OS_NORETURN
4599 static void
4600 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4601 {
4602 thread_t self = current_thread();
4603
4604 self->options |= TH_OPT_VMPRIV;
4605
4606 vm_page_lock_queues();
4607
4608 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4609 vm_pageout_queue_internal.pgo_inited = TRUE;
4610
4611 #if DEVELOPMENT || DEBUG
4612 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4613 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4614 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4615 #endif /* DEVELOPMENT || DEBUG */
4616
4617 vm_page_unlock_queues();
4618
4619 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4620 thread_vm_bind_group_add();
4621 }
4622
4623 #if CONFIG_THREAD_GROUPS
4624 thread_group_vm_add();
4625 #endif /* CONFIG_THREAD_GROUPS */
4626
4627 #if __AMP__
4628 if (vm_compressor_ebound) {
4629 /*
4630 * Use the soft bound option for vm_compressor to allow it to run on
4631 * P-cores if E-cluster is unavailable.
4632 */
4633 thread_soft_bind_cluster_type(self, 'E');
4634 }
4635 #endif /* __AMP__ */
4636
4637 thread_set_thread_name(current_thread(), "VM_compressor");
4638 #if DEVELOPMENT || DEBUG
4639 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4640 #endif
4641 vm_pageout_iothread_internal_continue(cthr, 0);
4642
4643 /*NOTREACHED*/
4644 }
4645
4646 kern_return_t
4647 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4648 {
4649 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4650 return KERN_SUCCESS;
4651 } else {
4652 return KERN_FAILURE; /* Already set */
4653 }
4654 }
4655
4656 extern boolean_t memorystatus_manual_testing_on;
4657 extern unsigned int memorystatus_level;
4658
4659
4660 #if VM_PRESSURE_EVENTS
4661
4662 boolean_t vm_pressure_events_enabled = FALSE;
4663
4664 extern uint64_t next_warning_notification_sent_at_ts;
4665 extern uint64_t next_critical_notification_sent_at_ts;
4666
4667 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4668
4669 /*
4670 * The last time there was change in pressure level OR we forced a check
4671 * because the system is stuck in a non-normal pressure level.
4672 */
4673 uint64_t vm_pressure_last_level_transition_abs = 0;
4674
4675 /*
4676 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4677 * level before resending out notifications for that level again.
4678 */
4679 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4680
4681 void
4682 vm_pressure_response(void)
4683 {
4684 vm_pressure_level_t old_level = kVMPressureNormal;
4685 int new_level = -1;
4686 unsigned int total_pages;
4687 uint64_t available_memory = 0;
4688 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4689 bool force_check = false;
4690 int time_in_mins;
4691
4692
4693 if (vm_pressure_events_enabled == FALSE) {
4694 return;
4695 }
4696
4697 available_memory = (uint64_t) memorystatus_get_available_page_count();
4698
4699 total_pages = (unsigned int) atop_64(max_mem);
4700 #if CONFIG_SECLUDED_MEMORY
4701 total_pages -= vm_page_secluded_count;
4702 #endif /* CONFIG_SECLUDED_MEMORY */
4703 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4704
4705 if (memorystatus_manual_testing_on) {
4706 return;
4707 }
4708
4709 curr_ts = mach_absolute_time();
4710 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4711
4712 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4713 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4714 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4715
4716 old_level = memorystatus_vm_pressure_level;
4717
4718 switch (memorystatus_vm_pressure_level) {
4719 case kVMPressureNormal:
4720 {
4721 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4722 new_level = kVMPressureCritical;
4723 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4724 new_level = kVMPressureWarning;
4725 }
4726 break;
4727 }
4728
4729 case kVMPressureWarning:
4730 case kVMPressureUrgent:
4731 {
4732 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4733 new_level = kVMPressureNormal;
4734 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4735 new_level = kVMPressureCritical;
4736 } else if (force_check) {
4737 new_level = kVMPressureWarning;
4738 next_warning_notification_sent_at_ts = curr_ts;
4739 }
4740 break;
4741 }
4742
4743 case kVMPressureCritical:
4744 {
4745 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4746 new_level = kVMPressureNormal;
4747 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4748 new_level = kVMPressureWarning;
4749 } else if (force_check) {
4750 new_level = kVMPressureCritical;
4751 next_critical_notification_sent_at_ts = curr_ts;
4752 }
4753 break;
4754 }
4755
4756 default:
4757 return;
4758 }
4759
4760 if (new_level != -1 || force_check) {
4761 if (new_level != -1) {
4762 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4763
4764 if (new_level != (int) old_level) {
4765 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4766 new_level, old_level, 0, 0);
4767 }
4768 } else {
4769 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4770 new_level, old_level, force_check, 0);
4771 }
4772
4773 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4774 /*
4775 * We don't want to schedule a wakeup while hibernation is in progress
4776 * because that could collide with checks for non-monotonicity in the scheduler.
4777 * We do however do all the updates to memorystatus_vm_pressure_level because
4778 * we _might_ want to use that for decisions regarding which pages or how
4779 * many pages we want to dump in hibernation.
4780 */
4781 return;
4782 }
4783
4784 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4785 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4786 thread_wakeup(&vm_pressure_thread);
4787 }
4788
4789 if (old_level != memorystatus_vm_pressure_level) {
4790 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4791 }
4792 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4793 }
4794 }
4795 }
4796 #endif /* VM_PRESSURE_EVENTS */
4797
4798
4799 /**
4800 * Called by a kernel thread to ask if a number of pages may be wired.
4801 */
4802 kern_return_t
4803 mach_vm_wire_level_monitor(int64_t requested_pages)
4804 {
4805 if (requested_pages <= 0) {
4806 return KERN_INVALID_ARGUMENT;
4807 }
4808
4809 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4810 /**
4811 * Available pages can be negative in the case where more system memory is
4812 * wired than the threshold, so we must use a signed integer.
4813 */
4814 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4815
4816 if (requested_pages > available_pages) {
4817 return KERN_RESOURCE_SHORTAGE;
4818 }
4819 return KERN_SUCCESS;
4820 }
4821
4822 /*
4823 * Function called by a kernel thread to either get the current pressure level or
4824 * wait until memory pressure changes from a given level.
4825 */
4826 kern_return_t
4827 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4828 {
4829 #if !VM_PRESSURE_EVENTS
4830 (void)wait_for_pressure;
4831 (void)pressure_level;
4832 return KERN_NOT_SUPPORTED;
4833 #else /* VM_PRESSURE_EVENTS */
4834
4835 uint32_t *waiters = NULL;
4836 wait_result_t wr = 0;
4837 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4838
4839 if (pressure_level == NULL) {
4840 return KERN_INVALID_ARGUMENT;
4841 }
4842 if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4843 *pressure_level == kVMPressureForegroundJetsam)) {
4844 return KERN_INVALID_ARGUMENT;
4845 }
4846
4847 if (wait_for_pressure) {
4848 switch (*pressure_level) {
4849 case kVMPressureForegroundJetsam:
4850 case kVMPressureBackgroundJetsam:
4851
4852 if (*pressure_level == kVMPressureForegroundJetsam) {
4853 waiters = &memorystatus_jetsam_fg_band_waiters;
4854 } else {
4855 /* kVMPressureBackgroundJetsam */
4856 waiters = &memorystatus_jetsam_bg_band_waiters;
4857 }
4858
4859 lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4860 wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4861 if (wr == THREAD_WAITING) {
4862 *waiters += 1;
4863 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4864 wr = thread_block(THREAD_CONTINUE_NULL);
4865 } else {
4866 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4867 }
4868
4869 if (wr != THREAD_AWAKENED) {
4870 return KERN_ABORTED;
4871 }
4872
4873 return KERN_SUCCESS;
4874 case kVMPressureNormal:
4875 case kVMPressureWarning:
4876 case kVMPressureUrgent:
4877 case kVMPressureCritical:
4878 while (old_level == *pressure_level) {
4879 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4880 THREAD_INTERRUPTIBLE);
4881 if (wr == THREAD_WAITING) {
4882 wr = thread_block(THREAD_CONTINUE_NULL);
4883 }
4884 if (wr == THREAD_INTERRUPTED) {
4885 return KERN_ABORTED;
4886 }
4887
4888 if (wr == THREAD_AWAKENED) {
4889 old_level = memorystatus_vm_pressure_level;
4890 }
4891 }
4892 break;
4893 default:
4894 return KERN_INVALID_ARGUMENT;
4895 }
4896 }
4897
4898 *pressure_level = old_level;
4899 return KERN_SUCCESS;
4900 #endif /* VM_PRESSURE_EVENTS */
4901 }
4902
4903 #if VM_PRESSURE_EVENTS
4904 void
4905 vm_pressure_thread(void)
4906 {
4907 static boolean_t thread_initialized = FALSE;
4908
4909 if (thread_initialized == TRUE) {
4910 vm_pageout_state.vm_pressure_thread_running = TRUE;
4911 consider_vm_pressure_events();
4912 vm_pageout_state.vm_pressure_thread_running = FALSE;
4913 }
4914
4915 #if CONFIG_THREAD_GROUPS
4916 thread_group_vm_add();
4917 #endif /* CONFIG_THREAD_GROUPS */
4918
4919 thread_set_thread_name(current_thread(), "VM_pressure");
4920 thread_initialized = TRUE;
4921 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4922 thread_block((thread_continue_t)vm_pressure_thread);
4923 }
4924 #endif /* VM_PRESSURE_EVENTS */
4925
4926
4927 /*
4928 * called once per-second via "compute_averages"
4929 */
4930 void
4931 compute_pageout_gc_throttle(__unused void *arg)
4932 {
4933 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4934 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4935 sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4936 }
4937 }
4938
4939 /*
4940 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4941 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4942 * jetsams. We need to check if the zone map size is above its jetsam limit to
4943 * decide if this was indeed the case.
4944 *
4945 * We need to do this on a different thread because of the following reasons:
4946 *
4947 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4948 * itself causing the system to hang. We perform synchronous jetsams if we're
4949 * leaking in the VM map entries zone, so the leaking process could be doing a
4950 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4951 * jetsam itself. We also need the vm_map lock on the process termination path,
4952 * which would now lead the dying process to deadlock against itself.
4953 *
4954 * 2. The jetsam path might need to allocate zone memory itself. We could try
4955 * using the non-blocking variant of zalloc for this path, but we can still
4956 * end up trying to do a kmem_alloc when the zone maps are almost full.
4957 */
4958 __dead2
4959 void
4960 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4961 {
4962 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4963
4964 if (step != VM_PAGEOUT_GC_INIT) {
4965 sched_cond_ack(&vm_pageout_gc_cond);
4966 }
4967
4968 while (true) {
4969 if (step == VM_PAGEOUT_GC_INIT) {
4970 /* first time being called is not about GC */
4971 #if CONFIG_THREAD_GROUPS
4972 thread_group_vm_add();
4973 #endif /* CONFIG_THREAD_GROUPS */
4974 step = VM_PAGEOUT_GC_COLLECT;
4975 } else if (zone_map_nearing_exhaustion()) {
4976 /*
4977 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4978 *
4979 * Bail out after calling zone_gc (which triggers the
4980 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4981 * operations that clear out a bunch of caches might allocate zone
4982 * memory themselves (for eg. vm_map operations would need VM map
4983 * entries). Since the zone map is almost full at this point, we
4984 * could end up with a panic. We just need to quickly jetsam a
4985 * process and exit here.
4986 *
4987 * It could so happen that we were woken up to relieve memory
4988 * pressure and the zone map also happened to be near its limit at
4989 * the time, in which case we'll skip out early. But that should be
4990 * ok; if memory pressure persists, the thread will simply be woken
4991 * up again.
4992 */
4993
4994 zone_gc(ZONE_GC_JETSAM);
4995 } else {
4996 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4997 boolean_t buf_large_zfree = FALSE;
4998 boolean_t first_try = TRUE;
4999
5000 stack_collect();
5001
5002 consider_machine_collect();
5003 #if CONFIG_DEFERRED_RECLAIM
5004 vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE);
5005 #endif /* CONFIG_DEFERRED_RECLAIM */
5006 #if CONFIG_MBUF_MCACHE
5007 mbuf_drain(FALSE);
5008 #endif /* CONFIG_MBUF_MCACHE */
5009
5010 do {
5011 if (consider_buffer_cache_collect != NULL) {
5012 buf_large_zfree = (*consider_buffer_cache_collect)(0);
5013 }
5014 if (first_try == TRUE || buf_large_zfree == TRUE) {
5015 /*
5016 * zone_gc should be last, because the other operations
5017 * might return memory to zones.
5018 */
5019 zone_gc(ZONE_GC_TRIM);
5020 }
5021 first_try = FALSE;
5022 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5023
5024 consider_machine_adjust();
5025 }
5026
5027 sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5028 }
5029 __builtin_unreachable();
5030 }
5031
5032
5033 #if VM_PAGE_BUCKETS_CHECK
5034 #if VM_PAGE_FAKE_BUCKETS
5035 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5036 #endif /* VM_PAGE_FAKE_BUCKETS */
5037 #endif /* VM_PAGE_BUCKETS_CHECK */
5038
5039
5040
5041 void
5042 vm_set_restrictions(unsigned int num_cpus)
5043 {
5044 int vm_restricted_to_single_processor = 0;
5045
5046 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5047 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5048 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5049 } else {
5050 assert(num_cpus > 0);
5051
5052 if (num_cpus <= 3) {
5053 /*
5054 * on systems with a limited number of CPUS, bind the
5055 * 4 major threads that can free memory and that tend to use
5056 * a fair bit of CPU under pressured conditions to a single processor.
5057 * This insures that these threads don't hog all of the available CPUs
5058 * (important for camera launch), while allowing them to run independently
5059 * w/r to locks... the 4 threads are
5060 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5061 * vm_compressor_swap_trigger_thread (minor and major compactions),
5062 * memorystatus_thread (jetsams).
5063 *
5064 * the first time the thread is run, it is responsible for checking the
5065 * state of vm_restricted_to_single_processor, and if TRUE it calls
5066 * thread_bind_master... someday this should be replaced with a group
5067 * scheduling mechanism and KPI.
5068 */
5069 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5070 } else {
5071 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5072 }
5073 }
5074 }
5075
5076 /*
5077 * Set up vm_config based on the vm_compressor_mode.
5078 * Must run BEFORE the pageout thread starts up.
5079 */
5080 __startup_func
5081 void
5082 vm_config_init(void)
5083 {
5084 bzero(&vm_config, sizeof(vm_config));
5085
5086 switch (vm_compressor_mode) {
5087 case VM_PAGER_DEFAULT:
5088 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5089 OS_FALLTHROUGH;
5090
5091 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5092 vm_config.compressor_is_present = TRUE;
5093 vm_config.swap_is_present = TRUE;
5094 vm_config.compressor_is_active = TRUE;
5095 vm_config.swap_is_active = TRUE;
5096 break;
5097
5098 case VM_PAGER_COMPRESSOR_NO_SWAP:
5099 vm_config.compressor_is_present = TRUE;
5100 vm_config.swap_is_present = TRUE;
5101 vm_config.compressor_is_active = TRUE;
5102 break;
5103
5104 case VM_PAGER_FREEZER_DEFAULT:
5105 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5106 OS_FALLTHROUGH;
5107
5108 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5109 vm_config.compressor_is_present = TRUE;
5110 vm_config.swap_is_present = TRUE;
5111 break;
5112
5113 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5114 vm_config.compressor_is_present = TRUE;
5115 vm_config.swap_is_present = TRUE;
5116 vm_config.compressor_is_active = TRUE;
5117 vm_config.freezer_swap_is_active = TRUE;
5118 break;
5119
5120 case VM_PAGER_NOT_CONFIGURED:
5121 break;
5122
5123 default:
5124 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5125 break;
5126 }
5127 }
5128
5129 __startup_func
5130 static void
5131 vm_pageout_create_gc_thread(void)
5132 {
5133 thread_t thread;
5134
5135 sched_cond_init(&vm_pageout_gc_cond);
5136 if (kernel_thread_create(vm_pageout_garbage_collect,
5137 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5138 panic("vm_pageout_garbage_collect: create failed");
5139 }
5140 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5141 if (thread->reserved_stack == 0) {
5142 assert(thread->kernel_stack);
5143 thread->reserved_stack = thread->kernel_stack;
5144 }
5145
5146 /* thread is started in vm_pageout() */
5147 vm_pageout_gc_thread = thread;
5148 }
5149 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5150
5151 void
5152 vm_pageout(void)
5153 {
5154 thread_t self = current_thread();
5155 thread_t thread;
5156 kern_return_t result;
5157 spl_t s;
5158
5159 /*
5160 * Set thread privileges.
5161 */
5162 s = splsched();
5163
5164 #if CONFIG_VPS_DYNAMIC_PRIO
5165 if (vps_dynamic_priority_enabled) {
5166 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5167 thread_set_eager_preempt(self);
5168 } else {
5169 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5170 }
5171 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5172 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5173 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5174
5175 thread_lock(self);
5176 self->options |= TH_OPT_VMPRIV;
5177 thread_unlock(self);
5178
5179 if (!self->reserved_stack) {
5180 self->reserved_stack = self->kernel_stack;
5181 }
5182
5183 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5184 !vps_dynamic_priority_enabled) {
5185 thread_vm_bind_group_add();
5186 }
5187
5188
5189 #if CONFIG_THREAD_GROUPS
5190 thread_group_vm_add();
5191 #endif /* CONFIG_THREAD_GROUPS */
5192
5193 #if __AMP__
5194 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5195 if (vm_pgo_pbound) {
5196 /*
5197 * Use the soft bound option for vm pageout to allow it to run on
5198 * E-cores if P-cluster is unavailable.
5199 */
5200 thread_soft_bind_cluster_type(self, 'P');
5201 }
5202 #endif /* __AMP__ */
5203
5204 PE_parse_boot_argn("vmpgo_protect_realtime",
5205 &vm_pageout_protect_realtime,
5206 sizeof(vm_pageout_protect_realtime));
5207 splx(s);
5208
5209 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5210
5211 /*
5212 * Initialize some paging parameters.
5213 */
5214
5215 vm_pageout_state.vm_pressure_thread_running = FALSE;
5216 vm_pageout_state.vm_pressure_changed = FALSE;
5217 vm_pageout_state.memorystatus_purge_on_warning = 2;
5218 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5219 vm_pageout_state.memorystatus_purge_on_critical = 8;
5220 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5221 vm_pageout_state.vm_page_speculative_percentage = 5;
5222 vm_pageout_state.vm_page_speculative_target = 0;
5223
5224 vm_pageout_state.vm_pageout_swap_wait = 0;
5225 vm_pageout_state.vm_pageout_idle_wait = 0;
5226 vm_pageout_state.vm_pageout_empty_wait = 0;
5227 vm_pageout_state.vm_pageout_burst_wait = 0;
5228 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5229 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5230 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5231
5232 vm_pageout_state.vm_pageout_inactive = 0;
5233 vm_pageout_state.vm_pageout_inactive_used = 0;
5234 vm_pageout_state.vm_pageout_inactive_clean = 0;
5235
5236 vm_pageout_state.vm_memory_pressure = 0;
5237 vm_pageout_state.vm_page_filecache_min = 0;
5238 #if CONFIG_JETSAM
5239 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5240 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5241 #else
5242 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5243 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5244 #endif
5245 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5246
5247 vm_pageout_state.vm_pageout_considered_page_last = 0;
5248
5249 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5250 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5251 }
5252
5253 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5254 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5255 }
5256
5257 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5258 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5259 }
5260
5261 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5262 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5263 }
5264
5265 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5266 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5267 }
5268
5269 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5270 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5271 }
5272
5273 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5274 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5275 }
5276 /*
5277 * even if we've already called vm_page_free_reserve
5278 * call it again here to insure that the targets are
5279 * accurately calculated (it uses vm_page_free_count_init)
5280 * calling it with an arg of 0 will not change the reserve
5281 * but will re-calculate free_min and free_target
5282 */
5283 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5284 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5285 } else {
5286 vm_page_free_reserve(0);
5287 }
5288
5289 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5290 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5291
5292 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5293 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5294
5295 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5296
5297 #if DEVELOPMENT || DEBUG
5298 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5299 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5300 #endif /* DEVELOPMENT || DEBUG */
5301
5302
5303 /* internal pageout thread started when default pager registered first time */
5304 /* external pageout and garbage collection threads started here */
5305 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5306 ethr->id = 0;
5307 ethr->q = &vm_pageout_queue_external;
5308 /* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5309 ethr->current_early_swapout_chead = NULL;
5310 ethr->current_regular_swapout_chead = NULL;
5311 ethr->current_late_swapout_chead = NULL;
5312 ethr->scratch_buf = NULL;
5313 #if DEVELOPMENT || DEBUG
5314 ethr->benchmark_q = NULL;
5315 #endif /* DEVELOPMENT || DEBUG */
5316 sched_cond_init(&(ethr->pgo_wakeup));
5317
5318 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5319 (void *)ethr, BASEPRI_VM,
5320 &(ethr->pgo_iothread));
5321 if (result != KERN_SUCCESS) {
5322 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5323 }
5324 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5325
5326 thread_mtx_lock(vm_pageout_gc_thread );
5327 thread_start(vm_pageout_gc_thread );
5328 thread_mtx_unlock(vm_pageout_gc_thread);
5329
5330 #if VM_PRESSURE_EVENTS
5331 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5332 BASEPRI_DEFAULT,
5333 &thread);
5334
5335 if (result != KERN_SUCCESS) {
5336 panic("vm_pressure_thread: create failed");
5337 }
5338
5339 thread_deallocate(thread);
5340 #endif
5341
5342 vm_object_reaper_init();
5343
5344
5345 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5346 vm_compressor_init();
5347 }
5348
5349 #if VM_PRESSURE_EVENTS
5350 vm_pressure_events_enabled = TRUE;
5351 #endif /* VM_PRESSURE_EVENTS */
5352
5353 #if CONFIG_PHANTOM_CACHE
5354 vm_phantom_cache_init();
5355 #endif
5356 #if VM_PAGE_BUCKETS_CHECK
5357 #if VM_PAGE_FAKE_BUCKETS
5358 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5359 (uint64_t) vm_page_fake_buckets_start,
5360 (uint64_t) vm_page_fake_buckets_end);
5361 pmap_protect(kernel_pmap,
5362 vm_page_fake_buckets_start,
5363 vm_page_fake_buckets_end,
5364 VM_PROT_READ);
5365 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5366 #endif /* VM_PAGE_FAKE_BUCKETS */
5367 #endif /* VM_PAGE_BUCKETS_CHECK */
5368
5369 #if VM_OBJECT_TRACKING
5370 vm_object_tracking_init();
5371 #endif /* VM_OBJECT_TRACKING */
5372
5373 #if __arm64__
5374 // vm_tests();
5375 #endif /* __arm64__ */
5376
5377 vm_pageout_continue();
5378
5379 /*
5380 * Unreached code!
5381 *
5382 * The vm_pageout_continue() call above never returns, so the code below is never
5383 * executed. We take advantage of this to declare several DTrace VM related probe
5384 * points that our kernel doesn't have an analog for. These are probe points that
5385 * exist in Solaris and are in the DTrace documentation, so people may have written
5386 * scripts that use them. Declaring the probe points here means their scripts will
5387 * compile and execute which we want for portability of the scripts, but since this
5388 * section of code is never reached, the probe points will simply never fire. Yes,
5389 * this is basically a hack. The problem is the DTrace probe points were chosen with
5390 * Solaris specific VM events in mind, not portability to different VM implementations.
5391 */
5392
5393 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5394 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5395 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5396 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5397 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5398 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5399 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5400 /*NOTREACHED*/
5401 }
5402
5403
5404
5405 kern_return_t
5406 vm_pageout_internal_start(void)
5407 {
5408 kern_return_t result = KERN_SUCCESS;
5409 host_basic_info_data_t hinfo;
5410 vm_offset_t buf, bufsize;
5411
5412 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5413
5414 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5415 #define BSD_HOST 1
5416 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5417
5418 assert(hinfo.max_cpus > 0);
5419
5420 #if !XNU_TARGET_OS_OSX
5421 vm_pageout_state.vm_compressor_thread_count = 1;
5422 #else /* !XNU_TARGET_OS_OSX */
5423 if (hinfo.max_cpus > 4) {
5424 vm_pageout_state.vm_compressor_thread_count = 2;
5425 } else {
5426 vm_pageout_state.vm_compressor_thread_count = 1;
5427 }
5428 #endif /* !XNU_TARGET_OS_OSX */
5429 #if __AMP__
5430 if (vm_compressor_ebound) {
5431 vm_pageout_state.vm_compressor_thread_count = 2;
5432 }
5433 #endif
5434 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5435 sizeof(vm_pageout_state.vm_compressor_thread_count));
5436
5437 /* did we get from the bootargs an unreasonable number? */
5438 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5439 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5440 }
5441 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5442 vm_pageout_state.vm_compressor_thread_count = 1;
5443 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5444 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5445 }
5446
5447 vm_pageout_queue_internal.pgo_maxlaundry =
5448 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5449
5450 PE_parse_boot_argn("vmpgoi_maxlaundry",
5451 &vm_pageout_queue_internal.pgo_maxlaundry,
5452 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5453
5454 #if DEVELOPMENT || DEBUG
5455 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5456 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5457 #endif /* DEVELOPMENT || DEBUG */
5458
5459 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5460
5461 kmem_alloc(kernel_map, &buf,
5462 bufsize * vm_pageout_state.vm_compressor_thread_count,
5463 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5464 VM_KERN_MEMORY_COMPRESSOR);
5465
5466 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5467 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5468 iq->id = i;
5469 iq->q = &vm_pageout_queue_internal;
5470 iq->current_early_swapout_chead = NULL;
5471 iq->current_regular_swapout_chead = NULL;
5472 iq->current_late_swapout_chead = NULL;
5473 iq->scratch_buf = (char *)(buf + i * bufsize);
5474 #if DEVELOPMENT || DEBUG
5475 iq->benchmark_q = &vm_pageout_queue_benchmark;
5476 #endif /* DEVELOPMENT || DEBUG */
5477 sched_cond_init(&(iq->pgo_wakeup));
5478 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5479 (void *)iq, BASEPRI_VM,
5480 &(iq->pgo_iothread));
5481
5482 if (result != KERN_SUCCESS) {
5483 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5484 }
5485 }
5486 return result;
5487 }
5488
5489 #if CONFIG_IOSCHED
5490 /*
5491 * To support I/O Expedite for compressed files we mark the upls with special flags.
5492 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5493 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5494 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5495 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5496 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5497 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5498 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5499 * unless the real I/O upl is being destroyed).
5500 */
5501
5502
5503 static void
5504 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5505 {
5506 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5507
5508 upl_lock(src_upl);
5509 if (src_upl->decmp_io_upl) {
5510 /*
5511 * If there is already an alive real I/O UPL, ignore this new UPL.
5512 * This case should rarely happen and even if it does, it just means
5513 * that we might issue a spurious expedite which the driver is expected
5514 * to handle.
5515 */
5516 upl_unlock(src_upl);
5517 return;
5518 }
5519 src_upl->decmp_io_upl = (void *)upl;
5520 src_upl->ref_count++;
5521
5522 upl->flags |= UPL_DECMP_REAL_IO;
5523 upl->decmp_io_upl = (void *)src_upl;
5524 upl_unlock(src_upl);
5525 }
5526 #endif /* CONFIG_IOSCHED */
5527
5528 #if UPL_DEBUG
5529 int upl_debug_enabled = 1;
5530 #else
5531 int upl_debug_enabled = 0;
5532 #endif
5533
5534 static upl_t
5535 upl_create(int type, int flags, upl_size_t size)
5536 {
5537 uint32_t pages = (uint32_t)atop(round_page_32(size));
5538 upl_t upl;
5539
5540 assert(page_aligned(size));
5541
5542 /*
5543 * FIXME: this code assumes the allocation always succeeds,
5544 * however `pages` can be up to MAX_UPL_SIZE.
5545 *
5546 * The allocation size is above 32k (resp. 128k)
5547 * on 16k pages (resp. 4k), which kalloc might fail
5548 * to allocate.
5549 */
5550 upl = kalloc_type(struct upl, struct upl_page_info,
5551 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5552 if (type & UPL_CREATE_INTERNAL) {
5553 flags |= UPL_INTERNAL;
5554 }
5555
5556 if (type & UPL_CREATE_LITE) {
5557 flags |= UPL_LITE;
5558 if (pages) {
5559 upl->lite_list = bitmap_alloc(pages);
5560 }
5561 }
5562
5563 upl->flags = flags;
5564 upl->ref_count = 1;
5565 upl_lock_init(upl);
5566 #if CONFIG_IOSCHED
5567 if (type & UPL_CREATE_IO_TRACKING) {
5568 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5569 }
5570
5571 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5572 /* Only support expedite on internal UPLs */
5573 thread_t curthread = current_thread();
5574 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5575 Z_WAITOK | Z_ZERO);
5576 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5577 if (curthread->decmp_upl != NULL) {
5578 upl_set_decmp_info(upl, curthread->decmp_upl);
5579 }
5580 }
5581 #endif
5582 #if CONFIG_IOSCHED || UPL_DEBUG
5583 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5584 upl->upl_creator = current_thread();
5585 upl->flags |= UPL_TRACKED_BY_OBJECT;
5586 }
5587 #endif
5588
5589 #if UPL_DEBUG
5590 upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5591 #endif /* UPL_DEBUG */
5592
5593 return upl;
5594 }
5595
5596 static void
5597 upl_destroy(upl_t upl)
5598 {
5599 uint32_t pages;
5600
5601 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5602
5603 if (upl->ext_ref_count) {
5604 panic("upl(%p) ext_ref_count", upl);
5605 }
5606
5607 #if CONFIG_IOSCHED
5608 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5609 upl_t src_upl;
5610 src_upl = upl->decmp_io_upl;
5611 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5612 upl_lock(src_upl);
5613 src_upl->decmp_io_upl = NULL;
5614 upl_unlock(src_upl);
5615 upl_deallocate(src_upl);
5616 }
5617 #endif /* CONFIG_IOSCHED */
5618
5619 #if CONFIG_IOSCHED || UPL_DEBUG
5620 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5621 !(upl->flags & UPL_VECTOR)) {
5622 vm_object_t object;
5623
5624 if (upl->flags & UPL_SHADOWED) {
5625 object = upl->map_object->shadow;
5626 } else {
5627 object = upl->map_object;
5628 }
5629
5630 vm_object_lock(object);
5631 queue_remove(&object->uplq, upl, upl_t, uplq);
5632 vm_object_activity_end(object);
5633 vm_object_collapse(object, 0, TRUE);
5634 vm_object_unlock(object);
5635 }
5636 #endif
5637 /*
5638 * drop a reference on the map_object whether or
5639 * not a pageout object is inserted
5640 */
5641 if (upl->flags & UPL_SHADOWED) {
5642 vm_object_deallocate(upl->map_object);
5643 }
5644
5645 if (upl->flags & UPL_DEVICE_MEMORY) {
5646 pages = 1;
5647 } else {
5648 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5649 }
5650
5651 upl_lock_destroy(upl);
5652
5653 #if CONFIG_IOSCHED
5654 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5655 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5656 }
5657 #endif
5658
5659 #if UPL_DEBUG
5660 for (int i = 0; i < upl->upl_commit_index; i++) {
5661 btref_put(upl->upl_commit_records[i].c_btref);
5662 }
5663 btref_put(upl->upl_create_btref);
5664 #endif /* UPL_DEBUG */
5665
5666 if ((upl->flags & UPL_LITE) && pages) {
5667 bitmap_free(upl->lite_list, pages);
5668 }
5669 kfree_type(struct upl, struct upl_page_info,
5670 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5671 }
5672
5673 void
5674 upl_deallocate(upl_t upl)
5675 {
5676 upl_lock(upl);
5677
5678 if (--upl->ref_count == 0) {
5679 if (vector_upl_is_valid(upl)) {
5680 vector_upl_deallocate(upl);
5681 }
5682 upl_unlock(upl);
5683
5684 if (upl->upl_iodone) {
5685 upl_callout_iodone(upl);
5686 }
5687
5688 upl_destroy(upl);
5689 } else {
5690 upl_unlock(upl);
5691 }
5692 }
5693
5694 #if CONFIG_IOSCHED
5695 void
5696 upl_mark_decmp(upl_t upl)
5697 {
5698 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5699 upl->flags |= UPL_DECMP_REQ;
5700 upl->upl_creator->decmp_upl = (void *)upl;
5701 }
5702 }
5703
5704 void
5705 upl_unmark_decmp(upl_t upl)
5706 {
5707 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5708 upl->upl_creator->decmp_upl = NULL;
5709 }
5710 }
5711
5712 #endif /* CONFIG_IOSCHED */
5713
5714 #define VM_PAGE_Q_BACKING_UP(q) \
5715 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5716
5717 boolean_t must_throttle_writes(void);
5718
5719 boolean_t
5720 must_throttle_writes()
5721 {
5722 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5723 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5724 return TRUE;
5725 }
5726
5727 return FALSE;
5728 }
5729
5730 int vm_page_delayed_work_ctx_needed = 0;
5731 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5732
5733 __startup_func
5734 static void
5735 vm_page_delayed_work_init_ctx(void)
5736 {
5737 uint16_t min_delayed_work_ctx_allocated = 16;
5738
5739 /*
5740 * try really hard to always keep NCPU elements around in the zone
5741 * in order for the UPL code to almost always get an element.
5742 */
5743 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5744 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5745 }
5746
5747 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5748 }
5749 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5750
5751 struct vm_page_delayed_work*
5752 vm_page_delayed_work_get_ctx(void)
5753 {
5754 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5755
5756 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5757
5758 if (__probable(dw_ctx)) {
5759 dw_ctx->delayed_owner = current_thread();
5760 } else {
5761 vm_page_delayed_work_ctx_needed++;
5762 }
5763 return dw_ctx ? dw_ctx->dwp : NULL;
5764 }
5765
5766 void
5767 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5768 {
5769 struct vm_page_delayed_work_ctx *ldw_ctx;
5770
5771 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5772 ldw_ctx->delayed_owner = NULL;
5773
5774 zfree(dw_ctx_zone, ldw_ctx);
5775 }
5776
5777 /*
5778 * Routine: vm_object_upl_request
5779 * Purpose:
5780 * Cause the population of a portion of a vm_object.
5781 * Depending on the nature of the request, the pages
5782 * returned may be contain valid data or be uninitialized.
5783 * A page list structure, listing the physical pages
5784 * will be returned upon request.
5785 * This function is called by the file system or any other
5786 * supplier of backing store to a pager.
5787 * IMPORTANT NOTE: The caller must still respect the relationship
5788 * between the vm_object and its backing memory object. The
5789 * caller MUST NOT substitute changes in the backing file
5790 * without first doing a memory_object_lock_request on the
5791 * target range unless it is know that the pages are not
5792 * shared with another entity at the pager level.
5793 * Copy_in_to:
5794 * if a page list structure is present
5795 * return the mapped physical pages, where a
5796 * page is not present, return a non-initialized
5797 * one. If the no_sync bit is turned on, don't
5798 * call the pager unlock to synchronize with other
5799 * possible copies of the page. Leave pages busy
5800 * in the original object, if a page list structure
5801 * was specified. When a commit of the page list
5802 * pages is done, the dirty bit will be set for each one.
5803 * Copy_out_from:
5804 * If a page list structure is present, return
5805 * all mapped pages. Where a page does not exist
5806 * map a zero filled one. Leave pages busy in
5807 * the original object. If a page list structure
5808 * is not specified, this call is a no-op.
5809 *
5810 * Note: access of default pager objects has a rather interesting
5811 * twist. The caller of this routine, presumably the file system
5812 * page cache handling code, will never actually make a request
5813 * against a default pager backed object. Only the default
5814 * pager will make requests on backing store related vm_objects
5815 * In this way the default pager can maintain the relationship
5816 * between backing store files (abstract memory objects) and
5817 * the vm_objects (cache objects), they support.
5818 *
5819 */
5820
5821 __private_extern__ kern_return_t
5822 vm_object_upl_request(
5823 vm_object_t object,
5824 vm_object_offset_t offset,
5825 upl_size_t size,
5826 upl_t *upl_ptr,
5827 upl_page_info_array_t user_page_list,
5828 unsigned int *page_list_count,
5829 upl_control_flags_t cntrl_flags,
5830 vm_tag_t tag)
5831 {
5832 vm_page_t dst_page = VM_PAGE_NULL;
5833 vm_object_offset_t dst_offset;
5834 upl_size_t xfer_size;
5835 unsigned int size_in_pages;
5836 boolean_t dirty;
5837 boolean_t hw_dirty;
5838 upl_t upl = NULL;
5839 unsigned int entry;
5840 vm_page_t alias_page = NULL;
5841 int refmod_state = 0;
5842 vm_object_t last_copy_object;
5843 uint32_t last_copy_version;
5844 struct vm_page_delayed_work dw_array;
5845 struct vm_page_delayed_work *dwp, *dwp_start;
5846 bool dwp_finish_ctx = TRUE;
5847 int dw_count;
5848 int dw_limit;
5849 int io_tracking_flag = 0;
5850 vm_grab_options_t grab_options;
5851 int page_grab_count = 0;
5852 ppnum_t phys_page;
5853 pmap_flush_context pmap_flush_context_storage;
5854 boolean_t pmap_flushes_delayed = FALSE;
5855 task_t task = current_task();
5856
5857 dwp_start = dwp = NULL;
5858
5859 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5860 /*
5861 * For forward compatibility's sake,
5862 * reject any unknown flag.
5863 */
5864 return KERN_INVALID_VALUE;
5865 }
5866 if ((!object->internal) && (object->paging_offset != 0)) {
5867 panic("vm_object_upl_request: external object with non-zero paging offset");
5868 }
5869 if (object->phys_contiguous) {
5870 panic("vm_object_upl_request: contiguous object specified");
5871 }
5872
5873 assertf(page_aligned(offset) && page_aligned(size),
5874 "offset 0x%llx size 0x%x",
5875 offset, size);
5876
5877 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5878
5879 dw_count = 0;
5880 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5881 dwp_start = vm_page_delayed_work_get_ctx();
5882 if (dwp_start == NULL) {
5883 dwp_start = &dw_array;
5884 dw_limit = 1;
5885 dwp_finish_ctx = FALSE;
5886 }
5887
5888 dwp = dwp_start;
5889
5890 if (size > MAX_UPL_SIZE_BYTES) {
5891 size = MAX_UPL_SIZE_BYTES;
5892 }
5893
5894 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5895 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5896 }
5897
5898 #if CONFIG_IOSCHED || UPL_DEBUG
5899 if (object->io_tracking || upl_debug_enabled) {
5900 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5901 }
5902 #endif
5903 #if CONFIG_IOSCHED
5904 if (object->io_tracking) {
5905 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5906 }
5907 #endif
5908
5909 if (cntrl_flags & UPL_SET_INTERNAL) {
5910 if (cntrl_flags & UPL_SET_LITE) {
5911 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5912 } else {
5913 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5914 }
5915 user_page_list = size ? upl->page_list : NULL;
5916 } else {
5917 if (cntrl_flags & UPL_SET_LITE) {
5918 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5919 } else {
5920 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5921 }
5922 }
5923 *upl_ptr = upl;
5924
5925 if (user_page_list) {
5926 user_page_list[0].device = FALSE;
5927 }
5928
5929 if (cntrl_flags & UPL_SET_LITE) {
5930 upl->map_object = object;
5931 } else {
5932 upl->map_object = vm_object_allocate(size, object->vmo_provenance);
5933 vm_object_lock(upl->map_object);
5934 /*
5935 * No neeed to lock the new object: nobody else knows
5936 * about it yet, so it's all ours so far.
5937 */
5938 upl->map_object->shadow = object;
5939 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5940 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5941 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5942 upl->map_object->vo_shadow_offset = offset;
5943 upl->map_object->wimg_bits = object->wimg_bits;
5944 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5945 "object %p shadow_offset 0x%llx",
5946 upl->map_object, upl->map_object->vo_shadow_offset);
5947 vm_object_unlock(upl->map_object);
5948
5949 alias_page = vm_page_create_fictitious();
5950
5951 upl->flags |= UPL_SHADOWED;
5952 }
5953 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5954 upl->flags |= UPL_PAGEOUT;
5955 }
5956
5957 vm_object_lock(object);
5958 vm_object_activity_begin(object);
5959
5960 grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
5961 #if CONFIG_SECLUDED_MEMORY
5962 if (object->can_grab_secluded) {
5963 grab_options |= VM_PAGE_GRAB_SECLUDED;
5964 }
5965 #endif /* CONFIG_SECLUDED_MEMORY */
5966
5967 /*
5968 * we can lock in the paging_offset once paging_in_progress is set
5969 */
5970 upl->u_size = size;
5971 upl->u_offset = offset + object->paging_offset;
5972
5973 #if CONFIG_IOSCHED || UPL_DEBUG
5974 if (object->io_tracking || upl_debug_enabled) {
5975 vm_object_activity_begin(object);
5976 queue_enter(&object->uplq, upl, upl_t, uplq);
5977 }
5978 #endif
5979 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5980 /*
5981 * Honor copy-on-write obligations
5982 *
5983 * The caller is gathering these pages and
5984 * might modify their contents. We need to
5985 * make sure that the copy object has its own
5986 * private copies of these pages before we let
5987 * the caller modify them.
5988 */
5989 vm_object_update(object,
5990 offset,
5991 size,
5992 NULL,
5993 NULL,
5994 FALSE, /* should_return */
5995 MEMORY_OBJECT_COPY_SYNC,
5996 VM_PROT_NO_CHANGE);
5997
5998 VM_PAGEOUT_DEBUG(upl_cow, 1);
5999 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6000 }
6001 /*
6002 * remember which copy object we synchronized with
6003 */
6004 last_copy_object = object->vo_copy;
6005 last_copy_version = object->vo_copy_version;
6006 entry = 0;
6007
6008 xfer_size = size;
6009 dst_offset = offset;
6010 size_in_pages = size / PAGE_SIZE;
6011
6012 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6013 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6014 object->scan_collisions = 0;
6015 }
6016
6017 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6018 boolean_t isSSD = FALSE;
6019
6020 #if !XNU_TARGET_OS_OSX
6021 isSSD = TRUE;
6022 #else /* !XNU_TARGET_OS_OSX */
6023 vnode_pager_get_isSSD(object->pager, &isSSD);
6024 #endif /* !XNU_TARGET_OS_OSX */
6025 vm_object_unlock(object);
6026
6027 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6028
6029 if (isSSD == TRUE) {
6030 delay(1000 * size_in_pages);
6031 } else {
6032 delay(5000 * size_in_pages);
6033 }
6034 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6035
6036 vm_object_lock(object);
6037 }
6038
6039 while (xfer_size) {
6040 dwp->dw_mask = 0;
6041
6042 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6043 vm_object_unlock(object);
6044 alias_page = vm_page_create_fictitious();
6045 vm_object_lock(object);
6046 }
6047 if (cntrl_flags & UPL_COPYOUT_FROM) {
6048 upl->flags |= UPL_PAGE_SYNC_DONE;
6049
6050 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6051 vm_page_is_fictitious(dst_page) ||
6052 dst_page->vmp_absent ||
6053 VMP_ERROR_GET(dst_page) ||
6054 dst_page->vmp_cleaning ||
6055 (VM_PAGE_WIRED(dst_page))) {
6056 if (user_page_list) {
6057 user_page_list[entry].phys_addr = 0;
6058 }
6059
6060 goto try_next_page;
6061 }
6062 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6063
6064 /*
6065 * grab this up front...
6066 * a high percentange of the time we're going to
6067 * need the hardware modification state a bit later
6068 * anyway... so we can eliminate an extra call into
6069 * the pmap layer by grabbing it here and recording it
6070 */
6071 if (dst_page->vmp_pmapped) {
6072 refmod_state = pmap_get_refmod(phys_page);
6073 } else {
6074 refmod_state = 0;
6075 }
6076
6077 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6078 /*
6079 * page is on inactive list and referenced...
6080 * reactivate it now... this gets it out of the
6081 * way of vm_pageout_scan which would have to
6082 * reactivate it upon tripping over it
6083 */
6084 dwp->dw_mask |= DW_vm_page_activate;
6085 }
6086 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6087 /*
6088 * we're only asking for DIRTY pages to be returned
6089 */
6090 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6091 /*
6092 * if we were the page stolen by vm_pageout_scan to be
6093 * cleaned (as opposed to a buddy being clustered in
6094 * or this request is not being driven by a PAGEOUT cluster
6095 * then we only need to check for the page being dirty or
6096 * precious to decide whether to return it
6097 */
6098 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6099 goto check_busy;
6100 }
6101 goto dont_return;
6102 }
6103 /*
6104 * this is a request for a PAGEOUT cluster and this page
6105 * is merely along for the ride as a 'buddy'... not only
6106 * does it have to be dirty to be returned, but it also
6107 * can't have been referenced recently...
6108 */
6109 if ((hibernate_cleaning_in_progress == TRUE ||
6110 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6111 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6112 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6113 goto check_busy;
6114 }
6115 dont_return:
6116 /*
6117 * if we reach here, we're not to return
6118 * the page... go on to the next one
6119 */
6120 if (dst_page->vmp_laundry == TRUE) {
6121 /*
6122 * if we get here, the page is not 'cleaning' (filtered out above).
6123 * since it has been referenced, remove it from the laundry
6124 * so we don't pay the cost of an I/O to clean a page
6125 * we're just going to take back
6126 */
6127 vm_page_lockspin_queues();
6128
6129 vm_pageout_steal_laundry(dst_page, TRUE);
6130 vm_page_activate(dst_page);
6131
6132 vm_page_unlock_queues();
6133 }
6134 if (user_page_list) {
6135 user_page_list[entry].phys_addr = 0;
6136 }
6137
6138 goto try_next_page;
6139 }
6140 check_busy:
6141 if (dst_page->vmp_busy) {
6142 if (cntrl_flags & UPL_NOBLOCK) {
6143 if (user_page_list) {
6144 user_page_list[entry].phys_addr = 0;
6145 }
6146 dwp->dw_mask = 0;
6147
6148 goto try_next_page;
6149 }
6150 /*
6151 * someone else is playing with the
6152 * page. We will have to wait.
6153 */
6154 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6155
6156 continue;
6157 }
6158 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6159 vm_page_lockspin_queues();
6160
6161 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6162 /*
6163 * we've buddied up a page for a clustered pageout
6164 * that has already been moved to the pageout
6165 * queue by pageout_scan... we need to remove
6166 * it from the queue and drop the laundry count
6167 * on that queue
6168 */
6169 vm_pageout_throttle_up(dst_page);
6170 }
6171 vm_page_unlock_queues();
6172 }
6173 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6174 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6175
6176 if (phys_page > upl->highest_page) {
6177 upl->highest_page = phys_page;
6178 }
6179
6180 assert(!pmap_is_noencrypt(phys_page));
6181
6182 if (cntrl_flags & UPL_SET_LITE) {
6183 unsigned int pg_num;
6184
6185 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6186 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6187 bitmap_set(upl->lite_list, pg_num);
6188
6189 if (hw_dirty) {
6190 if (pmap_flushes_delayed == FALSE) {
6191 pmap_flush_context_init(&pmap_flush_context_storage);
6192 pmap_flushes_delayed = TRUE;
6193 }
6194 pmap_clear_refmod_options(phys_page,
6195 VM_MEM_MODIFIED,
6196 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6197 &pmap_flush_context_storage);
6198 }
6199
6200 /*
6201 * Mark original page as cleaning
6202 * in place.
6203 */
6204 dst_page->vmp_cleaning = TRUE;
6205 dst_page->vmp_precious = FALSE;
6206 } else {
6207 /*
6208 * use pageclean setup, it is more
6209 * convenient even for the pageout
6210 * cases here
6211 */
6212 vm_object_lock(upl->map_object);
6213 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6214 vm_object_unlock(upl->map_object);
6215
6216 alias_page->vmp_absent = FALSE;
6217 alias_page = NULL;
6218 }
6219 if (dirty) {
6220 SET_PAGE_DIRTY(dst_page, FALSE);
6221 } else {
6222 dst_page->vmp_dirty = FALSE;
6223 }
6224
6225 if (!dirty) {
6226 dst_page->vmp_precious = TRUE;
6227 }
6228
6229 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6230 if (!VM_PAGE_WIRED(dst_page)) {
6231 dst_page->vmp_free_when_done = TRUE;
6232 }
6233 }
6234 } else {
6235 if ((cntrl_flags & UPL_WILL_MODIFY) &&
6236 (object->vo_copy != last_copy_object ||
6237 object->vo_copy_version != last_copy_version)) {
6238 /*
6239 * Honor copy-on-write obligations
6240 *
6241 * The copy object has changed since we
6242 * last synchronized for copy-on-write.
6243 * Another copy object might have been
6244 * inserted while we released the object's
6245 * lock. Since someone could have seen the
6246 * original contents of the remaining pages
6247 * through that new object, we have to
6248 * synchronize with it again for the remaining
6249 * pages only. The previous pages are "busy"
6250 * so they can not be seen through the new
6251 * mapping. The new mapping will see our
6252 * upcoming changes for those previous pages,
6253 * but that's OK since they couldn't see what
6254 * was there before. It's just a race anyway
6255 * and there's no guarantee of consistency or
6256 * atomicity. We just don't want new mappings
6257 * to see both the *before* and *after* pages.
6258 */
6259 if (object->vo_copy != VM_OBJECT_NULL) {
6260 vm_object_update(
6261 object,
6262 dst_offset,/* current offset */
6263 xfer_size, /* remaining size */
6264 NULL,
6265 NULL,
6266 FALSE, /* should_return */
6267 MEMORY_OBJECT_COPY_SYNC,
6268 VM_PROT_NO_CHANGE);
6269
6270 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6271 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6272 }
6273 /*
6274 * remember the copy object we synced with
6275 */
6276 last_copy_object = object->vo_copy;
6277 last_copy_version = object->vo_copy_version;
6278 }
6279 dst_page = vm_page_lookup(object, dst_offset);
6280
6281 if (dst_page != VM_PAGE_NULL) {
6282 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6283 /*
6284 * skip over pages already present in the cache
6285 */
6286 if (user_page_list) {
6287 user_page_list[entry].phys_addr = 0;
6288 }
6289
6290 goto try_next_page;
6291 }
6292 if (vm_page_is_fictitious(dst_page)) {
6293 panic("need corner case for fictitious page");
6294 }
6295
6296 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6297 /*
6298 * someone else is playing with the
6299 * page. We will have to wait.
6300 */
6301 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6302
6303 continue;
6304 }
6305 if (dst_page->vmp_laundry) {
6306 vm_pageout_steal_laundry(dst_page, FALSE);
6307 }
6308 } else {
6309 if (object->private) {
6310 /*
6311 * This is a nasty wrinkle for users
6312 * of upl who encounter device or
6313 * private memory however, it is
6314 * unavoidable, only a fault can
6315 * resolve the actual backing
6316 * physical page by asking the
6317 * backing device.
6318 */
6319 if (user_page_list) {
6320 user_page_list[entry].phys_addr = 0;
6321 }
6322
6323 goto try_next_page;
6324 }
6325 if (object->scan_collisions) {
6326 /*
6327 * the pageout_scan thread is trying to steal
6328 * pages from this object, but has run into our
6329 * lock... grab 2 pages from the head of the object...
6330 * the first is freed on behalf of pageout_scan, the
6331 * 2nd is for our own use... we use vm_object_page_grab
6332 * in both cases to avoid taking pages from the free
6333 * list since we are under memory pressure and our
6334 * lock on this object is getting in the way of
6335 * relieving it
6336 */
6337 dst_page = vm_object_page_grab(object);
6338
6339 if (dst_page != VM_PAGE_NULL) {
6340 vm_page_release(dst_page,
6341 VMP_RELEASE_NONE);
6342 }
6343
6344 dst_page = vm_object_page_grab(object);
6345 }
6346 if (dst_page == VM_PAGE_NULL) {
6347 /*
6348 * need to allocate a page
6349 */
6350 dst_page = vm_page_grab_options(grab_options);
6351 if (dst_page != VM_PAGE_NULL) {
6352 page_grab_count++;
6353 }
6354 }
6355 if (dst_page == VM_PAGE_NULL) {
6356 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6357 /*
6358 * we don't want to stall waiting for pages to come onto the free list
6359 * while we're already holding absent pages in this UPL
6360 * the caller will deal with the empty slots
6361 */
6362 if (user_page_list) {
6363 user_page_list[entry].phys_addr = 0;
6364 }
6365
6366 goto try_next_page;
6367 }
6368 /*
6369 * no pages available... wait
6370 * then try again for the same
6371 * offset...
6372 */
6373 vm_object_unlock(object);
6374
6375 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6376
6377 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6378
6379 VM_PAGE_WAIT();
6380 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6381
6382 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6383
6384 vm_object_lock(object);
6385
6386 continue;
6387 }
6388 vm_page_insert(dst_page, object, dst_offset);
6389
6390 dst_page->vmp_absent = TRUE;
6391 dst_page->vmp_busy = FALSE;
6392
6393 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6394 /*
6395 * if UPL_RET_ONLY_ABSENT was specified,
6396 * than we're definitely setting up a
6397 * upl for a clustered read/pagein
6398 * operation... mark the pages as clustered
6399 * so upl_commit_range can put them on the
6400 * speculative list
6401 */
6402 dst_page->vmp_clustered = TRUE;
6403
6404 if (!(cntrl_flags & UPL_FILE_IO)) {
6405 counter_inc(&vm_statistics_pageins);
6406 }
6407 }
6408 }
6409 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6410
6411 dst_page->vmp_overwriting = TRUE;
6412
6413 if (dst_page->vmp_pmapped) {
6414 if (!(cntrl_flags & UPL_FILE_IO)) {
6415 /*
6416 * eliminate all mappings from the
6417 * original object and its prodigy
6418 */
6419 refmod_state = pmap_disconnect(phys_page);
6420 } else {
6421 refmod_state = pmap_get_refmod(phys_page);
6422 }
6423 } else {
6424 refmod_state = 0;
6425 }
6426
6427 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6428 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6429
6430 if (cntrl_flags & UPL_SET_LITE) {
6431 unsigned int pg_num;
6432
6433 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6434 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6435 bitmap_set(upl->lite_list, pg_num);
6436
6437 if (hw_dirty) {
6438 pmap_clear_modify(phys_page);
6439 }
6440
6441 /*
6442 * Mark original page as cleaning
6443 * in place.
6444 */
6445 dst_page->vmp_cleaning = TRUE;
6446 dst_page->vmp_precious = FALSE;
6447 } else {
6448 /*
6449 * use pageclean setup, it is more
6450 * convenient even for the pageout
6451 * cases here
6452 */
6453 vm_object_lock(upl->map_object);
6454 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6455 vm_object_unlock(upl->map_object);
6456
6457 alias_page->vmp_absent = FALSE;
6458 alias_page = NULL;
6459 }
6460
6461 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6462 upl->flags &= ~UPL_CLEAR_DIRTY;
6463 upl->flags |= UPL_SET_DIRTY;
6464 dirty = TRUE;
6465 /*
6466 * Page belonging to a code-signed object is about to
6467 * be written. Mark it tainted and disconnect it from
6468 * all pmaps so processes have to fault it back in and
6469 * deal with the tainted bit.
6470 */
6471 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6472 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6473 vm_page_upl_tainted++;
6474 if (dst_page->vmp_pmapped) {
6475 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6476 if (refmod_state & VM_MEM_REFERENCED) {
6477 dst_page->vmp_reference = TRUE;
6478 }
6479 }
6480 }
6481 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6482 /*
6483 * clean in place for read implies
6484 * that a write will be done on all
6485 * the pages that are dirty before
6486 * a upl commit is done. The caller
6487 * is obligated to preserve the
6488 * contents of all pages marked dirty
6489 */
6490 upl->flags |= UPL_CLEAR_DIRTY;
6491 }
6492 dst_page->vmp_dirty = dirty;
6493
6494 if (!dirty) {
6495 dst_page->vmp_precious = TRUE;
6496 }
6497
6498 if (!VM_PAGE_WIRED(dst_page)) {
6499 /*
6500 * deny access to the target page while
6501 * it is being worked on
6502 */
6503 dst_page->vmp_busy = TRUE;
6504 } else {
6505 dwp->dw_mask |= DW_vm_page_wire;
6506 }
6507
6508 /*
6509 * We might be about to satisfy a fault which has been
6510 * requested. So no need for the "restart" bit.
6511 */
6512 dst_page->vmp_restart = FALSE;
6513 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6514 /*
6515 * expect the page to be used
6516 */
6517 dwp->dw_mask |= DW_set_reference;
6518 }
6519 if (cntrl_flags & UPL_PRECIOUS) {
6520 if (object->internal) {
6521 SET_PAGE_DIRTY(dst_page, FALSE);
6522 dst_page->vmp_precious = FALSE;
6523 } else {
6524 dst_page->vmp_precious = TRUE;
6525 }
6526 } else {
6527 dst_page->vmp_precious = FALSE;
6528 }
6529 }
6530 if (dst_page->vmp_busy) {
6531 upl->flags |= UPL_HAS_BUSY;
6532 }
6533 if (VM_PAGE_WIRED(dst_page)) {
6534 upl->flags |= UPL_HAS_WIRED;
6535 }
6536
6537 if (phys_page > upl->highest_page) {
6538 upl->highest_page = phys_page;
6539 }
6540 assert(!pmap_is_noencrypt(phys_page));
6541 if (user_page_list) {
6542 user_page_list[entry].phys_addr = phys_page;
6543 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6544 user_page_list[entry].absent = dst_page->vmp_absent;
6545 user_page_list[entry].dirty = dst_page->vmp_dirty;
6546 user_page_list[entry].precious = dst_page->vmp_precious;
6547 user_page_list[entry].device = FALSE;
6548 user_page_list[entry].needed = FALSE;
6549 if (dst_page->vmp_clustered == TRUE) {
6550 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6551 } else {
6552 user_page_list[entry].speculative = FALSE;
6553 }
6554 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6555 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6556 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6557 user_page_list[entry].mark = FALSE;
6558 }
6559 /*
6560 * if UPL_RET_ONLY_ABSENT is set, then
6561 * we are working with a fresh page and we've
6562 * just set the clustered flag on it to
6563 * indicate that it was drug in as part of a
6564 * speculative cluster... so leave it alone
6565 */
6566 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6567 /*
6568 * someone is explicitly grabbing this page...
6569 * update clustered and speculative state
6570 *
6571 */
6572 if (dst_page->vmp_clustered) {
6573 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6574 }
6575 }
6576 try_next_page:
6577 if (dwp->dw_mask) {
6578 if (dwp->dw_mask & DW_vm_page_activate) {
6579 counter_inc(&vm_statistics_reactivations);
6580 }
6581
6582 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6583
6584 if (dw_count >= dw_limit) {
6585 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6586
6587 dwp = dwp_start;
6588 dw_count = 0;
6589 }
6590 }
6591 entry++;
6592 dst_offset += PAGE_SIZE_64;
6593 xfer_size -= PAGE_SIZE;
6594 }
6595 if (dw_count) {
6596 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6597 dwp = dwp_start;
6598 dw_count = 0;
6599 }
6600
6601 if (alias_page != NULL) {
6602 VM_PAGE_FREE(alias_page);
6603 }
6604 if (pmap_flushes_delayed == TRUE) {
6605 pmap_flush(&pmap_flush_context_storage);
6606 }
6607
6608 if (page_list_count != NULL) {
6609 if (upl->flags & UPL_INTERNAL) {
6610 *page_list_count = 0;
6611 } else if (*page_list_count > entry) {
6612 *page_list_count = entry;
6613 }
6614 }
6615 #if UPL_DEBUG
6616 upl->upl_state = 1;
6617 #endif
6618 vm_object_unlock(object);
6619
6620 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6621 if (task != NULL) {
6622 counter_add(&task->pages_grabbed_upl, page_grab_count);
6623 }
6624
6625 if (dwp_start && dwp_finish_ctx) {
6626 vm_page_delayed_work_finish_ctx(dwp_start);
6627 dwp_start = dwp = NULL;
6628 }
6629
6630 return KERN_SUCCESS;
6631 }
6632
6633 int cs_executable_create_upl = 0;
6634 extern int proc_selfpid(void);
6635 extern char *proc_name_address(void *p);
6636
6637 kern_return_t
6638 vm_map_create_upl(
6639 vm_map_t map,
6640 vm_map_address_t offset,
6641 upl_size_t *upl_size,
6642 upl_t *upl,
6643 upl_page_info_array_t page_list,
6644 unsigned int *count,
6645 upl_control_flags_t *flags,
6646 vm_tag_t tag)
6647 {
6648 vm_map_entry_t entry;
6649 upl_control_flags_t caller_flags;
6650 int force_data_sync;
6651 int sync_cow_data;
6652 vm_object_t local_object;
6653 vm_map_offset_t local_offset;
6654 vm_map_offset_t local_start;
6655 kern_return_t ret;
6656 vm_map_address_t original_offset;
6657 vm_map_size_t original_size, adjusted_size;
6658 vm_map_offset_t local_entry_start;
6659 vm_object_offset_t local_entry_offset;
6660 vm_object_offset_t offset_in_mapped_page;
6661 boolean_t release_map = FALSE;
6662
6663 start_with_map:
6664 caller_flags = *flags;
6665
6666 if (caller_flags & ~UPL_VALID_FLAGS) {
6667 /*
6668 * For forward compatibility's sake,
6669 * reject any unknown flag.
6670 */
6671 ret = KERN_INVALID_VALUE;
6672 goto done;
6673 }
6674
6675 if (upl == NULL) {
6676 ret = KERN_INVALID_ARGUMENT;
6677 goto done;
6678 }
6679
6680
6681 original_offset = offset;
6682 original_size = *upl_size;
6683 adjusted_size = original_size;
6684
6685 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6686 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6687
6688 REDISCOVER_ENTRY:
6689 vm_map_lock_read(map);
6690
6691 if (!vm_map_lookup_entry(map, offset, &entry)) {
6692 vm_map_unlock_read(map);
6693 ret = KERN_FAILURE;
6694 goto done;
6695 }
6696
6697 local_entry_start = entry->vme_start;
6698 local_entry_offset = VME_OFFSET(entry);
6699
6700 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6701 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6702 }
6703
6704 if (entry->vme_end - original_offset < adjusted_size) {
6705 adjusted_size = entry->vme_end - original_offset;
6706 assert(adjusted_size > 0);
6707 *upl_size = (upl_size_t) adjusted_size;
6708 assert(*upl_size == adjusted_size);
6709 }
6710
6711 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6712 *flags = 0;
6713
6714 if (!entry->is_sub_map &&
6715 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6716 if (VME_OBJECT(entry)->private) {
6717 *flags = UPL_DEV_MEMORY;
6718 }
6719
6720 if (VME_OBJECT(entry)->phys_contiguous) {
6721 *flags |= UPL_PHYS_CONTIG;
6722 }
6723 }
6724 vm_map_unlock_read(map);
6725 ret = KERN_SUCCESS;
6726 goto done;
6727 }
6728
6729 offset_in_mapped_page = 0;
6730 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6731 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6732 *upl_size = (upl_size_t)
6733 (vm_map_round_page(original_offset + adjusted_size,
6734 VM_MAP_PAGE_MASK(map))
6735 - offset);
6736
6737 offset_in_mapped_page = original_offset - offset;
6738 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6739
6740 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6741 }
6742
6743 if (!entry->is_sub_map) {
6744 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6745 !VME_OBJECT(entry)->phys_contiguous) {
6746 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6747 *upl_size = MAX_UPL_SIZE_BYTES;
6748 }
6749 }
6750
6751 /*
6752 * Create an object if necessary.
6753 */
6754 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6755 if (entry->max_protection == VM_PROT_NONE) {
6756 /* don't create an object for a reserved range */
6757 vm_map_unlock_read(map);
6758 ret = KERN_PROTECTION_FAILURE;
6759 goto done;
6760 }
6761
6762 if (vm_map_lock_read_to_write(map)) {
6763 goto REDISCOVER_ENTRY;
6764 }
6765
6766 VME_OBJECT_SET(entry,
6767 vm_object_allocate((vm_size_t)
6768 vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id),
6769 false, 0);
6770 VME_OFFSET_SET(entry, 0);
6771 assert(entry->use_pmap);
6772
6773 vm_map_lock_write_to_read(map);
6774 }
6775
6776 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6777 !(entry->protection & VM_PROT_WRITE)) {
6778 vm_map_unlock_read(map);
6779 ret = KERN_PROTECTION_FAILURE;
6780 goto done;
6781 }
6782 }
6783
6784 #if !XNU_TARGET_OS_OSX
6785 if (map->pmap != kernel_pmap &&
6786 (caller_flags & UPL_COPYOUT_FROM) &&
6787 (entry->protection & VM_PROT_EXECUTE) &&
6788 !(entry->protection & VM_PROT_WRITE)) {
6789 vm_offset_t kaddr;
6790 vm_size_t ksize;
6791
6792 /*
6793 * We're about to create a read-only UPL backed by
6794 * memory from an executable mapping.
6795 * Wiring the pages would result in the pages being copied
6796 * (due to the "MAP_PRIVATE" mapping) and no longer
6797 * code-signed, so no longer eligible for execution.
6798 * Instead, let's copy the data into a kernel buffer and
6799 * create the UPL from this kernel buffer.
6800 * The kernel buffer is then freed, leaving the UPL holding
6801 * the last reference on the VM object, so the memory will
6802 * be released when the UPL is committed.
6803 */
6804
6805 vm_map_unlock_read(map);
6806 entry = VM_MAP_ENTRY_NULL;
6807 /* allocate kernel buffer */
6808 ksize = round_page(*upl_size);
6809 kaddr = 0;
6810 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6811 KMA_PAGEABLE | KMA_DATA, tag);
6812 if (ret == KERN_SUCCESS) {
6813 /* copyin the user data */
6814 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6815 }
6816 if (ret == KERN_SUCCESS) {
6817 if (ksize > *upl_size) {
6818 /* zero out the extra space in kernel buffer */
6819 memset((void *)(kaddr + *upl_size),
6820 0,
6821 ksize - *upl_size);
6822 }
6823 /* create the UPL from the kernel buffer */
6824 vm_object_offset_t offset_in_object;
6825 vm_object_offset_t offset_in_object_page;
6826
6827 offset_in_object = offset - local_entry_start + local_entry_offset;
6828 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6829 assert(offset_in_object_page < PAGE_SIZE);
6830 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6831 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6832 ret = vm_map_create_upl(kernel_map,
6833 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6834 upl_size, upl, page_list, count, flags, tag);
6835 }
6836 if (kaddr != 0) {
6837 /* free the kernel buffer */
6838 kmem_free(kernel_map, kaddr, ksize);
6839 kaddr = 0;
6840 ksize = 0;
6841 }
6842 #if DEVELOPMENT || DEBUG
6843 DTRACE_VM4(create_upl_from_executable,
6844 vm_map_t, map,
6845 vm_map_address_t, offset,
6846 upl_size_t, *upl_size,
6847 kern_return_t, ret);
6848 #endif /* DEVELOPMENT || DEBUG */
6849 goto done;
6850 }
6851 #endif /* !XNU_TARGET_OS_OSX */
6852
6853 if (!entry->is_sub_map) {
6854 local_object = VME_OBJECT(entry);
6855 assert(local_object != VM_OBJECT_NULL);
6856 }
6857
6858 if (!entry->is_sub_map &&
6859 !entry->needs_copy &&
6860 *upl_size != 0 &&
6861 local_object->vo_size > *upl_size && /* partial UPL */
6862 entry->wired_count == 0 && /* No COW for entries that are wired */
6863 (map->pmap != kernel_pmap) && /* alias checks */
6864 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6865 ||
6866 ( /* case 2 */
6867 local_object->internal &&
6868 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6869 os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6870 vm_prot_t prot;
6871
6872 /*
6873 * Case 1:
6874 * Set up the targeted range for copy-on-write to avoid
6875 * applying true_share/copy_delay to the entire object.
6876 *
6877 * Case 2:
6878 * This map entry covers only part of an internal
6879 * object. There could be other map entries covering
6880 * other areas of this object and some of these map
6881 * entries could be marked as "needs_copy", which
6882 * assumes that the object is COPY_SYMMETRIC.
6883 * To avoid marking this object as COPY_DELAY and
6884 * "true_share", let's shadow it and mark the new
6885 * (smaller) object as "true_share" and COPY_DELAY.
6886 */
6887
6888 if (vm_map_lock_read_to_write(map)) {
6889 goto REDISCOVER_ENTRY;
6890 }
6891 vm_map_lock_assert_exclusive(map);
6892 assert(VME_OBJECT(entry) == local_object);
6893
6894 vm_map_clip_start(map,
6895 entry,
6896 vm_map_trunc_page(offset,
6897 VM_MAP_PAGE_MASK(map)));
6898 vm_map_clip_end(map,
6899 entry,
6900 vm_map_round_page(offset + *upl_size,
6901 VM_MAP_PAGE_MASK(map)));
6902 if ((entry->vme_end - offset) < *upl_size) {
6903 *upl_size = (upl_size_t) (entry->vme_end - offset);
6904 assert(*upl_size == entry->vme_end - offset);
6905 }
6906
6907 prot = entry->protection & ~VM_PROT_WRITE;
6908 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6909 prot |= VM_PROT_EXECUTE;
6910 }
6911 vm_object_pmap_protect(local_object,
6912 VME_OFFSET(entry),
6913 entry->vme_end - entry->vme_start,
6914 ((entry->is_shared ||
6915 map->mapped_in_other_pmaps)
6916 ? PMAP_NULL
6917 : map->pmap),
6918 VM_MAP_PAGE_SIZE(map),
6919 entry->vme_start,
6920 prot);
6921
6922 assert(entry->wired_count == 0);
6923
6924 /*
6925 * Lock the VM object and re-check its status: if it's mapped
6926 * in another address space, we could still be racing with
6927 * another thread holding that other VM map exclusively.
6928 */
6929 vm_object_lock(local_object);
6930 if (local_object->true_share) {
6931 /* object is already in proper state: no COW needed */
6932 assert(local_object->copy_strategy !=
6933 MEMORY_OBJECT_COPY_SYMMETRIC);
6934 } else {
6935 /* not true_share: ask for copy-on-write below */
6936 assert(local_object->copy_strategy ==
6937 MEMORY_OBJECT_COPY_SYMMETRIC);
6938 entry->needs_copy = TRUE;
6939 }
6940 vm_object_unlock(local_object);
6941
6942 vm_map_lock_write_to_read(map);
6943 }
6944
6945 if (entry->needs_copy) {
6946 /*
6947 * Honor copy-on-write for COPY_SYMMETRIC
6948 * strategy.
6949 */
6950 vm_map_t local_map;
6951 vm_object_t object;
6952 vm_object_offset_t new_offset;
6953 vm_prot_t prot;
6954 boolean_t wired;
6955 vm_map_version_t version;
6956 vm_map_t real_map;
6957 vm_prot_t fault_type;
6958
6959 local_map = map;
6960
6961 if (caller_flags & UPL_COPYOUT_FROM) {
6962 fault_type = VM_PROT_READ | VM_PROT_COPY;
6963 vm_counters.create_upl_extra_cow++;
6964 vm_counters.create_upl_extra_cow_pages +=
6965 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6966 } else {
6967 fault_type = VM_PROT_WRITE;
6968 }
6969 if (vm_map_lookup_and_lock_object(&local_map,
6970 offset, fault_type,
6971 OBJECT_LOCK_EXCLUSIVE,
6972 &version, &object,
6973 &new_offset, &prot, &wired,
6974 NULL,
6975 &real_map, NULL) != KERN_SUCCESS) {
6976 if (fault_type == VM_PROT_WRITE) {
6977 vm_counters.create_upl_lookup_failure_write++;
6978 } else {
6979 vm_counters.create_upl_lookup_failure_copy++;
6980 }
6981 vm_map_unlock_read(local_map);
6982 ret = KERN_FAILURE;
6983 goto done;
6984 }
6985 if (real_map != local_map) {
6986 vm_map_unlock(real_map);
6987 }
6988 vm_map_unlock_read(local_map);
6989
6990 vm_object_unlock(object);
6991
6992 goto REDISCOVER_ENTRY;
6993 }
6994
6995 if (entry->is_sub_map) {
6996 vm_map_t submap;
6997
6998 submap = VME_SUBMAP(entry);
6999 local_start = entry->vme_start;
7000 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7001
7002 vm_map_reference(submap);
7003 vm_map_unlock_read(map);
7004
7005 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7006 offset += offset_in_mapped_page;
7007 *upl_size -= offset_in_mapped_page;
7008
7009 if (release_map) {
7010 vm_map_deallocate(map);
7011 }
7012 map = submap;
7013 release_map = TRUE;
7014 offset = local_offset + (offset - local_start);
7015 goto start_with_map;
7016 }
7017
7018 if (sync_cow_data &&
7019 (VME_OBJECT(entry)->shadow ||
7020 VME_OBJECT(entry)->vo_copy)) {
7021 local_object = VME_OBJECT(entry);
7022 local_start = entry->vme_start;
7023 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7024
7025 vm_object_reference(local_object);
7026 vm_map_unlock_read(map);
7027
7028 if (local_object->shadow && local_object->vo_copy) {
7029 vm_object_lock_request(local_object->shadow,
7030 ((vm_object_offset_t)
7031 ((offset - local_start) +
7032 local_offset) +
7033 local_object->vo_shadow_offset),
7034 *upl_size, FALSE,
7035 MEMORY_OBJECT_DATA_SYNC,
7036 VM_PROT_NO_CHANGE);
7037 }
7038 sync_cow_data = FALSE;
7039 vm_object_deallocate(local_object);
7040
7041 goto REDISCOVER_ENTRY;
7042 }
7043 if (force_data_sync) {
7044 local_object = VME_OBJECT(entry);
7045 local_start = entry->vme_start;
7046 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7047
7048 vm_object_reference(local_object);
7049 vm_map_unlock_read(map);
7050
7051 vm_object_lock_request(local_object,
7052 ((vm_object_offset_t)
7053 ((offset - local_start) +
7054 local_offset)),
7055 (vm_object_size_t)*upl_size,
7056 FALSE,
7057 MEMORY_OBJECT_DATA_SYNC,
7058 VM_PROT_NO_CHANGE);
7059
7060 force_data_sync = FALSE;
7061 vm_object_deallocate(local_object);
7062
7063 goto REDISCOVER_ENTRY;
7064 }
7065 if (VME_OBJECT(entry)->private) {
7066 *flags = UPL_DEV_MEMORY;
7067 } else {
7068 *flags = 0;
7069 }
7070
7071 if (VME_OBJECT(entry)->phys_contiguous) {
7072 *flags |= UPL_PHYS_CONTIG;
7073 }
7074
7075 local_object = VME_OBJECT(entry);
7076 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7077 local_start = entry->vme_start;
7078
7079
7080 /*
7081 * Wiring will copy the pages to the shadow object.
7082 * The shadow object will not be code-signed so
7083 * attempting to execute code from these copied pages
7084 * would trigger a code-signing violation.
7085 */
7086 if (entry->protection & VM_PROT_EXECUTE) {
7087 #if MACH_ASSERT
7088 printf("pid %d[%s] create_upl out of executable range from "
7089 "0x%llx to 0x%llx: side effects may include "
7090 "code-signing violations later on\n",
7091 proc_selfpid(),
7092 (get_bsdtask_info(current_task())
7093 ? proc_name_address(get_bsdtask_info(current_task()))
7094 : "?"),
7095 (uint64_t) entry->vme_start,
7096 (uint64_t) entry->vme_end);
7097 #endif /* MACH_ASSERT */
7098 DTRACE_VM2(cs_executable_create_upl,
7099 uint64_t, (uint64_t)entry->vme_start,
7100 uint64_t, (uint64_t)entry->vme_end);
7101 cs_executable_create_upl++;
7102 }
7103
7104 vm_object_lock(local_object);
7105
7106 /*
7107 * Ensure that this object is "true_share" and "copy_delay" now,
7108 * while we're still holding the VM map lock. After we unlock the map,
7109 * anything could happen to that mapping, including some copy-on-write
7110 * activity. We need to make sure that the IOPL will point at the
7111 * same memory as the mapping.
7112 */
7113 if (local_object->true_share) {
7114 assert(local_object->copy_strategy !=
7115 MEMORY_OBJECT_COPY_SYMMETRIC);
7116 } else if (!is_kernel_object(local_object) &&
7117 local_object != compressor_object &&
7118 !local_object->phys_contiguous) {
7119 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7120 if (!local_object->true_share &&
7121 vm_object_tracking_btlog) {
7122 btlog_record(vm_object_tracking_btlog, local_object,
7123 VM_OBJECT_TRACKING_OP_TRUESHARE,
7124 btref_get(__builtin_frame_address(0), 0));
7125 }
7126 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7127 VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7128 if (local_object->copy_strategy ==
7129 MEMORY_OBJECT_COPY_SYMMETRIC) {
7130 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7131 }
7132 }
7133
7134 vm_object_reference_locked(local_object);
7135 vm_object_unlock(local_object);
7136
7137 vm_map_unlock_read(map);
7138
7139 offset += offset_in_mapped_page;
7140 assert(*upl_size > offset_in_mapped_page);
7141 *upl_size -= offset_in_mapped_page;
7142
7143 ret = vm_object_iopl_request(local_object,
7144 ((vm_object_offset_t)
7145 ((offset - local_start) + local_offset)),
7146 *upl_size,
7147 upl,
7148 page_list,
7149 count,
7150 caller_flags,
7151 tag);
7152 vm_object_deallocate(local_object);
7153
7154 done:
7155 if (release_map) {
7156 vm_map_deallocate(map);
7157 }
7158
7159 return ret;
7160 }
7161
7162 /*
7163 * Internal routine to enter a UPL into a VM map.
7164 *
7165 * JMM - This should just be doable through the standard
7166 * vm_map_enter() API.
7167 */
7168 kern_return_t
7169 vm_map_enter_upl_range(
7170 vm_map_t map,
7171 upl_t upl,
7172 vm_object_offset_t offset_to_map,
7173 vm_size_t size_to_map,
7174 vm_prot_t prot_to_map,
7175 vm_map_offset_t *dst_addr)
7176 {
7177 vm_map_size_t size;
7178 vm_object_offset_t offset;
7179 vm_map_offset_t addr;
7180 vm_page_t m;
7181 kern_return_t kr;
7182 int isVectorUPL = 0, curr_upl = 0;
7183 upl_t vector_upl = NULL;
7184 mach_vm_offset_t vector_upl_dst_addr = 0;
7185 vm_map_t vector_upl_submap = NULL;
7186 upl_offset_t subupl_offset = 0;
7187 upl_size_t subupl_size = 0;
7188
7189 if (upl == UPL_NULL) {
7190 return KERN_INVALID_ARGUMENT;
7191 }
7192
7193 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7194 assert(map == kernel_map);
7195
7196 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7197 int mapped = 0, valid_upls = 0;
7198 vector_upl = upl;
7199
7200 upl_lock(vector_upl);
7201 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7202 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7203 if (upl == NULL) {
7204 continue;
7205 }
7206 valid_upls++;
7207 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7208 mapped++;
7209 }
7210 }
7211
7212 if (mapped) {
7213 if (mapped != valid_upls) {
7214 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7215 } else {
7216 upl_unlock(vector_upl);
7217 return KERN_FAILURE;
7218 }
7219 }
7220
7221 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7222 panic("TODO4K: vector UPL not implemented");
7223 }
7224
7225 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7226 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7227 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7228 VM_KERN_MEMORY_NONE).kmr_submap;
7229 map = vector_upl_submap;
7230 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7231 curr_upl = 0;
7232 } else {
7233 upl_lock(upl);
7234 }
7235
7236 process_upl_to_enter:
7237 if (isVectorUPL) {
7238 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7239 *dst_addr = vector_upl_dst_addr;
7240 upl_unlock(vector_upl);
7241 return KERN_SUCCESS;
7242 }
7243 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7244 if (upl == NULL) {
7245 goto process_upl_to_enter;
7246 }
7247
7248 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7249 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7250 } else {
7251 /*
7252 * check to see if already mapped
7253 */
7254 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7255 upl_unlock(upl);
7256 return KERN_FAILURE;
7257 }
7258 }
7259
7260 if ((!(upl->flags & UPL_SHADOWED)) &&
7261 ((upl->flags & UPL_HAS_BUSY) ||
7262 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7263 vm_object_t object;
7264 vm_page_t alias_page;
7265 vm_object_offset_t new_offset;
7266 unsigned int pg_num;
7267
7268 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7269 object = upl->map_object;
7270 upl->map_object = vm_object_allocate(
7271 vm_object_round_page(size),
7272 /* Provenance is copied from the object we're shadowing */
7273 object->vmo_provenance);
7274
7275 vm_object_lock(upl->map_object);
7276
7277 upl->map_object->shadow = object;
7278 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7279 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7280 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7281 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7282 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7283 "object %p shadow_offset 0x%llx",
7284 upl->map_object,
7285 (uint64_t)upl->map_object->vo_shadow_offset);
7286 upl->map_object->wimg_bits = object->wimg_bits;
7287 offset = upl->map_object->vo_shadow_offset;
7288 new_offset = 0;
7289
7290 upl->flags |= UPL_SHADOWED;
7291
7292 while (size) {
7293 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7294 assert(pg_num == new_offset / PAGE_SIZE);
7295
7296 if (bitmap_test(upl->lite_list, pg_num)) {
7297 alias_page = vm_page_create_fictitious();
7298
7299 vm_object_lock(object);
7300
7301 m = vm_page_lookup(object, offset);
7302 if (m == VM_PAGE_NULL) {
7303 panic("vm_upl_map: page missing");
7304 }
7305
7306 /*
7307 * Convert the fictitious page to a private
7308 * shadow of the real page.
7309 */
7310 alias_page->vmp_free_when_done = TRUE;
7311 /*
7312 * since m is a page in the upl it must
7313 * already be wired or BUSY, so it's
7314 * safe to assign the underlying physical
7315 * page to the alias
7316 */
7317
7318 vm_object_unlock(object);
7319
7320 vm_page_lockspin_queues();
7321 vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7322 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7323 vm_page_unlock_queues();
7324
7325 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7326
7327 assert(!alias_page->vmp_wanted);
7328 alias_page->vmp_busy = FALSE;
7329 alias_page->vmp_absent = FALSE;
7330 }
7331 size -= PAGE_SIZE;
7332 offset += PAGE_SIZE_64;
7333 new_offset += PAGE_SIZE_64;
7334 }
7335 vm_object_unlock(upl->map_object);
7336 }
7337 if (upl->flags & UPL_SHADOWED) {
7338 if (isVectorUPL) {
7339 offset = 0;
7340 } else {
7341 offset = offset_to_map;
7342 }
7343 } else {
7344 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7345 if (!isVectorUPL) {
7346 offset += offset_to_map;
7347 }
7348 }
7349
7350 if (isVectorUPL) {
7351 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7352 } else {
7353 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7354 }
7355
7356 vm_object_reference(upl->map_object);
7357
7358 if (!isVectorUPL) {
7359 *dst_addr = 0;
7360 /*
7361 * NEED A UPL_MAP ALIAS
7362 */
7363 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7364 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7365 upl->map_object, offset, FALSE,
7366 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7367
7368 if (kr != KERN_SUCCESS) {
7369 vm_object_deallocate(upl->map_object);
7370 upl_unlock(upl);
7371 return kr;
7372 }
7373 } else {
7374 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7375 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7376 upl->map_object, offset, FALSE,
7377 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7378 if (kr) {
7379 panic("vm_map_enter failed for a Vector UPL");
7380 }
7381 }
7382 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7383 /* this will have to be an increment rather than */
7384 /* an assignment. */
7385 vm_object_lock(upl->map_object);
7386
7387 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7388 m = vm_page_lookup(upl->map_object, offset);
7389
7390 if (m) {
7391 m->vmp_pmapped = TRUE;
7392
7393 /*
7394 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7395 * but only in kernel space. If this was on a user map,
7396 * we'd have to set the wpmapped bit.
7397 */
7398 /* m->vmp_wpmapped = TRUE; */
7399 assert(map->pmap == kernel_pmap);
7400
7401 kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7402
7403 assert(kr == KERN_SUCCESS);
7404 #if KASAN
7405 kasan_notify_address(addr, PAGE_SIZE_64);
7406 #endif
7407 }
7408 offset += PAGE_SIZE_64;
7409 }
7410 vm_object_unlock(upl->map_object);
7411
7412 /*
7413 * hold a reference for the mapping
7414 */
7415 upl->ref_count++;
7416 upl->flags |= UPL_PAGE_LIST_MAPPED;
7417 upl->kaddr = (vm_offset_t) *dst_addr;
7418 assert(upl->kaddr == *dst_addr);
7419
7420 if (isVectorUPL) {
7421 goto process_upl_to_enter;
7422 }
7423
7424 if (!isVectorUPL) {
7425 vm_map_offset_t addr_adjustment;
7426
7427 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7428 if (addr_adjustment) {
7429 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7430 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7431 *dst_addr += addr_adjustment;
7432 }
7433 }
7434
7435 upl_unlock(upl);
7436
7437 return KERN_SUCCESS;
7438 }
7439
7440 kern_return_t
7441 vm_map_enter_upl(
7442 vm_map_t map,
7443 upl_t upl,
7444 vm_map_offset_t *dst_addr)
7445 {
7446 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7447 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7448 }
7449
7450 /*
7451 * Internal routine to remove a UPL mapping from a VM map.
7452 *
7453 * XXX - This should just be doable through a standard
7454 * vm_map_remove() operation. Otherwise, implicit clean-up
7455 * of the target map won't be able to correctly remove
7456 * these (and release the reference on the UPL). Having
7457 * to do this means we can't map these into user-space
7458 * maps yet.
7459 */
7460 kern_return_t
7461 vm_map_remove_upl_range(
7462 vm_map_t map,
7463 upl_t upl,
7464 __unused vm_object_offset_t offset_to_unmap,
7465 __unused vm_size_t size_to_unmap)
7466 {
7467 vm_address_t addr;
7468 upl_size_t size;
7469 int isVectorUPL = 0, curr_upl = 0;
7470 upl_t vector_upl = NULL;
7471
7472 if (upl == UPL_NULL) {
7473 return KERN_INVALID_ARGUMENT;
7474 }
7475
7476 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7477 int unmapped = 0, valid_upls = 0;
7478 vector_upl = upl;
7479 upl_lock(vector_upl);
7480 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7481 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7482 if (upl == NULL) {
7483 continue;
7484 }
7485 valid_upls++;
7486 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7487 unmapped++;
7488 }
7489 }
7490
7491 if (unmapped) {
7492 if (unmapped != valid_upls) {
7493 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7494 } else {
7495 upl_unlock(vector_upl);
7496 return KERN_FAILURE;
7497 }
7498 }
7499 curr_upl = 0;
7500 } else {
7501 upl_lock(upl);
7502 }
7503
7504 process_upl_to_remove:
7505 if (isVectorUPL) {
7506 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7507 vm_map_t v_upl_submap;
7508 vm_offset_t v_upl_submap_dst_addr;
7509 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7510
7511 kmem_free_guard(map, v_upl_submap_dst_addr,
7512 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7513 vm_map_deallocate(v_upl_submap);
7514 upl_unlock(vector_upl);
7515 return KERN_SUCCESS;
7516 }
7517
7518 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7519 if (upl == NULL) {
7520 goto process_upl_to_remove;
7521 }
7522 }
7523
7524 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7525 addr = upl->kaddr;
7526 size = upl->u_mapped_size;
7527
7528 assert(upl->ref_count > 1);
7529 upl->ref_count--; /* removing mapping ref */
7530
7531 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7532 upl->kaddr = (vm_offset_t) 0;
7533 upl->u_mapped_size = 0;
7534
7535 if (isVectorUPL) {
7536 /*
7537 * If it's a Vectored UPL, we'll be removing the entire
7538 * submap anyways, so no need to remove individual UPL
7539 * element mappings from within the submap
7540 */
7541 goto process_upl_to_remove;
7542 }
7543
7544 upl_unlock(upl);
7545
7546 vm_map_remove(map,
7547 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7548 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7549 return KERN_SUCCESS;
7550 }
7551 upl_unlock(upl);
7552
7553 return KERN_FAILURE;
7554 }
7555
7556 kern_return_t
7557 vm_map_remove_upl(
7558 vm_map_t map,
7559 upl_t upl)
7560 {
7561 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7562 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7563 }
7564
7565 void
7566 iopl_valid_data(
7567 upl_t upl,
7568 vm_tag_t tag)
7569 {
7570 vm_object_t object;
7571 vm_offset_t offset;
7572 vm_page_t m, nxt_page = VM_PAGE_NULL;
7573 upl_size_t size;
7574 int wired_count = 0;
7575
7576 if (upl == NULL) {
7577 panic("iopl_valid_data: NULL upl");
7578 }
7579 if (vector_upl_is_valid(upl)) {
7580 panic("iopl_valid_data: vector upl");
7581 }
7582 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7583 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7584 }
7585
7586 object = upl->map_object;
7587
7588 if (is_kernel_object(object) || object == compressor_object) {
7589 panic("iopl_valid_data: object == kernel or compressor");
7590 }
7591
7592 if (object->purgable == VM_PURGABLE_VOLATILE ||
7593 object->purgable == VM_PURGABLE_EMPTY) {
7594 panic("iopl_valid_data: object %p purgable %d",
7595 object, object->purgable);
7596 }
7597
7598 size = upl_adjusted_size(upl, PAGE_MASK);
7599
7600 vm_object_lock(object);
7601 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7602
7603 bool whole_object;
7604
7605 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7606 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7607 whole_object = true;
7608 } else {
7609 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7610 whole_object = false;
7611 }
7612
7613 while (size) {
7614 if (whole_object) {
7615 if (nxt_page != VM_PAGE_NULL) {
7616 m = nxt_page;
7617 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7618 }
7619 } else {
7620 m = vm_page_lookup(object, offset);
7621 offset += PAGE_SIZE;
7622
7623 if (m == VM_PAGE_NULL) {
7624 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7625 }
7626 }
7627 if (m->vmp_busy) {
7628 if (!m->vmp_absent) {
7629 panic("iopl_valid_data: busy page w/o absent");
7630 }
7631
7632 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7633 panic("iopl_valid_data: busy+absent page on page queue");
7634 }
7635 if (m->vmp_reusable) {
7636 panic("iopl_valid_data: %p is reusable", m);
7637 }
7638
7639 m->vmp_absent = FALSE;
7640 m->vmp_dirty = TRUE;
7641 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7642 assert(m->vmp_wire_count == 0);
7643 m->vmp_wire_count++;
7644 assert(m->vmp_wire_count);
7645 if (m->vmp_wire_count == 1) {
7646 m->vmp_q_state = VM_PAGE_IS_WIRED;
7647 wired_count++;
7648 } else {
7649 panic("iopl_valid_data: %p already wired", m);
7650 }
7651
7652
7653 vm_page_wakeup_done(object, m);
7654 }
7655 size -= PAGE_SIZE;
7656 }
7657 if (wired_count) {
7658 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7659 assert(object->resident_page_count >= object->wired_page_count);
7660
7661 /* no need to adjust purgeable accounting for this object: */
7662 assert(object->purgable != VM_PURGABLE_VOLATILE);
7663 assert(object->purgable != VM_PURGABLE_EMPTY);
7664
7665 vm_page_lockspin_queues();
7666 vm_page_wire_count += wired_count;
7667 vm_page_unlock_queues();
7668 }
7669 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7670 vm_object_unlock(object);
7671 }
7672
7673
7674 void
7675 vm_object_set_pmap_cache_attr(
7676 vm_object_t object,
7677 upl_page_info_array_t user_page_list,
7678 unsigned int num_pages,
7679 boolean_t batch_pmap_op)
7680 {
7681 unsigned int cache_attr = 0;
7682
7683 cache_attr = object->wimg_bits & VM_WIMG_MASK;
7684 assert(user_page_list);
7685 if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7686 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7687 }
7688 }
7689
7690
7691 static bool
7692 vm_object_iopl_wire_full(
7693 vm_object_t object,
7694 upl_t upl,
7695 upl_page_info_array_t user_page_list,
7696 upl_control_flags_t cntrl_flags,
7697 vm_tag_t tag)
7698 {
7699 vm_page_t dst_page;
7700 unsigned int entry;
7701 int page_count;
7702 int delayed_unlock = 0;
7703 boolean_t retval = TRUE;
7704 ppnum_t phys_page;
7705
7706 vm_object_lock_assert_exclusive(object);
7707 assert(object->purgable != VM_PURGABLE_VOLATILE);
7708 assert(object->purgable != VM_PURGABLE_EMPTY);
7709 assert(object->pager == NULL);
7710 assert(object->vo_copy == NULL);
7711 assert(object->shadow == NULL);
7712
7713 page_count = object->resident_page_count;
7714 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7715
7716 vm_page_lock_queues();
7717
7718 while (page_count--) {
7719 if (dst_page->vmp_busy ||
7720 vm_page_is_fictitious(dst_page) ||
7721 dst_page->vmp_absent ||
7722 VMP_ERROR_GET(dst_page) ||
7723 dst_page->vmp_cleaning ||
7724 dst_page->vmp_restart ||
7725 dst_page->vmp_laundry) {
7726 retval = FALSE;
7727 goto done;
7728 }
7729 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7730 retval = FALSE;
7731 goto done;
7732 }
7733 dst_page->vmp_reference = TRUE;
7734
7735 vm_page_wire(dst_page, tag, FALSE);
7736
7737 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7738 SET_PAGE_DIRTY(dst_page, FALSE);
7739 }
7740 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7741 assert(entry >= 0 && entry < object->resident_page_count);
7742 bitmap_set(upl->lite_list, entry);
7743
7744 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7745
7746 if (phys_page > upl->highest_page) {
7747 upl->highest_page = phys_page;
7748 }
7749
7750 if (user_page_list) {
7751 user_page_list[entry].phys_addr = phys_page;
7752 user_page_list[entry].absent = dst_page->vmp_absent;
7753 user_page_list[entry].dirty = dst_page->vmp_dirty;
7754 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
7755 user_page_list[entry].precious = dst_page->vmp_precious;
7756 user_page_list[entry].device = FALSE;
7757 user_page_list[entry].speculative = FALSE;
7758 user_page_list[entry].cs_validated = FALSE;
7759 user_page_list[entry].cs_tainted = FALSE;
7760 user_page_list[entry].cs_nx = FALSE;
7761 user_page_list[entry].needed = FALSE;
7762 user_page_list[entry].mark = FALSE;
7763 }
7764 if (delayed_unlock++ > 256) {
7765 delayed_unlock = 0;
7766 lck_mtx_yield(&vm_page_queue_lock);
7767
7768 VM_CHECK_MEMORYSTATUS;
7769 }
7770 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7771 }
7772 done:
7773 vm_page_unlock_queues();
7774
7775 VM_CHECK_MEMORYSTATUS;
7776
7777 return retval;
7778 }
7779
7780
7781 static kern_return_t
7782 vm_object_iopl_wire_empty(
7783 vm_object_t object,
7784 upl_t upl,
7785 upl_page_info_array_t user_page_list,
7786 upl_control_flags_t cntrl_flags,
7787 vm_tag_t tag,
7788 vm_object_offset_t *dst_offset,
7789 int page_count,
7790 int *page_grab_count)
7791 {
7792 vm_page_t dst_page;
7793 boolean_t no_zero_fill = FALSE;
7794 int interruptible;
7795 int pages_wired = 0;
7796 int pages_inserted = 0;
7797 int entry = 0;
7798 uint64_t delayed_ledger_update = 0;
7799 kern_return_t ret = KERN_SUCCESS;
7800 vm_grab_options_t grab_options;
7801 ppnum_t phys_page;
7802
7803 vm_object_lock_assert_exclusive(object);
7804 assert(object->purgable != VM_PURGABLE_VOLATILE);
7805 assert(object->purgable != VM_PURGABLE_EMPTY);
7806 assert(object->pager == NULL);
7807 assert(object->vo_copy == NULL);
7808 assert(object->shadow == NULL);
7809
7810 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7811 interruptible = THREAD_ABORTSAFE;
7812 } else {
7813 interruptible = THREAD_UNINT;
7814 }
7815
7816 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7817 no_zero_fill = TRUE;
7818 }
7819
7820 grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
7821 #if CONFIG_SECLUDED_MEMORY
7822 if (object->can_grab_secluded) {
7823 grab_options |= VM_PAGE_GRAB_SECLUDED;
7824 }
7825 #endif /* CONFIG_SECLUDED_MEMORY */
7826
7827 while (page_count--) {
7828 while ((dst_page = vm_page_grab_options(grab_options))
7829 == VM_PAGE_NULL) {
7830 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7831
7832 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7833
7834 if (vm_page_wait(interruptible) == FALSE) {
7835 /*
7836 * interrupted case
7837 */
7838 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7839
7840 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7841
7842 ret = MACH_SEND_INTERRUPTED;
7843 goto done;
7844 }
7845 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7846
7847 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7848 }
7849
7850 dst_page->vmp_absent = no_zero_fill;
7851 dst_page->vmp_reference = TRUE;
7852
7853 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7854 SET_PAGE_DIRTY(dst_page, FALSE);
7855 }
7856 if (dst_page->vmp_absent == FALSE) {
7857 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7858 assert(dst_page->vmp_wire_count == 0);
7859 dst_page->vmp_wire_count++;
7860 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7861 assert(dst_page->vmp_wire_count);
7862 pages_wired++;
7863
7864
7865 vm_page_wakeup_done(object, dst_page);
7866 }
7867 pages_inserted++;
7868
7869 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7870
7871 if (no_zero_fill == FALSE) {
7872 vm_page_zero_fill(
7873 dst_page
7874 );
7875 }
7876
7877 bitmap_set(upl->lite_list, entry);
7878
7879 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7880
7881 if (phys_page > upl->highest_page) {
7882 upl->highest_page = phys_page;
7883 }
7884
7885 if (user_page_list) {
7886 user_page_list[entry].phys_addr = phys_page;
7887 user_page_list[entry].absent = dst_page->vmp_absent;
7888 user_page_list[entry].dirty = dst_page->vmp_dirty;
7889 user_page_list[entry].free_when_done = FALSE;
7890 user_page_list[entry].precious = FALSE;
7891 user_page_list[entry].device = FALSE;
7892 user_page_list[entry].speculative = FALSE;
7893 user_page_list[entry].cs_validated = FALSE;
7894 user_page_list[entry].cs_tainted = FALSE;
7895 user_page_list[entry].cs_nx = FALSE;
7896 user_page_list[entry].needed = FALSE;
7897 user_page_list[entry].mark = FALSE;
7898 }
7899 entry++;
7900 *dst_offset += PAGE_SIZE_64;
7901 }
7902 done:
7903 if (pages_wired) {
7904 vm_page_lockspin_queues();
7905 vm_page_wire_count += pages_wired;
7906 vm_page_unlock_queues();
7907 }
7908 if (pages_inserted) {
7909 if (object->internal) {
7910 OSAddAtomic(pages_inserted, &vm_page_internal_count);
7911 } else {
7912 OSAddAtomic(pages_inserted, &vm_page_external_count);
7913 }
7914 }
7915 if (delayed_ledger_update) {
7916 task_t owner;
7917 int ledger_idx_volatile;
7918 int ledger_idx_nonvolatile;
7919 int ledger_idx_volatile_compressed;
7920 int ledger_idx_nonvolatile_compressed;
7921 int ledger_idx_composite;
7922 int ledger_idx_external_wired;
7923 boolean_t do_footprint;
7924
7925 owner = VM_OBJECT_OWNER(object);
7926 assert(owner);
7927
7928 vm_object_ledger_tag_ledgers(object,
7929 &ledger_idx_volatile,
7930 &ledger_idx_nonvolatile,
7931 &ledger_idx_volatile_compressed,
7932 &ledger_idx_nonvolatile_compressed,
7933 &ledger_idx_composite,
7934 &ledger_idx_external_wired,
7935 &do_footprint);
7936
7937 if (object->internal) {
7938 /* more non-volatile bytes */
7939 ledger_credit(owner->ledger,
7940 ledger_idx_nonvolatile,
7941 delayed_ledger_update);
7942 if (do_footprint) {
7943 /* more footprint */
7944 ledger_credit(owner->ledger,
7945 task_ledgers.phys_footprint,
7946 delayed_ledger_update);
7947 } else if (ledger_idx_composite != -1) {
7948 ledger_credit(owner->ledger,
7949 ledger_idx_composite,
7950 delayed_ledger_update);
7951 }
7952 } else {
7953 /* more external wired bytes */
7954 ledger_credit(owner->ledger,
7955 ledger_idx_external_wired,
7956 delayed_ledger_update);
7957 if (do_footprint) {
7958 /* more footprint */
7959 ledger_credit(owner->ledger,
7960 task_ledgers.phys_footprint,
7961 delayed_ledger_update);
7962 } else if (ledger_idx_composite != -1) {
7963 ledger_credit(owner->ledger,
7964 ledger_idx_composite,
7965 delayed_ledger_update);
7966 }
7967 }
7968 }
7969
7970 assert(page_grab_count);
7971 *page_grab_count = pages_inserted;
7972
7973 return ret;
7974 }
7975
7976
7977 kern_return_t
7978 vm_object_iopl_request(
7979 vm_object_t object,
7980 vm_object_offset_t offset,
7981 upl_size_t size,
7982 upl_t *upl_ptr,
7983 upl_page_info_array_t user_page_list,
7984 unsigned int *page_list_count,
7985 upl_control_flags_t cntrl_flags,
7986 vm_tag_t tag)
7987 {
7988 vm_page_t dst_page;
7989 vm_object_offset_t dst_offset;
7990 upl_size_t xfer_size;
7991 upl_t upl = NULL;
7992 unsigned int entry;
7993 int no_zero_fill = FALSE;
7994 unsigned int size_in_pages;
7995 int page_grab_count = 0;
7996 u_int32_t psize;
7997 kern_return_t ret;
7998 vm_prot_t prot;
7999 struct vm_object_fault_info fault_info = {};
8000 struct vm_page_delayed_work dw_array;
8001 struct vm_page_delayed_work *dwp, *dwp_start;
8002 bool dwp_finish_ctx = TRUE;
8003 int dw_count;
8004 int dw_limit;
8005 int dw_index;
8006 boolean_t caller_lookup;
8007 int io_tracking_flag = 0;
8008 int interruptible;
8009 ppnum_t phys_page;
8010
8011 boolean_t set_cache_attr_needed = FALSE;
8012 boolean_t free_wired_pages = FALSE;
8013 boolean_t fast_path_empty_req = FALSE;
8014 boolean_t fast_path_full_req = FALSE;
8015
8016 task_t task = current_task();
8017
8018 dwp_start = dwp = NULL;
8019
8020 vm_object_offset_t original_offset = offset;
8021 upl_size_t original_size = size;
8022
8023 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8024
8025 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8026 offset = vm_object_trunc_page(offset);
8027 if (size != original_size || offset != original_offset) {
8028 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8029 }
8030
8031 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8032 /*
8033 * For forward compatibility's sake,
8034 * reject any unknown flag.
8035 */
8036 return KERN_INVALID_VALUE;
8037 }
8038 if (!vm_lopage_needed) {
8039 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8040 }
8041
8042 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8043 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8044 return KERN_INVALID_VALUE;
8045 }
8046
8047 if (object->phys_contiguous) {
8048 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8049 return KERN_INVALID_ADDRESS;
8050 }
8051
8052 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8053 return KERN_INVALID_ADDRESS;
8054 }
8055 }
8056 }
8057 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8058 no_zero_fill = TRUE;
8059 }
8060
8061 if (cntrl_flags & UPL_COPYOUT_FROM) {
8062 prot = VM_PROT_READ;
8063 } else {
8064 prot = VM_PROT_READ | VM_PROT_WRITE;
8065 }
8066
8067 if ((!object->internal) && (object->paging_offset != 0)) {
8068 panic("vm_object_iopl_request: external object with non-zero paging offset");
8069 }
8070
8071 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8072
8073 #if CONFIG_IOSCHED || UPL_DEBUG
8074 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8075 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8076 }
8077 #endif
8078
8079 #if CONFIG_IOSCHED
8080 if (object->io_tracking) {
8081 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8082 if (!is_kernel_object(object)) {
8083 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8084 }
8085 }
8086 #endif
8087
8088 if (object->phys_contiguous) {
8089 psize = PAGE_SIZE;
8090 } else {
8091 psize = size;
8092
8093 dw_count = 0;
8094 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8095 dwp_start = vm_page_delayed_work_get_ctx();
8096 if (dwp_start == NULL) {
8097 dwp_start = &dw_array;
8098 dw_limit = 1;
8099 dwp_finish_ctx = FALSE;
8100 }
8101
8102 dwp = dwp_start;
8103 }
8104
8105 if (cntrl_flags & UPL_SET_INTERNAL) {
8106 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8107 user_page_list = size ? upl->page_list : NULL;
8108 } else {
8109 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8110 }
8111 if (user_page_list) {
8112 user_page_list[0].device = FALSE;
8113 }
8114 *upl_ptr = upl;
8115
8116 if (cntrl_flags & UPL_NOZEROFILLIO) {
8117 DTRACE_VM4(upl_nozerofillio,
8118 vm_object_t, object,
8119 vm_object_offset_t, offset,
8120 upl_size_t, size,
8121 upl_t, upl);
8122 }
8123
8124 upl->map_object = object;
8125 upl->u_offset = original_offset;
8126 upl->u_size = original_size;
8127
8128 size_in_pages = size / PAGE_SIZE;
8129
8130 if (is_kernel_object(object) &&
8131 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8132 upl->flags |= UPL_KERNEL_OBJECT;
8133 #if UPL_DEBUG
8134 vm_object_lock(object);
8135 #else
8136 vm_object_lock_shared(object);
8137 #endif
8138 } else {
8139 vm_object_lock(object);
8140 vm_object_activity_begin(object);
8141 }
8142 /*
8143 * paging in progress also protects the paging_offset
8144 */
8145 upl->u_offset = original_offset + object->paging_offset;
8146
8147 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8148 /*
8149 * The user requested that access to the pages in this UPL
8150 * be blocked until the UPL is commited or aborted.
8151 */
8152 upl->flags |= UPL_ACCESS_BLOCKED;
8153 }
8154
8155 #if CONFIG_IOSCHED || UPL_DEBUG
8156 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8157 vm_object_activity_begin(object);
8158 queue_enter(&object->uplq, upl, upl_t, uplq);
8159 }
8160 #endif
8161
8162 if (object->phys_contiguous) {
8163 if (upl->flags & UPL_ACCESS_BLOCKED) {
8164 assert(!object->blocked_access);
8165 object->blocked_access = TRUE;
8166 }
8167
8168 vm_object_unlock(object);
8169
8170 /*
8171 * don't need any shadow mappings for this one
8172 * since it is already I/O memory
8173 */
8174 upl->flags |= UPL_DEVICE_MEMORY;
8175
8176 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8177
8178 if (user_page_list) {
8179 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8180 user_page_list[0].device = TRUE;
8181 }
8182 if (page_list_count != NULL) {
8183 if (upl->flags & UPL_INTERNAL) {
8184 *page_list_count = 0;
8185 } else {
8186 *page_list_count = 1;
8187 }
8188 }
8189
8190 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8191 if (task != NULL) {
8192 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8193 }
8194 return KERN_SUCCESS;
8195 }
8196 if (!is_kernel_object(object) && object != compressor_object) {
8197 /*
8198 * Protect user space from future COW operations
8199 */
8200 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8201 if (!object->true_share &&
8202 vm_object_tracking_btlog) {
8203 btlog_record(vm_object_tracking_btlog, object,
8204 VM_OBJECT_TRACKING_OP_TRUESHARE,
8205 btref_get(__builtin_frame_address(0), 0));
8206 }
8207 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8208
8209 vm_object_lock_assert_exclusive(object);
8210 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8211
8212 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8213 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8214 }
8215 }
8216
8217 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8218 object->vo_copy != VM_OBJECT_NULL) {
8219 /*
8220 * Honor copy-on-write obligations
8221 *
8222 * The caller is gathering these pages and
8223 * might modify their contents. We need to
8224 * make sure that the copy object has its own
8225 * private copies of these pages before we let
8226 * the caller modify them.
8227 *
8228 * NOTE: someone else could map the original object
8229 * after we've done this copy-on-write here, and they
8230 * could then see an inconsistent picture of the memory
8231 * while it's being modified via the UPL. To prevent this,
8232 * we would have to block access to these pages until the
8233 * UPL is released. We could use the UPL_BLOCK_ACCESS
8234 * code path for that...
8235 */
8236 vm_object_update(object,
8237 offset,
8238 size,
8239 NULL,
8240 NULL,
8241 FALSE, /* should_return */
8242 MEMORY_OBJECT_COPY_SYNC,
8243 VM_PROT_NO_CHANGE);
8244 VM_PAGEOUT_DEBUG(iopl_cow, 1);
8245 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8246 }
8247 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8248 object->purgable != VM_PURGABLE_VOLATILE &&
8249 object->purgable != VM_PURGABLE_EMPTY &&
8250 object->vo_copy == NULL &&
8251 size == object->vo_size &&
8252 offset == 0 &&
8253 object->shadow == NULL &&
8254 object->pager == NULL) {
8255 if (object->resident_page_count == size_in_pages) {
8256 assert(object != compressor_object);
8257 assert(!is_kernel_object(object));
8258 fast_path_full_req = TRUE;
8259 } else if (object->resident_page_count == 0) {
8260 assert(object != compressor_object);
8261 assert(!is_kernel_object(object));
8262 fast_path_empty_req = TRUE;
8263 set_cache_attr_needed = TRUE;
8264 }
8265 }
8266
8267 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8268 interruptible = THREAD_ABORTSAFE;
8269 } else {
8270 interruptible = THREAD_UNINT;
8271 }
8272
8273 entry = 0;
8274
8275 xfer_size = size;
8276 dst_offset = offset;
8277
8278 if (fast_path_full_req) {
8279 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8280 goto finish;
8281 }
8282 /*
8283 * we couldn't complete the processing of this request on the fast path
8284 * so fall through to the slow path and finish up
8285 */
8286 } else if (fast_path_empty_req) {
8287 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8288 ret = KERN_MEMORY_ERROR;
8289 goto return_err;
8290 }
8291 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8292 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8293
8294 if (ret) {
8295 free_wired_pages = TRUE;
8296 goto return_err;
8297 }
8298 goto finish;
8299 }
8300
8301 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8302 fault_info.lo_offset = offset;
8303 fault_info.hi_offset = offset + xfer_size;
8304 fault_info.mark_zf_absent = TRUE;
8305 fault_info.interruptible = interruptible;
8306 fault_info.batch_pmap_op = TRUE;
8307
8308 while (xfer_size) {
8309 vm_fault_return_t result;
8310
8311 dwp->dw_mask = 0;
8312
8313 if (fast_path_full_req) {
8314 /*
8315 * if we get here, it means that we ran into a page
8316 * state we couldn't handle in the fast path and
8317 * bailed out to the slow path... since the order
8318 * we look at pages is different between the 2 paths,
8319 * the following check is needed to determine whether
8320 * this page was already processed in the fast path
8321 */
8322 if (bitmap_test(upl->lite_list, entry)) {
8323 goto skip_page;
8324 }
8325 }
8326 dst_page = vm_page_lookup(object, dst_offset);
8327
8328 if (dst_page == VM_PAGE_NULL ||
8329 dst_page->vmp_busy ||
8330 VMP_ERROR_GET(dst_page) ||
8331 dst_page->vmp_restart ||
8332 dst_page->vmp_absent ||
8333 vm_page_is_fictitious(dst_page)) {
8334 if (is_kernel_object(object)) {
8335 panic("vm_object_iopl_request: missing/bad page in kernel object");
8336 }
8337 if (object == compressor_object) {
8338 panic("vm_object_iopl_request: missing/bad page in compressor object");
8339 }
8340
8341 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8342 ret = KERN_MEMORY_ERROR;
8343 goto return_err;
8344 }
8345
8346 if (dst_page != VM_PAGE_NULL &&
8347 dst_page->vmp_busy) {
8348 wait_result_t wait_result;
8349 vm_object_lock_assert_exclusive(object);
8350 wait_result = vm_page_sleep(object, dst_page,
8351 interruptible, LCK_SLEEP_DEFAULT);
8352 if (wait_result == THREAD_AWAKENED ||
8353 wait_result == THREAD_RESTART) {
8354 continue;
8355 }
8356 ret = MACH_SEND_INTERRUPTED;
8357 goto return_err;
8358 }
8359
8360 set_cache_attr_needed = TRUE;
8361
8362 /*
8363 * We just looked up the page and the result remains valid
8364 * until the object lock is release, so send it to
8365 * vm_fault_page() (as "dst_page"), to avoid having to
8366 * look it up again there.
8367 */
8368 caller_lookup = TRUE;
8369
8370 do {
8371 vm_page_t top_page;
8372 kern_return_t error_code;
8373
8374 fault_info.cluster_size = xfer_size;
8375 vm_object_paging_begin(object);
8376
8377 result = vm_fault_page(object, dst_offset,
8378 prot | VM_PROT_WRITE, FALSE,
8379 caller_lookup,
8380 &prot, &dst_page, &top_page,
8381 (int *)0,
8382 &error_code, no_zero_fill,
8383 &fault_info);
8384
8385 /* our lookup is no longer valid at this point */
8386 caller_lookup = FALSE;
8387
8388 switch (result) {
8389 case VM_FAULT_SUCCESS:
8390 page_grab_count++;
8391
8392 if (!dst_page->vmp_absent) {
8393 vm_page_wakeup_done(object, dst_page);
8394 } else {
8395 /*
8396 * we only get back an absent page if we
8397 * requested that it not be zero-filled
8398 * because we are about to fill it via I/O
8399 *
8400 * absent pages should be left BUSY
8401 * to prevent them from being faulted
8402 * into an address space before we've
8403 * had a chance to complete the I/O on
8404 * them since they may contain info that
8405 * shouldn't be seen by the faulting task
8406 */
8407 }
8408 /*
8409 * Release paging references and
8410 * top-level placeholder page, if any.
8411 */
8412 if (top_page != VM_PAGE_NULL) {
8413 vm_object_t local_object;
8414
8415 local_object = VM_PAGE_OBJECT(top_page);
8416
8417 /*
8418 * comparing 2 packed pointers
8419 */
8420 if (top_page->vmp_object != dst_page->vmp_object) {
8421 vm_object_lock(local_object);
8422 VM_PAGE_FREE(top_page);
8423 vm_object_paging_end(local_object);
8424 vm_object_unlock(local_object);
8425 } else {
8426 VM_PAGE_FREE(top_page);
8427 vm_object_paging_end(local_object);
8428 }
8429 }
8430 vm_object_paging_end(object);
8431 break;
8432
8433 case VM_FAULT_RETRY:
8434 vm_object_lock(object);
8435 break;
8436
8437 case VM_FAULT_MEMORY_SHORTAGE:
8438 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8439
8440 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8441
8442 if (vm_page_wait(interruptible)) {
8443 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8444
8445 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8446 vm_object_lock(object);
8447
8448 break;
8449 }
8450 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8451
8452 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8453 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8454 OS_FALLTHROUGH;
8455
8456 case VM_FAULT_INTERRUPTED:
8457 error_code = MACH_SEND_INTERRUPTED;
8458 OS_FALLTHROUGH;
8459 case VM_FAULT_MEMORY_ERROR:
8460 memory_error:
8461 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8462
8463 vm_object_lock(object);
8464 goto return_err;
8465
8466 case VM_FAULT_SUCCESS_NO_VM_PAGE:
8467 /* success but no page: fail */
8468 vm_object_paging_end(object);
8469 vm_object_unlock(object);
8470 goto memory_error;
8471
8472 default:
8473 panic("vm_object_iopl_request: unexpected error"
8474 " 0x%x from vm_fault_page()\n", result);
8475 }
8476 } while (result != VM_FAULT_SUCCESS);
8477 }
8478 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8479
8480 if (upl->flags & UPL_KERNEL_OBJECT) {
8481 goto record_phys_addr;
8482 }
8483
8484 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8485 dst_page->vmp_busy = TRUE;
8486 goto record_phys_addr;
8487 }
8488
8489 if (dst_page->vmp_cleaning) {
8490 /*
8491 * Someone else is cleaning this page in place.
8492 * In theory, we should be able to proceed and use this
8493 * page but they'll probably end up clearing the "busy"
8494 * bit on it in upl_commit_range() but they didn't set
8495 * it, so they would clear our "busy" bit and open
8496 * us to race conditions.
8497 * We'd better wait for the cleaning to complete and
8498 * then try again.
8499 */
8500 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8501 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8502 continue;
8503 }
8504 if (dst_page->vmp_laundry) {
8505 vm_pageout_steal_laundry(dst_page, FALSE);
8506 }
8507
8508 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8509 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
8510 vm_page_t new_page;
8511 int refmod;
8512
8513 /*
8514 * support devices that can't DMA above 32 bits
8515 * by substituting pages from a pool of low address
8516 * memory for any pages we find above the 4G mark
8517 * can't substitute if the page is already wired because
8518 * we don't know whether that physical address has been
8519 * handed out to some other 64 bit capable DMA device to use
8520 */
8521 if (VM_PAGE_WIRED(dst_page)) {
8522 ret = KERN_PROTECTION_FAILURE;
8523 goto return_err;
8524 }
8525
8526 new_page = vm_page_grablo(VM_PAGE_GRAB_OPTIONS_NONE);
8527
8528 if (new_page == VM_PAGE_NULL) {
8529 ret = KERN_RESOURCE_SHORTAGE;
8530 goto return_err;
8531 }
8532 /*
8533 * from here until the vm_page_replace completes
8534 * we musn't drop the object lock... we don't
8535 * want anyone refaulting this page in and using
8536 * it after we disconnect it... we want the fault
8537 * to find the new page being substituted.
8538 */
8539 if (dst_page->vmp_pmapped) {
8540 refmod = pmap_disconnect(phys_page);
8541 } else {
8542 refmod = 0;
8543 }
8544
8545 if (!dst_page->vmp_absent) {
8546 vm_page_copy(dst_page, new_page);
8547 }
8548
8549 new_page->vmp_reference = dst_page->vmp_reference;
8550 new_page->vmp_dirty = dst_page->vmp_dirty;
8551 new_page->vmp_absent = dst_page->vmp_absent;
8552
8553 if (refmod & VM_MEM_REFERENCED) {
8554 new_page->vmp_reference = TRUE;
8555 }
8556 if (refmod & VM_MEM_MODIFIED) {
8557 SET_PAGE_DIRTY(new_page, FALSE);
8558 }
8559
8560 vm_page_replace(new_page, object, dst_offset);
8561
8562 dst_page = new_page;
8563 /*
8564 * vm_page_grablo returned the page marked
8565 * BUSY... we don't need a PAGE_WAKEUP_DONE
8566 * here, because we've never dropped the object lock
8567 */
8568 if (!dst_page->vmp_absent) {
8569 dst_page->vmp_busy = FALSE;
8570 }
8571
8572 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8573 }
8574 if (!dst_page->vmp_busy) {
8575 dwp->dw_mask |= DW_vm_page_wire;
8576 }
8577
8578 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8579 /*
8580 * Mark the page "busy" to block any future page fault
8581 * on this page in addition to wiring it.
8582 * We'll also remove the mapping
8583 * of all these pages before leaving this routine.
8584 */
8585 assert(!vm_page_is_fictitious(dst_page));
8586 dst_page->vmp_busy = TRUE;
8587 }
8588 /*
8589 * expect the page to be used
8590 * page queues lock must be held to set 'reference'
8591 */
8592 dwp->dw_mask |= DW_set_reference;
8593
8594 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8595 SET_PAGE_DIRTY(dst_page, TRUE);
8596 /*
8597 * Page belonging to a code-signed object is about to
8598 * be written. Mark it tainted and disconnect it from
8599 * all pmaps so processes have to fault it back in and
8600 * deal with the tainted bit.
8601 */
8602 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8603 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8604 vm_page_iopl_tainted++;
8605 if (dst_page->vmp_pmapped) {
8606 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8607 if (refmod & VM_MEM_REFERENCED) {
8608 dst_page->vmp_reference = TRUE;
8609 }
8610 }
8611 }
8612 }
8613 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8614 pmap_sync_page_attributes_phys(phys_page);
8615 dst_page->vmp_written_by_kernel = FALSE;
8616 }
8617
8618 record_phys_addr:
8619 if (dst_page->vmp_busy) {
8620 upl->flags |= UPL_HAS_BUSY;
8621 }
8622
8623 bitmap_set(upl->lite_list, entry);
8624
8625 if (phys_page > upl->highest_page) {
8626 upl->highest_page = phys_page;
8627 }
8628
8629 if (user_page_list) {
8630 user_page_list[entry].phys_addr = phys_page;
8631 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8632 user_page_list[entry].absent = dst_page->vmp_absent;
8633 user_page_list[entry].dirty = dst_page->vmp_dirty;
8634 user_page_list[entry].precious = dst_page->vmp_precious;
8635 user_page_list[entry].device = FALSE;
8636 user_page_list[entry].needed = FALSE;
8637 if (dst_page->vmp_clustered == TRUE) {
8638 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8639 } else {
8640 user_page_list[entry].speculative = FALSE;
8641 }
8642 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8643 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8644 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8645 user_page_list[entry].mark = FALSE;
8646 }
8647 if (!is_kernel_object(object) && object != compressor_object) {
8648 /*
8649 * someone is explicitly grabbing this page...
8650 * update clustered and speculative state
8651 *
8652 */
8653 if (dst_page->vmp_clustered) {
8654 VM_PAGE_CONSUME_CLUSTERED(dst_page);
8655 }
8656 }
8657 skip_page:
8658 entry++;
8659 dst_offset += PAGE_SIZE_64;
8660 xfer_size -= PAGE_SIZE;
8661
8662 if (dwp->dw_mask) {
8663 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8664
8665 if (dw_count >= dw_limit) {
8666 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8667
8668 dwp = dwp_start;
8669 dw_count = 0;
8670 }
8671 }
8672 }
8673 assert(entry == size_in_pages);
8674
8675 if (dw_count) {
8676 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8677 dwp = dwp_start;
8678 dw_count = 0;
8679 }
8680 finish:
8681 if (user_page_list && set_cache_attr_needed == TRUE) {
8682 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8683 }
8684
8685 if (page_list_count != NULL) {
8686 if (upl->flags & UPL_INTERNAL) {
8687 *page_list_count = 0;
8688 } else if (*page_list_count > size_in_pages) {
8689 *page_list_count = size_in_pages;
8690 }
8691 }
8692 vm_object_unlock(object);
8693
8694 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8695 /*
8696 * We've marked all the pages "busy" so that future
8697 * page faults will block.
8698 * Now remove the mapping for these pages, so that they
8699 * can't be accessed without causing a page fault.
8700 */
8701 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8702 PMAP_NULL,
8703 PAGE_SIZE,
8704 0, VM_PROT_NONE);
8705 assert(!object->blocked_access);
8706 object->blocked_access = TRUE;
8707 }
8708
8709 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8710 if (task != NULL) {
8711 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8712 }
8713
8714 if (dwp_start && dwp_finish_ctx) {
8715 vm_page_delayed_work_finish_ctx(dwp_start);
8716 dwp_start = dwp = NULL;
8717 }
8718
8719 return KERN_SUCCESS;
8720
8721 return_err:
8722 dw_index = 0;
8723
8724 for (; offset < dst_offset; offset += PAGE_SIZE) {
8725 boolean_t need_unwire;
8726 bool need_wakeup;
8727
8728 dst_page = vm_page_lookup(object, offset);
8729
8730 if (dst_page == VM_PAGE_NULL) {
8731 panic("vm_object_iopl_request: Wired page missing.");
8732 }
8733
8734 /*
8735 * if we've already processed this page in an earlier
8736 * dw_do_work, we need to undo the wiring... we will
8737 * leave the dirty and reference bits on if they
8738 * were set, since we don't have a good way of knowing
8739 * what the previous state was and we won't get here
8740 * under any normal circumstances... we will always
8741 * clear BUSY and wakeup any waiters via vm_page_free
8742 * or PAGE_WAKEUP_DONE
8743 */
8744 need_unwire = TRUE;
8745
8746 need_wakeup = false;
8747 if (dw_count) {
8748 if ((dwp_start)[dw_index].dw_m == dst_page) {
8749 /*
8750 * still in the deferred work list
8751 * which means we haven't yet called
8752 * vm_page_wire on this page
8753 */
8754 need_unwire = FALSE;
8755
8756 if (dst_page->vmp_busy &&
8757 ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8758 /*
8759 * It's our own "busy" bit, so we need to clear it
8760 * now and wake up waiters below.
8761 */
8762 dst_page->vmp_busy = false;
8763 need_wakeup = true;
8764 }
8765
8766 dw_index++;
8767 dw_count--;
8768 }
8769 }
8770 vm_page_lock_queues();
8771
8772 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8773 vm_page_free(dst_page);
8774
8775 need_unwire = FALSE;
8776 } else {
8777 if (need_unwire == TRUE) {
8778 vm_page_unwire(dst_page, TRUE);
8779 }
8780 if (dst_page->vmp_busy) {
8781 /* not our "busy" or we would have cleared it above */
8782 assert(!need_wakeup);
8783 }
8784 if (need_wakeup) {
8785 assert(!dst_page->vmp_busy);
8786 vm_page_wakeup(object, dst_page);
8787 }
8788 }
8789 vm_page_unlock_queues();
8790
8791 if (need_unwire == TRUE) {
8792 counter_inc(&vm_statistics_reactivations);
8793 }
8794 }
8795 #if UPL_DEBUG
8796 upl->upl_state = 2;
8797 #endif
8798 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8799 vm_object_activity_end(object);
8800 vm_object_collapse(object, 0, TRUE);
8801 }
8802 vm_object_unlock(object);
8803 upl_destroy(upl);
8804
8805 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8806 if (task != NULL) {
8807 counter_add(&task->pages_grabbed_iopl, page_grab_count);
8808 }
8809
8810 if (dwp_start && dwp_finish_ctx) {
8811 vm_page_delayed_work_finish_ctx(dwp_start);
8812 dwp_start = dwp = NULL;
8813 }
8814 return ret;
8815 }
8816
8817 kern_return_t
8818 upl_transpose(
8819 upl_t upl1,
8820 upl_t upl2)
8821 {
8822 kern_return_t retval;
8823 boolean_t upls_locked;
8824 vm_object_t object1, object2;
8825
8826 /* LD: Should mapped UPLs be eligible for a transpose? */
8827 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8828 return KERN_INVALID_ARGUMENT;
8829 }
8830
8831 upls_locked = FALSE;
8832
8833 /*
8834 * Since we need to lock both UPLs at the same time,
8835 * avoid deadlocks by always taking locks in the same order.
8836 */
8837 if (upl1 < upl2) {
8838 upl_lock(upl1);
8839 upl_lock(upl2);
8840 } else {
8841 upl_lock(upl2);
8842 upl_lock(upl1);
8843 }
8844 upls_locked = TRUE; /* the UPLs will need to be unlocked */
8845
8846 object1 = upl1->map_object;
8847 object2 = upl2->map_object;
8848
8849 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8850 upl1->u_size != upl2->u_size) {
8851 /*
8852 * We deal only with full objects, not subsets.
8853 * That's because we exchange the entire backing store info
8854 * for the objects: pager, resident pages, etc... We can't do
8855 * only part of it.
8856 */
8857 retval = KERN_INVALID_VALUE;
8858 goto done;
8859 }
8860
8861 /*
8862 * Tranpose the VM objects' backing store.
8863 */
8864 retval = vm_object_transpose(object1, object2,
8865 upl_adjusted_size(upl1, PAGE_MASK));
8866
8867 if (retval == KERN_SUCCESS) {
8868 /*
8869 * Make each UPL point to the correct VM object, i.e. the
8870 * object holding the pages that the UPL refers to...
8871 */
8872 #if CONFIG_IOSCHED || UPL_DEBUG
8873 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8874 vm_object_lock(object1);
8875 vm_object_lock(object2);
8876 }
8877 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8878 queue_remove(&object1->uplq, upl1, upl_t, uplq);
8879 }
8880 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8881 queue_remove(&object2->uplq, upl2, upl_t, uplq);
8882 }
8883 #endif
8884 upl1->map_object = object2;
8885 upl2->map_object = object1;
8886
8887 #if CONFIG_IOSCHED || UPL_DEBUG
8888 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8889 queue_enter(&object2->uplq, upl1, upl_t, uplq);
8890 }
8891 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8892 queue_enter(&object1->uplq, upl2, upl_t, uplq);
8893 }
8894 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8895 vm_object_unlock(object2);
8896 vm_object_unlock(object1);
8897 }
8898 #endif
8899 }
8900
8901 done:
8902 /*
8903 * Cleanup.
8904 */
8905 if (upls_locked) {
8906 upl_unlock(upl1);
8907 upl_unlock(upl2);
8908 upls_locked = FALSE;
8909 }
8910
8911 return retval;
8912 }
8913
8914 void
8915 upl_range_needed(
8916 upl_t upl,
8917 int index,
8918 int count)
8919 {
8920 int size_in_pages;
8921
8922 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8923 return;
8924 }
8925
8926 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8927
8928 while (count-- && index < size_in_pages) {
8929 upl->page_list[index++].needed = TRUE;
8930 }
8931 }
8932
8933
8934 /*
8935 * Reserve of virtual addresses in the kernel address space.
8936 * We need to map the physical pages in the kernel, so that we
8937 * can call the code-signing or slide routines with a kernel
8938 * virtual address. We keep this pool of pre-allocated kernel
8939 * virtual addresses so that we don't have to scan the kernel's
8940 * virtaul address space each time we need to work with
8941 * a physical page.
8942 */
8943 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8944 #define VM_PAGING_NUM_PAGES 64
8945 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8946 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8947 int vm_paging_max_index = 0;
8948 int vm_paging_page_waiter = 0;
8949 int vm_paging_page_waiter_total = 0;
8950
8951 unsigned long vm_paging_no_kernel_page = 0;
8952 unsigned long vm_paging_objects_mapped = 0;
8953 unsigned long vm_paging_pages_mapped = 0;
8954 unsigned long vm_paging_objects_mapped_slow = 0;
8955 unsigned long vm_paging_pages_mapped_slow = 0;
8956
8957 __startup_func
8958 static void
8959 vm_paging_map_init(void)
8960 {
8961 kmem_alloc(kernel_map, &vm_paging_base_address,
8962 ptoa(VM_PAGING_NUM_PAGES),
8963 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
8964 VM_KERN_MEMORY_NONE);
8965 }
8966 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
8967
8968 /*
8969 * vm_paging_map_object:
8970 * Maps part of a VM object's pages in the kernel
8971 * virtual address space, using the pre-allocated
8972 * kernel virtual addresses, if possible.
8973 * Context:
8974 * The VM object is locked. This lock will get
8975 * dropped and re-acquired though, so the caller
8976 * must make sure the VM object is kept alive
8977 * (by holding a VM map that has a reference
8978 * on it, for example, or taking an extra reference).
8979 * The page should also be kept busy to prevent
8980 * it from being reclaimed.
8981 */
8982 kern_return_t
8983 vm_paging_map_object(
8984 vm_page_t page,
8985 vm_object_t object,
8986 vm_object_offset_t offset,
8987 vm_prot_t protection,
8988 boolean_t can_unlock_object,
8989 vm_map_size_t *size, /* IN/OUT */
8990 vm_map_offset_t *address, /* OUT */
8991 boolean_t *need_unmap) /* OUT */
8992 {
8993 kern_return_t kr;
8994 vm_map_offset_t page_map_offset;
8995 vm_map_size_t map_size;
8996 vm_object_offset_t object_offset;
8997 int i;
8998
8999 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9000 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9001 *address = (vm_map_offset_t)
9002 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9003 *need_unmap = FALSE;
9004 return KERN_SUCCESS;
9005
9006 assert(page->vmp_busy);
9007 /*
9008 * Use one of the pre-allocated kernel virtual addresses
9009 * and just enter the VM page in the kernel address space
9010 * at that virtual address.
9011 */
9012 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9013
9014 /*
9015 * Try and find an available kernel virtual address
9016 * from our pre-allocated pool.
9017 */
9018 page_map_offset = 0;
9019 for (;;) {
9020 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9021 if (vm_paging_page_inuse[i] == FALSE) {
9022 page_map_offset =
9023 vm_paging_base_address +
9024 (i * PAGE_SIZE);
9025 break;
9026 }
9027 }
9028 if (page_map_offset != 0) {
9029 /* found a space to map our page ! */
9030 break;
9031 }
9032
9033 if (can_unlock_object) {
9034 /*
9035 * If we can afford to unlock the VM object,
9036 * let's take the slow path now...
9037 */
9038 break;
9039 }
9040 /*
9041 * We can't afford to unlock the VM object, so
9042 * let's wait for a space to become available...
9043 */
9044 vm_paging_page_waiter_total++;
9045 vm_paging_page_waiter++;
9046 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9047 if (kr == THREAD_WAITING) {
9048 simple_unlock(&vm_paging_lock);
9049 kr = thread_block(THREAD_CONTINUE_NULL);
9050 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9051 }
9052 vm_paging_page_waiter--;
9053 /* ... and try again */
9054 }
9055
9056 if (page_map_offset != 0) {
9057 /*
9058 * We found a kernel virtual address;
9059 * map the physical page to that virtual address.
9060 */
9061 if (i > vm_paging_max_index) {
9062 vm_paging_max_index = i;
9063 }
9064 vm_paging_page_inuse[i] = TRUE;
9065 simple_unlock(&vm_paging_lock);
9066
9067 page->vmp_pmapped = TRUE;
9068
9069 /*
9070 * Keep the VM object locked over the PMAP_ENTER
9071 * and the actual use of the page by the kernel,
9072 * or this pmap mapping might get undone by a
9073 * vm_object_pmap_protect() call...
9074 */
9075 kr = pmap_enter_check(kernel_pmap,
9076 page_map_offset,
9077 page,
9078 protection,
9079 VM_PROT_NONE,
9080 TRUE);
9081 assert(kr == KERN_SUCCESS);
9082 vm_paging_objects_mapped++;
9083 vm_paging_pages_mapped++;
9084 *address = page_map_offset;
9085 *need_unmap = TRUE;
9086
9087 #if KASAN
9088 kasan_notify_address(page_map_offset, PAGE_SIZE);
9089 #endif
9090
9091 /* all done and mapped, ready to use ! */
9092 return KERN_SUCCESS;
9093 }
9094
9095 /*
9096 * We ran out of pre-allocated kernel virtual
9097 * addresses. Just map the page in the kernel
9098 * the slow and regular way.
9099 */
9100 vm_paging_no_kernel_page++;
9101 simple_unlock(&vm_paging_lock);
9102 }
9103
9104 if (!can_unlock_object) {
9105 *address = 0;
9106 *size = 0;
9107 *need_unmap = FALSE;
9108 return KERN_NOT_SUPPORTED;
9109 }
9110
9111 object_offset = vm_object_trunc_page(offset);
9112 map_size = vm_map_round_page(*size,
9113 VM_MAP_PAGE_MASK(kernel_map));
9114
9115 /*
9116 * Try and map the required range of the object
9117 * in the kernel_map. Given that allocation is
9118 * for pageable memory, it shouldn't contain
9119 * pointers and is mapped into the data range.
9120 */
9121
9122 vm_object_reference_locked(object); /* for the map entry */
9123 vm_object_unlock(object);
9124
9125 kr = vm_map_enter(kernel_map,
9126 address,
9127 map_size,
9128 0,
9129 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9130 object,
9131 object_offset,
9132 FALSE,
9133 protection,
9134 VM_PROT_ALL,
9135 VM_INHERIT_NONE);
9136 if (kr != KERN_SUCCESS) {
9137 *address = 0;
9138 *size = 0;
9139 *need_unmap = FALSE;
9140 vm_object_deallocate(object); /* for the map entry */
9141 vm_object_lock(object);
9142 return kr;
9143 }
9144
9145 *size = map_size;
9146
9147 /*
9148 * Enter the mapped pages in the page table now.
9149 */
9150 vm_object_lock(object);
9151 /*
9152 * VM object must be kept locked from before PMAP_ENTER()
9153 * until after the kernel is done accessing the page(s).
9154 * Otherwise, the pmap mappings in the kernel could be
9155 * undone by a call to vm_object_pmap_protect().
9156 */
9157
9158 for (page_map_offset = 0;
9159 map_size != 0;
9160 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9161 page = vm_page_lookup(object, offset + page_map_offset);
9162 if (page == VM_PAGE_NULL) {
9163 printf("vm_paging_map_object: no page !?");
9164 vm_object_unlock(object);
9165 vm_map_remove(kernel_map, *address, *size);
9166 *address = 0;
9167 *size = 0;
9168 *need_unmap = FALSE;
9169 vm_object_lock(object);
9170 return KERN_MEMORY_ERROR;
9171 }
9172 page->vmp_pmapped = TRUE;
9173
9174 kr = pmap_enter_check(kernel_pmap,
9175 *address + page_map_offset,
9176 page,
9177 protection,
9178 VM_PROT_NONE,
9179 TRUE);
9180 assert(kr == KERN_SUCCESS);
9181 #if KASAN
9182 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9183 #endif
9184 }
9185
9186 vm_paging_objects_mapped_slow++;
9187 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9188
9189 *need_unmap = TRUE;
9190
9191 return KERN_SUCCESS;
9192 }
9193
9194 /*
9195 * vm_paging_unmap_object:
9196 * Unmaps part of a VM object's pages from the kernel
9197 * virtual address space.
9198 * Context:
9199 * The VM object is locked. This lock will get
9200 * dropped and re-acquired though.
9201 */
9202 void
9203 vm_paging_unmap_object(
9204 vm_object_t object,
9205 vm_map_offset_t start,
9206 vm_map_offset_t end)
9207 {
9208 int i;
9209
9210 if ((vm_paging_base_address == 0) ||
9211 (start < vm_paging_base_address) ||
9212 (end > (vm_paging_base_address
9213 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9214 /*
9215 * We didn't use our pre-allocated pool of
9216 * kernel virtual address. Deallocate the
9217 * virtual memory.
9218 */
9219 if (object != VM_OBJECT_NULL) {
9220 vm_object_unlock(object);
9221 }
9222 vm_map_remove(kernel_map, start, end);
9223 if (object != VM_OBJECT_NULL) {
9224 vm_object_lock(object);
9225 }
9226 } else {
9227 /*
9228 * We used a kernel virtual address from our
9229 * pre-allocated pool. Put it back in the pool
9230 * for next time.
9231 */
9232 assert(end - start == PAGE_SIZE);
9233 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9234 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9235
9236 /* undo the pmap mapping */
9237 pmap_remove(kernel_pmap, start, end);
9238
9239 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9240 vm_paging_page_inuse[i] = FALSE;
9241 if (vm_paging_page_waiter) {
9242 thread_wakeup(&vm_paging_page_waiter);
9243 }
9244 simple_unlock(&vm_paging_lock);
9245 }
9246 }
9247
9248
9249 /*
9250 * page->vmp_object must be locked
9251 */
9252 void
9253 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9254 {
9255 if (!queues_locked) {
9256 vm_page_lockspin_queues();
9257 }
9258
9259 page->vmp_free_when_done = FALSE;
9260 /*
9261 * need to drop the laundry count...
9262 * we may also need to remove it
9263 * from the I/O paging queue...
9264 * vm_pageout_throttle_up handles both cases
9265 *
9266 * the laundry and pageout_queue flags are cleared...
9267 */
9268 vm_pageout_throttle_up(page);
9269
9270 if (!queues_locked) {
9271 vm_page_unlock_queues();
9272 }
9273 }
9274
9275 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9276
9277 upl_t
9278 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9279 {
9280 int i = 0;
9281 upl_t upl;
9282
9283 assert(max_upls > 0);
9284 if (max_upls == 0) {
9285 return NULL;
9286 }
9287
9288 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9289 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9290 }
9291 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9292
9293 upl = upl_create(0, UPL_VECTOR, 0);
9294 upl->vector_upl = vector_upl;
9295 upl->u_offset = upl_offset;
9296 vector_upl->size = 0;
9297 vector_upl->offset = upl_offset;
9298 vector_upl->invalid_upls = 0;
9299 vector_upl->num_upls = 0;
9300 vector_upl->pagelist = NULL;
9301 vector_upl->max_upls = max_upls;
9302
9303 for (i = 0; i < max_upls; i++) {
9304 vector_upl->upls[i].iostate.size = 0;
9305 vector_upl->upls[i].iostate.offset = 0;
9306 }
9307 return upl;
9308 }
9309
9310 upl_size_t
9311 vector_upl_get_size(const upl_t upl)
9312 {
9313 if (!vector_upl_is_valid(upl)) {
9314 return upl_get_size(upl);
9315 } else {
9316 return round_page_32(upl->vector_upl->size);
9317 }
9318 }
9319
9320 uint32_t
9321 vector_upl_max_upls(const upl_t upl)
9322 {
9323 if (!vector_upl_is_valid(upl)) {
9324 return 0;
9325 }
9326 return ((vector_upl_t)(upl->vector_upl))->max_upls;
9327 }
9328
9329 void
9330 vector_upl_deallocate(upl_t upl)
9331 {
9332 vector_upl_t vector_upl = upl->vector_upl;
9333
9334 assert(vector_upl_is_valid(upl));
9335
9336 if (vector_upl->invalid_upls != vector_upl->num_upls) {
9337 panic("Deallocating non-empty Vectored UPL");
9338 }
9339 uint32_t max_upls = vector_upl->max_upls;
9340 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9341 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9342 upl->vector_upl = NULL;
9343 }
9344
9345 boolean_t
9346 vector_upl_is_valid(upl_t upl)
9347 {
9348 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9349 }
9350
9351 boolean_t
9352 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9353 {
9354 if (vector_upl_is_valid(upl)) {
9355 vector_upl_t vector_upl = upl->vector_upl;
9356
9357 if (vector_upl) {
9358 if (subupl) {
9359 if (io_size) {
9360 if (io_size < PAGE_SIZE) {
9361 io_size = PAGE_SIZE;
9362 }
9363 subupl->vector_upl = (void*)vector_upl;
9364 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9365 vector_upl->size += io_size;
9366 upl->u_size += io_size;
9367 } else {
9368 uint32_t i = 0, invalid_upls = 0;
9369 for (i = 0; i < vector_upl->num_upls; i++) {
9370 if (vector_upl->upls[i].elem == subupl) {
9371 break;
9372 }
9373 }
9374 if (i == vector_upl->num_upls) {
9375 panic("Trying to remove sub-upl when none exists");
9376 }
9377
9378 vector_upl->upls[i].elem = NULL;
9379 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9380 relaxed);
9381 if (invalid_upls == vector_upl->num_upls) {
9382 return TRUE;
9383 } else {
9384 return FALSE;
9385 }
9386 }
9387 } else {
9388 panic("vector_upl_set_subupl was passed a NULL upl element");
9389 }
9390 } else {
9391 panic("vector_upl_set_subupl was passed a non-vectored upl");
9392 }
9393 } else {
9394 panic("vector_upl_set_subupl was passed a NULL upl");
9395 }
9396
9397 return FALSE;
9398 }
9399
9400 void
9401 vector_upl_set_pagelist(upl_t upl)
9402 {
9403 if (vector_upl_is_valid(upl)) {
9404 uint32_t i = 0;
9405 vector_upl_t vector_upl = upl->vector_upl;
9406
9407 if (vector_upl) {
9408 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9409
9410 vector_upl->pagelist = kalloc_type(struct upl_page_info,
9411 atop(vector_upl->size), Z_WAITOK);
9412
9413 for (i = 0; i < vector_upl->num_upls; i++) {
9414 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9415 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9416 pagelist_size += cur_upl_pagelist_size;
9417 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9418 upl->highest_page = vector_upl->upls[i].elem->highest_page;
9419 }
9420 }
9421 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9422 } else {
9423 panic("vector_upl_set_pagelist was passed a non-vectored upl");
9424 }
9425 } else {
9426 panic("vector_upl_set_pagelist was passed a NULL upl");
9427 }
9428 }
9429
9430 upl_t
9431 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9432 {
9433 if (vector_upl_is_valid(upl)) {
9434 vector_upl_t vector_upl = upl->vector_upl;
9435 if (vector_upl) {
9436 if (index < vector_upl->num_upls) {
9437 return vector_upl->upls[index].elem;
9438 }
9439 } else {
9440 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9441 }
9442 }
9443 return NULL;
9444 }
9445
9446 upl_t
9447 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9448 {
9449 if (vector_upl_is_valid(upl)) {
9450 uint32_t i = 0;
9451 vector_upl_t vector_upl = upl->vector_upl;
9452
9453 if (vector_upl) {
9454 upl_t subupl = NULL;
9455 vector_upl_iostates_t subupl_state;
9456
9457 for (i = 0; i < vector_upl->num_upls; i++) {
9458 subupl = vector_upl->upls[i].elem;
9459 subupl_state = vector_upl->upls[i].iostate;
9460 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9461 /* We could have been passed an offset/size pair that belongs
9462 * to an UPL element that has already been committed/aborted.
9463 * If so, return NULL.
9464 */
9465 if (subupl == NULL) {
9466 return NULL;
9467 }
9468 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9469 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9470 if (*upl_size > subupl_state.size) {
9471 *upl_size = subupl_state.size;
9472 }
9473 }
9474 if (*upl_offset >= subupl_state.offset) {
9475 *upl_offset -= subupl_state.offset;
9476 } else if (i) {
9477 panic("Vector UPL offset miscalculation");
9478 }
9479 return subupl;
9480 }
9481 }
9482 } else {
9483 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9484 }
9485 }
9486 return NULL;
9487 }
9488
9489 void
9490 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9491 {
9492 *v_upl_submap = NULL;
9493
9494 if (vector_upl_is_valid(upl)) {
9495 vector_upl_t vector_upl = upl->vector_upl;
9496 if (vector_upl) {
9497 *v_upl_submap = vector_upl->submap;
9498 *submap_dst_addr = vector_upl->submap_dst_addr;
9499 } else {
9500 panic("vector_upl_get_submap was passed a non-vectored UPL");
9501 }
9502 } else {
9503 panic("vector_upl_get_submap was passed a null UPL");
9504 }
9505 }
9506
9507 void
9508 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9509 {
9510 if (vector_upl_is_valid(upl)) {
9511 vector_upl_t vector_upl = upl->vector_upl;
9512 if (vector_upl) {
9513 vector_upl->submap = submap;
9514 vector_upl->submap_dst_addr = submap_dst_addr;
9515 } else {
9516 panic("vector_upl_get_submap was passed a non-vectored UPL");
9517 }
9518 } else {
9519 panic("vector_upl_get_submap was passed a NULL UPL");
9520 }
9521 }
9522
9523 void
9524 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9525 {
9526 if (vector_upl_is_valid(upl)) {
9527 uint32_t i = 0;
9528 vector_upl_t vector_upl = upl->vector_upl;
9529
9530 if (vector_upl) {
9531 for (i = 0; i < vector_upl->num_upls; i++) {
9532 if (vector_upl->upls[i].elem == subupl) {
9533 break;
9534 }
9535 }
9536
9537 if (i == vector_upl->num_upls) {
9538 panic("setting sub-upl iostate when none exists");
9539 }
9540
9541 vector_upl->upls[i].iostate.offset = offset;
9542 if (size < PAGE_SIZE) {
9543 size = PAGE_SIZE;
9544 }
9545 vector_upl->upls[i].iostate.size = size;
9546 } else {
9547 panic("vector_upl_set_iostate was passed a non-vectored UPL");
9548 }
9549 } else {
9550 panic("vector_upl_set_iostate was passed a NULL UPL");
9551 }
9552 }
9553
9554 void
9555 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9556 {
9557 if (vector_upl_is_valid(upl)) {
9558 uint32_t i = 0;
9559 vector_upl_t vector_upl = upl->vector_upl;
9560
9561 if (vector_upl) {
9562 for (i = 0; i < vector_upl->num_upls; i++) {
9563 if (vector_upl->upls[i].elem == subupl) {
9564 break;
9565 }
9566 }
9567
9568 if (i == vector_upl->num_upls) {
9569 panic("getting sub-upl iostate when none exists");
9570 }
9571
9572 *offset = vector_upl->upls[i].iostate.offset;
9573 *size = vector_upl->upls[i].iostate.size;
9574 } else {
9575 panic("vector_upl_get_iostate was passed a non-vectored UPL");
9576 }
9577 } else {
9578 panic("vector_upl_get_iostate was passed a NULL UPL");
9579 }
9580 }
9581
9582 void
9583 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9584 {
9585 if (vector_upl_is_valid(upl)) {
9586 vector_upl_t vector_upl = upl->vector_upl;
9587 if (vector_upl) {
9588 if (index < vector_upl->num_upls) {
9589 *offset = vector_upl->upls[index].iostate.offset;
9590 *size = vector_upl->upls[index].iostate.size;
9591 } else {
9592 *offset = *size = 0;
9593 }
9594 } else {
9595 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9596 }
9597 } else {
9598 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9599 }
9600 }
9601
9602 void *
9603 upl_get_internal_vectorupl(upl_t upl)
9604 {
9605 return upl->vector_upl;
9606 }
9607
9608 upl_page_info_t *
9609 upl_get_internal_vectorupl_pagelist(upl_t upl)
9610 {
9611 return upl->vector_upl->pagelist;
9612 }
9613
9614 upl_page_info_t *
9615 upl_get_internal_page_list(upl_t upl)
9616 {
9617 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9618 }
9619
9620 void
9621 upl_clear_dirty(
9622 upl_t upl,
9623 boolean_t value)
9624 {
9625 if (value) {
9626 upl->flags |= UPL_CLEAR_DIRTY;
9627 } else {
9628 upl->flags &= ~UPL_CLEAR_DIRTY;
9629 }
9630 }
9631
9632 void
9633 upl_set_referenced(
9634 upl_t upl,
9635 boolean_t value)
9636 {
9637 upl_lock(upl);
9638 if (value) {
9639 upl->ext_ref_count++;
9640 } else {
9641 if (!upl->ext_ref_count) {
9642 panic("upl_set_referenced not %p", upl);
9643 }
9644 upl->ext_ref_count--;
9645 }
9646 upl_unlock(upl);
9647 }
9648
9649 void
9650 upl_set_map_exclusive(upl_t upl)
9651 {
9652 upl_lock(upl);
9653 while (upl->map_addr_owner) {
9654 upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9655 upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9656 }
9657 upl->map_addr_owner = thread_get_ctid(current_thread());
9658 upl_unlock(upl);
9659 }
9660
9661 void
9662 upl_clear_map_exclusive(upl_t upl)
9663 {
9664 assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9665 upl_lock(upl);
9666 if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9667 upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9668 upl_wakeup(&upl->map_addr_owner);
9669 }
9670 upl->map_addr_owner = 0;
9671 upl_unlock(upl);
9672 }
9673
9674 #if CONFIG_IOSCHED
9675 void
9676 upl_set_blkno(
9677 upl_t upl,
9678 vm_offset_t upl_offset,
9679 int io_size,
9680 int64_t blkno)
9681 {
9682 int i, j;
9683 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9684 return;
9685 }
9686
9687 assert(upl->upl_reprio_info != 0);
9688 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9689 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9690 }
9691 }
9692 #endif
9693
9694 void inline
9695 memoryshot(unsigned int event, unsigned int control)
9696 {
9697 if (vm_debug_events) {
9698 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9699 vm_page_active_count, vm_page_inactive_count,
9700 vm_page_free_count, vm_page_speculative_count,
9701 vm_page_throttled_count);
9702 } else {
9703 (void) event;
9704 (void) control;
9705 }
9706 }
9707
9708 #ifdef MACH_BSD
9709
9710 boolean_t
9711 upl_device_page(upl_page_info_t *upl)
9712 {
9713 return UPL_DEVICE_PAGE(upl);
9714 }
9715 boolean_t
9716 upl_page_present(upl_page_info_t *upl, int index)
9717 {
9718 return UPL_PAGE_PRESENT(upl, index);
9719 }
9720 boolean_t
9721 upl_speculative_page(upl_page_info_t *upl, int index)
9722 {
9723 return UPL_SPECULATIVE_PAGE(upl, index);
9724 }
9725 boolean_t
9726 upl_dirty_page(upl_page_info_t *upl, int index)
9727 {
9728 return UPL_DIRTY_PAGE(upl, index);
9729 }
9730 boolean_t
9731 upl_valid_page(upl_page_info_t *upl, int index)
9732 {
9733 return UPL_VALID_PAGE(upl, index);
9734 }
9735 ppnum_t
9736 upl_phys_page(upl_page_info_t *upl, int index)
9737 {
9738 return UPL_PHYS_PAGE(upl, index);
9739 }
9740
9741 void
9742 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9743 {
9744 upl[index].mark = v;
9745 }
9746
9747 boolean_t
9748 upl_page_get_mark(upl_page_info_t *upl, int index)
9749 {
9750 return upl[index].mark;
9751 }
9752
9753 void
9754 vm_countdirtypages(void)
9755 {
9756 vm_page_t m;
9757 int dpages;
9758 int pgopages;
9759 int precpages;
9760
9761
9762 dpages = 0;
9763 pgopages = 0;
9764 precpages = 0;
9765
9766 vm_page_lock_queues();
9767 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9768 do {
9769 if (m == (vm_page_t)0) {
9770 break;
9771 }
9772
9773 if (m->vmp_dirty) {
9774 dpages++;
9775 }
9776 if (m->vmp_free_when_done) {
9777 pgopages++;
9778 }
9779 if (m->vmp_precious) {
9780 precpages++;
9781 }
9782
9783 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9784 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9785 if (m == (vm_page_t)0) {
9786 break;
9787 }
9788 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9789 vm_page_unlock_queues();
9790
9791 vm_page_lock_queues();
9792 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9793 do {
9794 if (m == (vm_page_t)0) {
9795 break;
9796 }
9797
9798 dpages++;
9799 assert(m->vmp_dirty);
9800 assert(!m->vmp_free_when_done);
9801 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9802 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9803 if (m == (vm_page_t)0) {
9804 break;
9805 }
9806 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9807 vm_page_unlock_queues();
9808
9809 vm_page_lock_queues();
9810 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9811 do {
9812 if (m == (vm_page_t)0) {
9813 break;
9814 }
9815
9816 if (m->vmp_dirty) {
9817 dpages++;
9818 }
9819 if (m->vmp_free_when_done) {
9820 pgopages++;
9821 }
9822 if (m->vmp_precious) {
9823 precpages++;
9824 }
9825
9826 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9827 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9828 if (m == (vm_page_t)0) {
9829 break;
9830 }
9831 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9832 vm_page_unlock_queues();
9833
9834 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9835
9836 dpages = 0;
9837 pgopages = 0;
9838 precpages = 0;
9839
9840 vm_page_lock_queues();
9841 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9842
9843 do {
9844 if (m == (vm_page_t)0) {
9845 break;
9846 }
9847 if (m->vmp_dirty) {
9848 dpages++;
9849 }
9850 if (m->vmp_free_when_done) {
9851 pgopages++;
9852 }
9853 if (m->vmp_precious) {
9854 precpages++;
9855 }
9856
9857 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9858 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9859 if (m == (vm_page_t)0) {
9860 break;
9861 }
9862 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9863 vm_page_unlock_queues();
9864
9865 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9866 }
9867 #endif /* MACH_BSD */
9868
9869
9870 #if CONFIG_IOSCHED
9871 int
9872 upl_get_cached_tier(upl_t upl)
9873 {
9874 assert(upl);
9875 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9876 return upl->upl_priority;
9877 }
9878 return -1;
9879 }
9880 #endif /* CONFIG_IOSCHED */
9881
9882
9883 void
9884 upl_callout_iodone(upl_t upl)
9885 {
9886 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9887
9888 if (upl_ctx) {
9889 void (*iodone_func)(void *, int) = upl_ctx->io_done;
9890
9891 assert(upl_ctx->io_done);
9892
9893 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9894 }
9895 }
9896
9897 void
9898 upl_set_iodone(upl_t upl, void *upl_iodone)
9899 {
9900 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9901 }
9902
9903 void
9904 upl_set_iodone_error(upl_t upl, int error)
9905 {
9906 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9907
9908 if (upl_ctx) {
9909 upl_ctx->io_error = error;
9910 }
9911 }
9912
9913
9914 ppnum_t
9915 upl_get_highest_page(
9916 upl_t upl)
9917 {
9918 return upl->highest_page;
9919 }
9920
9921 upl_size_t
9922 upl_get_size(
9923 upl_t upl)
9924 {
9925 return upl_adjusted_size(upl, PAGE_MASK);
9926 }
9927
9928 upl_size_t
9929 upl_adjusted_size(
9930 upl_t upl,
9931 vm_map_offset_t pgmask)
9932 {
9933 vm_object_offset_t start_offset, end_offset;
9934
9935 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9936 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9937
9938 return (upl_size_t)(end_offset - start_offset);
9939 }
9940
9941 vm_object_offset_t
9942 upl_adjusted_offset(
9943 upl_t upl,
9944 vm_map_offset_t pgmask)
9945 {
9946 return trunc_page_mask_64(upl->u_offset, pgmask);
9947 }
9948
9949 vm_object_offset_t
9950 upl_get_data_offset(
9951 upl_t upl)
9952 {
9953 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9954 }
9955
9956 upl_t
9957 upl_associated_upl(upl_t upl)
9958 {
9959 return upl->associated_upl;
9960 }
9961
9962 void
9963 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9964 {
9965 upl->associated_upl = associated_upl;
9966 }
9967
9968 struct vnode *
9969 upl_lookup_vnode(upl_t upl)
9970 {
9971 if (!upl->map_object->internal) {
9972 return vnode_pager_lookup_vnode(upl->map_object->pager);
9973 } else {
9974 return NULL;
9975 }
9976 }
9977
9978 boolean_t
9979 upl_has_wired_pages(upl_t upl)
9980 {
9981 return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
9982 }
9983
9984 #if UPL_DEBUG
9985 kern_return_t
9986 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
9987 {
9988 upl->ubc_alias1 = alias1;
9989 upl->ubc_alias2 = alias2;
9990 return KERN_SUCCESS;
9991 }
9992 int
9993 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
9994 {
9995 if (al) {
9996 *al = upl->ubc_alias1;
9997 }
9998 if (al2) {
9999 *al2 = upl->ubc_alias2;
10000 }
10001 return KERN_SUCCESS;
10002 }
10003 #endif /* UPL_DEBUG */
10004
10005 #if VM_PRESSURE_EVENTS
10006 /*
10007 * Upward trajectory.
10008 */
10009
10010 boolean_t
10011 VM_PRESSURE_NORMAL_TO_WARNING(void)
10012 {
10013 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10014 /* Available pages below our threshold */
10015 uint32_t available_pages = memorystatus_get_available_page_count();
10016 if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10017 #if CONFIG_FREEZE
10018 /* No frozen processes to kill */
10019 if (memorystatus_frozen_count == 0) {
10020 /* Not enough suspended processes available. */
10021 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10022 return TRUE;
10023 }
10024 }
10025 #else /* CONFIG_FREEZE */
10026 return TRUE;
10027 #endif /* CONFIG_FREEZE */
10028 }
10029 return FALSE;
10030 } else {
10031 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10032 }
10033 }
10034
10035 boolean_t
10036 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10037 {
10038 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10039 /* Available pages below our threshold */
10040 uint32_t available_pages = memorystatus_get_available_page_count();
10041 return available_pages < memorystatus_get_critical_page_shortage_threshold();
10042 } else {
10043 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10044 }
10045 }
10046
10047 /*
10048 * Downward trajectory.
10049 */
10050 boolean_t
10051 VM_PRESSURE_WARNING_TO_NORMAL(void)
10052 {
10053 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10054 /* Available pages above our threshold */
10055 uint32_t available_pages = memorystatus_get_available_page_count();
10056 uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10057 return available_pages > target_threshold;
10058 } else {
10059 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10060 }
10061 }
10062
10063 boolean_t
10064 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10065 {
10066 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10067 uint32_t available_pages = memorystatus_get_available_page_count();
10068 uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10069 return available_pages > target_threshold;
10070 } else {
10071 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10072 }
10073 }
10074 #endif /* VM_PRESSURE_EVENTS */
10075
10076 #if DEVELOPMENT || DEBUG
10077 bool compressor_running_perf_test;
10078 uint64_t compressor_perf_test_pages_processed;
10079
10080 static kern_return_t
10081 move_pages_to_queue(
10082 vm_map_t map,
10083 user_addr_t start_addr,
10084 size_t buffer_size,
10085 vm_page_queue_head_t *queue,
10086 size_t *pages_moved)
10087 {
10088 kern_return_t err = KERN_SUCCESS;
10089 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10090 boolean_t addr_in_map = FALSE;
10091 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10092 vm_object_t curr_object = VM_OBJECT_NULL;
10093 *pages_moved = 0;
10094
10095
10096 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10097 /*
10098 * We don't currently support benchmarking maps with a different page size
10099 * than the kernel.
10100 */
10101 return KERN_INVALID_ARGUMENT;
10102 }
10103
10104 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10105 return KERN_INVALID_ARGUMENT;
10106 }
10107
10108 vm_map_lock_read(map);
10109 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10110 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10111
10112
10113 while (curr_addr < end_addr) {
10114 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10115 if (!addr_in_map) {
10116 err = KERN_INVALID_ARGUMENT;
10117 break;
10118 }
10119 curr_object = VME_OBJECT(curr_entry);
10120 if (curr_object) {
10121 vm_object_lock(curr_object);
10122 /* We really only want anonymous memory that's in the top level map and object here. */
10123 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10124 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10125 err = KERN_INVALID_ARGUMENT;
10126 vm_object_unlock(curr_object);
10127 break;
10128 }
10129 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10130 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10131 (curr_entry->vme_start + VME_OFFSET(curr_entry));
10132 vm_map_offset_t curr_offset = start_offset;
10133 vm_page_t curr_page;
10134 while (curr_offset < end_offset) {
10135 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10136 if (curr_page != VM_PAGE_NULL) {
10137 vm_page_lock_queues();
10138 if (curr_page->vmp_laundry) {
10139 vm_pageout_steal_laundry(curr_page, TRUE);
10140 }
10141 /*
10142 * we've already factored out pages in the laundry which
10143 * means this page can't be on the pageout queue so it's
10144 * safe to do the vm_page_queues_remove
10145 */
10146 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10147 vm_page_queues_remove(curr_page, TRUE);
10148 if (donate) {
10149 /*
10150 * The compressor needs to see this bit to know
10151 * where this page needs to land. Also if stolen,
10152 * this bit helps put the page back in the right
10153 * special queue where it belongs.
10154 */
10155 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10156 }
10157 // Clear the referenced bit so we ensure this gets paged out
10158 curr_page->vmp_reference = false;
10159 if (curr_page->vmp_pmapped) {
10160 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10161 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10162 }
10163 vm_page_queue_enter(queue, curr_page, vmp_pageq);
10164 vm_page_unlock_queues();
10165 *pages_moved += 1;
10166 }
10167 curr_offset += PAGE_SIZE_64;
10168 curr_addr += PAGE_SIZE_64;
10169 }
10170 }
10171 vm_object_unlock(curr_object);
10172 }
10173 vm_map_unlock_read(map);
10174 return err;
10175 }
10176
10177 /*
10178 * Local queue for processing benchmark pages.
10179 * Can't be allocated on the stack because the pointer has to
10180 * be packable.
10181 */
10182 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10183 kern_return_t
10184 run_compressor_perf_test(
10185 user_addr_t buf,
10186 size_t buffer_size,
10187 uint64_t *time,
10188 uint64_t *bytes_compressed,
10189 uint64_t *compressor_growth)
10190 {
10191 kern_return_t err = KERN_SUCCESS;
10192 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10193 return KERN_NOT_SUPPORTED;
10194 }
10195 if (current_task() == kernel_task) {
10196 return KERN_INVALID_ARGUMENT;
10197 }
10198 vm_page_lock_queues();
10199 if (compressor_running_perf_test) {
10200 /* Only run one instance of the benchmark at a time. */
10201 vm_page_unlock_queues();
10202 return KERN_RESOURCE_SHORTAGE;
10203 }
10204 vm_page_unlock_queues();
10205 size_t page_count = 0;
10206 vm_map_t map;
10207 vm_page_t p, next;
10208 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10209 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10210 *bytes_compressed = *compressor_growth = 0;
10211
10212 vm_page_queue_init(&compressor_perf_test_queue);
10213 map = current_task()->map;
10214 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10215 if (err != KERN_SUCCESS) {
10216 goto out;
10217 }
10218
10219 vm_page_lock_queues();
10220 compressor_running_perf_test = true;
10221 compressor_perf_test_pages_processed = 0;
10222 /*
10223 * At this point the compressor threads should only process the benchmark queue
10224 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10225 * to determine how many compressed bytes we ended up using.
10226 */
10227 compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10228 vm_page_unlock_queues();
10229
10230 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10231
10232 vm_page_lock_queues();
10233 compressor_perf_test_start = mach_absolute_time();
10234
10235 // Wake up the compressor thread(s)
10236 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10237 pgo_iothread_internal_state[0].pgo_iothread);
10238
10239 /*
10240 * Depending on when this test is run we could overshoot or be right on the mark
10241 * with our page_count. So the comparison is of the _less than_ variety.
10242 */
10243 while (compressor_perf_test_pages_processed < page_count) {
10244 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10245 vm_page_unlock_queues();
10246 thread_block(THREAD_CONTINUE_NULL);
10247 vm_page_lock_queues();
10248 }
10249 compressor_perf_test_end = mach_absolute_time();
10250 compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10251 vm_page_unlock_queues();
10252
10253
10254 out:
10255 /*
10256 * If we errored out above, then we could still have some pages
10257 * on the local queue. Make sure to put them back on the active queue before
10258 * returning so they're not orphaned.
10259 */
10260 vm_page_lock_queues();
10261 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10262 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10263 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10264 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10265
10266 vm_page_enqueue_active(p, FALSE);
10267 p = next;
10268 }
10269
10270 compressor_running_perf_test = false;
10271 vm_page_unlock_queues();
10272 if (err == KERN_SUCCESS) {
10273 *bytes_compressed = page_count * PAGE_SIZE_64;
10274 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
10275 }
10276
10277 /*
10278 * pageout_scan will consider waking the compactor swapper
10279 * before it blocks. Do the same thing here before we return
10280 * to ensure that back to back benchmark runs can't overly fragment the
10281 * compressor pool.
10282 */
10283 vm_consider_waking_compactor_swapper();
10284 return err;
10285 }
10286 #endif /* DEVELOPMENT || DEBUG */
10287