1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69
70 #include <debug.h>
71
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92
93 #include <os/log.h>
94
95 #include <sys/kdebug_triage.h>
96
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115
116 #include <san/kasan.h>
117 #include <sys/kern_memorystatus_xnu.h>
118
119 #if CONFIG_PHANTOM_CACHE
120 #include <vm/vm_phantom_cache_internal.h>
121 #endif
122
123 #if UPL_DEBUG
124 #include <libkern/OSDebug.h>
125 #endif
126
127 extern int cs_debug;
128
129 #if CONFIG_MBUF_MCACHE
130 extern void mbuf_drain(boolean_t);
131 #endif /* CONFIG_MBUF_MCACHE */
132
133 #if CONFIG_FREEZE
134 extern unsigned int memorystatus_frozen_count;
135 extern unsigned int memorystatus_suspended_count;
136 #endif /* CONFIG_FREEZE */
137 extern vm_pressure_level_t memorystatus_vm_pressure_level;
138
139 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
140 extern uint32_t memorystatus_jetsam_fg_band_waiters;
141 extern uint32_t memorystatus_jetsam_bg_band_waiters;
142
143 void vm_pressure_response(void);
144 extern void consider_vm_pressure_events(void);
145
146 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
147
148 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
149 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
150 #if CONFIG_VPS_DYNAMIC_PRIO
151 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
152 #else
153 const bool vps_dynamic_priority_enabled = false;
154 #endif
155 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
156
157 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
158 #if !XNU_TARGET_OS_OSX
159 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
160 #else /* !XNU_TARGET_OS_OSX */
161 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
162 #endif /* !XNU_TARGET_OS_OSX */
163 #endif
164
165 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
166 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
167 #endif
168
169 #ifndef VM_PAGE_LAUNDRY_MAX
170 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
171 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
172
173 #ifndef VM_PAGEOUT_BURST_WAIT
174 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
175 #endif /* VM_PAGEOUT_BURST_WAIT */
176
177 #ifndef VM_PAGEOUT_EMPTY_WAIT
178 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
179 #endif /* VM_PAGEOUT_EMPTY_WAIT */
180
181 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
182 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
183 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
184
185 #ifndef VM_PAGEOUT_IDLE_WAIT
186 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
187 #endif /* VM_PAGEOUT_IDLE_WAIT */
188
189 #ifndef VM_PAGEOUT_SWAP_WAIT
190 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
191 #endif /* VM_PAGEOUT_SWAP_WAIT */
192
193 /*
194 * vm_page_max_speculative_age_q should be less than or equal to
195 * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
196 * vm_page_queue_speculative entries.
197 */
198
199 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
200 #ifndef VM_PAGE_SPECULATIVE_TARGET
201 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
202 #endif /* VM_PAGE_SPECULATIVE_TARGET */
203
204
205 /*
206 * To obtain a reasonable LRU approximation, the inactive queue
207 * needs to be large enough to give pages on it a chance to be
208 * referenced a second time. This macro defines the fraction
209 * of active+inactive pages that should be inactive.
210 * The pageout daemon uses it to update vm_page_inactive_target.
211 *
212 * If vm_page_free_count falls below vm_page_free_target and
213 * vm_page_inactive_count is below vm_page_inactive_target,
214 * then the pageout daemon starts running.
215 */
216
217 #ifndef VM_PAGE_INACTIVE_TARGET
218 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
219 #endif /* VM_PAGE_INACTIVE_TARGET */
220
221 /*
222 * Once the pageout daemon starts running, it keeps going
223 * until vm_page_free_count meets or exceeds vm_page_free_target.
224 */
225
226 #ifndef VM_PAGE_FREE_TARGET
227 #if !XNU_TARGET_OS_OSX
228 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
229 #else /* !XNU_TARGET_OS_OSX */
230 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
231 #endif /* !XNU_TARGET_OS_OSX */
232 #endif /* VM_PAGE_FREE_TARGET */
233
234
235 /*
236 * The pageout daemon always starts running once vm_page_free_count
237 * falls below vm_page_free_min.
238 */
239
240 #ifndef VM_PAGE_FREE_MIN
241 #if !XNU_TARGET_OS_OSX
242 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
243 #else /* !XNU_TARGET_OS_OSX */
244 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
245 #endif /* !XNU_TARGET_OS_OSX */
246 #endif /* VM_PAGE_FREE_MIN */
247
248 #if !XNU_TARGET_OS_OSX
249 #define VM_PAGE_FREE_RESERVED_LIMIT 100
250 #define VM_PAGE_FREE_MIN_LIMIT 1500
251 #define VM_PAGE_FREE_TARGET_LIMIT 2000
252 #else /* !XNU_TARGET_OS_OSX */
253 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
254 #define VM_PAGE_FREE_MIN_LIMIT 3500
255 #define VM_PAGE_FREE_TARGET_LIMIT 4000
256 #endif /* !XNU_TARGET_OS_OSX */
257
258 /*
259 * When vm_page_free_count falls below vm_page_free_reserved,
260 * only vm-privileged threads can allocate pages. vm-privilege
261 * allows the pageout daemon and default pager (and any other
262 * associated threads needed for default pageout) to continue
263 * operation by dipping into the reserved pool of pages.
264 */
265
266 #ifndef VM_PAGE_FREE_RESERVED
267 #define VM_PAGE_FREE_RESERVED(n) \
268 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
269 #endif /* VM_PAGE_FREE_RESERVED */
270
271 /*
272 * When we dequeue pages from the inactive list, they are
273 * reactivated (ie, put back on the active queue) if referenced.
274 * However, it is possible to starve the free list if other
275 * processors are referencing pages faster than we can turn off
276 * the referenced bit. So we limit the number of reactivations
277 * we will make per call of vm_pageout_scan().
278 */
279 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
280
281 #ifndef VM_PAGE_REACTIVATE_LIMIT
282 #if !XNU_TARGET_OS_OSX
283 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
284 #else /* !XNU_TARGET_OS_OSX */
285 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
286 #endif /* !XNU_TARGET_OS_OSX */
287 #endif /* VM_PAGE_REACTIVATE_LIMIT */
288 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
289
290 int vm_pageout_protect_realtime = true;
291
292 extern boolean_t hibernate_cleaning_in_progress;
293
294 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
295 struct pgo_iothread_state pgo_iothread_external_state;
296
297 #if VM_PRESSURE_EVENTS
298 void vm_pressure_thread(void);
299
300 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
301 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
302
303 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
304 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
305 #endif
306
307 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
308 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
309 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
310
311 extern void vm_pageout_continue(void);
312 extern void vm_pageout_scan(void);
313
314 boolean_t vm_pageout_running = FALSE;
315
316 uint32_t vm_page_upl_tainted = 0;
317 uint32_t vm_page_iopl_tainted = 0;
318
319 #if XNU_TARGET_OS_OSX
320 static boolean_t vm_pageout_waiter = FALSE;
321 #endif /* XNU_TARGET_OS_OSX */
322
323
324 #if DEVELOPMENT || DEBUG
325 struct vm_pageout_debug vm_pageout_debug;
326 #endif
327 struct vm_pageout_vminfo vm_pageout_vminfo;
328 struct vm_pageout_state vm_pageout_state;
329 struct vm_config vm_config;
330
331 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
332 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
333 #if DEVELOPMENT || DEBUG
334 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
335 #endif /* DEVELOPMENT || DEBUG */
336
337 int vm_upl_wait_for_pages = 0;
338 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
339
340 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
341
342 int vm_debug_events = 0;
343
344 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
345
346 #if CONFIG_MEMORYSTATUS
347 extern void memorystatus_kill_on_vps_starvation(void);
348
349 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
350 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
351
352 #endif
353
354 #if __AMP__
355
356
357 /*
358 * Bind compressor threads to e-cores unless there are multiple non-e clusters
359 */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_bind_cluster_type(thread_t, char, bool);
371
372 #endif /* __AMP__ */
373
374
375 /*
376 * Routine: vm_pageout_object_terminate
377 * Purpose:
378 * Destroy the pageout_object, and perform all of the
379 * required cleanup actions.
380 *
381 * In/Out conditions:
382 * The object must be locked, and will be returned locked.
383 */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 vm_object_t object)
387 {
388 vm_object_t shadow_object;
389
390 /*
391 * Deal with the deallocation (last reference) of a pageout object
392 * (used for cleaning-in-place) by dropping the paging references/
393 * freeing pages in the original object.
394 */
395
396 assert(object->pageout);
397 shadow_object = object->shadow;
398 vm_object_lock(shadow_object);
399
400 while (!vm_page_queue_empty(&object->memq)) {
401 vm_page_t p, m;
402 vm_object_offset_t offset;
403
404 p = (vm_page_t) vm_page_queue_first(&object->memq);
405
406 assert(p->vmp_private);
407 assert(p->vmp_free_when_done);
408 p->vmp_free_when_done = FALSE;
409 assert(!p->vmp_cleaning);
410 assert(!p->vmp_laundry);
411
412 offset = p->vmp_offset;
413 VM_PAGE_FREE(p);
414 p = VM_PAGE_NULL;
415
416 m = vm_page_lookup(shadow_object,
417 offset + object->vo_shadow_offset);
418
419 if (m == VM_PAGE_NULL) {
420 continue;
421 }
422
423 assert((m->vmp_dirty) || (m->vmp_precious) ||
424 (m->vmp_busy && m->vmp_cleaning));
425
426 /*
427 * Handle the trusted pager throttle.
428 * Also decrement the burst throttle (if external).
429 */
430 vm_page_lock_queues();
431 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 vm_pageout_throttle_up(m);
433 }
434
435 /*
436 * Handle the "target" page(s). These pages are to be freed if
437 * successfully cleaned. Target pages are always busy, and are
438 * wired exactly once. The initial target pages are not mapped,
439 * (so cannot be referenced or modified) but converted target
440 * pages may have been modified between the selection as an
441 * adjacent page and conversion to a target.
442 */
443 if (m->vmp_free_when_done) {
444 assert(m->vmp_busy);
445 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 assert(m->vmp_wire_count == 1);
447 m->vmp_cleaning = FALSE;
448 m->vmp_free_when_done = FALSE;
449 /*
450 * Revoke all access to the page. Since the object is
451 * locked, and the page is busy, this prevents the page
452 * from being dirtied after the pmap_disconnect() call
453 * returns.
454 *
455 * Since the page is left "dirty" but "not modifed", we
456 * can detect whether the page was redirtied during
457 * pageout by checking the modify state.
458 */
459 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 SET_PAGE_DIRTY(m, FALSE);
461 } else {
462 m->vmp_dirty = FALSE;
463 }
464
465 if (m->vmp_dirty) {
466 vm_page_unwire(m, TRUE); /* reactivates */
467 counter_inc(&vm_statistics_reactivations);
468 vm_page_wakeup_done(object, m);
469 } else {
470 vm_page_free(m); /* clears busy, etc. */
471 }
472 vm_page_unlock_queues();
473 continue;
474 }
475 /*
476 * Handle the "adjacent" pages. These pages were cleaned in
477 * place, and should be left alone.
478 * If prep_pin_count is nonzero, then someone is using the
479 * page, so make it active.
480 */
481 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
482 if (m->vmp_reference) {
483 vm_page_activate(m);
484 } else {
485 vm_page_deactivate(m);
486 }
487 }
488 if (m->vmp_overwriting) {
489 /*
490 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 */
492 if (m->vmp_busy) {
493 /*
494 * We do not re-set m->vmp_dirty !
495 * The page was busy so no extraneous activity
496 * could have occurred. COPY_INTO is a read into the
497 * new pages. CLEAN_IN_PLACE does actually write
498 * out the pages but handling outside of this code
499 * will take care of resetting dirty. We clear the
500 * modify however for the Programmed I/O case.
501 */
502 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503
504 m->vmp_busy = FALSE;
505 m->vmp_absent = FALSE;
506 } else {
507 /*
508 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 * Occurs when the original page was wired
510 * at the time of the list request
511 */
512 assert(VM_PAGE_WIRED(m));
513 vm_page_unwire(m, TRUE); /* reactivates */
514 }
515 m->vmp_overwriting = FALSE;
516 } else {
517 m->vmp_dirty = FALSE;
518 }
519 m->vmp_cleaning = FALSE;
520
521 /*
522 * Wakeup any thread waiting for the page to be un-cleaning.
523 */
524 vm_page_wakeup(object, m);
525 vm_page_unlock_queues();
526 }
527 /*
528 * Account for the paging reference taken in vm_paging_object_allocate.
529 */
530 vm_object_activity_end(shadow_object);
531 vm_object_unlock(shadow_object);
532
533 assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 assert(object->paging_in_progress == 0);
535 assert(object->activity_in_progress == 0);
536 assert(object->resident_page_count == 0);
537 return;
538 }
539
540 /*
541 * Routine: vm_pageclean_setup
542 *
543 * Purpose: setup a page to be cleaned (made non-dirty), but not
544 * necessarily flushed from the VM page cache.
545 * This is accomplished by cleaning in place.
546 *
547 * The page must not be busy, and new_object
548 * must be locked.
549 *
550 */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 vm_page_t m,
554 vm_page_t new_m,
555 vm_object_t new_object,
556 vm_object_offset_t new_offset)
557 {
558 assert(!m->vmp_busy);
559 #if 0
560 assert(!m->vmp_cleaning);
561 #endif
562
563 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564
565 /*
566 * Mark original page as cleaning in place.
567 */
568 m->vmp_cleaning = TRUE;
569 SET_PAGE_DIRTY(m, FALSE);
570 m->vmp_precious = FALSE;
571
572 /*
573 * Convert the fictitious page to a private shadow of
574 * the real page.
575 */
576 assert(new_m->vmp_fictitious);
577 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
578 new_m->vmp_fictitious = FALSE;
579 new_m->vmp_private = TRUE;
580 new_m->vmp_free_when_done = TRUE;
581 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
582
583 vm_page_lockspin_queues();
584 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
585 vm_page_unlock_queues();
586
587 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
588 assert(!new_m->vmp_wanted);
589 new_m->vmp_busy = FALSE;
590 }
591
592 /*
593 * Routine: vm_pageout_initialize_page
594 * Purpose:
595 * Causes the specified page to be initialized in
596 * the appropriate memory object. This routine is used to push
597 * pages into a copy-object when they are modified in the
598 * permanent object.
599 *
600 * The page is moved to a temporary object and paged out.
601 *
602 * In/out conditions:
603 * The page in question must not be on any pageout queues.
604 * The object to which it belongs must be locked.
605 * The page must be busy, but not hold a paging reference.
606 *
607 * Implementation:
608 * Move this page to a completely new object.
609 */
610 void
vm_pageout_initialize_page(vm_page_t m)611 vm_pageout_initialize_page(
612 vm_page_t m)
613 {
614 vm_object_t object;
615 vm_object_offset_t paging_offset;
616 memory_object_t pager;
617
618 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
619
620 object = VM_PAGE_OBJECT(m);
621
622 assert(m->vmp_busy);
623 assert(object->internal);
624
625 /*
626 * Verify that we really want to clean this page
627 */
628 assert(!m->vmp_absent);
629 assert(m->vmp_dirty);
630
631 /*
632 * Create a paging reference to let us play with the object.
633 */
634 paging_offset = m->vmp_offset + object->paging_offset;
635
636 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
637 panic("reservation without pageout?"); /* alan */
638
639 VM_PAGE_FREE(m);
640 vm_object_unlock(object);
641
642 return;
643 }
644
645 /*
646 * If there's no pager, then we can't clean the page. This should
647 * never happen since this should be a copy object and therefore not
648 * an external object, so the pager should always be there.
649 */
650
651 pager = object->pager;
652
653 if (pager == MEMORY_OBJECT_NULL) {
654 panic("missing pager for copy object");
655
656 VM_PAGE_FREE(m);
657 return;
658 }
659
660 /*
661 * set the page for future call to vm_fault_list_request
662 */
663 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
664 SET_PAGE_DIRTY(m, FALSE);
665
666 /*
667 * keep the object from collapsing or terminating
668 */
669 vm_object_paging_begin(object);
670 vm_object_unlock(object);
671
672 /*
673 * Write the data to its pager.
674 * Note that the data is passed by naming the new object,
675 * not a virtual address; the pager interface has been
676 * manipulated to use the "internal memory" data type.
677 * [The object reference from its allocation is donated
678 * to the eventual recipient.]
679 */
680 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
681
682 vm_object_lock(object);
683 vm_object_paging_end(object);
684 }
685
686
687 /*
688 * vm_pageout_cluster:
689 *
690 * Given a page, queue it to the appropriate I/O thread,
691 * which will page it out and attempt to clean adjacent pages
692 * in the same operation.
693 *
694 * The object and queues must be locked. We will take a
695 * paging reference to prevent deallocation or collapse when we
696 * release the object lock back at the call site. The I/O thread
697 * is responsible for consuming this reference
698 *
699 * The page must not be on any pageout queue.
700 */
701 #if DEVELOPMENT || DEBUG
702 vmct_stats_t vmct_stats;
703
704 int32_t vmct_active = 0;
705 uint64_t vm_compressor_epoch_start = 0;
706 uint64_t vm_compressor_epoch_stop = 0;
707
708 typedef enum vmct_state_t {
709 VMCT_IDLE,
710 VMCT_AWAKENED,
711 VMCT_ACTIVE,
712 } vmct_state_t;
713 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
714 #endif
715
716
717
718 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)719 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
720 {
721 vm_object_t object = VM_PAGE_OBJECT(m);
722
723 VM_PAGE_CHECK(m);
724 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
725 vm_object_lock_assert_exclusive(object);
726
727 /*
728 * Make sure it's OK to page this out.
729 */
730 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
731 assert(!m->vmp_cleaning && !m->vmp_laundry);
732 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
733
734 /*
735 * protect the object from collapse or termination
736 */
737 vm_object_activity_begin(object);
738
739
740 /*
741 * pgo_laundry count is tied to the laundry bit
742 */
743 m->vmp_laundry = TRUE;
744 q->pgo_laundry++;
745
746 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
747 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
748
749 if (object->internal == TRUE) {
750 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
751 m->vmp_busy = TRUE;
752 #if DEVELOPMENT || DEBUG
753 /*
754 * The benchmark queue will be woken up independently by the benchmark
755 * itself.
756 */
757 if (q != &vm_pageout_queue_benchmark) {
758 #else /* DEVELOPMENT || DEBUG */
759 if (true) {
760 #endif /* DEVELOPMENT || DEBUG */
761 /*
762 * Wake up the first compressor thread. It will wake subsequent
763 * threads if necessary.
764 */
765 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
766 pgo_iothread_internal_state[0].pgo_iothread);
767 }
768 } else {
769 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
770 }
771 VM_PAGE_CHECK(m);
772 }
773
774 void
775 vm_pageout_cluster(vm_page_t m)
776 {
777 struct vm_pageout_queue *q;
778 vm_object_t object = VM_PAGE_OBJECT(m);
779 if (object->internal) {
780 q = &vm_pageout_queue_internal;
781 } else {
782 q = &vm_pageout_queue_external;
783 }
784 vm_pageout_cluster_to_queue(m, q);
785 }
786
787
788 /*
789 * A page is back from laundry or we are stealing it back from
790 * the laundering state. See if there are some pages waiting to
791 * go to laundry and if we can let some of them go now.
792 *
793 * Object and page queues must be locked.
794 */
795 void
796 vm_pageout_throttle_up(
797 vm_page_t m)
798 {
799 struct vm_pageout_queue *q;
800 vm_object_t m_object;
801
802 m_object = VM_PAGE_OBJECT(m);
803
804 assert(m_object != VM_OBJECT_NULL);
805 assert(!is_kernel_object(m_object));
806
807 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
808 vm_object_lock_assert_exclusive(m_object);
809
810 if (m_object->internal == TRUE) {
811 q = &vm_pageout_queue_internal;
812 } else {
813 q = &vm_pageout_queue_external;
814 }
815
816 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
817 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
818 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
819
820 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
821
822 vm_object_activity_end(m_object);
823
824 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
825 }
826 if (m->vmp_laundry == TRUE) {
827 m->vmp_laundry = FALSE;
828 q->pgo_laundry--;
829
830 if (q->pgo_throttled == TRUE) {
831 q->pgo_throttled = FALSE;
832 thread_wakeup((event_t) &q->pgo_laundry);
833 }
834 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
835 q->pgo_draining = FALSE;
836 thread_wakeup((event_t) (&q->pgo_laundry + 1));
837 }
838 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
839 }
840 }
841
842
843 static void
844 vm_pageout_throttle_up_batch(
845 struct vm_pageout_queue *q,
846 int batch_cnt)
847 {
848 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
849
850 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
851
852 q->pgo_laundry -= batch_cnt;
853
854 if (q->pgo_throttled == TRUE) {
855 q->pgo_throttled = FALSE;
856 thread_wakeup((event_t) &q->pgo_laundry);
857 }
858 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
859 q->pgo_draining = FALSE;
860 thread_wakeup((event_t) (&q->pgo_laundry + 1));
861 }
862 }
863
864
865
866 /*
867 * VM memory pressure monitoring.
868 *
869 * vm_pageout_scan() keeps track of the number of pages it considers and
870 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
871 *
872 * compute_memory_pressure() is called every second from compute_averages()
873 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
874 * of recalimed pages in a new vm_pageout_stat[] bucket.
875 *
876 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
877 * The caller provides the number of seconds ("nsecs") worth of statistics
878 * it wants, up to 30 seconds.
879 * It computes the number of pages reclaimed in the past "nsecs" seconds and
880 * also returns the number of pages the system still needs to reclaim at this
881 * moment in time.
882 */
883 #if DEVELOPMENT || DEBUG
884 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
885 #else
886 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
887 #endif
888 struct vm_pageout_stat {
889 unsigned long vm_page_active_count;
890 unsigned long vm_page_speculative_count;
891 unsigned long vm_page_inactive_count;
892 unsigned long vm_page_anonymous_count;
893
894 unsigned long vm_page_free_count;
895 unsigned long vm_page_wire_count;
896 unsigned long vm_page_compressor_count;
897
898 unsigned long vm_page_pages_compressed;
899 unsigned long vm_page_pageable_internal_count;
900 unsigned long vm_page_pageable_external_count;
901 unsigned long vm_page_xpmapped_external_count;
902
903 unsigned int pages_grabbed;
904 unsigned int pages_freed;
905
906 unsigned int pages_compressed;
907 unsigned int pages_grabbed_by_compressor;
908 unsigned int failed_compressions;
909
910 unsigned int pages_evicted;
911 unsigned int pages_purged;
912
913 unsigned int considered;
914 unsigned int considered_bq_internal;
915 unsigned int considered_bq_external;
916
917 unsigned int skipped_external;
918 unsigned int skipped_internal;
919 unsigned int filecache_min_reactivations;
920
921 unsigned int freed_speculative;
922 unsigned int freed_cleaned;
923 unsigned int freed_internal;
924 unsigned int freed_external;
925
926 unsigned int cleaned_dirty_external;
927 unsigned int cleaned_dirty_internal;
928
929 unsigned int inactive_referenced;
930 unsigned int inactive_nolock;
931 unsigned int reactivation_limit_exceeded;
932 unsigned int forced_inactive_reclaim;
933
934 unsigned int throttled_internal_q;
935 unsigned int throttled_external_q;
936
937 unsigned int phantom_ghosts_found;
938 unsigned int phantom_ghosts_added;
939
940 unsigned int vm_page_realtime_count;
941 unsigned int forcereclaimed_sharedcache;
942 unsigned int forcereclaimed_realtime;
943 unsigned int protected_sharedcache;
944 unsigned int protected_realtime;
945 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
946
947 unsigned int vm_pageout_stat_now = 0;
948
949 #define VM_PAGEOUT_STAT_BEFORE(i) \
950 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
951 #define VM_PAGEOUT_STAT_AFTER(i) \
952 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
953
954 #if VM_PAGE_BUCKETS_CHECK
955 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
956 #endif /* VM_PAGE_BUCKETS_CHECK */
957
958
959 void
960 record_memory_pressure(void);
961 void
962 record_memory_pressure(void)
963 {
964 unsigned int vm_pageout_next;
965
966 #if VM_PAGE_BUCKETS_CHECK
967 /* check the consistency of VM page buckets at regular interval */
968 static int counter = 0;
969 if ((++counter % vm_page_buckets_check_interval) == 0) {
970 vm_page_buckets_check();
971 }
972 #endif /* VM_PAGE_BUCKETS_CHECK */
973
974 vm_pageout_state.vm_memory_pressure =
975 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
976 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
977 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
978 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
979
980 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
981
982 /* move "now" forward */
983 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
984
985 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
986
987 vm_pageout_stat_now = vm_pageout_next;
988 }
989
990
991 /*
992 * IMPORTANT
993 * mach_vm_ctl_page_free_wanted() is called indirectly, via
994 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
995 * it must be safe in the restricted stackshot context. Locks and/or
996 * blocking are not allowable.
997 */
998 unsigned int
999 mach_vm_ctl_page_free_wanted(void)
1000 {
1001 unsigned int page_free_target, page_free_count, page_free_wanted;
1002
1003 page_free_target = vm_page_free_target;
1004 page_free_count = vm_page_free_count;
1005 if (page_free_target > page_free_count) {
1006 page_free_wanted = page_free_target - page_free_count;
1007 } else {
1008 page_free_wanted = 0;
1009 }
1010
1011 return page_free_wanted;
1012 }
1013
1014
1015 /*
1016 * IMPORTANT:
1017 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1018 * wait_for_pressure FALSE, so that code path must remain safe in the
1019 * restricted stackshot context. No blocking or locks are allowable.
1020 * on that code path.
1021 */
1022
1023 kern_return_t
1024 mach_vm_pressure_monitor(
1025 boolean_t wait_for_pressure,
1026 unsigned int nsecs_monitored,
1027 unsigned int *pages_reclaimed_p,
1028 unsigned int *pages_wanted_p)
1029 {
1030 wait_result_t wr;
1031 unsigned int vm_pageout_then, vm_pageout_now;
1032 unsigned int pages_reclaimed;
1033 unsigned int units_of_monitor;
1034
1035 units_of_monitor = 8 * nsecs_monitored;
1036 /*
1037 * We don't take the vm_page_queue_lock here because we don't want
1038 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1039 * thread when it's trying to reclaim memory. We don't need fully
1040 * accurate monitoring anyway...
1041 */
1042
1043 if (wait_for_pressure) {
1044 /* wait until there's memory pressure */
1045 while (vm_page_free_count >= vm_page_free_target) {
1046 wr = assert_wait((event_t) &vm_page_free_wanted,
1047 THREAD_INTERRUPTIBLE);
1048 if (wr == THREAD_WAITING) {
1049 wr = thread_block(THREAD_CONTINUE_NULL);
1050 }
1051 if (wr == THREAD_INTERRUPTED) {
1052 return KERN_ABORTED;
1053 }
1054 if (wr == THREAD_AWAKENED) {
1055 /*
1056 * The memory pressure might have already
1057 * been relieved but let's not block again
1058 * and let's report that there was memory
1059 * pressure at some point.
1060 */
1061 break;
1062 }
1063 }
1064 }
1065
1066 /* provide the number of pages the system wants to reclaim */
1067 if (pages_wanted_p != NULL) {
1068 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1069 }
1070
1071 if (pages_reclaimed_p == NULL) {
1072 return KERN_SUCCESS;
1073 }
1074
1075 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1076 vm_pageout_now = vm_pageout_stat_now;
1077 pages_reclaimed = 0;
1078 for (vm_pageout_then =
1079 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1080 vm_pageout_then != vm_pageout_now &&
1081 units_of_monitor-- != 0;
1082 vm_pageout_then =
1083 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1084 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1085 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1086 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1087 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1088 }
1089 *pages_reclaimed_p = pages_reclaimed;
1090
1091 return KERN_SUCCESS;
1092 }
1093
1094
1095
1096 #if DEVELOPMENT || DEBUG
1097
1098 static void
1099 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1100
1101 /*
1102 * condition variable used to make sure there is
1103 * only a single sweep going on at a time
1104 */
1105 bool vm_pageout_disconnect_all_pages_active = false;
1106
1107 void
1108 vm_pageout_disconnect_all_pages()
1109 {
1110 vm_page_lock_queues();
1111
1112 if (vm_pageout_disconnect_all_pages_active) {
1113 vm_page_unlock_queues();
1114 return;
1115 }
1116 vm_pageout_disconnect_all_pages_active = true;
1117
1118 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1119 vm_page_throttled_count);
1120 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1121 vm_page_anonymous_count);
1122 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1123 (vm_page_inactive_count - vm_page_anonymous_count));
1124 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1125 vm_page_active_count);
1126 #ifdef CONFIG_SECLUDED_MEMORY
1127 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1128 vm_page_secluded_count);
1129 #endif /* CONFIG_SECLUDED_MEMORY */
1130 vm_page_unlock_queues();
1131
1132 vm_pageout_disconnect_all_pages_active = false;
1133 }
1134
1135 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1136 void
1137 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1138 {
1139 vm_page_t m;
1140 vm_object_t t_object = NULL;
1141 vm_object_t l_object = NULL;
1142 vm_object_t m_object = NULL;
1143 int delayed_unlock = 0;
1144 int try_failed_count = 0;
1145 int disconnected_count = 0;
1146 int paused_count = 0;
1147 int object_locked_count = 0;
1148
1149 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1150 DBG_FUNC_START),
1151 q, qcount);
1152
1153 while (qcount && !vm_page_queue_empty(q)) {
1154 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1155
1156 m = (vm_page_t) vm_page_queue_first(q);
1157 m_object = VM_PAGE_OBJECT(m);
1158
1159 if (m_object == VM_OBJECT_NULL) {
1160 /*
1161 * Bumped into a free page. This should only happen on the
1162 * secluded queue
1163 */
1164 #if CONFIG_SECLUDED_MEMORY
1165 assert(q == &vm_page_queue_secluded);
1166 #endif /* CONFIG_SECLUDED_MEMORY */
1167 goto reenter_pg_on_q;
1168 }
1169
1170 /*
1171 * check to see if we currently are working
1172 * with the same object... if so, we've
1173 * already got the lock
1174 */
1175 if (m_object != l_object) {
1176 /*
1177 * the object associated with candidate page is
1178 * different from the one we were just working
1179 * with... dump the lock if we still own it
1180 */
1181 if (l_object != NULL) {
1182 vm_object_unlock(l_object);
1183 l_object = NULL;
1184 }
1185 if (m_object != t_object) {
1186 try_failed_count = 0;
1187 }
1188
1189 /*
1190 * Try to lock object; since we've alread got the
1191 * page queues lock, we can only 'try' for this one.
1192 * if the 'try' fails, we need to do a mutex_pause
1193 * to allow the owner of the object lock a chance to
1194 * run...
1195 */
1196 if (!vm_object_lock_try_scan(m_object)) {
1197 if (try_failed_count > 20) {
1198 goto reenter_pg_on_q;
1199 }
1200 vm_page_unlock_queues();
1201 mutex_pause(try_failed_count++);
1202 vm_page_lock_queues();
1203 delayed_unlock = 0;
1204
1205 paused_count++;
1206
1207 t_object = m_object;
1208 continue;
1209 }
1210 object_locked_count++;
1211
1212 l_object = m_object;
1213 }
1214 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1215 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1216 m->vmp_free_when_done) {
1217 /*
1218 * put it back on the head of its queue
1219 */
1220 goto reenter_pg_on_q;
1221 }
1222 if (m->vmp_pmapped == TRUE) {
1223 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1224
1225 disconnected_count++;
1226 }
1227 reenter_pg_on_q:
1228 vm_page_queue_remove(q, m, vmp_pageq);
1229 vm_page_queue_enter(q, m, vmp_pageq);
1230
1231 qcount--;
1232 try_failed_count = 0;
1233
1234 if (delayed_unlock++ > 128) {
1235 if (l_object != NULL) {
1236 vm_object_unlock(l_object);
1237 l_object = NULL;
1238 }
1239 lck_mtx_yield(&vm_page_queue_lock);
1240 delayed_unlock = 0;
1241 }
1242 }
1243 if (l_object != NULL) {
1244 vm_object_unlock(l_object);
1245 l_object = NULL;
1246 }
1247
1248 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1249 DBG_FUNC_END),
1250 q, disconnected_count, object_locked_count, paused_count);
1251 }
1252
1253 extern const char *proc_best_name(struct proc* proc);
1254
1255 int
1256 vm_toggle_task_selfdonate_pages(task_t task)
1257 {
1258 int state = 0;
1259 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1260 printf("VM Donation mode is OFF on the system\n");
1261 return state;
1262 }
1263 if (task != kernel_task) {
1264 task_lock(task);
1265 if (!task->donates_own_pages) {
1266 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1267 task->donates_own_pages = true;
1268 state = 1;
1269 } else if (task->donates_own_pages) {
1270 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1271 task->donates_own_pages = false;
1272 state = 0;
1273 }
1274 task_unlock(task);
1275 }
1276 return state;
1277 }
1278 #endif /* DEVELOPMENT || DEBUG */
1279
1280 void
1281 vm_task_set_selfdonate_pages(task_t task, bool donate)
1282 {
1283 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1284 assert(task != kernel_task);
1285
1286 task_lock(task);
1287 task->donates_own_pages = donate;
1288 task_unlock(task);
1289 }
1290
1291
1292
1293 static size_t
1294 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1295
1296 /*
1297 * condition variable used to make sure there is
1298 * only a single sweep going on at a time
1299 */
1300 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1301
1302
1303 kern_return_t
1304 vm_pageout_anonymous_pages()
1305 {
1306 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1307 size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1308 vm_page_lock_queues();
1309
1310 if (vm_pageout_anonymous_pages_active == TRUE) {
1311 vm_page_unlock_queues();
1312 return KERN_RESOURCE_SHORTAGE;
1313 }
1314 vm_pageout_anonymous_pages_active = TRUE;
1315 vm_page_unlock_queues();
1316
1317 throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1318 anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1319 active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1320
1321 os_log(OS_LOG_DEFAULT,
1322 "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1323 __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1324
1325 if (VM_CONFIG_SWAP_IS_PRESENT) {
1326 vm_consider_swapping();
1327 }
1328
1329 vm_page_lock_queues();
1330 vm_pageout_anonymous_pages_active = FALSE;
1331 vm_page_unlock_queues();
1332 return KERN_SUCCESS;
1333 } else {
1334 return KERN_NOT_SUPPORTED;
1335 }
1336 }
1337
1338
1339 size_t
1340 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1341 {
1342 vm_page_t m;
1343 vm_object_t t_object = NULL;
1344 vm_object_t l_object = NULL;
1345 vm_object_t m_object = NULL;
1346 int delayed_unlock = 0;
1347 int try_failed_count = 0;
1348 int refmod_state;
1349 int pmap_options;
1350 struct vm_pageout_queue *iq;
1351 ppnum_t phys_page;
1352 size_t pages_moved = 0;
1353
1354
1355 iq = &vm_pageout_queue_internal;
1356
1357 vm_page_lock_queues();
1358
1359 #if DEVELOPMENT || DEBUG
1360 if (perf_test) {
1361 iq = &vm_pageout_queue_benchmark;
1362 // ensure the benchmark queue isn't throttled
1363 iq->pgo_maxlaundry = (unsigned int) qcount;
1364 }
1365 #endif /* DEVELOPMENT ||DEBUG */
1366
1367 while (qcount && !vm_page_queue_empty(q)) {
1368 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1369
1370 if (VM_PAGE_Q_THROTTLED(iq)) {
1371 if (l_object != NULL) {
1372 vm_object_unlock(l_object);
1373 l_object = NULL;
1374 }
1375 iq->pgo_draining = TRUE;
1376
1377 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1378 vm_page_unlock_queues();
1379
1380 thread_block(THREAD_CONTINUE_NULL);
1381
1382 vm_page_lock_queues();
1383 delayed_unlock = 0;
1384 continue;
1385 }
1386 m = (vm_page_t) vm_page_queue_first(q);
1387 m_object = VM_PAGE_OBJECT(m);
1388
1389 /*
1390 * check to see if we currently are working
1391 * with the same object... if so, we've
1392 * already got the lock
1393 */
1394 if (m_object != l_object) {
1395 if (!m_object->internal) {
1396 goto reenter_pg_on_q;
1397 }
1398
1399 /*
1400 * the object associated with candidate page is
1401 * different from the one we were just working
1402 * with... dump the lock if we still own it
1403 */
1404 if (l_object != NULL) {
1405 vm_object_unlock(l_object);
1406 l_object = NULL;
1407 }
1408 if (m_object != t_object) {
1409 try_failed_count = 0;
1410 }
1411
1412 /*
1413 * Try to lock object; since we've alread got the
1414 * page queues lock, we can only 'try' for this one.
1415 * if the 'try' fails, we need to do a mutex_pause
1416 * to allow the owner of the object lock a chance to
1417 * run...
1418 */
1419 if (!vm_object_lock_try_scan(m_object)) {
1420 if (try_failed_count > 20) {
1421 goto reenter_pg_on_q;
1422 }
1423 vm_page_unlock_queues();
1424 mutex_pause(try_failed_count++);
1425 vm_page_lock_queues();
1426 delayed_unlock = 0;
1427
1428 t_object = m_object;
1429 continue;
1430 }
1431 l_object = m_object;
1432 }
1433 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1434 /*
1435 * page is not to be cleaned
1436 * put it back on the head of its queue
1437 */
1438 goto reenter_pg_on_q;
1439 }
1440 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1441
1442 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1443 refmod_state = pmap_get_refmod(phys_page);
1444
1445 if (refmod_state & VM_MEM_REFERENCED) {
1446 m->vmp_reference = TRUE;
1447 }
1448 if (refmod_state & VM_MEM_MODIFIED) {
1449 SET_PAGE_DIRTY(m, FALSE);
1450 }
1451 }
1452 if (m->vmp_reference == TRUE) {
1453 m->vmp_reference = FALSE;
1454 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1455 goto reenter_pg_on_q;
1456 }
1457 if (m->vmp_pmapped == TRUE) {
1458 if (m->vmp_dirty || m->vmp_precious) {
1459 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1460 } else {
1461 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1462 }
1463 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1464 if (refmod_state & VM_MEM_MODIFIED) {
1465 SET_PAGE_DIRTY(m, FALSE);
1466 }
1467 }
1468
1469 if (!m->vmp_dirty && !m->vmp_precious) {
1470 vm_page_unlock_queues();
1471 VM_PAGE_FREE(m);
1472 vm_page_lock_queues();
1473 delayed_unlock = 0;
1474
1475 goto next_pg;
1476 }
1477 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1478 if (!m_object->pager_initialized) {
1479 vm_page_unlock_queues();
1480
1481 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1482
1483 if (!m_object->pager_initialized) {
1484 vm_object_compressor_pager_create(m_object);
1485 }
1486
1487 vm_page_lock_queues();
1488 delayed_unlock = 0;
1489 }
1490 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1491 /*
1492 * We dropped the page queues lock above, so
1493 * "m" might no longer be on this queue...
1494 */
1495 if (m != (vm_page_t) vm_page_queue_first(q)) {
1496 continue;
1497 }
1498 goto reenter_pg_on_q;
1499 }
1500 /*
1501 * vm_object_compressor_pager_create will drop the object lock
1502 * which means 'm' may no longer be valid to use
1503 */
1504 continue;
1505 }
1506
1507 if (!perf_test) {
1508 /*
1509 * we've already factored out pages in the laundry which
1510 * means this page can't be on the pageout queue so it's
1511 * safe to do the vm_page_queues_remove
1512 */
1513 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1514 vm_page_queues_remove(m, TRUE);
1515 if (donate) {
1516 /*
1517 * The compressor needs to see this bit to know
1518 * where this page needs to land. Also if stolen,
1519 * this bit helps put the page back in the right
1520 * special queue where it belongs.
1521 */
1522 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1523 }
1524 } else {
1525 vm_page_queue_remove(q, m, vmp_pageq);
1526 }
1527
1528 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1529
1530 vm_pageout_cluster_to_queue(m, iq);
1531
1532 pages_moved++;
1533 goto next_pg;
1534
1535 reenter_pg_on_q:
1536 vm_page_queue_remove(q, m, vmp_pageq);
1537 vm_page_queue_enter(q, m, vmp_pageq);
1538 next_pg:
1539 qcount--;
1540 try_failed_count = 0;
1541
1542 if (delayed_unlock++ > 128) {
1543 if (l_object != NULL) {
1544 vm_object_unlock(l_object);
1545 l_object = NULL;
1546 }
1547 lck_mtx_yield(&vm_page_queue_lock);
1548 delayed_unlock = 0;
1549 }
1550 }
1551 if (l_object != NULL) {
1552 vm_object_unlock(l_object);
1553 l_object = NULL;
1554 }
1555 vm_page_unlock_queues();
1556 return pages_moved;
1557 }
1558
1559
1560
1561 /*
1562 * function in BSD to apply I/O throttle to the pageout thread
1563 */
1564 extern void vm_pageout_io_throttle(void);
1565
1566 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1567 MACRO_BEGIN \
1568 /* \
1569 * If a "reusable" page somehow made it back into \
1570 * the active queue, it's been re-used and is not \
1571 * quite re-usable. \
1572 * If the VM object was "all_reusable", consider it \
1573 * as "all re-used" instead of converting it to \
1574 * "partially re-used", which could be expensive. \
1575 */ \
1576 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1577 if ((m)->vmp_reusable || \
1578 (obj)->all_reusable) { \
1579 vm_object_reuse_pages((obj), \
1580 (m)->vmp_offset, \
1581 (m)->vmp_offset + PAGE_SIZE_64, \
1582 FALSE); \
1583 } \
1584 MACRO_END
1585
1586
1587 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1588 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1589
1590 #define FCS_IDLE 0
1591 #define FCS_DELAYED 1
1592 #define FCS_DEADLOCK_DETECTED 2
1593
1594 struct flow_control {
1595 int state;
1596 mach_timespec_t ts;
1597 };
1598
1599
1600 uint64_t vm_pageout_rejected_bq_internal = 0;
1601 uint64_t vm_pageout_rejected_bq_external = 0;
1602 uint64_t vm_pageout_skipped_bq_internal = 0;
1603 uint64_t vm_pageout_skipped_bq_external = 0;
1604
1605 #define ANONS_GRABBED_LIMIT 2
1606
1607
1608 #if 0
1609 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1610 #endif
1611 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1612
1613 #define VM_PAGEOUT_PB_NO_ACTION 0
1614 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1615 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1616
1617
1618 #if 0
1619 static void
1620 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1621 {
1622 if (*local_freeq) {
1623 vm_page_unlock_queues();
1624
1625 VM_DEBUG_CONSTANT_EVENT(
1626 vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1627 vm_page_free_count, 0, 0, 1);
1628
1629 vm_page_free_list(*local_freeq, TRUE);
1630
1631 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1632 vm_page_free_count, *local_freed, 0, 1);
1633
1634 *local_freeq = NULL;
1635 *local_freed = 0;
1636
1637 vm_page_lock_queues();
1638 } else {
1639 lck_mtx_yield(&vm_page_queue_lock);
1640 }
1641 *delayed_unlock = 1;
1642 }
1643 #endif
1644
1645
1646 static void
1647 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1648 vm_page_t *local_freeq, int *local_freed, int action)
1649 {
1650 vm_page_unlock_queues();
1651
1652 if (*object != NULL) {
1653 vm_object_unlock(*object);
1654 *object = NULL;
1655 }
1656 if (*local_freeq) {
1657 vm_page_free_list(*local_freeq, TRUE);
1658
1659 *local_freeq = NULL;
1660 *local_freed = 0;
1661 }
1662 *delayed_unlock = 1;
1663
1664 switch (action) {
1665 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1666 vm_consider_waking_compactor_swapper();
1667 break;
1668 case VM_PAGEOUT_PB_THREAD_YIELD:
1669 thread_yield_internal(1);
1670 break;
1671 case VM_PAGEOUT_PB_NO_ACTION:
1672 default:
1673 break;
1674 }
1675 vm_page_lock_queues();
1676 }
1677
1678
1679 static struct vm_pageout_vminfo last;
1680
1681 uint64_t last_vm_page_pages_grabbed = 0;
1682
1683 extern uint32_t c_segment_pages_compressed;
1684
1685 extern uint64_t shared_region_pager_reclaimed;
1686 extern struct memory_object_pager_ops shared_region_pager_ops;
1687
1688 void
1689 update_vm_info(void)
1690 {
1691 unsigned long tmp;
1692 uint64_t tmp64;
1693
1694 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1695 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1696 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1697 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1698
1699 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1700 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1701 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1702
1703 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1704 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1705 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1706 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1707 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1708
1709 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1710 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1711 last.vm_pageout_considered_page = tmp;
1712
1713 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1714 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1715 last.vm_pageout_compressions = tmp64;
1716
1717 tmp = vm_pageout_vminfo.vm_compressor_failed;
1718 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1719 last.vm_compressor_failed = tmp;
1720
1721 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1722 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1723 last.vm_compressor_pages_grabbed = tmp64;
1724
1725 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1726 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1727 last.vm_phantom_cache_found_ghost = tmp;
1728
1729 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1730 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1731 last.vm_phantom_cache_added_ghost = tmp;
1732
1733 tmp64 = counter_load(&vm_page_grab_count);
1734 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1735 last_vm_page_pages_grabbed = tmp64;
1736
1737 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1738 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1739 last.vm_page_pages_freed = tmp;
1740
1741 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1742 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1743 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1744 last.vm_pageout_pages_evicted = tmp;
1745
1746 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1747 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1748 last.vm_pageout_pages_purged = tmp;
1749
1750 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1751 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1752 last.vm_pageout_freed_speculative = tmp;
1753
1754 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1755 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1756 last.vm_pageout_freed_external = tmp;
1757
1758 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1759 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1760 last.vm_pageout_inactive_referenced = tmp;
1761
1762 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1763 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1764 last.vm_pageout_scan_inactive_throttled_external = tmp;
1765
1766 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1767 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1768 last.vm_pageout_inactive_dirty_external = tmp;
1769
1770 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1771 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1772 last.vm_pageout_freed_cleaned = tmp;
1773
1774 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1775 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1776 last.vm_pageout_inactive_nolock = tmp;
1777
1778 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1779 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1780 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1781
1782 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1783 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1784 last.vm_pageout_skipped_external = tmp;
1785
1786 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1787 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1788 last.vm_pageout_skipped_internal = tmp;
1789
1790 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1791 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1792 last.vm_pageout_reactivation_limit_exceeded = tmp;
1793
1794 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1795 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1796 last.vm_pageout_inactive_force_reclaim = tmp;
1797
1798 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1799 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1800 last.vm_pageout_freed_internal = tmp;
1801
1802 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1803 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1804 last.vm_pageout_considered_bq_internal = tmp;
1805
1806 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1807 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1808 last.vm_pageout_considered_bq_external = tmp;
1809
1810 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1811 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1812 last.vm_pageout_filecache_min_reactivated = tmp;
1813
1814 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1815 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1816 last.vm_pageout_inactive_dirty_internal = tmp;
1817
1818 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1819 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1820 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1821
1822 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1823 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1824 last.vm_pageout_forcereclaimed_realtime = tmp;
1825
1826 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1827 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1828 last.vm_pageout_protected_sharedcache = tmp;
1829
1830 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1831 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1832 last.vm_pageout_protected_realtime = tmp;
1833 }
1834
1835 KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1836 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1837 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1838 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1839 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1840
1841 KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1842 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1843 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1844 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1845
1846 KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1847 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1848 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1849 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1850 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1851
1852 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1853 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1854 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1855 KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1856 vm_pageout_stats[vm_pageout_stat_now].considered,
1857 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1858 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1859 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1860
1861 KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1862 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1863 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1864 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1865 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1866
1867 KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1868 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1869 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1870 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1871 vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1872
1873 KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1874 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1875 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1876 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1877 vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1878
1879 KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1880 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1881 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1882 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1883 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1884
1885 KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1886 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1887 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1888 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1889 vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1890 }
1891 KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1892 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1893 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1894 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1895 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1896
1897 record_memory_pressure();
1898 }
1899
1900 extern boolean_t hibernation_vmqueues_inspection;
1901
1902 /*
1903 * Return values for functions called by vm_pageout_scan
1904 * that control its flow.
1905 *
1906 * PROCEED -- vm_pageout_scan will keep making forward progress.
1907 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1908 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1909 */
1910
1911 #define VM_PAGEOUT_SCAN_PROCEED (0)
1912 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1913 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1914
1915 /*
1916 * This function is called only from vm_pageout_scan and
1917 * it moves overflow secluded pages (one-at-a-time) to the
1918 * batched 'local' free Q or active Q.
1919 */
1920 static void
1921 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1922 {
1923 #if CONFIG_SECLUDED_MEMORY
1924 /*
1925 * Deal with secluded_q overflow.
1926 */
1927 if (vm_page_secluded_count > vm_page_secluded_target) {
1928 vm_page_t secluded_page;
1929
1930 /*
1931 * SECLUDED_AGING_BEFORE_ACTIVE:
1932 * Excess secluded pages go to the active queue and
1933 * will later go to the inactive queue.
1934 */
1935 assert((vm_page_secluded_count_free +
1936 vm_page_secluded_count_inuse) ==
1937 vm_page_secluded_count);
1938 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1939 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1940
1941 vm_page_queues_remove(secluded_page, FALSE);
1942 assert(!secluded_page->vmp_fictitious);
1943 assert(!VM_PAGE_WIRED(secluded_page));
1944
1945 if (secluded_page->vmp_object == 0) {
1946 /* transfer to free queue */
1947 assert(secluded_page->vmp_busy);
1948 secluded_page->vmp_snext = *local_freeq;
1949 *local_freeq = secluded_page;
1950 *local_freed += 1;
1951 } else {
1952 /* transfer to head of active queue */
1953 vm_page_enqueue_active(secluded_page, FALSE);
1954 secluded_page = VM_PAGE_NULL;
1955 }
1956 }
1957 #else /* CONFIG_SECLUDED_MEMORY */
1958
1959 #pragma unused(local_freeq)
1960 #pragma unused(local_freed)
1961
1962 return;
1963
1964 #endif /* CONFIG_SECLUDED_MEMORY */
1965 }
1966
1967 /*
1968 * This function is called only from vm_pageout_scan and
1969 * it initializes the loop targets for vm_pageout_scan().
1970 */
1971 static void
1972 vps_init_page_targets(void)
1973 {
1974 /*
1975 * LD TODO: Other page targets should be calculated here too.
1976 */
1977 vm_page_anonymous_min = vm_page_inactive_target / 20;
1978
1979 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1980 vm_pageout_state.vm_page_speculative_percentage = 50;
1981 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1982 vm_pageout_state.vm_page_speculative_percentage = 1;
1983 }
1984
1985 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1986 vm_page_inactive_count);
1987 }
1988
1989 /*
1990 * This function is called only from vm_pageout_scan and
1991 * it purges a single VM object at-a-time and will either
1992 * make vm_pageout_scan() restart the loop or keeping moving forward.
1993 */
1994 static int
1995 vps_purge_object()
1996 {
1997 int force_purge;
1998
1999 assert(available_for_purge >= 0);
2000 force_purge = 0; /* no force-purging */
2001
2002 #if VM_PRESSURE_EVENTS
2003 vm_pressure_level_t pressure_level;
2004
2005 pressure_level = memorystatus_vm_pressure_level;
2006
2007 if (pressure_level > kVMPressureNormal) {
2008 if (pressure_level >= kVMPressureCritical) {
2009 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2010 } else if (pressure_level >= kVMPressureUrgent) {
2011 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2012 } else if (pressure_level >= kVMPressureWarning) {
2013 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2014 }
2015 }
2016 #endif /* VM_PRESSURE_EVENTS */
2017
2018 if (available_for_purge || force_purge) {
2019 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2020
2021 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2022 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2023 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2024 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2025 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2026
2027 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2028 }
2029 VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2030 memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2031 }
2032
2033 return VM_PAGEOUT_SCAN_PROCEED;
2034 }
2035
2036 /*
2037 * This function is called only from vm_pageout_scan and
2038 * it will try to age the next speculative Q if the oldest
2039 * one is empty.
2040 */
2041 static int
2042 vps_age_speculative_queue(boolean_t force_speculative_aging)
2043 {
2044 #define DELAY_SPECULATIVE_AGE 1000
2045
2046 /*
2047 * try to pull pages from the aging bins...
2048 * see vm_page_internal.h for an explanation of how
2049 * this mechanism works
2050 */
2051 boolean_t can_steal = FALSE;
2052 int num_scanned_queues;
2053 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2054 mach_timespec_t ts;
2055 struct vm_speculative_age_q *aq;
2056 struct vm_speculative_age_q *sq;
2057
2058 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2059
2060 aq = &vm_page_queue_speculative[speculative_steal_index];
2061
2062 num_scanned_queues = 0;
2063 while (vm_page_queue_empty(&aq->age_q) &&
2064 num_scanned_queues++ != vm_page_max_speculative_age_q) {
2065 speculative_steal_index++;
2066
2067 if (speculative_steal_index > vm_page_max_speculative_age_q) {
2068 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2069 }
2070
2071 aq = &vm_page_queue_speculative[speculative_steal_index];
2072 }
2073
2074 if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2075 /*
2076 * XXX We've scanned all the speculative
2077 * queues but still haven't found one
2078 * that is not empty, even though
2079 * vm_page_speculative_count is not 0.
2080 */
2081 if (!vm_page_queue_empty(&sq->age_q)) {
2082 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2083 }
2084 #if DEVELOPMENT || DEBUG
2085 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2086 #endif
2087 /* readjust... */
2088 vm_page_speculative_count = 0;
2089 /* ... and continue */
2090 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2091 }
2092
2093 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2094 can_steal = TRUE;
2095 } else {
2096 if (!delay_speculative_age) {
2097 mach_timespec_t ts_fully_aged;
2098
2099 ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2100 ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2101 * 1000 * NSEC_PER_USEC;
2102
2103 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2104
2105 clock_sec_t sec;
2106 clock_nsec_t nsec;
2107 clock_get_system_nanotime(&sec, &nsec);
2108 ts.tv_sec = (unsigned int) sec;
2109 ts.tv_nsec = nsec;
2110
2111 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2112 can_steal = TRUE;
2113 } else {
2114 delay_speculative_age++;
2115 }
2116 } else {
2117 delay_speculative_age++;
2118 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2119 delay_speculative_age = 0;
2120 }
2121 }
2122 }
2123 if (can_steal == TRUE) {
2124 vm_page_speculate_ageit(aq);
2125 }
2126
2127 return VM_PAGEOUT_SCAN_PROCEED;
2128 }
2129
2130 /*
2131 * This function is called only from vm_pageout_scan and
2132 * it evicts a single VM object from the cache.
2133 */
2134 static int inline
2135 vps_object_cache_evict(vm_object_t *object_to_unlock)
2136 {
2137 static int cache_evict_throttle = 0;
2138 struct vm_speculative_age_q *sq;
2139
2140 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2141
2142 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2143 int pages_evicted;
2144
2145 if (*object_to_unlock != NULL) {
2146 vm_object_unlock(*object_to_unlock);
2147 *object_to_unlock = NULL;
2148 }
2149 KDBG(0x13001ec | DBG_FUNC_START);
2150
2151 pages_evicted = vm_object_cache_evict(100, 10);
2152
2153 KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2154
2155 if (pages_evicted) {
2156 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2157
2158 VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2159 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2160 memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2161
2162 /*
2163 * we just freed up to 100 pages,
2164 * so go back to the top of the main loop
2165 * and re-evaulate the memory situation
2166 */
2167 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2168 } else {
2169 cache_evict_throttle = 1000;
2170 }
2171 }
2172 if (cache_evict_throttle) {
2173 cache_evict_throttle--;
2174 }
2175
2176 return VM_PAGEOUT_SCAN_PROCEED;
2177 }
2178
2179
2180 /*
2181 * This function is called only from vm_pageout_scan and
2182 * it calculates the filecache min. that needs to be maintained
2183 * as we start to steal pages.
2184 */
2185 static void
2186 vps_calculate_filecache_min(void)
2187 {
2188 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2189
2190 #if CONFIG_JETSAM
2191 /*
2192 * don't let the filecache_min fall below 15% of available memory
2193 * on systems with an active compressor that isn't nearing its
2194 * limits w/r to accepting new data
2195 *
2196 * on systems w/o the compressor/swapper, the filecache is always
2197 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2198 * since most (if not all) of the anonymous pages are in the
2199 * throttled queue (which isn't counted as available) which
2200 * effectively disables this filter
2201 */
2202 if (vm_compressor_low_on_space() || divisor == 0) {
2203 vm_pageout_state.vm_page_filecache_min = 0;
2204 } else {
2205 vm_pageout_state.vm_page_filecache_min =
2206 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2207 }
2208 #else
2209 if (vm_compressor_out_of_space() || divisor == 0) {
2210 vm_pageout_state.vm_page_filecache_min = 0;
2211 } else {
2212 /*
2213 * don't let the filecache_min fall below the specified critical level
2214 */
2215 vm_pageout_state.vm_page_filecache_min =
2216 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2217 }
2218 #endif
2219 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2220 vm_pageout_state.vm_page_filecache_min = 0;
2221 }
2222 }
2223
2224 /*
2225 * This function is called only from vm_pageout_scan and
2226 * it updates the flow control time to detect if VM pageoutscan
2227 * isn't making progress.
2228 */
2229 static void
2230 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2231 {
2232 mach_timespec_t ts;
2233 clock_sec_t sec;
2234 clock_nsec_t nsec;
2235
2236 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2237 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2238 clock_get_system_nanotime(&sec, &nsec);
2239 flow_control->ts.tv_sec = (unsigned int) sec;
2240 flow_control->ts.tv_nsec = nsec;
2241 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2242
2243 flow_control->state = FCS_DELAYED;
2244
2245 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2246 }
2247
2248 /*
2249 * This function is called only from vm_pageout_scan and
2250 * it is the flow control logic of VM pageout scan which
2251 * controls if it should block and for how long.
2252 * Any blocking of vm_pageout_scan happens ONLY in this function.
2253 */
2254 static int
2255 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2256 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2257 {
2258 boolean_t exceeded_burst_throttle = FALSE;
2259 unsigned int msecs = 0;
2260 uint32_t inactive_external_count;
2261 mach_timespec_t ts;
2262 struct vm_pageout_queue *iq;
2263 struct vm_pageout_queue *eq;
2264 struct vm_speculative_age_q *sq;
2265
2266 iq = &vm_pageout_queue_internal;
2267 eq = &vm_pageout_queue_external;
2268 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2269
2270 /*
2271 * Sometimes we have to pause:
2272 * 1) No inactive pages - nothing to do.
2273 * 2) Loop control - no acceptable pages found on the inactive queue
2274 * within the last vm_pageout_burst_inactive_throttle iterations
2275 * 3) Flow control - default pageout queue is full
2276 */
2277 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2278 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2279 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2280 vm_page_queue_empty(&sq->age_q)) {
2281 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2282 msecs = vm_pageout_state.vm_pageout_empty_wait;
2283 } else if (inactive_burst_count >=
2284 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2285 (vm_page_inactive_count +
2286 vm_page_speculative_count))) {
2287 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2288 msecs = vm_pageout_state.vm_pageout_burst_wait;
2289
2290 exceeded_burst_throttle = TRUE;
2291 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2292 VM_DYNAMIC_PAGING_ENABLED()) {
2293 clock_sec_t sec;
2294 clock_nsec_t nsec;
2295
2296 switch (flow_control->state) {
2297 case FCS_IDLE:
2298 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2299 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2300 /*
2301 * since the compressor is running independently of vm_pageout_scan
2302 * let's not wait for it just yet... as long as we have a healthy supply
2303 * of filecache pages to work with, let's keep stealing those.
2304 */
2305 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2306
2307 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2308 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2309 *anons_grabbed = ANONS_GRABBED_LIMIT;
2310 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2311 return VM_PAGEOUT_SCAN_PROCEED;
2312 }
2313 }
2314
2315 vps_flow_control_reset_deadlock_timer(flow_control);
2316 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2317
2318 break;
2319
2320 case FCS_DELAYED:
2321 clock_get_system_nanotime(&sec, &nsec);
2322 ts.tv_sec = (unsigned int) sec;
2323 ts.tv_nsec = nsec;
2324
2325 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2326 /*
2327 * the pageout thread for the default pager is potentially
2328 * deadlocked since the
2329 * default pager queue has been throttled for more than the
2330 * allowable time... we need to move some clean pages or dirty
2331 * pages belonging to the external pagers if they aren't throttled
2332 * vm_page_free_wanted represents the number of threads currently
2333 * blocked waiting for pages... we'll move one page for each of
2334 * these plus a fixed amount to break the logjam... once we're done
2335 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2336 * with a new timeout target since we have no way of knowing
2337 * whether we've broken the deadlock except through observation
2338 * of the queue associated with the default pager... we need to
2339 * stop moving pages and allow the system to run to see what
2340 * state it settles into.
2341 */
2342
2343 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2344 vm_page_free_wanted + vm_page_free_wanted_privileged;
2345 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2346 flow_control->state = FCS_DEADLOCK_DETECTED;
2347 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2348 return VM_PAGEOUT_SCAN_PROCEED;
2349 }
2350 /*
2351 * just resniff instead of trying
2352 * to compute a new delay time... we're going to be
2353 * awakened immediately upon a laundry completion,
2354 * so we won't wait any longer than necessary
2355 */
2356 msecs = vm_pageout_state.vm_pageout_idle_wait;
2357 break;
2358
2359 case FCS_DEADLOCK_DETECTED:
2360 if (*vm_pageout_deadlock_target) {
2361 return VM_PAGEOUT_SCAN_PROCEED;
2362 }
2363
2364 vps_flow_control_reset_deadlock_timer(flow_control);
2365 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2366
2367 break;
2368 }
2369 } else {
2370 /*
2371 * No need to pause...
2372 */
2373 return VM_PAGEOUT_SCAN_PROCEED;
2374 }
2375
2376 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2377
2378 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2379 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2380
2381 if (vm_page_free_count >= vm_page_free_target) {
2382 /*
2383 * we're here because
2384 * 1) someone else freed up some pages while we had
2385 * the queues unlocked above
2386 * and we've hit one of the 3 conditions that
2387 * cause us to pause the pageout scan thread
2388 *
2389 * since we already have enough free pages,
2390 * let's avoid stalling and return normally
2391 *
2392 * before we return, make sure the pageout I/O threads
2393 * are running throttled in case there are still requests
2394 * in the laundry... since we have enough free pages
2395 * we don't need the laundry to be cleaned in a timely
2396 * fashion... so let's avoid interfering with foreground
2397 * activity
2398 *
2399 * we don't want to hold vm_page_queue_free_lock when
2400 * calling vm_pageout_adjust_eq_iothrottle (since it
2401 * may cause other locks to be taken), we do the intitial
2402 * check outside of the lock. Once we take the lock,
2403 * we recheck the condition since it may have changed.
2404 * if it has, no problem, we will make the threads
2405 * non-throttled before actually blocking
2406 */
2407 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2408 }
2409 vm_free_page_lock();
2410
2411 if (vm_page_free_count >= vm_page_free_target &&
2412 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2413 return VM_PAGEOUT_SCAN_DONE_RETURN;
2414 }
2415 vm_free_page_unlock();
2416
2417 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2418 /*
2419 * we're most likely about to block due to one of
2420 * the 3 conditions that cause vm_pageout_scan to
2421 * not be able to make forward progress w/r
2422 * to providing new pages to the free queue,
2423 * so unthrottle the I/O threads in case we
2424 * have laundry to be cleaned... it needs
2425 * to be completed ASAP.
2426 *
2427 * even if we don't block, we want the io threads
2428 * running unthrottled since the sum of free +
2429 * clean pages is still under our free target
2430 */
2431 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2432 }
2433 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2434 /*
2435 * if we get here we're below our free target and
2436 * we're stalling due to a full laundry queue or
2437 * we don't have any inactive pages other then
2438 * those in the clean queue...
2439 * however, we have pages on the clean queue that
2440 * can be moved to the free queue, so let's not
2441 * stall the pageout scan
2442 */
2443 flow_control->state = FCS_IDLE;
2444 return VM_PAGEOUT_SCAN_PROCEED;
2445 }
2446 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2447 flow_control->state = FCS_IDLE;
2448 return VM_PAGEOUT_SCAN_PROCEED;
2449 }
2450
2451 VM_CHECK_MEMORYSTATUS;
2452
2453 if (flow_control->state != FCS_IDLE) {
2454 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2455 }
2456
2457 iq->pgo_throttled = TRUE;
2458 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2459
2460 vm_page_unlock_queues();
2461
2462 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2463
2464 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2465 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2466 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2467
2468 thread_block(THREAD_CONTINUE_NULL);
2469
2470 VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2471 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2472 memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2473
2474 vm_page_lock_queues();
2475
2476 iq->pgo_throttled = FALSE;
2477
2478 vps_init_page_targets();
2479
2480 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2481 }
2482
2483 extern boolean_t vm_darkwake_mode;
2484 /*
2485 * This function is called only from vm_pageout_scan and
2486 * it will find and return the most appropriate page to be
2487 * reclaimed.
2488 */
2489 static int
2490 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2491 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2492 {
2493 vm_page_t m = NULL;
2494 vm_object_t m_object = VM_OBJECT_NULL;
2495 uint32_t inactive_external_count;
2496 struct vm_speculative_age_q *sq;
2497 struct vm_pageout_queue *iq;
2498 int retval = VM_PAGEOUT_SCAN_PROCEED;
2499
2500 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2501 iq = &vm_pageout_queue_internal;
2502
2503 *is_page_from_bg_q = FALSE;
2504
2505 m = NULL;
2506 m_object = VM_OBJECT_NULL;
2507
2508 if (VM_DYNAMIC_PAGING_ENABLED()) {
2509 assert(vm_page_throttled_count == 0);
2510 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2511 }
2512
2513 /*
2514 * Try for a clean-queue inactive page.
2515 * These are pages that vm_pageout_scan tried to steal earlier, but
2516 * were dirty and had to be cleaned. Pick them up now that they are clean.
2517 */
2518 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2519 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2520
2521 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2522
2523 goto found_page;
2524 }
2525
2526 /*
2527 * The next most eligible pages are ones we paged in speculatively,
2528 * but which have not yet been touched and have been aged out.
2529 */
2530 if (!vm_page_queue_empty(&sq->age_q)) {
2531 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2532
2533 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2534
2535 if (!m->vmp_dirty || force_anonymous == FALSE) {
2536 goto found_page;
2537 } else {
2538 m = NULL;
2539 }
2540 }
2541
2542 #if !CONFIG_JETSAM
2543 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2544 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2545 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2546 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2547 goto found_page;
2548 }
2549 }
2550 #endif /* !CONFIG_JETSAM */
2551
2552 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2553 vm_object_t bg_m_object = NULL;
2554
2555 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2556
2557 bg_m_object = VM_PAGE_OBJECT(m);
2558
2559 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2560 /*
2561 * This page is on the background queue
2562 * but not on a pageable queue OR is busy during
2563 * darkwake mode when the target is artificially lowered.
2564 * If it is busy during darkwake mode, and we don't skip it,
2565 * we will just swing back around and try again with the same
2566 * queue and might hit the same page or its neighbor in a
2567 * similar state. Both of these are transient states and will
2568 * get resolved, but, at this point let's ignore this page.
2569 */
2570 if (vm_darkwake_mode && m->vmp_busy) {
2571 if (bg_m_object->internal) {
2572 vm_pageout_skipped_bq_internal++;
2573 } else {
2574 vm_pageout_skipped_bq_external++;
2575 }
2576 }
2577 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2578 if (bg_m_object->internal &&
2579 (VM_PAGE_Q_THROTTLED(iq) ||
2580 vm_compressor_out_of_space() == TRUE ||
2581 vm_page_free_count < (vm_page_free_reserved / 4))) {
2582 vm_pageout_skipped_bq_internal++;
2583 } else {
2584 *is_page_from_bg_q = TRUE;
2585
2586 if (bg_m_object->internal) {
2587 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2588 } else {
2589 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2590 }
2591 goto found_page;
2592 }
2593 }
2594 }
2595
2596 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2597
2598 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2599 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2600 *grab_anonymous = TRUE;
2601 *anons_grabbed = 0;
2602
2603 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2604 vm_pageout_vminfo.vm_pageout_skipped_external++;
2605 } else {
2606 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2607 /*
2608 * No swap and we are in dangerously low levels of free memory.
2609 * If we keep going ahead with anonymous pages, we are going to run into a situation
2610 * where the compressor will be stuck waiting for free pages (if it isn't already).
2611 *
2612 * So, pick a file backed page...
2613 */
2614 *grab_anonymous = FALSE;
2615 *anons_grabbed = ANONS_GRABBED_LIMIT;
2616 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2617 }
2618 }
2619 goto want_anonymous;
2620 }
2621 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2622
2623 #if CONFIG_JETSAM
2624 /* If the file-backed pool has accumulated
2625 * significantly more pages than the jetsam
2626 * threshold, prefer to reclaim those
2627 * inline to minimise compute overhead of reclaiming
2628 * anonymous pages.
2629 * This calculation does not account for the CPU local
2630 * external page queues, as those are expected to be
2631 * much smaller relative to the global pools.
2632 */
2633
2634 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2635
2636 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2637 if (vm_page_pageable_external_count >
2638 vm_pageout_state.vm_page_filecache_min) {
2639 if ((vm_page_pageable_external_count *
2640 vm_pageout_memorystatus_fb_factor_dr) >
2641 (memorystatus_get_critical_page_shortage_threshold() *
2642 vm_pageout_memorystatus_fb_factor_nr)) {
2643 *grab_anonymous = FALSE;
2644
2645 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2646 }
2647 }
2648 if (*grab_anonymous) {
2649 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2650 }
2651 }
2652 #endif /* CONFIG_JETSAM */
2653
2654 want_anonymous:
2655 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2656 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2657 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2658
2659 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2660 *anons_grabbed = 0;
2661
2662 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2663 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2664 if ((++(*reactivated_this_call) % 100)) {
2665 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2666
2667 vm_page_activate(m);
2668 counter_inc(&vm_statistics_reactivations);
2669 #if DEVELOPMENT || DEBUG
2670 if (*is_page_from_bg_q == TRUE) {
2671 if (m_object->internal) {
2672 vm_pageout_rejected_bq_internal++;
2673 } else {
2674 vm_pageout_rejected_bq_external++;
2675 }
2676 }
2677 #endif /* DEVELOPMENT || DEBUG */
2678 vm_pageout_state.vm_pageout_inactive_used++;
2679
2680 m = NULL;
2681 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2682
2683 goto found_page;
2684 }
2685
2686 /*
2687 * steal 1 of the file backed pages even if
2688 * we are under the limit that has been set
2689 * for a healthy filecache
2690 */
2691 }
2692 }
2693 goto found_page;
2694 }
2695 }
2696 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2697 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2698
2699 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2700 *anons_grabbed += 1;
2701
2702 goto found_page;
2703 }
2704
2705 m = NULL;
2706
2707 found_page:
2708 *victim_page = m;
2709
2710 return retval;
2711 }
2712
2713 /*
2714 * This function is called only from vm_pageout_scan and
2715 * it will put a page back on the active/inactive queue
2716 * if we can't reclaim it for some reason.
2717 */
2718 static void
2719 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2720 {
2721 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2722 vm_page_enqueue_inactive(m, FALSE);
2723 } else {
2724 vm_page_activate(m);
2725 }
2726
2727 #if DEVELOPMENT || DEBUG
2728 vm_object_t m_object = VM_PAGE_OBJECT(m);
2729
2730 if (page_from_bg_q == TRUE) {
2731 if (m_object->internal) {
2732 vm_pageout_rejected_bq_internal++;
2733 } else {
2734 vm_pageout_rejected_bq_external++;
2735 }
2736 }
2737 #endif /* DEVELOPMENT || DEBUG */
2738 }
2739
2740 /*
2741 * This function is called only from vm_pageout_scan and
2742 * it will try to grab the victim page's VM object (m_object)
2743 * which differs from the previous victim page's object (object).
2744 */
2745 static int
2746 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2747 {
2748 struct vm_speculative_age_q *sq;
2749
2750 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2751
2752 /*
2753 * the object associated with candidate page is
2754 * different from the one we were just working
2755 * with... dump the lock if we still own it
2756 */
2757 if (*object != NULL) {
2758 vm_object_unlock(*object);
2759 *object = NULL;
2760 }
2761 /*
2762 * Try to lock object; since we've alread got the
2763 * page queues lock, we can only 'try' for this one.
2764 * if the 'try' fails, we need to do a mutex_pause
2765 * to allow the owner of the object lock a chance to
2766 * run... otherwise, we're likely to trip over this
2767 * object in the same state as we work our way through
2768 * the queue... clumps of pages associated with the same
2769 * object are fairly typical on the inactive and active queues
2770 */
2771 if (!vm_object_lock_try_scan(m_object)) {
2772 vm_page_t m_want = NULL;
2773
2774 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2775
2776 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2777 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2778 }
2779
2780 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2781
2782 m->vmp_reference = FALSE;
2783
2784 if (!m_object->object_is_shared_cache) {
2785 /*
2786 * don't apply this optimization if this is the shared cache
2787 * object, it's too easy to get rid of very hot and important
2788 * pages...
2789 * m->vmp_object must be stable since we hold the page queues lock...
2790 * we can update the scan_collisions field sans the object lock
2791 * since it is a separate field and this is the only spot that does
2792 * a read-modify-write operation and it is never executed concurrently...
2793 * we can asynchronously set this field to 0 when creating a UPL, so it
2794 * is possible for the value to be a bit non-determistic, but that's ok
2795 * since it's only used as a hint
2796 */
2797 m_object->scan_collisions = 1;
2798 }
2799 if (page_from_bg_q) {
2800 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2801 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2802 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2803 } else if (!vm_page_queue_empty(&sq->age_q)) {
2804 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2805 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2806 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2807 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2808 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2809 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2810 }
2811
2812 /*
2813 * this is the next object we're going to be interested in
2814 * try to make sure its available after the mutex_pause
2815 * returns control
2816 */
2817 if (m_want) {
2818 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2819 }
2820
2821 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2822
2823 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2824 } else {
2825 *object = m_object;
2826 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2827 }
2828
2829 return VM_PAGEOUT_SCAN_PROCEED;
2830 }
2831
2832 /*
2833 * This function is called only from vm_pageout_scan and
2834 * it notices that pageout scan may be rendered ineffective
2835 * due to a FS deadlock and will jetsam a process if possible.
2836 * If jetsam isn't supported, it'll move the page to the active
2837 * queue to try and get some different pages pushed onwards so
2838 * we can try to get out of this scenario.
2839 */
2840 static void
2841 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2842 boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2843 {
2844 struct vm_pageout_queue *eq;
2845 vm_object_t cur_object = VM_OBJECT_NULL;
2846
2847 cur_object = *object;
2848
2849 eq = &vm_pageout_queue_external;
2850
2851 if (cur_object->internal == FALSE) {
2852 /*
2853 * we need to break up the following potential deadlock case...
2854 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2855 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2856 * c) Most of the pages in the inactive queue belong to this file.
2857 *
2858 * we are potentially in this deadlock because...
2859 * a) the external pageout queue is throttled
2860 * b) we're done with the active queue and moved on to the inactive queue
2861 * c) we've got a dirty external page
2862 *
2863 * since we don't know the reason for the external pageout queue being throttled we
2864 * must suspect that we are deadlocked, so move the current page onto the active queue
2865 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2866 *
2867 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2868 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2869 * pool the next time we select a victim page... if we can make enough new free pages,
2870 * the deadlock will break, the external pageout queue will empty and it will no longer
2871 * be throttled
2872 *
2873 * if we have jetsam configured, keep a count of the pages reactivated this way so
2874 * that we can try to find clean pages in the active/inactive queues before
2875 * deciding to jetsam a process
2876 */
2877 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2878
2879 vm_page_check_pageable_safe(m);
2880 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2881 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2882 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2883 vm_page_active_count++;
2884 vm_page_pageable_external_count++;
2885
2886 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2887
2888 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2889
2890 #pragma unused(force_anonymous)
2891
2892 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2893
2894 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2895 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2896 /*
2897 * Possible deadlock scenario so request jetsam action
2898 */
2899 memorystatus_kill_on_vps_starvation();
2900 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2901 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2902 }
2903 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2904
2905 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2906
2907 *force_anonymous = TRUE;
2908 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2909 } else {
2910 vm_page_activate(m);
2911 counter_inc(&vm_statistics_reactivations);
2912
2913 #if DEVELOPMENT || DEBUG
2914 if (is_page_from_bg_q == TRUE) {
2915 if (cur_object->internal) {
2916 vm_pageout_rejected_bq_internal++;
2917 } else {
2918 vm_pageout_rejected_bq_external++;
2919 }
2920 }
2921 #endif /* DEVELOPMENT || DEBUG */
2922
2923 vm_pageout_state.vm_pageout_inactive_used++;
2924 }
2925 }
2926
2927
2928 void
2929 vm_page_balance_inactive(int max_to_move)
2930 {
2931 vm_page_t m;
2932
2933 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2934
2935 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2936 /*
2937 * It is likely that the hibernation code path is
2938 * dealing with these very queues as we are about
2939 * to move pages around in/from them and completely
2940 * change the linkage of the pages.
2941 *
2942 * And so we skip the rebalancing of these queues.
2943 */
2944 return;
2945 }
2946 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2947 vm_page_inactive_count +
2948 vm_page_speculative_count);
2949
2950 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2951 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2952
2953 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2954
2955 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2956 assert(!m->vmp_laundry);
2957 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2958 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2959
2960 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2961
2962 /*
2963 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2964 *
2965 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2966 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2967 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2968 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2969 * by pageout_scan, which is just fine since the last reference would have happened quite far
2970 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2971 * have happened before we moved the page
2972 */
2973 if (m->vmp_pmapped == TRUE) {
2974 /*
2975 * We might be holding the page queue lock as a
2976 * spin lock and clearing the "referenced" bit could
2977 * take a while if there are lots of mappings of
2978 * that page, so make sure we acquire the lock as
2979 * as mutex to avoid a spinlock timeout.
2980 */
2981 vm_page_lockconvert_queues();
2982 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2983 }
2984
2985 /*
2986 * The page might be absent or busy,
2987 * but vm_page_deactivate can handle that.
2988 * FALSE indicates that we don't want a H/W clear reference
2989 */
2990 vm_page_deactivate_internal(m, FALSE);
2991 }
2992 }
2993
2994 /*
2995 * vm_pageout_scan does the dirty work for the pageout daemon.
2996 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2997 * held and vm_page_free_wanted == 0.
2998 */
2999 void
3000 vm_pageout_scan(void)
3001 {
3002 unsigned int loop_count = 0;
3003 unsigned int inactive_burst_count = 0;
3004 unsigned int reactivated_this_call;
3005 unsigned int reactivate_limit;
3006 vm_page_t local_freeq = NULL;
3007 int local_freed = 0;
3008 int delayed_unlock;
3009 int delayed_unlock_limit = 0;
3010 int refmod_state = 0;
3011 int vm_pageout_deadlock_target = 0;
3012 struct vm_pageout_queue *iq;
3013 struct vm_pageout_queue *eq;
3014 struct vm_speculative_age_q *sq;
3015 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3016 boolean_t inactive_throttled = FALSE;
3017 vm_object_t object = NULL;
3018 uint32_t inactive_reclaim_run;
3019 boolean_t grab_anonymous = FALSE;
3020 boolean_t force_anonymous = FALSE;
3021 boolean_t force_speculative_aging = FALSE;
3022 int anons_grabbed = 0;
3023 int page_prev_q_state = 0;
3024 boolean_t page_from_bg_q = FALSE;
3025 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3026 vm_object_t m_object = VM_OBJECT_NULL;
3027 int retval = 0;
3028 boolean_t lock_yield_check = FALSE;
3029
3030
3031 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3032 vm_pageout_vminfo.vm_pageout_freed_speculative,
3033 vm_pageout_state.vm_pageout_inactive_clean,
3034 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3035 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3036
3037 flow_control.state = FCS_IDLE;
3038 iq = &vm_pageout_queue_internal;
3039 eq = &vm_pageout_queue_external;
3040 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3041
3042 /* Ask the pmap layer to return any pages it no longer needs. */
3043 pmap_release_pages_fast();
3044
3045 vm_page_lock_queues();
3046
3047 delayed_unlock = 1;
3048
3049 /*
3050 * Calculate the max number of referenced pages on the inactive
3051 * queue that we will reactivate.
3052 */
3053 reactivated_this_call = 0;
3054 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3055 vm_page_inactive_count);
3056 inactive_reclaim_run = 0;
3057
3058 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3059
3060 /*
3061 * We must limit the rate at which we send pages to the pagers
3062 * so that we don't tie up too many pages in the I/O queues.
3063 * We implement a throttling mechanism using the laundry count
3064 * to limit the number of pages outstanding to the default
3065 * and external pagers. We can bypass the throttles and look
3066 * for clean pages if the pageout queues don't drain in a timely
3067 * fashion since this may indicate that the pageout paths are
3068 * stalled waiting for memory, which only we can provide.
3069 */
3070
3071 vps_init_page_targets();
3072 assert(object == NULL);
3073 assert(delayed_unlock != 0);
3074
3075 for (;;) {
3076 vm_page_t m;
3077
3078 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3079
3080 if (lock_yield_check) {
3081 lock_yield_check = FALSE;
3082
3083 if (delayed_unlock++ > delayed_unlock_limit) {
3084 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3085 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3086 } else if (vm_pageout_scan_wants_object) {
3087 vm_page_unlock_queues();
3088 mutex_pause(0);
3089 vm_page_lock_queues();
3090 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3091 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3092 }
3093 }
3094
3095 if (vm_upl_wait_for_pages < 0) {
3096 vm_upl_wait_for_pages = 0;
3097 }
3098
3099 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3100
3101 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3102 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3103 }
3104
3105 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3106
3107 assert(delayed_unlock);
3108
3109 /*
3110 * maintain our balance
3111 */
3112 vm_page_balance_inactive(1);
3113
3114
3115 /**********************************************************************
3116 * above this point we're playing with the active and secluded queues
3117 * below this point we're playing with the throttling mechanisms
3118 * and the inactive queue
3119 **********************************************************************/
3120
3121 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3122 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3123
3124 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3125 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3126 /*
3127 * make sure the pageout I/O threads are running
3128 * throttled in case there are still requests
3129 * in the laundry... since we have met our targets
3130 * we don't need the laundry to be cleaned in a timely
3131 * fashion... so let's avoid interfering with foreground
3132 * activity
3133 */
3134 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3135
3136 vm_free_page_lock();
3137
3138 if ((vm_page_free_count >= vm_page_free_target) &&
3139 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3140 /*
3141 * done - we have met our target *and*
3142 * there is no one waiting for a page.
3143 */
3144 return_from_scan:
3145 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3146
3147 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3148 vm_pageout_state.vm_pageout_inactive,
3149 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3150 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3151 vm_pageout_vminfo.vm_pageout_freed_speculative,
3152 vm_pageout_state.vm_pageout_inactive_clean,
3153 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3154 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3155
3156 return;
3157 }
3158 vm_free_page_unlock();
3159 }
3160
3161 /*
3162 * Before anything, we check if we have any ripe volatile
3163 * objects around. If so, try to purge the first object.
3164 * If the purge fails, fall through to reclaim a page instead.
3165 * If the purge succeeds, go back to the top and reevalute
3166 * the new memory situation.
3167 */
3168 retval = vps_purge_object();
3169
3170 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3171 /*
3172 * Success
3173 */
3174 if (object != NULL) {
3175 vm_object_unlock(object);
3176 object = NULL;
3177 }
3178
3179 lock_yield_check = FALSE;
3180 continue;
3181 }
3182
3183 /*
3184 * If our 'aged' queue is empty and we have some speculative pages
3185 * in the other queues, let's go through and see if we need to age
3186 * them.
3187 *
3188 * If we succeeded in aging a speculative Q or just that everything
3189 * looks normal w.r.t queue age and queue counts, we keep going onward.
3190 *
3191 * If, for some reason, we seem to have a mismatch between the spec.
3192 * page count and the page queues, we reset those variables and
3193 * restart the loop (LD TODO: Track this better?).
3194 */
3195 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3196 retval = vps_age_speculative_queue(force_speculative_aging);
3197
3198 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3199 lock_yield_check = FALSE;
3200 continue;
3201 }
3202 }
3203 force_speculative_aging = FALSE;
3204
3205 /*
3206 * Check to see if we need to evict objects from the cache.
3207 *
3208 * Note: 'object' here doesn't have anything to do with
3209 * the eviction part. We just need to make sure we have dropped
3210 * any object lock we might be holding if we need to go down
3211 * into the eviction logic.
3212 */
3213 retval = vps_object_cache_evict(&object);
3214
3215 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3216 lock_yield_check = FALSE;
3217 continue;
3218 }
3219
3220
3221 /*
3222 * Calculate our filecache_min that will affect the loop
3223 * going forward.
3224 */
3225 vps_calculate_filecache_min();
3226
3227 /*
3228 * LD TODO: Use a structure to hold all state variables for a single
3229 * vm_pageout_scan iteration and pass that structure to this function instead.
3230 */
3231 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3232 &delayed_unlock, &local_freeq, &local_freed,
3233 &vm_pageout_deadlock_target, inactive_burst_count);
3234
3235 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3236 if (loop_count >= vm_page_inactive_count) {
3237 loop_count = 0;
3238 }
3239
3240 inactive_burst_count = 0;
3241
3242 assert(object == NULL);
3243 assert(delayed_unlock != 0);
3244
3245 lock_yield_check = FALSE;
3246 continue;
3247 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3248 goto return_from_scan;
3249 }
3250
3251 flow_control.state = FCS_IDLE;
3252
3253 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3254 vm_pageout_inactive_external_forced_reactivate_limit);
3255 loop_count++;
3256 inactive_burst_count++;
3257 vm_pageout_state.vm_pageout_inactive++;
3258
3259 /*
3260 * Choose a victim.
3261 */
3262
3263 m = NULL;
3264 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3265
3266 if (m == NULL) {
3267 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3268 inactive_burst_count = 0;
3269
3270 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3271 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3272 }
3273
3274 lock_yield_check = TRUE;
3275 continue;
3276 }
3277
3278 /*
3279 * if we've gotten here, we have no victim page.
3280 * check to see if we've not finished balancing the queues
3281 * or we have a page on the aged speculative queue that we
3282 * skipped due to force_anonymous == TRUE.. or we have
3283 * speculative pages that we can prematurely age... if
3284 * one of these cases we'll keep going, else panic
3285 */
3286 force_anonymous = FALSE;
3287 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3288
3289 if (!vm_page_queue_empty(&sq->age_q)) {
3290 lock_yield_check = TRUE;
3291 continue;
3292 }
3293
3294 if (vm_page_speculative_count) {
3295 force_speculative_aging = TRUE;
3296 lock_yield_check = TRUE;
3297 continue;
3298 }
3299 panic("vm_pageout: no victim");
3300
3301 /* NOTREACHED */
3302 }
3303
3304 assert(VM_PAGE_PAGEABLE(m));
3305 m_object = VM_PAGE_OBJECT(m);
3306 force_anonymous = FALSE;
3307
3308 page_prev_q_state = m->vmp_q_state;
3309 /*
3310 * we just found this page on one of our queues...
3311 * it can't also be on the pageout queue, so safe
3312 * to call vm_page_queues_remove
3313 */
3314 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3315 vm_page_queues_remove(m, TRUE);
3316 if (donate) {
3317 /*
3318 * The compressor needs to see this bit to know
3319 * where this page needs to land. Also if stolen,
3320 * this bit helps put the page back in the right
3321 * special queue where it belongs.
3322 */
3323 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3324 }
3325
3326 assert(!m->vmp_laundry);
3327 assert(!m->vmp_private);
3328 assert(!m->vmp_fictitious);
3329 assert(!is_kernel_object(m_object));
3330 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3331
3332 vm_pageout_vminfo.vm_pageout_considered_page++;
3333
3334 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3335
3336 /*
3337 * check to see if we currently are working
3338 * with the same object... if so, we've
3339 * already got the lock
3340 */
3341 if (m_object != object) {
3342 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3343
3344 /*
3345 * vps_switch_object() will always drop the 'object' lock first
3346 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3347 * either 'm_object' or NULL.
3348 */
3349 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3350
3351 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3352 lock_yield_check = TRUE;
3353 continue;
3354 }
3355 }
3356 assert(m_object == object);
3357 assert(VM_PAGE_OBJECT(m) == m_object);
3358
3359 if (m->vmp_busy) {
3360 /*
3361 * Somebody is already playing with this page.
3362 * Put it back on the appropriate queue
3363 *
3364 */
3365 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3366
3367 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3368 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3369 }
3370
3371 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3372
3373 lock_yield_check = TRUE;
3374 continue;
3375 }
3376
3377 /*
3378 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3379 * If already cleaning this page in place
3380 * just leave if off the paging queues.
3381 * We can leave the page mapped, and upl_commit_range
3382 * will put it on the clean queue.
3383 *
3384 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3385 * an msync INVALIDATE is in progress...
3386 * this page has been marked for destruction
3387 * after it has been cleaned,
3388 * but not yet gathered into a UPL
3389 * where 'cleaning' will be set...
3390 * just leave it off the paging queues
3391 *
3392 * if (m->vmp_free_when_done && m->vmp_clenaing)
3393 * an msync INVALIDATE is in progress
3394 * and the UPL has already gathered this page...
3395 * just leave it off the paging queues
3396 */
3397 if (m->vmp_free_when_done || m->vmp_cleaning) {
3398 lock_yield_check = TRUE;
3399 continue;
3400 }
3401
3402
3403 /*
3404 * If it's absent, in error or the object is no longer alive,
3405 * we can reclaim the page... in the no longer alive case,
3406 * there are 2 states the page can be in that preclude us
3407 * from reclaiming it - busy or cleaning - that we've already
3408 * dealt with
3409 */
3410 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3411 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3412 if (m->vmp_absent) {
3413 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3414 } else if (!object->alive ||
3415 (!object->internal &&
3416 object->pager == MEMORY_OBJECT_NULL)) {
3417 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3418 } else {
3419 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3420 }
3421 reclaim_page:
3422 if (vm_pageout_deadlock_target) {
3423 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3424 vm_pageout_deadlock_target--;
3425 }
3426
3427 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3428
3429 if (object->internal) {
3430 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3431 } else {
3432 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3433 }
3434 assert(!m->vmp_cleaning);
3435 assert(!m->vmp_laundry);
3436
3437 if (!object->internal &&
3438 object->pager != NULL &&
3439 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3440 shared_region_pager_reclaimed++;
3441 }
3442
3443 m->vmp_busy = TRUE;
3444
3445 /*
3446 * remove page from object here since we're already
3447 * behind the object lock... defer the rest of the work
3448 * we'd normally do in vm_page_free_prepare_object
3449 * until 'vm_page_free_list' is called
3450 */
3451 if (m->vmp_tabled) {
3452 vm_page_remove(m, TRUE);
3453 }
3454
3455 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3456 m->vmp_snext = local_freeq;
3457 local_freeq = m;
3458 local_freed++;
3459
3460 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3461 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3462 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3463 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3464 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3465 vm_pageout_vminfo.vm_pageout_freed_internal++;
3466 } else {
3467 vm_pageout_vminfo.vm_pageout_freed_external++;
3468 }
3469
3470 inactive_burst_count = 0;
3471
3472 lock_yield_check = TRUE;
3473 continue;
3474 }
3475 if (object->vo_copy == VM_OBJECT_NULL) {
3476 /*
3477 * No one else can have any interest in this page.
3478 * If this is an empty purgable object, the page can be
3479 * reclaimed even if dirty.
3480 * If the page belongs to a volatile purgable object, we
3481 * reactivate it if the compressor isn't active.
3482 */
3483 if (object->purgable == VM_PURGABLE_EMPTY) {
3484 if (m->vmp_pmapped == TRUE) {
3485 /* unmap the page */
3486 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3487 if (refmod_state & VM_MEM_MODIFIED) {
3488 SET_PAGE_DIRTY(m, FALSE);
3489 }
3490 }
3491 if (m->vmp_dirty || m->vmp_precious) {
3492 /* we saved the cost of cleaning this page ! */
3493 vm_page_purged_count++;
3494 }
3495 goto reclaim_page;
3496 }
3497
3498 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3499 /*
3500 * With the VM compressor, the cost of
3501 * reclaiming a page is much lower (no I/O),
3502 * so if we find a "volatile" page, it's better
3503 * to let it get compressed rather than letting
3504 * it occupy a full page until it gets purged.
3505 * So no need to check for "volatile" here.
3506 */
3507 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3508 /*
3509 * Avoid cleaning a "volatile" page which might
3510 * be purged soon.
3511 */
3512
3513 /* if it's wired, we can't put it on our queue */
3514 assert(!VM_PAGE_WIRED(m));
3515
3516 /* just stick it back on! */
3517 reactivated_this_call++;
3518
3519 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3520 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3521 }
3522
3523 goto reactivate_page;
3524 }
3525 } /* vo_copy NULL */
3526 /*
3527 * If it's being used, reactivate.
3528 * (Fictitious pages are either busy or absent.)
3529 * First, update the reference and dirty bits
3530 * to make sure the page is unreferenced.
3531 */
3532 refmod_state = -1;
3533
3534 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3535 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3536
3537 if (refmod_state & VM_MEM_REFERENCED) {
3538 m->vmp_reference = TRUE;
3539 }
3540 if (refmod_state & VM_MEM_MODIFIED) {
3541 SET_PAGE_DIRTY(m, FALSE);
3542 }
3543 }
3544
3545 if (m->vmp_reference || m->vmp_dirty) {
3546 /* deal with a rogue "reusable" page */
3547 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3548 }
3549
3550 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3551 vm_pageout_state.vm_page_xpmapped_min = 0;
3552 } else {
3553 vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3554 vm_pageout_state.vm_page_xpmapped_min_divisor;
3555 }
3556
3557 if (!m->vmp_no_cache &&
3558 page_from_bg_q == FALSE &&
3559 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3560 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3561 /*
3562 * The page we pulled off the inactive list has
3563 * been referenced. It is possible for other
3564 * processors to be touching pages faster than we
3565 * can clear the referenced bit and traverse the
3566 * inactive queue, so we limit the number of
3567 * reactivations.
3568 */
3569 if (++reactivated_this_call >= reactivate_limit &&
3570 !object->object_is_shared_cache &&
3571 !((m->vmp_realtime ||
3572 object->for_realtime) &&
3573 vm_pageout_protect_realtime)) {
3574 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3575 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3576 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3577 if (object->object_is_shared_cache) {
3578 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3579 } else if (m->vmp_realtime ||
3580 object->for_realtime) {
3581 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3582 }
3583 } else {
3584 uint32_t isinuse;
3585
3586 if (reactivated_this_call >= reactivate_limit) {
3587 if (object->object_is_shared_cache) {
3588 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3589 } else if ((m->vmp_realtime ||
3590 object->for_realtime) &&
3591 vm_pageout_protect_realtime) {
3592 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3593 }
3594 }
3595 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3596 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3597 }
3598
3599 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3600 reactivate_page:
3601 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3602 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3603 /*
3604 * no explict mappings of this object exist
3605 * and it's not open via the filesystem
3606 */
3607 vm_page_deactivate(m);
3608 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3609 } else {
3610 /*
3611 * The page was/is being used, so put back on active list.
3612 */
3613 vm_page_activate(m);
3614 counter_inc(&vm_statistics_reactivations);
3615 inactive_burst_count = 0;
3616 }
3617 #if DEVELOPMENT || DEBUG
3618 if (page_from_bg_q == TRUE) {
3619 if (m_object->internal) {
3620 vm_pageout_rejected_bq_internal++;
3621 } else {
3622 vm_pageout_rejected_bq_external++;
3623 }
3624 }
3625 #endif /* DEVELOPMENT || DEBUG */
3626
3627 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3628 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3629 }
3630 vm_pageout_state.vm_pageout_inactive_used++;
3631
3632 lock_yield_check = TRUE;
3633 continue;
3634 }
3635 /*
3636 * Make sure we call pmap_get_refmod() if it
3637 * wasn't already called just above, to update
3638 * the dirty bit.
3639 */
3640 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3641 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3642 if (refmod_state & VM_MEM_MODIFIED) {
3643 SET_PAGE_DIRTY(m, FALSE);
3644 }
3645 }
3646 }
3647
3648 /*
3649 * we've got a candidate page to steal...
3650 *
3651 * m->vmp_dirty is up to date courtesy of the
3652 * preceding check for m->vmp_reference... if
3653 * we get here, then m->vmp_reference had to be
3654 * FALSE (or possibly "reactivate_limit" was
3655 * exceeded), but in either case we called
3656 * pmap_get_refmod() and updated both
3657 * m->vmp_reference and m->vmp_dirty
3658 *
3659 * if it's dirty or precious we need to
3660 * see if the target queue is throtttled
3661 * it if is, we need to skip over it by moving it back
3662 * to the end of the inactive queue
3663 */
3664
3665 inactive_throttled = FALSE;
3666
3667 if (m->vmp_dirty || m->vmp_precious) {
3668 if (object->internal) {
3669 if (VM_PAGE_Q_THROTTLED(iq)) {
3670 inactive_throttled = TRUE;
3671 }
3672 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3673 inactive_throttled = TRUE;
3674 }
3675 }
3676 throttle_inactive:
3677 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3678 object->internal && m->vmp_dirty &&
3679 (object->purgable == VM_PURGABLE_DENY ||
3680 object->purgable == VM_PURGABLE_NONVOLATILE ||
3681 object->purgable == VM_PURGABLE_VOLATILE)) {
3682 vm_page_check_pageable_safe(m);
3683 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3684 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3685 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3686 vm_page_throttled_count++;
3687
3688 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3689
3690 inactive_burst_count = 0;
3691
3692 lock_yield_check = TRUE;
3693 continue;
3694 }
3695 if (inactive_throttled == TRUE) {
3696 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3697 &force_anonymous, page_from_bg_q);
3698
3699 inactive_burst_count = 0;
3700
3701 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3702 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3703 }
3704
3705 lock_yield_check = TRUE;
3706 continue;
3707 }
3708
3709 /*
3710 * we've got a page that we can steal...
3711 * eliminate all mappings and make sure
3712 * we have the up-to-date modified state
3713 *
3714 * if we need to do a pmap_disconnect then we
3715 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3716 * provides the true state atomically... the
3717 * page was still mapped up to the pmap_disconnect
3718 * and may have been dirtied at the last microsecond
3719 *
3720 * Note that if 'pmapped' is FALSE then the page is not
3721 * and has not been in any map, so there is no point calling
3722 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3723 * of likely usage of the page.
3724 */
3725 if (m->vmp_pmapped == TRUE) {
3726 int pmap_options;
3727
3728 /*
3729 * Don't count this page as going into the compressor
3730 * if any of these are true:
3731 * 1) compressed pager isn't enabled
3732 * 2) Freezer enabled device with compressed pager
3733 * backend (exclusive use) i.e. most of the VM system
3734 * (including vm_pageout_scan) has no knowledge of
3735 * the compressor
3736 * 3) This page belongs to a file and hence will not be
3737 * sent into the compressor
3738 */
3739 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3740 object->internal == FALSE) {
3741 pmap_options = 0;
3742 } else if (m->vmp_dirty || m->vmp_precious) {
3743 /*
3744 * VM knows that this page is dirty (or
3745 * precious) and needs to be compressed
3746 * rather than freed.
3747 * Tell the pmap layer to count this page
3748 * as "compressed".
3749 */
3750 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3751 } else {
3752 /*
3753 * VM does not know if the page needs to
3754 * be preserved but the pmap layer might tell
3755 * us if any mapping has "modified" it.
3756 * Let's the pmap layer to count this page
3757 * as compressed if and only if it has been
3758 * modified.
3759 */
3760 pmap_options =
3761 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3762 }
3763 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3764 pmap_options,
3765 NULL);
3766 if (refmod_state & VM_MEM_MODIFIED) {
3767 SET_PAGE_DIRTY(m, FALSE);
3768 }
3769 }
3770
3771 /*
3772 * reset our count of pages that have been reclaimed
3773 * since the last page was 'stolen'
3774 */
3775 inactive_reclaim_run = 0;
3776
3777 /*
3778 * If it's clean and not precious, we can free the page.
3779 */
3780 if (!m->vmp_dirty && !m->vmp_precious) {
3781 vm_pageout_state.vm_pageout_inactive_clean++;
3782
3783 /*
3784 * OK, at this point we have found a page we are going to free.
3785 */
3786 #if CONFIG_PHANTOM_CACHE
3787 if (!object->internal) {
3788 vm_phantom_cache_add_ghost(m);
3789 }
3790 #endif
3791 goto reclaim_page;
3792 }
3793
3794 /*
3795 * The page may have been dirtied since the last check
3796 * for a throttled target queue (which may have been skipped
3797 * if the page was clean then). With the dirty page
3798 * disconnected here, we can make one final check.
3799 */
3800 if (object->internal) {
3801 if (VM_PAGE_Q_THROTTLED(iq)) {
3802 inactive_throttled = TRUE;
3803 }
3804 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3805 inactive_throttled = TRUE;
3806 }
3807
3808 if (inactive_throttled == TRUE) {
3809 goto throttle_inactive;
3810 }
3811 #if !CONFIG_JETSAM
3812 memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3813 #endif /* !CONFIG_JETSAM */
3814
3815 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3816 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3817 }
3818
3819 if (object->internal) {
3820 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3821 } else {
3822 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3823 }
3824
3825 /*
3826 * internal pages will go to the compressor...
3827 * external pages will go to the appropriate pager to be cleaned
3828 * and upon completion will end up on 'vm_page_queue_cleaned' which
3829 * is a preferred queue to steal from
3830 */
3831 vm_pageout_cluster(m);
3832 inactive_burst_count = 0;
3833
3834 /*
3835 * back to top of pageout scan loop
3836 */
3837 }
3838 }
3839
3840
3841 void
3842 vm_page_free_reserve(
3843 int pages)
3844 {
3845 int free_after_reserve;
3846
3847 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3848 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3849 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3850 } else {
3851 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3852 }
3853 } else {
3854 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3855 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3856 } else {
3857 vm_page_free_reserved += pages;
3858 }
3859 }
3860 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3861
3862 vm_page_free_min = vm_page_free_reserved +
3863 VM_PAGE_FREE_MIN(free_after_reserve);
3864
3865 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3866 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3867 }
3868
3869 vm_page_free_target = vm_page_free_reserved +
3870 VM_PAGE_FREE_TARGET(free_after_reserve);
3871
3872 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3873 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3874 }
3875
3876 if (vm_page_free_target < vm_page_free_min + 5) {
3877 vm_page_free_target = vm_page_free_min + 5;
3878 }
3879
3880 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3881 }
3882
3883 /*
3884 * vm_pageout is the high level pageout daemon.
3885 */
3886
3887 void
3888 vm_pageout_continue(void)
3889 {
3890 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3891 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3892
3893 vm_free_page_lock();
3894 vm_pageout_running = TRUE;
3895 vm_free_page_unlock();
3896
3897 vm_pageout_scan();
3898 /*
3899 * we hold both the vm_page_queue_free_lock
3900 * and the vm_page_queues_lock at this point
3901 */
3902 assert(vm_page_free_wanted == 0);
3903 assert(vm_page_free_wanted_privileged == 0);
3904 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3905
3906 vm_pageout_running = FALSE;
3907 #if XNU_TARGET_OS_OSX
3908 if (vm_pageout_waiter) {
3909 vm_pageout_waiter = FALSE;
3910 thread_wakeup((event_t)&vm_pageout_waiter);
3911 }
3912 #endif /* XNU_TARGET_OS_OSX */
3913
3914 vm_free_page_unlock();
3915 vm_page_unlock_queues();
3916
3917 thread_block((thread_continue_t)vm_pageout_continue);
3918 /*NOTREACHED*/
3919 }
3920
3921 #if XNU_TARGET_OS_OSX
3922 kern_return_t
3923 vm_pageout_wait(uint64_t deadline)
3924 {
3925 kern_return_t kr;
3926
3927 vm_free_page_lock();
3928 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3929 vm_pageout_waiter = TRUE;
3930 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3931 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3932 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3933 kr = KERN_OPERATION_TIMED_OUT;
3934 }
3935 }
3936 vm_free_page_unlock();
3937
3938 return kr;
3939 }
3940 #endif /* XNU_TARGET_OS_OSX */
3941
3942 OS_NORETURN
3943 static void
3944 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3945 {
3946 vm_page_t m = NULL;
3947 vm_object_t object;
3948 vm_object_offset_t offset;
3949 memory_object_t pager;
3950 struct vm_pageout_queue *q = ethr->q;
3951
3952 /* On systems with a compressor, the external IO thread clears its
3953 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3954 * creation)
3955 */
3956 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3957 current_thread()->options &= ~TH_OPT_VMPRIV;
3958 }
3959
3960 sched_cond_ack(&(ethr->pgo_wakeup));
3961
3962 while (true) {
3963 vm_page_lockspin_queues();
3964
3965 while (!vm_page_queue_empty(&q->pgo_pending)) {
3966 q->pgo_busy = TRUE;
3967 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3968
3969 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3970 VM_PAGE_CHECK(m);
3971 /*
3972 * grab a snapshot of the object and offset this
3973 * page is tabled in so that we can relookup this
3974 * page after we've taken the object lock - these
3975 * fields are stable while we hold the page queues lock
3976 * but as soon as we drop it, there is nothing to keep
3977 * this page in this object... we hold an activity_in_progress
3978 * on this object which will keep it from terminating
3979 */
3980 object = VM_PAGE_OBJECT(m);
3981 offset = m->vmp_offset;
3982
3983 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3984 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3985
3986 vm_page_unlock_queues();
3987
3988 vm_object_lock(object);
3989
3990 m = vm_page_lookup(object, offset);
3991
3992 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3993 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3994 /*
3995 * it's either the same page that someone else has
3996 * started cleaning (or it's finished cleaning or
3997 * been put back on the pageout queue), or
3998 * the page has been freed or we have found a
3999 * new page at this offset... in all of these cases
4000 * we merely need to release the activity_in_progress
4001 * we took when we put the page on the pageout queue
4002 */
4003 vm_object_activity_end(object);
4004 vm_object_unlock(object);
4005
4006 vm_page_lockspin_queues();
4007 continue;
4008 }
4009 pager = object->pager;
4010
4011 if (pager == MEMORY_OBJECT_NULL) {
4012 /*
4013 * This pager has been destroyed by either
4014 * memory_object_destroy or vm_object_destroy, and
4015 * so there is nowhere for the page to go.
4016 */
4017 if (m->vmp_free_when_done) {
4018 /*
4019 * Just free the page... VM_PAGE_FREE takes
4020 * care of cleaning up all the state...
4021 * including doing the vm_pageout_throttle_up
4022 */
4023 VM_PAGE_FREE(m);
4024 } else {
4025 vm_page_lockspin_queues();
4026
4027 vm_pageout_throttle_up(m);
4028 vm_page_activate(m);
4029
4030 vm_page_unlock_queues();
4031
4032 /*
4033 * And we are done with it.
4034 */
4035 }
4036 vm_object_activity_end(object);
4037 vm_object_unlock(object);
4038
4039 vm_page_lockspin_queues();
4040 continue;
4041 }
4042 #if 0
4043 /*
4044 * we don't hold the page queue lock
4045 * so this check isn't safe to make
4046 */
4047 VM_PAGE_CHECK(m);
4048 #endif
4049 /*
4050 * give back the activity_in_progress reference we
4051 * took when we queued up this page and replace it
4052 * it with a paging_in_progress reference that will
4053 * also hold the paging offset from changing and
4054 * prevent the object from terminating
4055 */
4056 vm_object_activity_end(object);
4057 vm_object_paging_begin(object);
4058 vm_object_unlock(object);
4059
4060 /*
4061 * Send the data to the pager.
4062 * any pageout clustering happens there
4063 */
4064 memory_object_data_return(pager,
4065 m->vmp_offset + object->paging_offset,
4066 PAGE_SIZE,
4067 NULL,
4068 NULL,
4069 FALSE,
4070 FALSE,
4071 0);
4072
4073 vm_object_lock(object);
4074 vm_object_paging_end(object);
4075 vm_object_unlock(object);
4076
4077 vm_pageout_io_throttle();
4078
4079 vm_page_lockspin_queues();
4080 }
4081 q->pgo_busy = FALSE;
4082
4083 vm_page_unlock_queues();
4084 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4085 }
4086 /*NOTREACHED*/
4087 }
4088
4089 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4090
4091 #if DEVELOPMENT || DEBUG
4092 static void
4093 vm_pageout_record_thread_time(int cqid, int ncomps)
4094 {
4095 if (__improbable(vm_compressor_time_thread)) {
4096 vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4097 vmct_stats.vmct_pages[cqid] += ncomps;
4098 vmct_stats.vmct_iterations[cqid]++;
4099 if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4100 vmct_stats.vmct_maxpages[cqid] = ncomps;
4101 }
4102 if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4103 vmct_stats.vmct_minpages[cqid] = ncomps;
4104 }
4105 }
4106 }
4107 #endif
4108
4109 static void *
4110 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4111 {
4112 /*
4113 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4114 * However, this page has been removed from all queues and is only
4115 * known to this compressor thread dealing with this local queue.
4116 *
4117 * TODO: Add a second localq that is the early localq and
4118 * put special pages like this one on that queue in the block above
4119 * under the pageq lock to avoid this 'works but not clean' logic.
4120 */
4121 void *donate_queue_head;
4122 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4123 donate_queue_head = &cq->current_early_swapout_chead;
4124 #else /* XNU_TARGET_OS_OSX */
4125 donate_queue_head = &cq->current_late_swapout_chead;
4126 #endif /* XNU_TARGET_OS_OSX */
4127 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4128 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4129 return donate_queue_head;
4130 } else {
4131 return &cq->current_regular_swapout_chead;
4132 }
4133 }
4134
4135 #define MAX_FREE_BATCH 32
4136
4137 OS_NORETURN
4138 static void
4139 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4140 {
4141 struct vm_pageout_queue *q;
4142 vm_page_t m = NULL;
4143 boolean_t pgo_draining;
4144 vm_page_t local_q;
4145 int local_cnt;
4146 vm_page_t local_freeq = NULL;
4147 int local_freed = 0;
4148 int local_batch_size;
4149 #if DEVELOPMENT || DEBUG
4150 int ncomps = 0;
4151 boolean_t marked_active = FALSE;
4152 int num_pages_processed = 0;
4153 #endif
4154 void *chead = NULL;
4155
4156 KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4157
4158 sched_cond_ack(&(cq->pgo_wakeup));
4159
4160 q = cq->q;
4161
4162 while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4163 #if DEVELOPMENT || DEBUG
4164 bool benchmark_accounting = false;
4165 /* If we're running the compressor perf test, only process the benchmark pages.
4166 * We'll get back to our regular queue once the benchmark is done */
4167 if (compressor_running_perf_test) {
4168 q = cq->benchmark_q;
4169 if (!vm_page_queue_empty(&q->pgo_pending)) {
4170 benchmark_accounting = true;
4171 } else {
4172 q = cq->q;
4173 benchmark_accounting = false;
4174 }
4175 }
4176 #endif /* DEVELOPMENT || DEBUG */
4177
4178 #if __AMP__
4179 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4180 local_batch_size = (q->pgo_maxlaundry >> 3);
4181 local_batch_size = MAX(local_batch_size, 16);
4182 } else {
4183 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4184 }
4185 #else
4186 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4187 #endif
4188
4189 #if RECORD_THE_COMPRESSED_DATA
4190 if (q->pgo_laundry) {
4191 c_compressed_record_init();
4192 }
4193 #endif
4194 while (true) { /* this loop is for working though all the pages in the pending queue */
4195 int pages_left_on_q = 0;
4196
4197 local_cnt = 0;
4198 local_q = NULL;
4199
4200 KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4201
4202 vm_page_lock_queues();
4203 #if DEVELOPMENT || DEBUG
4204 if (marked_active == FALSE) {
4205 vmct_active++;
4206 vmct_state[cq->id] = VMCT_ACTIVE;
4207 marked_active = TRUE;
4208 if (vmct_active == 1) {
4209 vm_compressor_epoch_start = mach_absolute_time();
4210 }
4211 }
4212 #endif
4213 KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4214
4215 KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4216
4217 /* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4218 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4219 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4220 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4221 VM_PAGE_CHECK(m);
4222
4223 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4224 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4225 m->vmp_laundry = FALSE;
4226
4227 m->vmp_snext = local_q;
4228 local_q = m;
4229 local_cnt++;
4230 }
4231 if (local_q == NULL) {
4232 break;
4233 }
4234
4235 q->pgo_busy = TRUE;
4236
4237 if ((pgo_draining = q->pgo_draining) == FALSE) {
4238 vm_pageout_throttle_up_batch(q, local_cnt);
4239 pages_left_on_q = q->pgo_laundry;
4240 } else {
4241 pages_left_on_q = q->pgo_laundry - local_cnt;
4242 }
4243
4244 vm_page_unlock_queues();
4245
4246 #if !RECORD_THE_COMPRESSED_DATA
4247 /* if we have lots to compress, wake up the other thread to help.
4248 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4249 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4250 // wake up the next compressor thread
4251 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4252 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4253 }
4254 #endif
4255 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4256
4257 while (local_q) {
4258 KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4259
4260 m = local_q;
4261 local_q = m->vmp_snext;
4262 m->vmp_snext = NULL;
4263
4264
4265 chead = vm_pageout_select_filling_chead(cq, m);
4266
4267 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4268 #if DEVELOPMENT || DEBUG
4269 ncomps++;
4270 #endif
4271 KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4272
4273 m->vmp_snext = local_freeq;
4274 local_freeq = m;
4275 local_freed++;
4276
4277 /* if we gathered enough free pages, free them now */
4278 if (local_freed >= MAX_FREE_BATCH) {
4279 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4280
4281 vm_page_free_list(local_freeq, TRUE);
4282
4283 local_freeq = NULL;
4284 local_freed = 0;
4285 }
4286 }
4287 #if DEVELOPMENT || DEBUG
4288 num_pages_processed++;
4289 #endif /* DEVELOPMENT || DEBUG */
4290 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4291 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4292 kern_return_t wait_result;
4293 int need_wakeup = 0;
4294
4295 if (local_freeq) {
4296 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4297
4298 vm_page_free_list(local_freeq, TRUE);
4299 local_freeq = NULL;
4300 local_freed = 0;
4301
4302 continue;
4303 }
4304 vm_free_page_lock_spin();
4305
4306 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4307 if (vm_page_free_wanted_privileged++ == 0) {
4308 need_wakeup = 1;
4309 }
4310 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4311
4312 vm_free_page_unlock();
4313
4314 if (need_wakeup) {
4315 thread_wakeup((event_t)&vm_page_free_wanted);
4316 }
4317
4318 if (wait_result == THREAD_WAITING) {
4319 thread_block(THREAD_CONTINUE_NULL);
4320 }
4321 } else {
4322 vm_free_page_unlock();
4323 }
4324 }
4325 #endif
4326 } /* while (local_q) */
4327 /* free any leftovers in the freeq */
4328 if (local_freeq) {
4329 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4330
4331 vm_page_free_list(local_freeq, TRUE);
4332 local_freeq = NULL;
4333 local_freed = 0;
4334 }
4335 if (pgo_draining == TRUE) {
4336 vm_page_lockspin_queues();
4337 vm_pageout_throttle_up_batch(q, local_cnt);
4338 vm_page_unlock_queues();
4339 }
4340 }
4341 KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4342
4343 /*
4344 * queue lock is held and our q is empty
4345 */
4346 q->pgo_busy = FALSE;
4347 #if DEVELOPMENT || DEBUG
4348 if (marked_active == TRUE) {
4349 vmct_active--;
4350 vmct_state[cq->id] = VMCT_IDLE;
4351
4352 if (vmct_active == 0) {
4353 vm_compressor_epoch_stop = mach_absolute_time();
4354 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4355 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4356 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4357 /* This interval includes intervals where one or more
4358 * compressor threads were pre-empted
4359 */
4360 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4361 }
4362 }
4363 if (compressor_running_perf_test && benchmark_accounting) {
4364 /*
4365 * We could turn ON compressor_running_perf_test while still processing
4366 * regular non-benchmark pages. We shouldn't count them here else we
4367 * could overshoot. We might also still be populating that benchmark Q
4368 * and be under pressure. So we will go back to the regular queues. And
4369 * benchmark accounting will be off for that case too.
4370 */
4371 compressor_perf_test_pages_processed += num_pages_processed;
4372 thread_wakeup(&compressor_perf_test_pages_processed);
4373 }
4374 #endif
4375 vm_page_unlock_queues();
4376 #if DEVELOPMENT || DEBUG
4377 vm_pageout_record_thread_time(cq->id, ncomps);
4378 #endif
4379
4380 KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4381 #if DEVELOPMENT || DEBUG
4382 if (compressor_running_perf_test && benchmark_accounting) {
4383 /*
4384 * We've been exclusively compressing pages from the benchmark queue,
4385 * do 1 pass over the internal queue before blocking.
4386 */
4387 continue;
4388 }
4389 #endif
4390
4391 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4392 }
4393 /*NOTREACHED*/
4394 }
4395
4396 /* resolves the pager and maintain stats in the pager and in the vm_object */
4397 kern_return_t
4398 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4399 {
4400 vm_object_t object;
4401 memory_object_t pager;
4402 int compressed_count_delta;
4403 kern_return_t retval;
4404
4405 object = VM_PAGE_OBJECT(m);
4406
4407 assert(!m->vmp_free_when_done);
4408 assert(!m->vmp_laundry);
4409
4410 pager = object->pager;
4411
4412 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4413 KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4414
4415 vm_object_lock(object);
4416
4417 /*
4418 * If there is no memory object for the page, create
4419 * one and hand it to the compression pager.
4420 */
4421
4422 if (!object->pager_initialized) {
4423 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4424 }
4425 if (!object->pager_initialized) {
4426 vm_object_compressor_pager_create(object);
4427 }
4428
4429 pager = object->pager;
4430
4431 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4432 /*
4433 * Still no pager for the object,
4434 * or the pager has been destroyed.
4435 * Reactivate the page.
4436 *
4437 * Should only happen if there is no
4438 * compression pager
4439 */
4440 vm_page_wakeup_done(object, m);
4441
4442 vm_page_lockspin_queues();
4443 vm_page_activate(m);
4444 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4445 vm_page_unlock_queues();
4446
4447 /*
4448 * And we are done with it.
4449 */
4450 vm_object_activity_end(object);
4451 vm_object_unlock(object);
4452
4453 return KERN_FAILURE;
4454 }
4455 vm_object_unlock(object);
4456
4457 KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4458 }
4459 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4460 assert(object->activity_in_progress > 0);
4461
4462 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4463 if (m->vmp_unmodified_ro == true) {
4464 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4465 }
4466 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4467
4468 vm_compressor_options_t flags = 0;
4469
4470 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4471 if (m->vmp_unmodified_ro) {
4472 flags |= C_PAGE_UNMODIFIED;
4473 }
4474 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4475
4476
4477 retval = vm_compressor_pager_put(
4478 pager,
4479 m->vmp_offset + object->paging_offset,
4480 VM_PAGE_GET_PHYS_PAGE(m),
4481 current_chead,
4482 scratch_buf,
4483 &compressed_count_delta,
4484 flags);
4485
4486 vm_object_lock(object);
4487
4488 assert(object->activity_in_progress > 0);
4489 assert(VM_PAGE_OBJECT(m) == object);
4490 assert( !VM_PAGE_WIRED(m));
4491
4492 vm_compressor_pager_count(pager,
4493 compressed_count_delta,
4494 FALSE, /* shared_lock */
4495 object);
4496
4497 if (retval == KERN_SUCCESS) {
4498 /*
4499 * If the object is purgeable, its owner's
4500 * purgeable ledgers will be updated in
4501 * vm_page_remove() but the page still
4502 * contributes to the owner's memory footprint,
4503 * so account for it as such.
4504 */
4505 if (m->vmp_tabled) {
4506 vm_page_remove(m, TRUE);
4507 }
4508 if ((object->purgable != VM_PURGABLE_DENY ||
4509 object->vo_ledger_tag) &&
4510 object->vo_owner != NULL) {
4511 /* one more compressed purgeable/tagged page */
4512 vm_object_owner_compressed_update(object,
4513 compressed_count_delta);
4514 }
4515 counter_inc(&vm_statistics_compressions);
4516 } else {
4517 vm_page_wakeup_done(object, m);
4518
4519 vm_page_lockspin_queues();
4520
4521 vm_page_activate(m);
4522 vm_pageout_vminfo.vm_compressor_failed++;
4523
4524 vm_page_unlock_queues();
4525 }
4526 vm_object_activity_end(object);
4527 vm_object_unlock(object);
4528
4529 return retval;
4530 }
4531
4532
4533 static void
4534 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4535 {
4536 uint32_t policy;
4537
4538 if (hibernate_cleaning_in_progress == TRUE) {
4539 req_lowpriority = FALSE;
4540 }
4541
4542 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4543 vm_page_unlock_queues();
4544
4545 if (req_lowpriority == TRUE) {
4546 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4547 DTRACE_VM(laundrythrottle);
4548 } else {
4549 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4550 DTRACE_VM(laundryunthrottle);
4551 }
4552 proc_set_thread_policy(ethr->pgo_iothread,
4553 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4554
4555 vm_page_lock_queues();
4556 ethr->q->pgo_lowpriority = req_lowpriority;
4557 }
4558 }
4559
4560 OS_NORETURN
4561 static void
4562 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4563 {
4564 thread_t self = current_thread();
4565
4566 self->options |= TH_OPT_VMPRIV;
4567
4568 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4569
4570 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4571 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4572
4573 vm_page_lock_queues();
4574
4575 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4576 vm_pageout_queue_external.pgo_inited = TRUE;
4577
4578 vm_page_unlock_queues();
4579
4580 #if CONFIG_THREAD_GROUPS
4581 thread_group_vm_add();
4582 #endif /* CONFIG_THREAD_GROUPS */
4583
4584 vm_pageout_iothread_external_continue(ethr, 0);
4585 /*NOTREACHED*/
4586 }
4587
4588
4589 OS_NORETURN
4590 static void
4591 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4592 {
4593 thread_t self = current_thread();
4594
4595 self->options |= TH_OPT_VMPRIV;
4596
4597 vm_page_lock_queues();
4598
4599 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4600 vm_pageout_queue_internal.pgo_inited = TRUE;
4601
4602 #if DEVELOPMENT || DEBUG
4603 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4604 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4605 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4606 #endif /* DEVELOPMENT || DEBUG */
4607
4608 vm_page_unlock_queues();
4609
4610 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4611 thread_vm_bind_group_add();
4612 }
4613
4614 #if CONFIG_THREAD_GROUPS
4615 thread_group_vm_add();
4616 #endif /* CONFIG_THREAD_GROUPS */
4617
4618 #if __AMP__
4619 if (vm_compressor_ebound) {
4620 /*
4621 * Use the soft bound option for vm_compressor to allow it to run on
4622 * P-cores if E-cluster is unavailable.
4623 */
4624 thread_bind_cluster_type(self, 'E', true);
4625 }
4626 #endif /* __AMP__ */
4627
4628 thread_set_thread_name(current_thread(), "VM_compressor");
4629 #if DEVELOPMENT || DEBUG
4630 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4631 #endif
4632 vm_pageout_iothread_internal_continue(cthr, 0);
4633
4634 /*NOTREACHED*/
4635 }
4636
4637 kern_return_t
4638 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4639 {
4640 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4641 return KERN_SUCCESS;
4642 } else {
4643 return KERN_FAILURE; /* Already set */
4644 }
4645 }
4646
4647 extern boolean_t memorystatus_manual_testing_on;
4648 extern unsigned int memorystatus_level;
4649
4650
4651 #if VM_PRESSURE_EVENTS
4652
4653 boolean_t vm_pressure_events_enabled = FALSE;
4654
4655 extern uint64_t next_warning_notification_sent_at_ts;
4656 extern uint64_t next_critical_notification_sent_at_ts;
4657
4658 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4659
4660 /*
4661 * The last time there was change in pressure level OR we forced a check
4662 * because the system is stuck in a non-normal pressure level.
4663 */
4664 uint64_t vm_pressure_last_level_transition_abs = 0;
4665
4666 /*
4667 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4668 * level before resending out notifications for that level again.
4669 */
4670 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4671
4672 void
4673 vm_pressure_response(void)
4674 {
4675 vm_pressure_level_t old_level = kVMPressureNormal;
4676 int new_level = -1;
4677 unsigned int total_pages;
4678 uint64_t available_memory = 0;
4679 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4680 bool force_check = false;
4681 int time_in_mins;
4682
4683
4684 if (vm_pressure_events_enabled == FALSE) {
4685 return;
4686 }
4687
4688 available_memory = (uint64_t) memorystatus_get_available_page_count();
4689
4690 total_pages = (unsigned int) atop_64(max_mem);
4691 #if CONFIG_SECLUDED_MEMORY
4692 total_pages -= vm_page_secluded_count;
4693 #endif /* CONFIG_SECLUDED_MEMORY */
4694 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4695
4696 if (memorystatus_manual_testing_on) {
4697 return;
4698 }
4699
4700 curr_ts = mach_absolute_time();
4701 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4702
4703 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4704 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4705 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4706
4707 old_level = memorystatus_vm_pressure_level;
4708
4709 switch (memorystatus_vm_pressure_level) {
4710 case kVMPressureNormal:
4711 {
4712 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4713 new_level = kVMPressureCritical;
4714 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4715 new_level = kVMPressureWarning;
4716 }
4717 break;
4718 }
4719
4720 case kVMPressureWarning:
4721 case kVMPressureUrgent:
4722 {
4723 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4724 new_level = kVMPressureNormal;
4725 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4726 new_level = kVMPressureCritical;
4727 } else if (force_check) {
4728 new_level = kVMPressureWarning;
4729 next_warning_notification_sent_at_ts = curr_ts;
4730 }
4731 break;
4732 }
4733
4734 case kVMPressureCritical:
4735 {
4736 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4737 new_level = kVMPressureNormal;
4738 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4739 new_level = kVMPressureWarning;
4740 } else if (force_check) {
4741 new_level = kVMPressureCritical;
4742 next_critical_notification_sent_at_ts = curr_ts;
4743 }
4744 break;
4745 }
4746
4747 default:
4748 return;
4749 }
4750
4751 if (new_level != -1 || force_check) {
4752 if (new_level != -1) {
4753 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4754
4755 if (new_level != (int) old_level) {
4756 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4757 new_level, old_level, 0, 0);
4758 }
4759 } else {
4760 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4761 new_level, old_level, force_check, 0);
4762 }
4763
4764 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4765 /*
4766 * We don't want to schedule a wakeup while hibernation is in progress
4767 * because that could collide with checks for non-monotonicity in the scheduler.
4768 * We do however do all the updates to memorystatus_vm_pressure_level because
4769 * we _might_ want to use that for decisions regarding which pages or how
4770 * many pages we want to dump in hibernation.
4771 */
4772 return;
4773 }
4774
4775 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4776 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4777 thread_wakeup(&vm_pressure_thread);
4778 }
4779
4780 if (old_level != memorystatus_vm_pressure_level) {
4781 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4782 }
4783 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4784 }
4785 }
4786 }
4787 #endif /* VM_PRESSURE_EVENTS */
4788
4789
4790 /**
4791 * Called by a kernel thread to ask if a number of pages may be wired.
4792 */
4793 kern_return_t
4794 mach_vm_wire_level_monitor(int64_t requested_pages)
4795 {
4796 if (requested_pages <= 0) {
4797 return KERN_INVALID_ARGUMENT;
4798 }
4799
4800 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4801 /**
4802 * Available pages can be negative in the case where more system memory is
4803 * wired than the threshold, so we must use a signed integer.
4804 */
4805 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4806
4807 if (requested_pages > available_pages) {
4808 return KERN_RESOURCE_SHORTAGE;
4809 }
4810 return KERN_SUCCESS;
4811 }
4812
4813 /*
4814 * Function called by a kernel thread to either get the current pressure level or
4815 * wait until memory pressure changes from a given level.
4816 */
4817 kern_return_t
4818 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4819 {
4820 #if !VM_PRESSURE_EVENTS
4821 (void)wait_for_pressure;
4822 (void)pressure_level;
4823 return KERN_NOT_SUPPORTED;
4824 #else /* VM_PRESSURE_EVENTS */
4825
4826 uint32_t *waiters = NULL;
4827 wait_result_t wr = 0;
4828 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4829
4830 if (pressure_level == NULL) {
4831 return KERN_INVALID_ARGUMENT;
4832 }
4833 if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4834 *pressure_level == kVMPressureForegroundJetsam)) {
4835 return KERN_INVALID_ARGUMENT;
4836 }
4837
4838 if (wait_for_pressure) {
4839 switch (*pressure_level) {
4840 case kVMPressureForegroundJetsam:
4841 case kVMPressureBackgroundJetsam:
4842
4843 if (*pressure_level == kVMPressureForegroundJetsam) {
4844 waiters = &memorystatus_jetsam_fg_band_waiters;
4845 } else {
4846 /* kVMPressureBackgroundJetsam */
4847 waiters = &memorystatus_jetsam_bg_band_waiters;
4848 }
4849
4850 lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4851 wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4852 if (wr == THREAD_WAITING) {
4853 *waiters += 1;
4854 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4855 wr = thread_block(THREAD_CONTINUE_NULL);
4856 } else {
4857 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4858 }
4859
4860 if (wr != THREAD_AWAKENED) {
4861 return KERN_ABORTED;
4862 }
4863
4864 return KERN_SUCCESS;
4865 case kVMPressureNormal:
4866 case kVMPressureWarning:
4867 case kVMPressureUrgent:
4868 case kVMPressureCritical:
4869 while (old_level == *pressure_level) {
4870 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4871 THREAD_INTERRUPTIBLE);
4872 if (wr == THREAD_WAITING) {
4873 wr = thread_block(THREAD_CONTINUE_NULL);
4874 }
4875 if (wr == THREAD_INTERRUPTED) {
4876 return KERN_ABORTED;
4877 }
4878
4879 if (wr == THREAD_AWAKENED) {
4880 old_level = memorystatus_vm_pressure_level;
4881 }
4882 }
4883 break;
4884 default:
4885 return KERN_INVALID_ARGUMENT;
4886 }
4887 }
4888
4889 *pressure_level = old_level;
4890 return KERN_SUCCESS;
4891 #endif /* VM_PRESSURE_EVENTS */
4892 }
4893
4894 #if VM_PRESSURE_EVENTS
4895 void
4896 vm_pressure_thread(void)
4897 {
4898 static boolean_t thread_initialized = FALSE;
4899
4900 if (thread_initialized == TRUE) {
4901 vm_pageout_state.vm_pressure_thread_running = TRUE;
4902 consider_vm_pressure_events();
4903 vm_pageout_state.vm_pressure_thread_running = FALSE;
4904 }
4905
4906 #if CONFIG_THREAD_GROUPS
4907 thread_group_vm_add();
4908 #endif /* CONFIG_THREAD_GROUPS */
4909
4910 thread_set_thread_name(current_thread(), "VM_pressure");
4911 thread_initialized = TRUE;
4912 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4913 thread_block((thread_continue_t)vm_pressure_thread);
4914 }
4915 #endif /* VM_PRESSURE_EVENTS */
4916
4917
4918 /*
4919 * called once per-second via "compute_averages"
4920 */
4921 void
4922 compute_pageout_gc_throttle(__unused void *arg)
4923 {
4924 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4925 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4926
4927 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4928 }
4929 }
4930
4931 /*
4932 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4933 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4934 * jetsams. We need to check if the zone map size is above its jetsam limit to
4935 * decide if this was indeed the case.
4936 *
4937 * We need to do this on a different thread because of the following reasons:
4938 *
4939 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4940 * itself causing the system to hang. We perform synchronous jetsams if we're
4941 * leaking in the VM map entries zone, so the leaking process could be doing a
4942 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4943 * jetsam itself. We also need the vm_map lock on the process termination path,
4944 * which would now lead the dying process to deadlock against itself.
4945 *
4946 * 2. The jetsam path might need to allocate zone memory itself. We could try
4947 * using the non-blocking variant of zalloc for this path, but we can still
4948 * end up trying to do a kmem_alloc when the zone maps are almost full.
4949 */
4950 __dead2
4951 void
4952 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4953 {
4954 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4955
4956 if (step == VM_PAGEOUT_GC_INIT) {
4957 /* first time being called is not about GC */
4958 #if CONFIG_THREAD_GROUPS
4959 thread_group_vm_add();
4960 #endif /* CONFIG_THREAD_GROUPS */
4961 } else if (zone_map_nearing_exhaustion()) {
4962 /*
4963 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4964 *
4965 * Bail out after calling zone_gc (which triggers the
4966 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4967 * operations that clear out a bunch of caches might allocate zone
4968 * memory themselves (for eg. vm_map operations would need VM map
4969 * entries). Since the zone map is almost full at this point, we
4970 * could end up with a panic. We just need to quickly jetsam a
4971 * process and exit here.
4972 *
4973 * It could so happen that we were woken up to relieve memory
4974 * pressure and the zone map also happened to be near its limit at
4975 * the time, in which case we'll skip out early. But that should be
4976 * ok; if memory pressure persists, the thread will simply be woken
4977 * up again.
4978 */
4979 zone_gc(ZONE_GC_JETSAM);
4980 } else {
4981 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4982 boolean_t buf_large_zfree = FALSE;
4983 boolean_t first_try = TRUE;
4984
4985 stack_collect();
4986
4987 consider_machine_collect();
4988 #if CONFIG_MBUF_MCACHE
4989 mbuf_drain(FALSE);
4990 #endif /* CONFIG_MBUF_MCACHE */
4991
4992 do {
4993 if (consider_buffer_cache_collect != NULL) {
4994 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4995 }
4996 if (first_try == TRUE || buf_large_zfree == TRUE) {
4997 /*
4998 * zone_gc should be last, because the other operations
4999 * might return memory to zones.
5000 */
5001 zone_gc(ZONE_GC_TRIM);
5002 }
5003 first_try = FALSE;
5004 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5005
5006 consider_machine_adjust();
5007 }
5008
5009 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
5010
5011 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5012 __builtin_unreachable();
5013 }
5014
5015
5016 #if VM_PAGE_BUCKETS_CHECK
5017 #if VM_PAGE_FAKE_BUCKETS
5018 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5019 #endif /* VM_PAGE_FAKE_BUCKETS */
5020 #endif /* VM_PAGE_BUCKETS_CHECK */
5021
5022
5023
5024 void
5025 vm_set_restrictions(unsigned int num_cpus)
5026 {
5027 int vm_restricted_to_single_processor = 0;
5028
5029 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5030 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5031 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5032 } else {
5033 assert(num_cpus > 0);
5034
5035 if (num_cpus <= 3) {
5036 /*
5037 * on systems with a limited number of CPUS, bind the
5038 * 4 major threads that can free memory and that tend to use
5039 * a fair bit of CPU under pressured conditions to a single processor.
5040 * This insures that these threads don't hog all of the available CPUs
5041 * (important for camera launch), while allowing them to run independently
5042 * w/r to locks... the 4 threads are
5043 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5044 * vm_compressor_swap_trigger_thread (minor and major compactions),
5045 * memorystatus_thread (jetsams).
5046 *
5047 * the first time the thread is run, it is responsible for checking the
5048 * state of vm_restricted_to_single_processor, and if TRUE it calls
5049 * thread_bind_master... someday this should be replaced with a group
5050 * scheduling mechanism and KPI.
5051 */
5052 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5053 } else {
5054 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5055 }
5056 }
5057 }
5058
5059 /*
5060 * Set up vm_config based on the vm_compressor_mode.
5061 * Must run BEFORE the pageout thread starts up.
5062 */
5063 __startup_func
5064 void
5065 vm_config_init(void)
5066 {
5067 bzero(&vm_config, sizeof(vm_config));
5068
5069 switch (vm_compressor_mode) {
5070 case VM_PAGER_DEFAULT:
5071 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5072 OS_FALLTHROUGH;
5073
5074 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5075 vm_config.compressor_is_present = TRUE;
5076 vm_config.swap_is_present = TRUE;
5077 vm_config.compressor_is_active = TRUE;
5078 vm_config.swap_is_active = TRUE;
5079 break;
5080
5081 case VM_PAGER_COMPRESSOR_NO_SWAP:
5082 vm_config.compressor_is_present = TRUE;
5083 vm_config.swap_is_present = TRUE;
5084 vm_config.compressor_is_active = TRUE;
5085 break;
5086
5087 case VM_PAGER_FREEZER_DEFAULT:
5088 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5089 OS_FALLTHROUGH;
5090
5091 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5092 vm_config.compressor_is_present = TRUE;
5093 vm_config.swap_is_present = TRUE;
5094 break;
5095
5096 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5097 vm_config.compressor_is_present = TRUE;
5098 vm_config.swap_is_present = TRUE;
5099 vm_config.compressor_is_active = TRUE;
5100 vm_config.freezer_swap_is_active = TRUE;
5101 break;
5102
5103 case VM_PAGER_NOT_CONFIGURED:
5104 break;
5105
5106 default:
5107 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5108 break;
5109 }
5110 }
5111
5112 __startup_func
5113 static void
5114 vm_pageout_create_gc_thread(void)
5115 {
5116 thread_t thread;
5117
5118 if (kernel_thread_create(vm_pageout_garbage_collect,
5119 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5120 panic("vm_pageout_garbage_collect: create failed");
5121 }
5122 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5123 if (thread->reserved_stack == 0) {
5124 assert(thread->kernel_stack);
5125 thread->reserved_stack = thread->kernel_stack;
5126 }
5127
5128 /* thread is started in vm_pageout() */
5129 vm_pageout_gc_thread = thread;
5130 }
5131 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5132
5133 void
5134 vm_pageout(void)
5135 {
5136 thread_t self = current_thread();
5137 thread_t thread;
5138 kern_return_t result;
5139 spl_t s;
5140
5141 /*
5142 * Set thread privileges.
5143 */
5144 s = splsched();
5145
5146 #if CONFIG_VPS_DYNAMIC_PRIO
5147 if (vps_dynamic_priority_enabled) {
5148 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5149 thread_set_eager_preempt(self);
5150 } else {
5151 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5152 }
5153 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5154 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5155 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5156
5157 thread_lock(self);
5158 self->options |= TH_OPT_VMPRIV;
5159 thread_unlock(self);
5160
5161 if (!self->reserved_stack) {
5162 self->reserved_stack = self->kernel_stack;
5163 }
5164
5165 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5166 !vps_dynamic_priority_enabled) {
5167 thread_vm_bind_group_add();
5168 }
5169
5170
5171 #if CONFIG_THREAD_GROUPS
5172 thread_group_vm_add();
5173 #endif /* CONFIG_THREAD_GROUPS */
5174
5175 #if __AMP__
5176 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5177 if (vm_pgo_pbound) {
5178 /*
5179 * Use the soft bound option for vm pageout to allow it to run on
5180 * E-cores if P-cluster is unavailable.
5181 */
5182 thread_bind_cluster_type(self, 'P', true);
5183 }
5184 #endif /* __AMP__ */
5185
5186 PE_parse_boot_argn("vmpgo_protect_realtime",
5187 &vm_pageout_protect_realtime,
5188 sizeof(vm_pageout_protect_realtime));
5189 splx(s);
5190
5191 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5192
5193 /*
5194 * Initialize some paging parameters.
5195 */
5196
5197 vm_pageout_state.vm_pressure_thread_running = FALSE;
5198 vm_pageout_state.vm_pressure_changed = FALSE;
5199 vm_pageout_state.memorystatus_purge_on_warning = 2;
5200 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5201 vm_pageout_state.memorystatus_purge_on_critical = 8;
5202 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5203 vm_pageout_state.vm_page_speculative_percentage = 5;
5204 vm_pageout_state.vm_page_speculative_target = 0;
5205
5206 vm_pageout_state.vm_pageout_swap_wait = 0;
5207 vm_pageout_state.vm_pageout_idle_wait = 0;
5208 vm_pageout_state.vm_pageout_empty_wait = 0;
5209 vm_pageout_state.vm_pageout_burst_wait = 0;
5210 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5211 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5212 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5213
5214 vm_pageout_state.vm_pageout_inactive = 0;
5215 vm_pageout_state.vm_pageout_inactive_used = 0;
5216 vm_pageout_state.vm_pageout_inactive_clean = 0;
5217
5218 vm_pageout_state.vm_memory_pressure = 0;
5219 vm_pageout_state.vm_page_filecache_min = 0;
5220 #if CONFIG_JETSAM
5221 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5222 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5223 #else
5224 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5225 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5226 #endif
5227 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5228
5229 vm_pageout_state.vm_pageout_considered_page_last = 0;
5230
5231 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5232 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5233 }
5234
5235 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5236 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5237 }
5238
5239 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5240 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5241 }
5242
5243 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5244 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5245 }
5246
5247 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5248 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5249 }
5250
5251 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5252 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5253 }
5254
5255 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5256 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5257 }
5258 /*
5259 * even if we've already called vm_page_free_reserve
5260 * call it again here to insure that the targets are
5261 * accurately calculated (it uses vm_page_free_count_init)
5262 * calling it with an arg of 0 will not change the reserve
5263 * but will re-calculate free_min and free_target
5264 */
5265 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5266 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5267 } else {
5268 vm_page_free_reserve(0);
5269 }
5270
5271 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5272 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5273
5274 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5275 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5276
5277 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5278
5279 #if DEVELOPMENT || DEBUG
5280 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5281 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5282 #endif /* DEVELOPMENT || DEBUG */
5283
5284
5285 /* internal pageout thread started when default pager registered first time */
5286 /* external pageout and garbage collection threads started here */
5287 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5288 ethr->id = 0;
5289 ethr->q = &vm_pageout_queue_external;
5290 /* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5291 ethr->current_early_swapout_chead = NULL;
5292 ethr->current_regular_swapout_chead = NULL;
5293 ethr->current_late_swapout_chead = NULL;
5294 ethr->scratch_buf = NULL;
5295 #if DEVELOPMENT || DEBUG
5296 ethr->benchmark_q = NULL;
5297 #endif /* DEVELOPMENT || DEBUG */
5298 sched_cond_init(&(ethr->pgo_wakeup));
5299
5300 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5301 (void *)ethr, BASEPRI_VM,
5302 &(ethr->pgo_iothread));
5303 if (result != KERN_SUCCESS) {
5304 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5305 }
5306 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5307
5308 thread_mtx_lock(vm_pageout_gc_thread );
5309 thread_start(vm_pageout_gc_thread );
5310 thread_mtx_unlock(vm_pageout_gc_thread);
5311
5312 #if VM_PRESSURE_EVENTS
5313 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5314 BASEPRI_DEFAULT,
5315 &thread);
5316
5317 if (result != KERN_SUCCESS) {
5318 panic("vm_pressure_thread: create failed");
5319 }
5320
5321 thread_deallocate(thread);
5322 #endif
5323
5324 vm_object_reaper_init();
5325
5326
5327 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5328 vm_compressor_init();
5329 }
5330
5331 #if VM_PRESSURE_EVENTS
5332 vm_pressure_events_enabled = TRUE;
5333 #endif /* VM_PRESSURE_EVENTS */
5334
5335 #if CONFIG_PHANTOM_CACHE
5336 vm_phantom_cache_init();
5337 #endif
5338 #if VM_PAGE_BUCKETS_CHECK
5339 #if VM_PAGE_FAKE_BUCKETS
5340 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5341 (uint64_t) vm_page_fake_buckets_start,
5342 (uint64_t) vm_page_fake_buckets_end);
5343 pmap_protect(kernel_pmap,
5344 vm_page_fake_buckets_start,
5345 vm_page_fake_buckets_end,
5346 VM_PROT_READ);
5347 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5348 #endif /* VM_PAGE_FAKE_BUCKETS */
5349 #endif /* VM_PAGE_BUCKETS_CHECK */
5350
5351 #if VM_OBJECT_TRACKING
5352 vm_object_tracking_init();
5353 #endif /* VM_OBJECT_TRACKING */
5354
5355 #if __arm64__
5356 // vm_tests();
5357 #endif /* __arm64__ */
5358
5359 vm_pageout_continue();
5360
5361 /*
5362 * Unreached code!
5363 *
5364 * The vm_pageout_continue() call above never returns, so the code below is never
5365 * executed. We take advantage of this to declare several DTrace VM related probe
5366 * points that our kernel doesn't have an analog for. These are probe points that
5367 * exist in Solaris and are in the DTrace documentation, so people may have written
5368 * scripts that use them. Declaring the probe points here means their scripts will
5369 * compile and execute which we want for portability of the scripts, but since this
5370 * section of code is never reached, the probe points will simply never fire. Yes,
5371 * this is basically a hack. The problem is the DTrace probe points were chosen with
5372 * Solaris specific VM events in mind, not portability to different VM implementations.
5373 */
5374
5375 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5376 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5377 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5378 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5379 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5380 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5381 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5382 /*NOTREACHED*/
5383 }
5384
5385
5386
5387 kern_return_t
5388 vm_pageout_internal_start(void)
5389 {
5390 kern_return_t result = KERN_SUCCESS;
5391 host_basic_info_data_t hinfo;
5392 vm_offset_t buf, bufsize;
5393
5394 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5395
5396 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5397 #define BSD_HOST 1
5398 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5399
5400 assert(hinfo.max_cpus > 0);
5401
5402 #if !XNU_TARGET_OS_OSX
5403 vm_pageout_state.vm_compressor_thread_count = 1;
5404 #else /* !XNU_TARGET_OS_OSX */
5405 if (hinfo.max_cpus > 4) {
5406 vm_pageout_state.vm_compressor_thread_count = 2;
5407 } else {
5408 vm_pageout_state.vm_compressor_thread_count = 1;
5409 }
5410 #endif /* !XNU_TARGET_OS_OSX */
5411 #if __AMP__
5412 if (vm_compressor_ebound) {
5413 vm_pageout_state.vm_compressor_thread_count = 2;
5414 }
5415 #endif
5416 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5417 sizeof(vm_pageout_state.vm_compressor_thread_count));
5418
5419 /* did we get from the bootargs an unreasonable number? */
5420 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5421 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5422 }
5423 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5424 vm_pageout_state.vm_compressor_thread_count = 1;
5425 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5426 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5427 }
5428
5429 vm_pageout_queue_internal.pgo_maxlaundry =
5430 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5431
5432 PE_parse_boot_argn("vmpgoi_maxlaundry",
5433 &vm_pageout_queue_internal.pgo_maxlaundry,
5434 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5435
5436 #if DEVELOPMENT || DEBUG
5437 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5438 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5439 #endif /* DEVELOPMENT || DEBUG */
5440
5441 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5442
5443 kmem_alloc(kernel_map, &buf,
5444 bufsize * vm_pageout_state.vm_compressor_thread_count,
5445 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5446 VM_KERN_MEMORY_COMPRESSOR);
5447
5448 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5449 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5450 iq->id = i;
5451 iq->q = &vm_pageout_queue_internal;
5452 iq->current_early_swapout_chead = NULL;
5453 iq->current_regular_swapout_chead = NULL;
5454 iq->current_late_swapout_chead = NULL;
5455 iq->scratch_buf = (char *)(buf + i * bufsize);
5456 #if DEVELOPMENT || DEBUG
5457 iq->benchmark_q = &vm_pageout_queue_benchmark;
5458 #endif /* DEVELOPMENT || DEBUG */
5459 sched_cond_init(&(iq->pgo_wakeup));
5460 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5461 (void *)iq, BASEPRI_VM,
5462 &(iq->pgo_iothread));
5463
5464 if (result != KERN_SUCCESS) {
5465 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5466 }
5467 }
5468 return result;
5469 }
5470
5471 #if CONFIG_IOSCHED
5472 /*
5473 * To support I/O Expedite for compressed files we mark the upls with special flags.
5474 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5475 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5476 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5477 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5478 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5479 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5480 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5481 * unless the real I/O upl is being destroyed).
5482 */
5483
5484
5485 static void
5486 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5487 {
5488 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5489
5490 upl_lock(src_upl);
5491 if (src_upl->decmp_io_upl) {
5492 /*
5493 * If there is already an alive real I/O UPL, ignore this new UPL.
5494 * This case should rarely happen and even if it does, it just means
5495 * that we might issue a spurious expedite which the driver is expected
5496 * to handle.
5497 */
5498 upl_unlock(src_upl);
5499 return;
5500 }
5501 src_upl->decmp_io_upl = (void *)upl;
5502 src_upl->ref_count++;
5503
5504 upl->flags |= UPL_DECMP_REAL_IO;
5505 upl->decmp_io_upl = (void *)src_upl;
5506 upl_unlock(src_upl);
5507 }
5508 #endif /* CONFIG_IOSCHED */
5509
5510 #if UPL_DEBUG
5511 int upl_debug_enabled = 1;
5512 #else
5513 int upl_debug_enabled = 0;
5514 #endif
5515
5516 static upl_t
5517 upl_create(int type, int flags, upl_size_t size)
5518 {
5519 uint32_t pages = (uint32_t)atop(round_page_32(size));
5520 upl_t upl;
5521
5522 assert(page_aligned(size));
5523
5524 /*
5525 * FIXME: this code assumes the allocation always succeeds,
5526 * however `pages` can be up to MAX_UPL_SIZE.
5527 *
5528 * The allocation size is above 32k (resp. 128k)
5529 * on 16k pages (resp. 4k), which kalloc might fail
5530 * to allocate.
5531 */
5532 upl = kalloc_type(struct upl, struct upl_page_info,
5533 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5534 if (type & UPL_CREATE_INTERNAL) {
5535 flags |= UPL_INTERNAL;
5536 }
5537
5538 if (type & UPL_CREATE_LITE) {
5539 flags |= UPL_LITE;
5540 if (pages) {
5541 upl->lite_list = bitmap_alloc(pages);
5542 }
5543 }
5544
5545 upl->flags = flags;
5546 upl->ref_count = 1;
5547 upl_lock_init(upl);
5548 #if CONFIG_IOSCHED
5549 if (type & UPL_CREATE_IO_TRACKING) {
5550 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5551 }
5552
5553 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5554 /* Only support expedite on internal UPLs */
5555 thread_t curthread = current_thread();
5556 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5557 Z_WAITOK | Z_ZERO);
5558 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5559 if (curthread->decmp_upl != NULL) {
5560 upl_set_decmp_info(upl, curthread->decmp_upl);
5561 }
5562 }
5563 #endif
5564 #if CONFIG_IOSCHED || UPL_DEBUG
5565 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5566 upl->upl_creator = current_thread();
5567 upl->flags |= UPL_TRACKED_BY_OBJECT;
5568 }
5569 #endif
5570
5571 #if UPL_DEBUG
5572 upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5573 #endif /* UPL_DEBUG */
5574
5575 return upl;
5576 }
5577
5578 static void
5579 upl_destroy(upl_t upl)
5580 {
5581 uint32_t pages;
5582
5583 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5584
5585 if (upl->ext_ref_count) {
5586 panic("upl(%p) ext_ref_count", upl);
5587 }
5588
5589 #if CONFIG_IOSCHED
5590 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5591 upl_t src_upl;
5592 src_upl = upl->decmp_io_upl;
5593 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5594 upl_lock(src_upl);
5595 src_upl->decmp_io_upl = NULL;
5596 upl_unlock(src_upl);
5597 upl_deallocate(src_upl);
5598 }
5599 #endif /* CONFIG_IOSCHED */
5600
5601 #if CONFIG_IOSCHED || UPL_DEBUG
5602 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5603 !(upl->flags & UPL_VECTOR)) {
5604 vm_object_t object;
5605
5606 if (upl->flags & UPL_SHADOWED) {
5607 object = upl->map_object->shadow;
5608 } else {
5609 object = upl->map_object;
5610 }
5611
5612 vm_object_lock(object);
5613 queue_remove(&object->uplq, upl, upl_t, uplq);
5614 vm_object_activity_end(object);
5615 vm_object_collapse(object, 0, TRUE);
5616 vm_object_unlock(object);
5617 }
5618 #endif
5619 /*
5620 * drop a reference on the map_object whether or
5621 * not a pageout object is inserted
5622 */
5623 if (upl->flags & UPL_SHADOWED) {
5624 vm_object_deallocate(upl->map_object);
5625 }
5626
5627 if (upl->flags & UPL_DEVICE_MEMORY) {
5628 pages = 1;
5629 } else {
5630 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5631 }
5632
5633 upl_lock_destroy(upl);
5634
5635 #if CONFIG_IOSCHED
5636 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5637 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5638 }
5639 #endif
5640
5641 #if UPL_DEBUG
5642 for (int i = 0; i < upl->upl_commit_index; i++) {
5643 btref_put(upl->upl_commit_records[i].c_btref);
5644 }
5645 btref_put(upl->upl_create_btref);
5646 #endif /* UPL_DEBUG */
5647
5648 if ((upl->flags & UPL_LITE) && pages) {
5649 bitmap_free(upl->lite_list, pages);
5650 }
5651 kfree_type(struct upl, struct upl_page_info,
5652 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5653 }
5654
5655 void
5656 upl_deallocate(upl_t upl)
5657 {
5658 upl_lock(upl);
5659
5660 if (--upl->ref_count == 0) {
5661 if (vector_upl_is_valid(upl)) {
5662 vector_upl_deallocate(upl);
5663 }
5664 upl_unlock(upl);
5665
5666 if (upl->upl_iodone) {
5667 upl_callout_iodone(upl);
5668 }
5669
5670 upl_destroy(upl);
5671 } else {
5672 upl_unlock(upl);
5673 }
5674 }
5675
5676 #if CONFIG_IOSCHED
5677 void
5678 upl_mark_decmp(upl_t upl)
5679 {
5680 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5681 upl->flags |= UPL_DECMP_REQ;
5682 upl->upl_creator->decmp_upl = (void *)upl;
5683 }
5684 }
5685
5686 void
5687 upl_unmark_decmp(upl_t upl)
5688 {
5689 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5690 upl->upl_creator->decmp_upl = NULL;
5691 }
5692 }
5693
5694 #endif /* CONFIG_IOSCHED */
5695
5696 #define VM_PAGE_Q_BACKING_UP(q) \
5697 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5698
5699 boolean_t must_throttle_writes(void);
5700
5701 boolean_t
5702 must_throttle_writes()
5703 {
5704 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5705 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5706 return TRUE;
5707 }
5708
5709 return FALSE;
5710 }
5711
5712 int vm_page_delayed_work_ctx_needed = 0;
5713 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5714
5715 __startup_func
5716 static void
5717 vm_page_delayed_work_init_ctx(void)
5718 {
5719 uint16_t min_delayed_work_ctx_allocated = 16;
5720
5721 /*
5722 * try really hard to always keep NCPU elements around in the zone
5723 * in order for the UPL code to almost always get an element.
5724 */
5725 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5726 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5727 }
5728
5729 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5730 }
5731 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5732
5733 struct vm_page_delayed_work*
5734 vm_page_delayed_work_get_ctx(void)
5735 {
5736 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5737
5738 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5739
5740 if (__probable(dw_ctx)) {
5741 dw_ctx->delayed_owner = current_thread();
5742 } else {
5743 vm_page_delayed_work_ctx_needed++;
5744 }
5745 return dw_ctx ? dw_ctx->dwp : NULL;
5746 }
5747
5748 void
5749 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5750 {
5751 struct vm_page_delayed_work_ctx *ldw_ctx;
5752
5753 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5754 ldw_ctx->delayed_owner = NULL;
5755
5756 zfree(dw_ctx_zone, ldw_ctx);
5757 }
5758
5759 /*
5760 * Routine: vm_object_upl_request
5761 * Purpose:
5762 * Cause the population of a portion of a vm_object.
5763 * Depending on the nature of the request, the pages
5764 * returned may be contain valid data or be uninitialized.
5765 * A page list structure, listing the physical pages
5766 * will be returned upon request.
5767 * This function is called by the file system or any other
5768 * supplier of backing store to a pager.
5769 * IMPORTANT NOTE: The caller must still respect the relationship
5770 * between the vm_object and its backing memory object. The
5771 * caller MUST NOT substitute changes in the backing file
5772 * without first doing a memory_object_lock_request on the
5773 * target range unless it is know that the pages are not
5774 * shared with another entity at the pager level.
5775 * Copy_in_to:
5776 * if a page list structure is present
5777 * return the mapped physical pages, where a
5778 * page is not present, return a non-initialized
5779 * one. If the no_sync bit is turned on, don't
5780 * call the pager unlock to synchronize with other
5781 * possible copies of the page. Leave pages busy
5782 * in the original object, if a page list structure
5783 * was specified. When a commit of the page list
5784 * pages is done, the dirty bit will be set for each one.
5785 * Copy_out_from:
5786 * If a page list structure is present, return
5787 * all mapped pages. Where a page does not exist
5788 * map a zero filled one. Leave pages busy in
5789 * the original object. If a page list structure
5790 * is not specified, this call is a no-op.
5791 *
5792 * Note: access of default pager objects has a rather interesting
5793 * twist. The caller of this routine, presumably the file system
5794 * page cache handling code, will never actually make a request
5795 * against a default pager backed object. Only the default
5796 * pager will make requests on backing store related vm_objects
5797 * In this way the default pager can maintain the relationship
5798 * between backing store files (abstract memory objects) and
5799 * the vm_objects (cache objects), they support.
5800 *
5801 */
5802
5803 __private_extern__ kern_return_t
5804 vm_object_upl_request(
5805 vm_object_t object,
5806 vm_object_offset_t offset,
5807 upl_size_t size,
5808 upl_t *upl_ptr,
5809 upl_page_info_array_t user_page_list,
5810 unsigned int *page_list_count,
5811 upl_control_flags_t cntrl_flags,
5812 vm_tag_t tag)
5813 {
5814 vm_page_t dst_page = VM_PAGE_NULL;
5815 vm_object_offset_t dst_offset;
5816 upl_size_t xfer_size;
5817 unsigned int size_in_pages;
5818 boolean_t dirty;
5819 boolean_t hw_dirty;
5820 upl_t upl = NULL;
5821 unsigned int entry;
5822 vm_page_t alias_page = NULL;
5823 int refmod_state = 0;
5824 vm_object_t last_copy_object;
5825 uint32_t last_copy_version;
5826 struct vm_page_delayed_work dw_array;
5827 struct vm_page_delayed_work *dwp, *dwp_start;
5828 bool dwp_finish_ctx = TRUE;
5829 int dw_count;
5830 int dw_limit;
5831 int io_tracking_flag = 0;
5832 int grab_options;
5833 int page_grab_count = 0;
5834 ppnum_t phys_page;
5835 pmap_flush_context pmap_flush_context_storage;
5836 boolean_t pmap_flushes_delayed = FALSE;
5837 #if DEVELOPMENT || DEBUG
5838 task_t task = current_task();
5839 #endif /* DEVELOPMENT || DEBUG */
5840
5841 dwp_start = dwp = NULL;
5842
5843 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5844 /*
5845 * For forward compatibility's sake,
5846 * reject any unknown flag.
5847 */
5848 return KERN_INVALID_VALUE;
5849 }
5850 if ((!object->internal) && (object->paging_offset != 0)) {
5851 panic("vm_object_upl_request: external object with non-zero paging offset");
5852 }
5853 if (object->phys_contiguous) {
5854 panic("vm_object_upl_request: contiguous object specified");
5855 }
5856
5857 assertf(page_aligned(offset) && page_aligned(size),
5858 "offset 0x%llx size 0x%x",
5859 offset, size);
5860
5861 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5862
5863 dw_count = 0;
5864 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5865 dwp_start = vm_page_delayed_work_get_ctx();
5866 if (dwp_start == NULL) {
5867 dwp_start = &dw_array;
5868 dw_limit = 1;
5869 dwp_finish_ctx = FALSE;
5870 }
5871
5872 dwp = dwp_start;
5873
5874 if (size > MAX_UPL_SIZE_BYTES) {
5875 size = MAX_UPL_SIZE_BYTES;
5876 }
5877
5878 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5879 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5880 }
5881
5882 #if CONFIG_IOSCHED || UPL_DEBUG
5883 if (object->io_tracking || upl_debug_enabled) {
5884 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5885 }
5886 #endif
5887 #if CONFIG_IOSCHED
5888 if (object->io_tracking) {
5889 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5890 }
5891 #endif
5892
5893 if (cntrl_flags & UPL_SET_INTERNAL) {
5894 if (cntrl_flags & UPL_SET_LITE) {
5895 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5896 } else {
5897 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5898 }
5899 user_page_list = size ? upl->page_list : NULL;
5900 } else {
5901 if (cntrl_flags & UPL_SET_LITE) {
5902 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5903 } else {
5904 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5905 }
5906 }
5907 *upl_ptr = upl;
5908
5909 if (user_page_list) {
5910 user_page_list[0].device = FALSE;
5911 }
5912
5913 if (cntrl_flags & UPL_SET_LITE) {
5914 upl->map_object = object;
5915 } else {
5916 upl->map_object = vm_object_allocate(size);
5917 vm_object_lock(upl->map_object);
5918 /*
5919 * No neeed to lock the new object: nobody else knows
5920 * about it yet, so it's all ours so far.
5921 */
5922 upl->map_object->shadow = object;
5923 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5924 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5925 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5926 upl->map_object->vo_shadow_offset = offset;
5927 upl->map_object->wimg_bits = object->wimg_bits;
5928 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5929 "object %p shadow_offset 0x%llx",
5930 upl->map_object, upl->map_object->vo_shadow_offset);
5931 vm_object_unlock(upl->map_object);
5932
5933 alias_page = vm_page_grab_fictitious(TRUE);
5934
5935 upl->flags |= UPL_SHADOWED;
5936 }
5937 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5938 upl->flags |= UPL_PAGEOUT;
5939 }
5940
5941 vm_object_lock(object);
5942 vm_object_activity_begin(object);
5943
5944 grab_options = 0;
5945 #if CONFIG_SECLUDED_MEMORY
5946 if (object->can_grab_secluded) {
5947 grab_options |= VM_PAGE_GRAB_SECLUDED;
5948 }
5949 #endif /* CONFIG_SECLUDED_MEMORY */
5950
5951 /*
5952 * we can lock in the paging_offset once paging_in_progress is set
5953 */
5954 upl->u_size = size;
5955 upl->u_offset = offset + object->paging_offset;
5956
5957 #if CONFIG_IOSCHED || UPL_DEBUG
5958 if (object->io_tracking || upl_debug_enabled) {
5959 vm_object_activity_begin(object);
5960 queue_enter(&object->uplq, upl, upl_t, uplq);
5961 }
5962 #endif
5963 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5964 /*
5965 * Honor copy-on-write obligations
5966 *
5967 * The caller is gathering these pages and
5968 * might modify their contents. We need to
5969 * make sure that the copy object has its own
5970 * private copies of these pages before we let
5971 * the caller modify them.
5972 */
5973 vm_object_update(object,
5974 offset,
5975 size,
5976 NULL,
5977 NULL,
5978 FALSE, /* should_return */
5979 MEMORY_OBJECT_COPY_SYNC,
5980 VM_PROT_NO_CHANGE);
5981
5982 VM_PAGEOUT_DEBUG(upl_cow, 1);
5983 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5984 }
5985 /*
5986 * remember which copy object we synchronized with
5987 */
5988 last_copy_object = object->vo_copy;
5989 last_copy_version = object->vo_copy_version;
5990 entry = 0;
5991
5992 xfer_size = size;
5993 dst_offset = offset;
5994 size_in_pages = size / PAGE_SIZE;
5995
5996 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5997 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5998 object->scan_collisions = 0;
5999 }
6000
6001 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6002 boolean_t isSSD = FALSE;
6003
6004 #if !XNU_TARGET_OS_OSX
6005 isSSD = TRUE;
6006 #else /* !XNU_TARGET_OS_OSX */
6007 vnode_pager_get_isSSD(object->pager, &isSSD);
6008 #endif /* !XNU_TARGET_OS_OSX */
6009 vm_object_unlock(object);
6010
6011 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6012
6013 if (isSSD == TRUE) {
6014 delay(1000 * size_in_pages);
6015 } else {
6016 delay(5000 * size_in_pages);
6017 }
6018 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6019
6020 vm_object_lock(object);
6021 }
6022
6023 while (xfer_size) {
6024 dwp->dw_mask = 0;
6025
6026 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6027 vm_object_unlock(object);
6028 alias_page = vm_page_grab_fictitious(TRUE);
6029 vm_object_lock(object);
6030 }
6031 if (cntrl_flags & UPL_COPYOUT_FROM) {
6032 upl->flags |= UPL_PAGE_SYNC_DONE;
6033
6034 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6035 dst_page->vmp_fictitious ||
6036 dst_page->vmp_absent ||
6037 VMP_ERROR_GET(dst_page) ||
6038 dst_page->vmp_cleaning ||
6039 (VM_PAGE_WIRED(dst_page))) {
6040 if (user_page_list) {
6041 user_page_list[entry].phys_addr = 0;
6042 }
6043
6044 goto try_next_page;
6045 }
6046 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6047
6048 /*
6049 * grab this up front...
6050 * a high percentange of the time we're going to
6051 * need the hardware modification state a bit later
6052 * anyway... so we can eliminate an extra call into
6053 * the pmap layer by grabbing it here and recording it
6054 */
6055 if (dst_page->vmp_pmapped) {
6056 refmod_state = pmap_get_refmod(phys_page);
6057 } else {
6058 refmod_state = 0;
6059 }
6060
6061 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6062 /*
6063 * page is on inactive list and referenced...
6064 * reactivate it now... this gets it out of the
6065 * way of vm_pageout_scan which would have to
6066 * reactivate it upon tripping over it
6067 */
6068 dwp->dw_mask |= DW_vm_page_activate;
6069 }
6070 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6071 /*
6072 * we're only asking for DIRTY pages to be returned
6073 */
6074 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6075 /*
6076 * if we were the page stolen by vm_pageout_scan to be
6077 * cleaned (as opposed to a buddy being clustered in
6078 * or this request is not being driven by a PAGEOUT cluster
6079 * then we only need to check for the page being dirty or
6080 * precious to decide whether to return it
6081 */
6082 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6083 goto check_busy;
6084 }
6085 goto dont_return;
6086 }
6087 /*
6088 * this is a request for a PAGEOUT cluster and this page
6089 * is merely along for the ride as a 'buddy'... not only
6090 * does it have to be dirty to be returned, but it also
6091 * can't have been referenced recently...
6092 */
6093 if ((hibernate_cleaning_in_progress == TRUE ||
6094 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6095 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6096 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6097 goto check_busy;
6098 }
6099 dont_return:
6100 /*
6101 * if we reach here, we're not to return
6102 * the page... go on to the next one
6103 */
6104 if (dst_page->vmp_laundry == TRUE) {
6105 /*
6106 * if we get here, the page is not 'cleaning' (filtered out above).
6107 * since it has been referenced, remove it from the laundry
6108 * so we don't pay the cost of an I/O to clean a page
6109 * we're just going to take back
6110 */
6111 vm_page_lockspin_queues();
6112
6113 vm_pageout_steal_laundry(dst_page, TRUE);
6114 vm_page_activate(dst_page);
6115
6116 vm_page_unlock_queues();
6117 }
6118 if (user_page_list) {
6119 user_page_list[entry].phys_addr = 0;
6120 }
6121
6122 goto try_next_page;
6123 }
6124 check_busy:
6125 if (dst_page->vmp_busy) {
6126 if (cntrl_flags & UPL_NOBLOCK) {
6127 if (user_page_list) {
6128 user_page_list[entry].phys_addr = 0;
6129 }
6130 dwp->dw_mask = 0;
6131
6132 goto try_next_page;
6133 }
6134 /*
6135 * someone else is playing with the
6136 * page. We will have to wait.
6137 */
6138 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6139
6140 continue;
6141 }
6142 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6143 vm_page_lockspin_queues();
6144
6145 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6146 /*
6147 * we've buddied up a page for a clustered pageout
6148 * that has already been moved to the pageout
6149 * queue by pageout_scan... we need to remove
6150 * it from the queue and drop the laundry count
6151 * on that queue
6152 */
6153 vm_pageout_throttle_up(dst_page);
6154 }
6155 vm_page_unlock_queues();
6156 }
6157 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6158 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6159
6160 if (phys_page > upl->highest_page) {
6161 upl->highest_page = phys_page;
6162 }
6163
6164 assert(!pmap_is_noencrypt(phys_page));
6165
6166 if (cntrl_flags & UPL_SET_LITE) {
6167 unsigned int pg_num;
6168
6169 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6170 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6171 bitmap_set(upl->lite_list, pg_num);
6172
6173 if (hw_dirty) {
6174 if (pmap_flushes_delayed == FALSE) {
6175 pmap_flush_context_init(&pmap_flush_context_storage);
6176 pmap_flushes_delayed = TRUE;
6177 }
6178 pmap_clear_refmod_options(phys_page,
6179 VM_MEM_MODIFIED,
6180 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6181 &pmap_flush_context_storage);
6182 }
6183
6184 /*
6185 * Mark original page as cleaning
6186 * in place.
6187 */
6188 dst_page->vmp_cleaning = TRUE;
6189 dst_page->vmp_precious = FALSE;
6190 } else {
6191 /*
6192 * use pageclean setup, it is more
6193 * convenient even for the pageout
6194 * cases here
6195 */
6196 vm_object_lock(upl->map_object);
6197 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6198 vm_object_unlock(upl->map_object);
6199
6200 alias_page->vmp_absent = FALSE;
6201 alias_page = NULL;
6202 }
6203 if (dirty) {
6204 SET_PAGE_DIRTY(dst_page, FALSE);
6205 } else {
6206 dst_page->vmp_dirty = FALSE;
6207 }
6208
6209 if (!dirty) {
6210 dst_page->vmp_precious = TRUE;
6211 }
6212
6213 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6214 if (!VM_PAGE_WIRED(dst_page)) {
6215 dst_page->vmp_free_when_done = TRUE;
6216 }
6217 }
6218 } else {
6219 if ((cntrl_flags & UPL_WILL_MODIFY) &&
6220 (object->vo_copy != last_copy_object ||
6221 object->vo_copy_version != last_copy_version)) {
6222 /*
6223 * Honor copy-on-write obligations
6224 *
6225 * The copy object has changed since we
6226 * last synchronized for copy-on-write.
6227 * Another copy object might have been
6228 * inserted while we released the object's
6229 * lock. Since someone could have seen the
6230 * original contents of the remaining pages
6231 * through that new object, we have to
6232 * synchronize with it again for the remaining
6233 * pages only. The previous pages are "busy"
6234 * so they can not be seen through the new
6235 * mapping. The new mapping will see our
6236 * upcoming changes for those previous pages,
6237 * but that's OK since they couldn't see what
6238 * was there before. It's just a race anyway
6239 * and there's no guarantee of consistency or
6240 * atomicity. We just don't want new mappings
6241 * to see both the *before* and *after* pages.
6242 */
6243 if (object->vo_copy != VM_OBJECT_NULL) {
6244 vm_object_update(
6245 object,
6246 dst_offset,/* current offset */
6247 xfer_size, /* remaining size */
6248 NULL,
6249 NULL,
6250 FALSE, /* should_return */
6251 MEMORY_OBJECT_COPY_SYNC,
6252 VM_PROT_NO_CHANGE);
6253
6254 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6255 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6256 }
6257 /*
6258 * remember the copy object we synced with
6259 */
6260 last_copy_object = object->vo_copy;
6261 last_copy_version = object->vo_copy_version;
6262 }
6263 dst_page = vm_page_lookup(object, dst_offset);
6264
6265 if (dst_page != VM_PAGE_NULL) {
6266 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6267 /*
6268 * skip over pages already present in the cache
6269 */
6270 if (user_page_list) {
6271 user_page_list[entry].phys_addr = 0;
6272 }
6273
6274 goto try_next_page;
6275 }
6276 if (dst_page->vmp_fictitious) {
6277 panic("need corner case for fictitious page");
6278 }
6279
6280 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6281 /*
6282 * someone else is playing with the
6283 * page. We will have to wait.
6284 */
6285 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6286
6287 continue;
6288 }
6289 if (dst_page->vmp_laundry) {
6290 vm_pageout_steal_laundry(dst_page, FALSE);
6291 }
6292 } else {
6293 if (object->private) {
6294 /*
6295 * This is a nasty wrinkle for users
6296 * of upl who encounter device or
6297 * private memory however, it is
6298 * unavoidable, only a fault can
6299 * resolve the actual backing
6300 * physical page by asking the
6301 * backing device.
6302 */
6303 if (user_page_list) {
6304 user_page_list[entry].phys_addr = 0;
6305 }
6306
6307 goto try_next_page;
6308 }
6309 if (object->scan_collisions) {
6310 /*
6311 * the pageout_scan thread is trying to steal
6312 * pages from this object, but has run into our
6313 * lock... grab 2 pages from the head of the object...
6314 * the first is freed on behalf of pageout_scan, the
6315 * 2nd is for our own use... we use vm_object_page_grab
6316 * in both cases to avoid taking pages from the free
6317 * list since we are under memory pressure and our
6318 * lock on this object is getting in the way of
6319 * relieving it
6320 */
6321 dst_page = vm_object_page_grab(object);
6322
6323 if (dst_page != VM_PAGE_NULL) {
6324 vm_page_release(dst_page,
6325 FALSE);
6326 }
6327
6328 dst_page = vm_object_page_grab(object);
6329 }
6330 if (dst_page == VM_PAGE_NULL) {
6331 /*
6332 * need to allocate a page
6333 */
6334 dst_page = vm_page_grab_options(grab_options);
6335 if (dst_page != VM_PAGE_NULL) {
6336 page_grab_count++;
6337 }
6338 }
6339 if (dst_page == VM_PAGE_NULL) {
6340 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6341 /*
6342 * we don't want to stall waiting for pages to come onto the free list
6343 * while we're already holding absent pages in this UPL
6344 * the caller will deal with the empty slots
6345 */
6346 if (user_page_list) {
6347 user_page_list[entry].phys_addr = 0;
6348 }
6349
6350 goto try_next_page;
6351 }
6352 /*
6353 * no pages available... wait
6354 * then try again for the same
6355 * offset...
6356 */
6357 vm_object_unlock(object);
6358
6359 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6360
6361 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6362
6363 VM_PAGE_WAIT();
6364 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6365
6366 VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6367
6368 vm_object_lock(object);
6369
6370 continue;
6371 }
6372 vm_page_insert(dst_page, object, dst_offset);
6373
6374 dst_page->vmp_absent = TRUE;
6375 dst_page->vmp_busy = FALSE;
6376
6377 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6378 /*
6379 * if UPL_RET_ONLY_ABSENT was specified,
6380 * than we're definitely setting up a
6381 * upl for a clustered read/pagein
6382 * operation... mark the pages as clustered
6383 * so upl_commit_range can put them on the
6384 * speculative list
6385 */
6386 dst_page->vmp_clustered = TRUE;
6387
6388 if (!(cntrl_flags & UPL_FILE_IO)) {
6389 counter_inc(&vm_statistics_pageins);
6390 }
6391 }
6392 }
6393 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6394
6395 dst_page->vmp_overwriting = TRUE;
6396
6397 if (dst_page->vmp_pmapped) {
6398 if (!(cntrl_flags & UPL_FILE_IO)) {
6399 /*
6400 * eliminate all mappings from the
6401 * original object and its prodigy
6402 */
6403 refmod_state = pmap_disconnect(phys_page);
6404 } else {
6405 refmod_state = pmap_get_refmod(phys_page);
6406 }
6407 } else {
6408 refmod_state = 0;
6409 }
6410
6411 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6412 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6413
6414 if (cntrl_flags & UPL_SET_LITE) {
6415 unsigned int pg_num;
6416
6417 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6418 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6419 bitmap_set(upl->lite_list, pg_num);
6420
6421 if (hw_dirty) {
6422 pmap_clear_modify(phys_page);
6423 }
6424
6425 /*
6426 * Mark original page as cleaning
6427 * in place.
6428 */
6429 dst_page->vmp_cleaning = TRUE;
6430 dst_page->vmp_precious = FALSE;
6431 } else {
6432 /*
6433 * use pageclean setup, it is more
6434 * convenient even for the pageout
6435 * cases here
6436 */
6437 vm_object_lock(upl->map_object);
6438 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6439 vm_object_unlock(upl->map_object);
6440
6441 alias_page->vmp_absent = FALSE;
6442 alias_page = NULL;
6443 }
6444
6445 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6446 upl->flags &= ~UPL_CLEAR_DIRTY;
6447 upl->flags |= UPL_SET_DIRTY;
6448 dirty = TRUE;
6449 /*
6450 * Page belonging to a code-signed object is about to
6451 * be written. Mark it tainted and disconnect it from
6452 * all pmaps so processes have to fault it back in and
6453 * deal with the tainted bit.
6454 */
6455 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6456 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6457 vm_page_upl_tainted++;
6458 if (dst_page->vmp_pmapped) {
6459 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6460 if (refmod_state & VM_MEM_REFERENCED) {
6461 dst_page->vmp_reference = TRUE;
6462 }
6463 }
6464 }
6465 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6466 /*
6467 * clean in place for read implies
6468 * that a write will be done on all
6469 * the pages that are dirty before
6470 * a upl commit is done. The caller
6471 * is obligated to preserve the
6472 * contents of all pages marked dirty
6473 */
6474 upl->flags |= UPL_CLEAR_DIRTY;
6475 }
6476 dst_page->vmp_dirty = dirty;
6477
6478 if (!dirty) {
6479 dst_page->vmp_precious = TRUE;
6480 }
6481
6482 if (!VM_PAGE_WIRED(dst_page)) {
6483 /*
6484 * deny access to the target page while
6485 * it is being worked on
6486 */
6487 dst_page->vmp_busy = TRUE;
6488 } else {
6489 dwp->dw_mask |= DW_vm_page_wire;
6490 }
6491
6492 /*
6493 * We might be about to satisfy a fault which has been
6494 * requested. So no need for the "restart" bit.
6495 */
6496 dst_page->vmp_restart = FALSE;
6497 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6498 /*
6499 * expect the page to be used
6500 */
6501 dwp->dw_mask |= DW_set_reference;
6502 }
6503 if (cntrl_flags & UPL_PRECIOUS) {
6504 if (object->internal) {
6505 SET_PAGE_DIRTY(dst_page, FALSE);
6506 dst_page->vmp_precious = FALSE;
6507 } else {
6508 dst_page->vmp_precious = TRUE;
6509 }
6510 } else {
6511 dst_page->vmp_precious = FALSE;
6512 }
6513 }
6514 if (dst_page->vmp_busy) {
6515 upl->flags |= UPL_HAS_BUSY;
6516 }
6517
6518 if (phys_page > upl->highest_page) {
6519 upl->highest_page = phys_page;
6520 }
6521 assert(!pmap_is_noencrypt(phys_page));
6522 if (user_page_list) {
6523 user_page_list[entry].phys_addr = phys_page;
6524 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6525 user_page_list[entry].absent = dst_page->vmp_absent;
6526 user_page_list[entry].dirty = dst_page->vmp_dirty;
6527 user_page_list[entry].precious = dst_page->vmp_precious;
6528 user_page_list[entry].device = FALSE;
6529 user_page_list[entry].needed = FALSE;
6530 if (dst_page->vmp_clustered == TRUE) {
6531 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6532 } else {
6533 user_page_list[entry].speculative = FALSE;
6534 }
6535 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6536 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6537 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6538 user_page_list[entry].mark = FALSE;
6539 }
6540 /*
6541 * if UPL_RET_ONLY_ABSENT is set, then
6542 * we are working with a fresh page and we've
6543 * just set the clustered flag on it to
6544 * indicate that it was drug in as part of a
6545 * speculative cluster... so leave it alone
6546 */
6547 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6548 /*
6549 * someone is explicitly grabbing this page...
6550 * update clustered and speculative state
6551 *
6552 */
6553 if (dst_page->vmp_clustered) {
6554 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6555 }
6556 }
6557 try_next_page:
6558 if (dwp->dw_mask) {
6559 if (dwp->dw_mask & DW_vm_page_activate) {
6560 counter_inc(&vm_statistics_reactivations);
6561 }
6562
6563 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6564
6565 if (dw_count >= dw_limit) {
6566 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6567
6568 dwp = dwp_start;
6569 dw_count = 0;
6570 }
6571 }
6572 entry++;
6573 dst_offset += PAGE_SIZE_64;
6574 xfer_size -= PAGE_SIZE;
6575 }
6576 if (dw_count) {
6577 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6578 dwp = dwp_start;
6579 dw_count = 0;
6580 }
6581
6582 if (alias_page != NULL) {
6583 VM_PAGE_FREE(alias_page);
6584 }
6585 if (pmap_flushes_delayed == TRUE) {
6586 pmap_flush(&pmap_flush_context_storage);
6587 }
6588
6589 if (page_list_count != NULL) {
6590 if (upl->flags & UPL_INTERNAL) {
6591 *page_list_count = 0;
6592 } else if (*page_list_count > entry) {
6593 *page_list_count = entry;
6594 }
6595 }
6596 #if UPL_DEBUG
6597 upl->upl_state = 1;
6598 #endif
6599 vm_object_unlock(object);
6600
6601 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6602 #if DEVELOPMENT || DEBUG
6603 if (task != NULL) {
6604 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6605 }
6606 #endif /* DEVELOPMENT || DEBUG */
6607
6608 if (dwp_start && dwp_finish_ctx) {
6609 vm_page_delayed_work_finish_ctx(dwp_start);
6610 dwp_start = dwp = NULL;
6611 }
6612
6613 return KERN_SUCCESS;
6614 }
6615
6616 /*
6617 * Routine: vm_object_super_upl_request
6618 * Purpose:
6619 * Cause the population of a portion of a vm_object
6620 * in much the same way as memory_object_upl_request.
6621 * Depending on the nature of the request, the pages
6622 * returned may be contain valid data or be uninitialized.
6623 * However, the region may be expanded up to the super
6624 * cluster size provided.
6625 */
6626
6627 __private_extern__ kern_return_t
6628 vm_object_super_upl_request(
6629 vm_object_t object,
6630 vm_object_offset_t offset,
6631 upl_size_t size,
6632 upl_size_t super_cluster,
6633 upl_t *upl,
6634 upl_page_info_t *user_page_list,
6635 unsigned int *page_list_count,
6636 upl_control_flags_t cntrl_flags,
6637 vm_tag_t tag)
6638 {
6639 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6640 return KERN_FAILURE;
6641 }
6642
6643 assert(object->paging_in_progress);
6644 offset = offset - object->paging_offset;
6645
6646 if (super_cluster > size) {
6647 vm_object_offset_t base_offset;
6648 upl_size_t super_size;
6649 vm_object_size_t super_size_64;
6650
6651 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6652 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6653 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6654 super_size = (upl_size_t) super_size_64;
6655 assert(super_size == super_size_64);
6656
6657 if (offset > (base_offset + super_size)) {
6658 panic("vm_object_super_upl_request: Missed target pageout"
6659 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6660 offset, base_offset, super_size, super_cluster,
6661 size, object->paging_offset);
6662 }
6663 /*
6664 * apparently there is a case where the vm requests a
6665 * page to be written out who's offset is beyond the
6666 * object size
6667 */
6668 if ((offset + size) > (base_offset + super_size)) {
6669 super_size_64 = (offset + size) - base_offset;
6670 super_size = (upl_size_t) super_size_64;
6671 assert(super_size == super_size_64);
6672 }
6673
6674 offset = base_offset;
6675 size = super_size;
6676 }
6677 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6678 }
6679
6680 int cs_executable_create_upl = 0;
6681 extern int proc_selfpid(void);
6682 extern char *proc_name_address(void *p);
6683
6684 kern_return_t
6685 vm_map_create_upl(
6686 vm_map_t map,
6687 vm_map_address_t offset,
6688 upl_size_t *upl_size,
6689 upl_t *upl,
6690 upl_page_info_array_t page_list,
6691 unsigned int *count,
6692 upl_control_flags_t *flags,
6693 vm_tag_t tag)
6694 {
6695 vm_map_entry_t entry;
6696 upl_control_flags_t caller_flags;
6697 int force_data_sync;
6698 int sync_cow_data;
6699 vm_object_t local_object;
6700 vm_map_offset_t local_offset;
6701 vm_map_offset_t local_start;
6702 kern_return_t ret;
6703 vm_map_address_t original_offset;
6704 vm_map_size_t original_size, adjusted_size;
6705 vm_map_offset_t local_entry_start;
6706 vm_object_offset_t local_entry_offset;
6707 vm_object_offset_t offset_in_mapped_page;
6708 boolean_t release_map = FALSE;
6709
6710
6711 start_with_map:
6712
6713 original_offset = offset;
6714 original_size = *upl_size;
6715 adjusted_size = original_size;
6716
6717 caller_flags = *flags;
6718
6719 if (caller_flags & ~UPL_VALID_FLAGS) {
6720 /*
6721 * For forward compatibility's sake,
6722 * reject any unknown flag.
6723 */
6724 ret = KERN_INVALID_VALUE;
6725 goto done;
6726 }
6727 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6728 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6729
6730 if (upl == NULL) {
6731 ret = KERN_INVALID_ARGUMENT;
6732 goto done;
6733 }
6734
6735 REDISCOVER_ENTRY:
6736 vm_map_lock_read(map);
6737
6738 if (!vm_map_lookup_entry(map, offset, &entry)) {
6739 vm_map_unlock_read(map);
6740 ret = KERN_FAILURE;
6741 goto done;
6742 }
6743
6744 local_entry_start = entry->vme_start;
6745 local_entry_offset = VME_OFFSET(entry);
6746
6747 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6748 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6749 }
6750
6751 if (entry->vme_end - original_offset < adjusted_size) {
6752 adjusted_size = entry->vme_end - original_offset;
6753 assert(adjusted_size > 0);
6754 *upl_size = (upl_size_t) adjusted_size;
6755 assert(*upl_size == adjusted_size);
6756 }
6757
6758 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6759 *flags = 0;
6760
6761 if (!entry->is_sub_map &&
6762 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6763 if (VME_OBJECT(entry)->private) {
6764 *flags = UPL_DEV_MEMORY;
6765 }
6766
6767 if (VME_OBJECT(entry)->phys_contiguous) {
6768 *flags |= UPL_PHYS_CONTIG;
6769 }
6770 }
6771 vm_map_unlock_read(map);
6772 ret = KERN_SUCCESS;
6773 goto done;
6774 }
6775
6776 offset_in_mapped_page = 0;
6777 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6778 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6779 *upl_size = (upl_size_t)
6780 (vm_map_round_page(original_offset + adjusted_size,
6781 VM_MAP_PAGE_MASK(map))
6782 - offset);
6783
6784 offset_in_mapped_page = original_offset - offset;
6785 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6786
6787 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6788 }
6789
6790 if (!entry->is_sub_map) {
6791 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6792 !VME_OBJECT(entry)->phys_contiguous) {
6793 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6794 *upl_size = MAX_UPL_SIZE_BYTES;
6795 }
6796 }
6797
6798 /*
6799 * Create an object if necessary.
6800 */
6801 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6802 if (vm_map_lock_read_to_write(map)) {
6803 goto REDISCOVER_ENTRY;
6804 }
6805
6806 VME_OBJECT_SET(entry,
6807 vm_object_allocate((vm_size_t)
6808 vm_object_round_page((entry->vme_end - entry->vme_start))),
6809 false, 0);
6810 VME_OFFSET_SET(entry, 0);
6811 assert(entry->use_pmap);
6812
6813 vm_map_lock_write_to_read(map);
6814 }
6815
6816 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6817 !(entry->protection & VM_PROT_WRITE)) {
6818 vm_map_unlock_read(map);
6819 ret = KERN_PROTECTION_FAILURE;
6820 goto done;
6821 }
6822 }
6823
6824 #if !XNU_TARGET_OS_OSX
6825 if (map->pmap != kernel_pmap &&
6826 (caller_flags & UPL_COPYOUT_FROM) &&
6827 (entry->protection & VM_PROT_EXECUTE) &&
6828 !(entry->protection & VM_PROT_WRITE)) {
6829 vm_offset_t kaddr;
6830 vm_size_t ksize;
6831
6832 /*
6833 * We're about to create a read-only UPL backed by
6834 * memory from an executable mapping.
6835 * Wiring the pages would result in the pages being copied
6836 * (due to the "MAP_PRIVATE" mapping) and no longer
6837 * code-signed, so no longer eligible for execution.
6838 * Instead, let's copy the data into a kernel buffer and
6839 * create the UPL from this kernel buffer.
6840 * The kernel buffer is then freed, leaving the UPL holding
6841 * the last reference on the VM object, so the memory will
6842 * be released when the UPL is committed.
6843 */
6844
6845 vm_map_unlock_read(map);
6846 entry = VM_MAP_ENTRY_NULL;
6847 /* allocate kernel buffer */
6848 ksize = round_page(*upl_size);
6849 kaddr = 0;
6850 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6851 KMA_PAGEABLE | KMA_DATA, tag);
6852 if (ret == KERN_SUCCESS) {
6853 /* copyin the user data */
6854 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6855 }
6856 if (ret == KERN_SUCCESS) {
6857 if (ksize > *upl_size) {
6858 /* zero out the extra space in kernel buffer */
6859 memset((void *)(kaddr + *upl_size),
6860 0,
6861 ksize - *upl_size);
6862 }
6863 /* create the UPL from the kernel buffer */
6864 vm_object_offset_t offset_in_object;
6865 vm_object_offset_t offset_in_object_page;
6866
6867 offset_in_object = offset - local_entry_start + local_entry_offset;
6868 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6869 assert(offset_in_object_page < PAGE_SIZE);
6870 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6871 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6872 ret = vm_map_create_upl(kernel_map,
6873 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6874 upl_size, upl, page_list, count, flags, tag);
6875 }
6876 if (kaddr != 0) {
6877 /* free the kernel buffer */
6878 kmem_free(kernel_map, kaddr, ksize);
6879 kaddr = 0;
6880 ksize = 0;
6881 }
6882 #if DEVELOPMENT || DEBUG
6883 DTRACE_VM4(create_upl_from_executable,
6884 vm_map_t, map,
6885 vm_map_address_t, offset,
6886 upl_size_t, *upl_size,
6887 kern_return_t, ret);
6888 #endif /* DEVELOPMENT || DEBUG */
6889 goto done;
6890 }
6891 #endif /* !XNU_TARGET_OS_OSX */
6892
6893 if (!entry->is_sub_map) {
6894 local_object = VME_OBJECT(entry);
6895 assert(local_object != VM_OBJECT_NULL);
6896 }
6897
6898 if (!entry->is_sub_map &&
6899 !entry->needs_copy &&
6900 *upl_size != 0 &&
6901 local_object->vo_size > *upl_size && /* partial UPL */
6902 entry->wired_count == 0 && /* No COW for entries that are wired */
6903 (map->pmap != kernel_pmap) && /* alias checks */
6904 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6905 ||
6906 ( /* case 2 */
6907 local_object->internal &&
6908 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6909 os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6910 vm_prot_t prot;
6911
6912 /*
6913 * Case 1:
6914 * Set up the targeted range for copy-on-write to avoid
6915 * applying true_share/copy_delay to the entire object.
6916 *
6917 * Case 2:
6918 * This map entry covers only part of an internal
6919 * object. There could be other map entries covering
6920 * other areas of this object and some of these map
6921 * entries could be marked as "needs_copy", which
6922 * assumes that the object is COPY_SYMMETRIC.
6923 * To avoid marking this object as COPY_DELAY and
6924 * "true_share", let's shadow it and mark the new
6925 * (smaller) object as "true_share" and COPY_DELAY.
6926 */
6927
6928 if (vm_map_lock_read_to_write(map)) {
6929 goto REDISCOVER_ENTRY;
6930 }
6931 vm_map_lock_assert_exclusive(map);
6932 assert(VME_OBJECT(entry) == local_object);
6933
6934 vm_map_clip_start(map,
6935 entry,
6936 vm_map_trunc_page(offset,
6937 VM_MAP_PAGE_MASK(map)));
6938 vm_map_clip_end(map,
6939 entry,
6940 vm_map_round_page(offset + *upl_size,
6941 VM_MAP_PAGE_MASK(map)));
6942 if ((entry->vme_end - offset) < *upl_size) {
6943 *upl_size = (upl_size_t) (entry->vme_end - offset);
6944 assert(*upl_size == entry->vme_end - offset);
6945 }
6946
6947 prot = entry->protection & ~VM_PROT_WRITE;
6948 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6949 prot |= VM_PROT_EXECUTE;
6950 }
6951 vm_object_pmap_protect(local_object,
6952 VME_OFFSET(entry),
6953 entry->vme_end - entry->vme_start,
6954 ((entry->is_shared ||
6955 map->mapped_in_other_pmaps)
6956 ? PMAP_NULL
6957 : map->pmap),
6958 VM_MAP_PAGE_SIZE(map),
6959 entry->vme_start,
6960 prot);
6961
6962 assert(entry->wired_count == 0);
6963
6964 /*
6965 * Lock the VM object and re-check its status: if it's mapped
6966 * in another address space, we could still be racing with
6967 * another thread holding that other VM map exclusively.
6968 */
6969 vm_object_lock(local_object);
6970 if (local_object->true_share) {
6971 /* object is already in proper state: no COW needed */
6972 assert(local_object->copy_strategy !=
6973 MEMORY_OBJECT_COPY_SYMMETRIC);
6974 } else {
6975 /* not true_share: ask for copy-on-write below */
6976 assert(local_object->copy_strategy ==
6977 MEMORY_OBJECT_COPY_SYMMETRIC);
6978 entry->needs_copy = TRUE;
6979 }
6980 vm_object_unlock(local_object);
6981
6982 vm_map_lock_write_to_read(map);
6983 }
6984
6985 if (entry->needs_copy) {
6986 /*
6987 * Honor copy-on-write for COPY_SYMMETRIC
6988 * strategy.
6989 */
6990 vm_map_t local_map;
6991 vm_object_t object;
6992 vm_object_offset_t new_offset;
6993 vm_prot_t prot;
6994 boolean_t wired;
6995 vm_map_version_t version;
6996 vm_map_t real_map;
6997 vm_prot_t fault_type;
6998
6999 local_map = map;
7000
7001 if (caller_flags & UPL_COPYOUT_FROM) {
7002 fault_type = VM_PROT_READ | VM_PROT_COPY;
7003 vm_counters.create_upl_extra_cow++;
7004 vm_counters.create_upl_extra_cow_pages +=
7005 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
7006 } else {
7007 fault_type = VM_PROT_WRITE;
7008 }
7009 if (vm_map_lookup_and_lock_object(&local_map,
7010 offset, fault_type,
7011 OBJECT_LOCK_EXCLUSIVE,
7012 &version, &object,
7013 &new_offset, &prot, &wired,
7014 NULL,
7015 &real_map, NULL) != KERN_SUCCESS) {
7016 if (fault_type == VM_PROT_WRITE) {
7017 vm_counters.create_upl_lookup_failure_write++;
7018 } else {
7019 vm_counters.create_upl_lookup_failure_copy++;
7020 }
7021 vm_map_unlock_read(local_map);
7022 ret = KERN_FAILURE;
7023 goto done;
7024 }
7025 if (real_map != local_map) {
7026 vm_map_unlock(real_map);
7027 }
7028 vm_map_unlock_read(local_map);
7029
7030 vm_object_unlock(object);
7031
7032 goto REDISCOVER_ENTRY;
7033 }
7034
7035 if (entry->is_sub_map) {
7036 vm_map_t submap;
7037
7038 submap = VME_SUBMAP(entry);
7039 local_start = entry->vme_start;
7040 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7041
7042 vm_map_reference(submap);
7043 vm_map_unlock_read(map);
7044
7045 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7046 offset += offset_in_mapped_page;
7047 *upl_size -= offset_in_mapped_page;
7048
7049 if (release_map) {
7050 vm_map_deallocate(map);
7051 }
7052 map = submap;
7053 release_map = TRUE;
7054 offset = local_offset + (offset - local_start);
7055 goto start_with_map;
7056 }
7057
7058 if (sync_cow_data &&
7059 (VME_OBJECT(entry)->shadow ||
7060 VME_OBJECT(entry)->vo_copy)) {
7061 local_object = VME_OBJECT(entry);
7062 local_start = entry->vme_start;
7063 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7064
7065 vm_object_reference(local_object);
7066 vm_map_unlock_read(map);
7067
7068 if (local_object->shadow && local_object->vo_copy) {
7069 vm_object_lock_request(local_object->shadow,
7070 ((vm_object_offset_t)
7071 ((offset - local_start) +
7072 local_offset) +
7073 local_object->vo_shadow_offset),
7074 *upl_size, FALSE,
7075 MEMORY_OBJECT_DATA_SYNC,
7076 VM_PROT_NO_CHANGE);
7077 }
7078 sync_cow_data = FALSE;
7079 vm_object_deallocate(local_object);
7080
7081 goto REDISCOVER_ENTRY;
7082 }
7083 if (force_data_sync) {
7084 local_object = VME_OBJECT(entry);
7085 local_start = entry->vme_start;
7086 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7087
7088 vm_object_reference(local_object);
7089 vm_map_unlock_read(map);
7090
7091 vm_object_lock_request(local_object,
7092 ((vm_object_offset_t)
7093 ((offset - local_start) +
7094 local_offset)),
7095 (vm_object_size_t)*upl_size,
7096 FALSE,
7097 MEMORY_OBJECT_DATA_SYNC,
7098 VM_PROT_NO_CHANGE);
7099
7100 force_data_sync = FALSE;
7101 vm_object_deallocate(local_object);
7102
7103 goto REDISCOVER_ENTRY;
7104 }
7105 if (VME_OBJECT(entry)->private) {
7106 *flags = UPL_DEV_MEMORY;
7107 } else {
7108 *flags = 0;
7109 }
7110
7111 if (VME_OBJECT(entry)->phys_contiguous) {
7112 *flags |= UPL_PHYS_CONTIG;
7113 }
7114
7115 local_object = VME_OBJECT(entry);
7116 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7117 local_start = entry->vme_start;
7118
7119 /*
7120 * Wiring will copy the pages to the shadow object.
7121 * The shadow object will not be code-signed so
7122 * attempting to execute code from these copied pages
7123 * would trigger a code-signing violation.
7124 */
7125 if (entry->protection & VM_PROT_EXECUTE) {
7126 #if MACH_ASSERT
7127 printf("pid %d[%s] create_upl out of executable range from "
7128 "0x%llx to 0x%llx: side effects may include "
7129 "code-signing violations later on\n",
7130 proc_selfpid(),
7131 (get_bsdtask_info(current_task())
7132 ? proc_name_address(get_bsdtask_info(current_task()))
7133 : "?"),
7134 (uint64_t) entry->vme_start,
7135 (uint64_t) entry->vme_end);
7136 #endif /* MACH_ASSERT */
7137 DTRACE_VM2(cs_executable_create_upl,
7138 uint64_t, (uint64_t)entry->vme_start,
7139 uint64_t, (uint64_t)entry->vme_end);
7140 cs_executable_create_upl++;
7141 }
7142
7143 vm_object_lock(local_object);
7144
7145 /*
7146 * Ensure that this object is "true_share" and "copy_delay" now,
7147 * while we're still holding the VM map lock. After we unlock the map,
7148 * anything could happen to that mapping, including some copy-on-write
7149 * activity. We need to make sure that the IOPL will point at the
7150 * same memory as the mapping.
7151 */
7152 if (local_object->true_share) {
7153 assert(local_object->copy_strategy !=
7154 MEMORY_OBJECT_COPY_SYMMETRIC);
7155 } else if (!is_kernel_object(local_object) &&
7156 local_object != compressor_object &&
7157 !local_object->phys_contiguous) {
7158 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7159 if (!local_object->true_share &&
7160 vm_object_tracking_btlog) {
7161 btlog_record(vm_object_tracking_btlog, local_object,
7162 VM_OBJECT_TRACKING_OP_TRUESHARE,
7163 btref_get(__builtin_frame_address(0), 0));
7164 }
7165 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7166 VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7167 if (local_object->copy_strategy ==
7168 MEMORY_OBJECT_COPY_SYMMETRIC) {
7169 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7170 }
7171 }
7172
7173 vm_object_reference_locked(local_object);
7174 vm_object_unlock(local_object);
7175
7176 vm_map_unlock_read(map);
7177
7178 offset += offset_in_mapped_page;
7179 assert(*upl_size > offset_in_mapped_page);
7180 *upl_size -= offset_in_mapped_page;
7181
7182 ret = vm_object_iopl_request(local_object,
7183 ((vm_object_offset_t)
7184 ((offset - local_start) + local_offset)),
7185 *upl_size,
7186 upl,
7187 page_list,
7188 count,
7189 caller_flags,
7190 tag);
7191 vm_object_deallocate(local_object);
7192
7193 done:
7194 if (release_map) {
7195 vm_map_deallocate(map);
7196 }
7197
7198 return ret;
7199 }
7200
7201 /*
7202 * Internal routine to enter a UPL into a VM map.
7203 *
7204 * JMM - This should just be doable through the standard
7205 * vm_map_enter() API.
7206 */
7207 kern_return_t
7208 vm_map_enter_upl_range(
7209 vm_map_t map,
7210 upl_t upl,
7211 vm_object_offset_t offset_to_map,
7212 vm_size_t size_to_map,
7213 vm_prot_t prot_to_map,
7214 vm_map_offset_t *dst_addr)
7215 {
7216 vm_map_size_t size;
7217 vm_object_offset_t offset;
7218 vm_map_offset_t addr;
7219 vm_page_t m;
7220 kern_return_t kr;
7221 int isVectorUPL = 0, curr_upl = 0;
7222 upl_t vector_upl = NULL;
7223 mach_vm_offset_t vector_upl_dst_addr = 0;
7224 vm_map_t vector_upl_submap = NULL;
7225 upl_offset_t subupl_offset = 0;
7226 upl_size_t subupl_size = 0;
7227
7228 if (upl == UPL_NULL) {
7229 return KERN_INVALID_ARGUMENT;
7230 }
7231
7232 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7233 assert(map == kernel_map);
7234
7235 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7236 int mapped = 0, valid_upls = 0;
7237 vector_upl = upl;
7238
7239 upl_lock(vector_upl);
7240 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7241 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7242 if (upl == NULL) {
7243 continue;
7244 }
7245 valid_upls++;
7246 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7247 mapped++;
7248 }
7249 }
7250
7251 if (mapped) {
7252 if (mapped != valid_upls) {
7253 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7254 } else {
7255 upl_unlock(vector_upl);
7256 return KERN_FAILURE;
7257 }
7258 }
7259
7260 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7261 panic("TODO4K: vector UPL not implemented");
7262 }
7263
7264 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7265 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7266 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7267 VM_KERN_MEMORY_NONE).kmr_submap;
7268 map = vector_upl_submap;
7269 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7270 curr_upl = 0;
7271 } else {
7272 upl_lock(upl);
7273 }
7274
7275 process_upl_to_enter:
7276 if (isVectorUPL) {
7277 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7278 *dst_addr = vector_upl_dst_addr;
7279 upl_unlock(vector_upl);
7280 return KERN_SUCCESS;
7281 }
7282 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7283 if (upl == NULL) {
7284 goto process_upl_to_enter;
7285 }
7286
7287 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7288 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7289 } else {
7290 /*
7291 * check to see if already mapped
7292 */
7293 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7294 upl_unlock(upl);
7295 return KERN_FAILURE;
7296 }
7297 }
7298
7299 if ((!(upl->flags & UPL_SHADOWED)) &&
7300 ((upl->flags & UPL_HAS_BUSY) ||
7301 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7302 vm_object_t object;
7303 vm_page_t alias_page;
7304 vm_object_offset_t new_offset;
7305 unsigned int pg_num;
7306
7307 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7308 object = upl->map_object;
7309 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7310
7311 vm_object_lock(upl->map_object);
7312
7313 upl->map_object->shadow = object;
7314 VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7315 VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7316 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7317 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7318 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7319 "object %p shadow_offset 0x%llx",
7320 upl->map_object,
7321 (uint64_t)upl->map_object->vo_shadow_offset);
7322 upl->map_object->wimg_bits = object->wimg_bits;
7323 offset = upl->map_object->vo_shadow_offset;
7324 new_offset = 0;
7325
7326 upl->flags |= UPL_SHADOWED;
7327
7328 while (size) {
7329 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7330 assert(pg_num == new_offset / PAGE_SIZE);
7331
7332 if (bitmap_test(upl->lite_list, pg_num)) {
7333 alias_page = vm_page_grab_fictitious(TRUE);
7334
7335 vm_object_lock(object);
7336
7337 m = vm_page_lookup(object, offset);
7338 if (m == VM_PAGE_NULL) {
7339 panic("vm_upl_map: page missing");
7340 }
7341
7342 /*
7343 * Convert the fictitious page to a private
7344 * shadow of the real page.
7345 */
7346 assert(alias_page->vmp_fictitious);
7347 alias_page->vmp_fictitious = FALSE;
7348 alias_page->vmp_private = TRUE;
7349 alias_page->vmp_free_when_done = TRUE;
7350 /*
7351 * since m is a page in the upl it must
7352 * already be wired or BUSY, so it's
7353 * safe to assign the underlying physical
7354 * page to the alias
7355 */
7356 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7357
7358 vm_object_unlock(object);
7359
7360 vm_page_lockspin_queues();
7361 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7362 vm_page_unlock_queues();
7363
7364 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7365
7366 assert(!alias_page->vmp_wanted);
7367 alias_page->vmp_busy = FALSE;
7368 alias_page->vmp_absent = FALSE;
7369 }
7370 size -= PAGE_SIZE;
7371 offset += PAGE_SIZE_64;
7372 new_offset += PAGE_SIZE_64;
7373 }
7374 vm_object_unlock(upl->map_object);
7375 }
7376 if (upl->flags & UPL_SHADOWED) {
7377 if (isVectorUPL) {
7378 offset = 0;
7379 } else {
7380 offset = offset_to_map;
7381 }
7382 } else {
7383 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7384 if (!isVectorUPL) {
7385 offset += offset_to_map;
7386 }
7387 }
7388
7389 if (isVectorUPL) {
7390 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7391 } else {
7392 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7393 }
7394
7395 vm_object_reference(upl->map_object);
7396
7397 if (!isVectorUPL) {
7398 *dst_addr = 0;
7399 /*
7400 * NEED A UPL_MAP ALIAS
7401 */
7402 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7403 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7404 upl->map_object, offset, FALSE,
7405 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7406
7407 if (kr != KERN_SUCCESS) {
7408 vm_object_deallocate(upl->map_object);
7409 upl_unlock(upl);
7410 return kr;
7411 }
7412 } else {
7413 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7414 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7415 upl->map_object, offset, FALSE,
7416 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7417 if (kr) {
7418 panic("vm_map_enter failed for a Vector UPL");
7419 }
7420 }
7421 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7422 /* this will have to be an increment rather than */
7423 /* an assignment. */
7424 vm_object_lock(upl->map_object);
7425
7426 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7427 m = vm_page_lookup(upl->map_object, offset);
7428
7429 if (m) {
7430 m->vmp_pmapped = TRUE;
7431
7432 /*
7433 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7434 * but only in kernel space. If this was on a user map,
7435 * we'd have to set the wpmapped bit.
7436 */
7437 /* m->vmp_wpmapped = TRUE; */
7438 assert(map->pmap == kernel_pmap);
7439
7440 kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE);
7441
7442 assert(kr == KERN_SUCCESS);
7443 #if KASAN
7444 kasan_notify_address(addr, PAGE_SIZE_64);
7445 #endif
7446 }
7447 offset += PAGE_SIZE_64;
7448 }
7449 vm_object_unlock(upl->map_object);
7450
7451 /*
7452 * hold a reference for the mapping
7453 */
7454 upl->ref_count++;
7455 upl->flags |= UPL_PAGE_LIST_MAPPED;
7456 upl->kaddr = (vm_offset_t) *dst_addr;
7457 assert(upl->kaddr == *dst_addr);
7458
7459 if (isVectorUPL) {
7460 goto process_upl_to_enter;
7461 }
7462
7463 if (!isVectorUPL) {
7464 vm_map_offset_t addr_adjustment;
7465
7466 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7467 if (addr_adjustment) {
7468 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7469 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7470 *dst_addr += addr_adjustment;
7471 }
7472 }
7473
7474 upl_unlock(upl);
7475
7476 return KERN_SUCCESS;
7477 }
7478
7479 kern_return_t
7480 vm_map_enter_upl(
7481 vm_map_t map,
7482 upl_t upl,
7483 vm_map_offset_t *dst_addr)
7484 {
7485 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7486 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7487 }
7488
7489 /*
7490 * Internal routine to remove a UPL mapping from a VM map.
7491 *
7492 * XXX - This should just be doable through a standard
7493 * vm_map_remove() operation. Otherwise, implicit clean-up
7494 * of the target map won't be able to correctly remove
7495 * these (and release the reference on the UPL). Having
7496 * to do this means we can't map these into user-space
7497 * maps yet.
7498 */
7499 kern_return_t
7500 vm_map_remove_upl_range(
7501 vm_map_t map,
7502 upl_t upl,
7503 __unused vm_object_offset_t offset_to_unmap,
7504 __unused vm_size_t size_to_unmap)
7505 {
7506 vm_address_t addr;
7507 upl_size_t size;
7508 int isVectorUPL = 0, curr_upl = 0;
7509 upl_t vector_upl = NULL;
7510
7511 if (upl == UPL_NULL) {
7512 return KERN_INVALID_ARGUMENT;
7513 }
7514
7515 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7516 int unmapped = 0, valid_upls = 0;
7517 vector_upl = upl;
7518 upl_lock(vector_upl);
7519 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7520 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7521 if (upl == NULL) {
7522 continue;
7523 }
7524 valid_upls++;
7525 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7526 unmapped++;
7527 }
7528 }
7529
7530 if (unmapped) {
7531 if (unmapped != valid_upls) {
7532 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7533 } else {
7534 upl_unlock(vector_upl);
7535 return KERN_FAILURE;
7536 }
7537 }
7538 curr_upl = 0;
7539 } else {
7540 upl_lock(upl);
7541 }
7542
7543 process_upl_to_remove:
7544 if (isVectorUPL) {
7545 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7546 vm_map_t v_upl_submap;
7547 vm_offset_t v_upl_submap_dst_addr;
7548 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7549
7550 kmem_free_guard(map, v_upl_submap_dst_addr,
7551 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7552 vm_map_deallocate(v_upl_submap);
7553 upl_unlock(vector_upl);
7554 return KERN_SUCCESS;
7555 }
7556
7557 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7558 if (upl == NULL) {
7559 goto process_upl_to_remove;
7560 }
7561 }
7562
7563 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7564 addr = upl->kaddr;
7565 size = upl->u_mapped_size;
7566
7567 assert(upl->ref_count > 1);
7568 upl->ref_count--; /* removing mapping ref */
7569
7570 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7571 upl->kaddr = (vm_offset_t) 0;
7572 upl->u_mapped_size = 0;
7573
7574 if (isVectorUPL) {
7575 /*
7576 * If it's a Vectored UPL, we'll be removing the entire
7577 * submap anyways, so no need to remove individual UPL
7578 * element mappings from within the submap
7579 */
7580 goto process_upl_to_remove;
7581 }
7582
7583 upl_unlock(upl);
7584
7585 vm_map_remove(map,
7586 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7587 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7588 return KERN_SUCCESS;
7589 }
7590 upl_unlock(upl);
7591
7592 return KERN_FAILURE;
7593 }
7594
7595 kern_return_t
7596 vm_map_remove_upl(
7597 vm_map_t map,
7598 upl_t upl)
7599 {
7600 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7601 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7602 }
7603
7604 void
7605 iopl_valid_data(
7606 upl_t upl,
7607 vm_tag_t tag)
7608 {
7609 vm_object_t object;
7610 vm_offset_t offset;
7611 vm_page_t m, nxt_page = VM_PAGE_NULL;
7612 upl_size_t size;
7613 int wired_count = 0;
7614
7615 if (upl == NULL) {
7616 panic("iopl_valid_data: NULL upl");
7617 }
7618 if (vector_upl_is_valid(upl)) {
7619 panic("iopl_valid_data: vector upl");
7620 }
7621 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7622 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7623 }
7624
7625 object = upl->map_object;
7626
7627 if (is_kernel_object(object) || object == compressor_object) {
7628 panic("iopl_valid_data: object == kernel or compressor");
7629 }
7630
7631 if (object->purgable == VM_PURGABLE_VOLATILE ||
7632 object->purgable == VM_PURGABLE_EMPTY) {
7633 panic("iopl_valid_data: object %p purgable %d",
7634 object, object->purgable);
7635 }
7636
7637 size = upl_adjusted_size(upl, PAGE_MASK);
7638
7639 vm_object_lock(object);
7640 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7641
7642 bool whole_object;
7643
7644 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7645 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7646 whole_object = true;
7647 } else {
7648 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7649 whole_object = false;
7650 }
7651
7652 while (size) {
7653 if (whole_object) {
7654 if (nxt_page != VM_PAGE_NULL) {
7655 m = nxt_page;
7656 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7657 }
7658 } else {
7659 m = vm_page_lookup(object, offset);
7660 offset += PAGE_SIZE;
7661
7662 if (m == VM_PAGE_NULL) {
7663 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7664 }
7665 }
7666 if (m->vmp_busy) {
7667 if (!m->vmp_absent) {
7668 panic("iopl_valid_data: busy page w/o absent");
7669 }
7670
7671 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7672 panic("iopl_valid_data: busy+absent page on page queue");
7673 }
7674 if (m->vmp_reusable) {
7675 panic("iopl_valid_data: %p is reusable", m);
7676 }
7677
7678 m->vmp_absent = FALSE;
7679 m->vmp_dirty = TRUE;
7680 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7681 assert(m->vmp_wire_count == 0);
7682 m->vmp_wire_count++;
7683 assert(m->vmp_wire_count);
7684 if (m->vmp_wire_count == 1) {
7685 m->vmp_q_state = VM_PAGE_IS_WIRED;
7686 wired_count++;
7687 } else {
7688 panic("iopl_valid_data: %p already wired", m);
7689 }
7690
7691 vm_page_wakeup_done(object, m);
7692 }
7693 size -= PAGE_SIZE;
7694 }
7695 if (wired_count) {
7696 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7697 assert(object->resident_page_count >= object->wired_page_count);
7698
7699 /* no need to adjust purgeable accounting for this object: */
7700 assert(object->purgable != VM_PURGABLE_VOLATILE);
7701 assert(object->purgable != VM_PURGABLE_EMPTY);
7702
7703 vm_page_lockspin_queues();
7704 vm_page_wire_count += wired_count;
7705 vm_page_unlock_queues();
7706 }
7707 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7708 vm_object_unlock(object);
7709 }
7710
7711
7712 void
7713 vm_object_set_pmap_cache_attr(
7714 vm_object_t object,
7715 upl_page_info_array_t user_page_list,
7716 unsigned int num_pages,
7717 boolean_t batch_pmap_op)
7718 {
7719 unsigned int cache_attr = 0;
7720
7721 cache_attr = object->wimg_bits & VM_WIMG_MASK;
7722 assert(user_page_list);
7723 if (cache_attr != VM_WIMG_USE_DEFAULT) {
7724 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7725 }
7726 }
7727
7728
7729 static bool
7730 vm_object_iopl_wire_full(
7731 vm_object_t object,
7732 upl_t upl,
7733 upl_page_info_array_t user_page_list,
7734 upl_control_flags_t cntrl_flags,
7735 vm_tag_t tag)
7736 {
7737 vm_page_t dst_page;
7738 unsigned int entry;
7739 int page_count;
7740 int delayed_unlock = 0;
7741 boolean_t retval = TRUE;
7742 ppnum_t phys_page;
7743
7744 vm_object_lock_assert_exclusive(object);
7745 assert(object->purgable != VM_PURGABLE_VOLATILE);
7746 assert(object->purgable != VM_PURGABLE_EMPTY);
7747 assert(object->pager == NULL);
7748 assert(object->vo_copy == NULL);
7749 assert(object->shadow == NULL);
7750
7751 page_count = object->resident_page_count;
7752 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7753
7754 vm_page_lock_queues();
7755
7756 while (page_count--) {
7757 if (dst_page->vmp_busy ||
7758 dst_page->vmp_fictitious ||
7759 dst_page->vmp_absent ||
7760 VMP_ERROR_GET(dst_page) ||
7761 dst_page->vmp_cleaning ||
7762 dst_page->vmp_restart ||
7763 dst_page->vmp_laundry) {
7764 retval = FALSE;
7765 goto done;
7766 }
7767 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7768 retval = FALSE;
7769 goto done;
7770 }
7771 dst_page->vmp_reference = TRUE;
7772
7773 vm_page_wire(dst_page, tag, FALSE);
7774
7775 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7776 SET_PAGE_DIRTY(dst_page, FALSE);
7777 }
7778 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7779 assert(entry >= 0 && entry < object->resident_page_count);
7780 bitmap_set(upl->lite_list, entry);
7781
7782 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7783
7784 if (phys_page > upl->highest_page) {
7785 upl->highest_page = phys_page;
7786 }
7787
7788 if (user_page_list) {
7789 user_page_list[entry].phys_addr = phys_page;
7790 user_page_list[entry].absent = dst_page->vmp_absent;
7791 user_page_list[entry].dirty = dst_page->vmp_dirty;
7792 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
7793 user_page_list[entry].precious = dst_page->vmp_precious;
7794 user_page_list[entry].device = FALSE;
7795 user_page_list[entry].speculative = FALSE;
7796 user_page_list[entry].cs_validated = FALSE;
7797 user_page_list[entry].cs_tainted = FALSE;
7798 user_page_list[entry].cs_nx = FALSE;
7799 user_page_list[entry].needed = FALSE;
7800 user_page_list[entry].mark = FALSE;
7801 }
7802 if (delayed_unlock++ > 256) {
7803 delayed_unlock = 0;
7804 lck_mtx_yield(&vm_page_queue_lock);
7805
7806 VM_CHECK_MEMORYSTATUS;
7807 }
7808 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7809 }
7810 done:
7811 vm_page_unlock_queues();
7812
7813 VM_CHECK_MEMORYSTATUS;
7814
7815 return retval;
7816 }
7817
7818
7819 static kern_return_t
7820 vm_object_iopl_wire_empty(
7821 vm_object_t object,
7822 upl_t upl,
7823 upl_page_info_array_t user_page_list,
7824 upl_control_flags_t cntrl_flags,
7825 vm_tag_t tag,
7826 vm_object_offset_t *dst_offset,
7827 int page_count,
7828 int *page_grab_count)
7829 {
7830 vm_page_t dst_page;
7831 boolean_t no_zero_fill = FALSE;
7832 int interruptible;
7833 int pages_wired = 0;
7834 int pages_inserted = 0;
7835 int entry = 0;
7836 uint64_t delayed_ledger_update = 0;
7837 kern_return_t ret = KERN_SUCCESS;
7838 int grab_options;
7839 ppnum_t phys_page;
7840
7841 vm_object_lock_assert_exclusive(object);
7842 assert(object->purgable != VM_PURGABLE_VOLATILE);
7843 assert(object->purgable != VM_PURGABLE_EMPTY);
7844 assert(object->pager == NULL);
7845 assert(object->vo_copy == NULL);
7846 assert(object->shadow == NULL);
7847
7848 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7849 interruptible = THREAD_ABORTSAFE;
7850 } else {
7851 interruptible = THREAD_UNINT;
7852 }
7853
7854 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7855 no_zero_fill = TRUE;
7856 }
7857
7858 grab_options = 0;
7859 #if CONFIG_SECLUDED_MEMORY
7860 if (object->can_grab_secluded) {
7861 grab_options |= VM_PAGE_GRAB_SECLUDED;
7862 }
7863 #endif /* CONFIG_SECLUDED_MEMORY */
7864
7865 while (page_count--) {
7866 while ((dst_page = vm_page_grab_options(grab_options))
7867 == VM_PAGE_NULL) {
7868 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7869
7870 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7871
7872 if (vm_page_wait(interruptible) == FALSE) {
7873 /*
7874 * interrupted case
7875 */
7876 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7877
7878 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7879
7880 ret = MACH_SEND_INTERRUPTED;
7881 goto done;
7882 }
7883 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7884
7885 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7886 }
7887 if (no_zero_fill == FALSE) {
7888 vm_page_zero_fill(dst_page);
7889 } else {
7890 dst_page->vmp_absent = TRUE;
7891 }
7892
7893 dst_page->vmp_reference = TRUE;
7894
7895 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7896 SET_PAGE_DIRTY(dst_page, FALSE);
7897 }
7898 if (dst_page->vmp_absent == FALSE) {
7899 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7900 assert(dst_page->vmp_wire_count == 0);
7901 dst_page->vmp_wire_count++;
7902 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7903 assert(dst_page->vmp_wire_count);
7904 pages_wired++;
7905 vm_page_wakeup_done(object, dst_page);
7906 }
7907 pages_inserted++;
7908
7909 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7910
7911 bitmap_set(upl->lite_list, entry);
7912
7913 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7914
7915 if (phys_page > upl->highest_page) {
7916 upl->highest_page = phys_page;
7917 }
7918
7919 if (user_page_list) {
7920 user_page_list[entry].phys_addr = phys_page;
7921 user_page_list[entry].absent = dst_page->vmp_absent;
7922 user_page_list[entry].dirty = dst_page->vmp_dirty;
7923 user_page_list[entry].free_when_done = FALSE;
7924 user_page_list[entry].precious = FALSE;
7925 user_page_list[entry].device = FALSE;
7926 user_page_list[entry].speculative = FALSE;
7927 user_page_list[entry].cs_validated = FALSE;
7928 user_page_list[entry].cs_tainted = FALSE;
7929 user_page_list[entry].cs_nx = FALSE;
7930 user_page_list[entry].needed = FALSE;
7931 user_page_list[entry].mark = FALSE;
7932 }
7933 entry++;
7934 *dst_offset += PAGE_SIZE_64;
7935 }
7936 done:
7937 if (pages_wired) {
7938 vm_page_lockspin_queues();
7939 vm_page_wire_count += pages_wired;
7940 vm_page_unlock_queues();
7941 }
7942 if (pages_inserted) {
7943 if (object->internal) {
7944 OSAddAtomic(pages_inserted, &vm_page_internal_count);
7945 } else {
7946 OSAddAtomic(pages_inserted, &vm_page_external_count);
7947 }
7948 }
7949 if (delayed_ledger_update) {
7950 task_t owner;
7951 int ledger_idx_volatile;
7952 int ledger_idx_nonvolatile;
7953 int ledger_idx_volatile_compressed;
7954 int ledger_idx_nonvolatile_compressed;
7955 int ledger_idx_composite;
7956 int ledger_idx_external_wired;
7957 boolean_t do_footprint;
7958
7959 owner = VM_OBJECT_OWNER(object);
7960 assert(owner);
7961
7962 vm_object_ledger_tag_ledgers(object,
7963 &ledger_idx_volatile,
7964 &ledger_idx_nonvolatile,
7965 &ledger_idx_volatile_compressed,
7966 &ledger_idx_nonvolatile_compressed,
7967 &ledger_idx_composite,
7968 &ledger_idx_external_wired,
7969 &do_footprint);
7970
7971 if (object->internal) {
7972 /* more non-volatile bytes */
7973 ledger_credit(owner->ledger,
7974 ledger_idx_nonvolatile,
7975 delayed_ledger_update);
7976 if (do_footprint) {
7977 /* more footprint */
7978 ledger_credit(owner->ledger,
7979 task_ledgers.phys_footprint,
7980 delayed_ledger_update);
7981 } else if (ledger_idx_composite != -1) {
7982 ledger_credit(owner->ledger,
7983 ledger_idx_composite,
7984 delayed_ledger_update);
7985 }
7986 } else {
7987 /* more external wired bytes */
7988 ledger_credit(owner->ledger,
7989 ledger_idx_external_wired,
7990 delayed_ledger_update);
7991 if (do_footprint) {
7992 /* more footprint */
7993 ledger_credit(owner->ledger,
7994 task_ledgers.phys_footprint,
7995 delayed_ledger_update);
7996 } else if (ledger_idx_composite != -1) {
7997 ledger_credit(owner->ledger,
7998 ledger_idx_composite,
7999 delayed_ledger_update);
8000 }
8001 }
8002 }
8003
8004 assert(page_grab_count);
8005 *page_grab_count = pages_inserted;
8006
8007 return ret;
8008 }
8009
8010
8011
8012 kern_return_t
8013 vm_object_iopl_request(
8014 vm_object_t object,
8015 vm_object_offset_t offset,
8016 upl_size_t size,
8017 upl_t *upl_ptr,
8018 upl_page_info_array_t user_page_list,
8019 unsigned int *page_list_count,
8020 upl_control_flags_t cntrl_flags,
8021 vm_tag_t tag)
8022 {
8023 vm_page_t dst_page;
8024 vm_object_offset_t dst_offset;
8025 upl_size_t xfer_size;
8026 upl_t upl = NULL;
8027 unsigned int entry;
8028 int no_zero_fill = FALSE;
8029 unsigned int size_in_pages;
8030 int page_grab_count = 0;
8031 u_int32_t psize;
8032 kern_return_t ret;
8033 vm_prot_t prot;
8034 struct vm_object_fault_info fault_info = {};
8035 struct vm_page_delayed_work dw_array;
8036 struct vm_page_delayed_work *dwp, *dwp_start;
8037 bool dwp_finish_ctx = TRUE;
8038 int dw_count;
8039 int dw_limit;
8040 int dw_index;
8041 boolean_t caller_lookup;
8042 int io_tracking_flag = 0;
8043 int interruptible;
8044 ppnum_t phys_page;
8045
8046 boolean_t set_cache_attr_needed = FALSE;
8047 boolean_t free_wired_pages = FALSE;
8048 boolean_t fast_path_empty_req = FALSE;
8049 boolean_t fast_path_full_req = FALSE;
8050
8051 #if DEVELOPMENT || DEBUG
8052 task_t task = current_task();
8053 #endif /* DEVELOPMENT || DEBUG */
8054
8055 dwp_start = dwp = NULL;
8056
8057 vm_object_offset_t original_offset = offset;
8058 upl_size_t original_size = size;
8059
8060 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8061
8062 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8063 offset = vm_object_trunc_page(offset);
8064 if (size != original_size || offset != original_offset) {
8065 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8066 }
8067
8068 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8069 /*
8070 * For forward compatibility's sake,
8071 * reject any unknown flag.
8072 */
8073 return KERN_INVALID_VALUE;
8074 }
8075 if (vm_lopage_needed == FALSE) {
8076 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8077 }
8078
8079 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8080 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8081 return KERN_INVALID_VALUE;
8082 }
8083
8084 if (object->phys_contiguous) {
8085 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8086 return KERN_INVALID_ADDRESS;
8087 }
8088
8089 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8090 return KERN_INVALID_ADDRESS;
8091 }
8092 }
8093 }
8094 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8095 no_zero_fill = TRUE;
8096 }
8097
8098 if (cntrl_flags & UPL_COPYOUT_FROM) {
8099 prot = VM_PROT_READ;
8100 } else {
8101 prot = VM_PROT_READ | VM_PROT_WRITE;
8102 }
8103
8104 if ((!object->internal) && (object->paging_offset != 0)) {
8105 panic("vm_object_iopl_request: external object with non-zero paging offset");
8106 }
8107
8108
8109 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8110
8111 #if CONFIG_IOSCHED || UPL_DEBUG
8112 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8113 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8114 }
8115 #endif
8116
8117 #if CONFIG_IOSCHED
8118 if (object->io_tracking) {
8119 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8120 if (!is_kernel_object(object)) {
8121 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8122 }
8123 }
8124 #endif
8125
8126 if (object->phys_contiguous) {
8127 psize = PAGE_SIZE;
8128 } else {
8129 psize = size;
8130
8131 dw_count = 0;
8132 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8133 dwp_start = vm_page_delayed_work_get_ctx();
8134 if (dwp_start == NULL) {
8135 dwp_start = &dw_array;
8136 dw_limit = 1;
8137 dwp_finish_ctx = FALSE;
8138 }
8139
8140 dwp = dwp_start;
8141 }
8142
8143 if (cntrl_flags & UPL_SET_INTERNAL) {
8144 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8145 user_page_list = size ? upl->page_list : NULL;
8146 } else {
8147 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8148 }
8149 if (user_page_list) {
8150 user_page_list[0].device = FALSE;
8151 }
8152 *upl_ptr = upl;
8153
8154 if (cntrl_flags & UPL_NOZEROFILLIO) {
8155 DTRACE_VM4(upl_nozerofillio,
8156 vm_object_t, object,
8157 vm_object_offset_t, offset,
8158 upl_size_t, size,
8159 upl_t, upl);
8160 }
8161
8162 upl->map_object = object;
8163 upl->u_offset = original_offset;
8164 upl->u_size = original_size;
8165
8166 size_in_pages = size / PAGE_SIZE;
8167
8168 if (is_kernel_object(object) &&
8169 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8170 upl->flags |= UPL_KERNEL_OBJECT;
8171 #if UPL_DEBUG
8172 vm_object_lock(object);
8173 #else
8174 vm_object_lock_shared(object);
8175 #endif
8176 } else {
8177 vm_object_lock(object);
8178 vm_object_activity_begin(object);
8179 }
8180 /*
8181 * paging in progress also protects the paging_offset
8182 */
8183 upl->u_offset = original_offset + object->paging_offset;
8184
8185 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8186 /*
8187 * The user requested that access to the pages in this UPL
8188 * be blocked until the UPL is commited or aborted.
8189 */
8190 upl->flags |= UPL_ACCESS_BLOCKED;
8191 }
8192
8193 #if CONFIG_IOSCHED || UPL_DEBUG
8194 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8195 vm_object_activity_begin(object);
8196 queue_enter(&object->uplq, upl, upl_t, uplq);
8197 }
8198 #endif
8199
8200 if (object->phys_contiguous) {
8201 if (upl->flags & UPL_ACCESS_BLOCKED) {
8202 assert(!object->blocked_access);
8203 object->blocked_access = TRUE;
8204 }
8205
8206 vm_object_unlock(object);
8207
8208 /*
8209 * don't need any shadow mappings for this one
8210 * since it is already I/O memory
8211 */
8212 upl->flags |= UPL_DEVICE_MEMORY;
8213
8214 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8215
8216 if (user_page_list) {
8217 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8218 user_page_list[0].device = TRUE;
8219 }
8220 if (page_list_count != NULL) {
8221 if (upl->flags & UPL_INTERNAL) {
8222 *page_list_count = 0;
8223 } else {
8224 *page_list_count = 1;
8225 }
8226 }
8227
8228 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8229 #if DEVELOPMENT || DEBUG
8230 if (task != NULL) {
8231 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8232 }
8233 #endif /* DEVELOPMENT || DEBUG */
8234 return KERN_SUCCESS;
8235 }
8236 if (!is_kernel_object(object) && object != compressor_object) {
8237 /*
8238 * Protect user space from future COW operations
8239 */
8240 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8241 if (!object->true_share &&
8242 vm_object_tracking_btlog) {
8243 btlog_record(vm_object_tracking_btlog, object,
8244 VM_OBJECT_TRACKING_OP_TRUESHARE,
8245 btref_get(__builtin_frame_address(0), 0));
8246 }
8247 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8248
8249 vm_object_lock_assert_exclusive(object);
8250 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8251
8252 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8253 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8254 }
8255 }
8256
8257 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8258 object->vo_copy != VM_OBJECT_NULL) {
8259 /*
8260 * Honor copy-on-write obligations
8261 *
8262 * The caller is gathering these pages and
8263 * might modify their contents. We need to
8264 * make sure that the copy object has its own
8265 * private copies of these pages before we let
8266 * the caller modify them.
8267 *
8268 * NOTE: someone else could map the original object
8269 * after we've done this copy-on-write here, and they
8270 * could then see an inconsistent picture of the memory
8271 * while it's being modified via the UPL. To prevent this,
8272 * we would have to block access to these pages until the
8273 * UPL is released. We could use the UPL_BLOCK_ACCESS
8274 * code path for that...
8275 */
8276 vm_object_update(object,
8277 offset,
8278 size,
8279 NULL,
8280 NULL,
8281 FALSE, /* should_return */
8282 MEMORY_OBJECT_COPY_SYNC,
8283 VM_PROT_NO_CHANGE);
8284 VM_PAGEOUT_DEBUG(iopl_cow, 1);
8285 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8286 }
8287 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8288 object->purgable != VM_PURGABLE_VOLATILE &&
8289 object->purgable != VM_PURGABLE_EMPTY &&
8290 object->vo_copy == NULL &&
8291 size == object->vo_size &&
8292 offset == 0 &&
8293 object->shadow == NULL &&
8294 object->pager == NULL) {
8295 if (object->resident_page_count == size_in_pages) {
8296 assert(object != compressor_object);
8297 assert(!is_kernel_object(object));
8298 fast_path_full_req = TRUE;
8299 } else if (object->resident_page_count == 0) {
8300 assert(object != compressor_object);
8301 assert(!is_kernel_object(object));
8302 fast_path_empty_req = TRUE;
8303 set_cache_attr_needed = TRUE;
8304 }
8305 }
8306
8307 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8308 interruptible = THREAD_ABORTSAFE;
8309 } else {
8310 interruptible = THREAD_UNINT;
8311 }
8312
8313 entry = 0;
8314
8315 xfer_size = size;
8316 dst_offset = offset;
8317
8318 if (fast_path_full_req) {
8319 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8320 goto finish;
8321 }
8322 /*
8323 * we couldn't complete the processing of this request on the fast path
8324 * so fall through to the slow path and finish up
8325 */
8326 } else if (fast_path_empty_req) {
8327 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8328 ret = KERN_MEMORY_ERROR;
8329 goto return_err;
8330 }
8331 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8332 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8333
8334 if (ret) {
8335 free_wired_pages = TRUE;
8336 goto return_err;
8337 }
8338 goto finish;
8339 }
8340
8341 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8342 fault_info.lo_offset = offset;
8343 fault_info.hi_offset = offset + xfer_size;
8344 fault_info.mark_zf_absent = TRUE;
8345 fault_info.interruptible = interruptible;
8346 fault_info.batch_pmap_op = TRUE;
8347
8348 while (xfer_size) {
8349 vm_fault_return_t result;
8350
8351 dwp->dw_mask = 0;
8352
8353 if (fast_path_full_req) {
8354 /*
8355 * if we get here, it means that we ran into a page
8356 * state we couldn't handle in the fast path and
8357 * bailed out to the slow path... since the order
8358 * we look at pages is different between the 2 paths,
8359 * the following check is needed to determine whether
8360 * this page was already processed in the fast path
8361 */
8362 if (bitmap_test(upl->lite_list, entry)) {
8363 goto skip_page;
8364 }
8365 }
8366 dst_page = vm_page_lookup(object, dst_offset);
8367
8368 if (dst_page == VM_PAGE_NULL ||
8369 dst_page->vmp_busy ||
8370 VMP_ERROR_GET(dst_page) ||
8371 dst_page->vmp_restart ||
8372 dst_page->vmp_absent ||
8373 dst_page->vmp_fictitious) {
8374 if (is_kernel_object(object)) {
8375 panic("vm_object_iopl_request: missing/bad page in kernel object");
8376 }
8377 if (object == compressor_object) {
8378 panic("vm_object_iopl_request: missing/bad page in compressor object");
8379 }
8380
8381 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8382 ret = KERN_MEMORY_ERROR;
8383 goto return_err;
8384 }
8385
8386 if (dst_page != VM_PAGE_NULL &&
8387 dst_page->vmp_busy) {
8388 wait_result_t wait_result;
8389 vm_object_lock_assert_exclusive(object);
8390 wait_result = vm_page_sleep(object, dst_page,
8391 interruptible, LCK_SLEEP_DEFAULT);
8392 if (wait_result == THREAD_AWAKENED ||
8393 wait_result == THREAD_RESTART) {
8394 continue;
8395 }
8396 ret = MACH_SEND_INTERRUPTED;
8397 goto return_err;
8398 }
8399
8400 set_cache_attr_needed = TRUE;
8401
8402 /*
8403 * We just looked up the page and the result remains valid
8404 * until the object lock is release, so send it to
8405 * vm_fault_page() (as "dst_page"), to avoid having to
8406 * look it up again there.
8407 */
8408 caller_lookup = TRUE;
8409
8410 do {
8411 vm_page_t top_page;
8412 kern_return_t error_code;
8413
8414 fault_info.cluster_size = xfer_size;
8415
8416 vm_object_paging_begin(object);
8417
8418 result = vm_fault_page(object, dst_offset,
8419 prot | VM_PROT_WRITE, FALSE,
8420 caller_lookup,
8421 &prot, &dst_page, &top_page,
8422 (int *)0,
8423 &error_code, no_zero_fill,
8424 &fault_info);
8425
8426 /* our lookup is no longer valid at this point */
8427 caller_lookup = FALSE;
8428
8429 switch (result) {
8430 case VM_FAULT_SUCCESS:
8431 page_grab_count++;
8432
8433 if (!dst_page->vmp_absent) {
8434 vm_page_wakeup_done(object, dst_page);
8435 } else {
8436 /*
8437 * we only get back an absent page if we
8438 * requested that it not be zero-filled
8439 * because we are about to fill it via I/O
8440 *
8441 * absent pages should be left BUSY
8442 * to prevent them from being faulted
8443 * into an address space before we've
8444 * had a chance to complete the I/O on
8445 * them since they may contain info that
8446 * shouldn't be seen by the faulting task
8447 */
8448 }
8449 /*
8450 * Release paging references and
8451 * top-level placeholder page, if any.
8452 */
8453 if (top_page != VM_PAGE_NULL) {
8454 vm_object_t local_object;
8455
8456 local_object = VM_PAGE_OBJECT(top_page);
8457
8458 /*
8459 * comparing 2 packed pointers
8460 */
8461 if (top_page->vmp_object != dst_page->vmp_object) {
8462 vm_object_lock(local_object);
8463 VM_PAGE_FREE(top_page);
8464 vm_object_paging_end(local_object);
8465 vm_object_unlock(local_object);
8466 } else {
8467 VM_PAGE_FREE(top_page);
8468 vm_object_paging_end(local_object);
8469 }
8470 }
8471 vm_object_paging_end(object);
8472 break;
8473
8474 case VM_FAULT_RETRY:
8475 vm_object_lock(object);
8476 break;
8477
8478 case VM_FAULT_MEMORY_SHORTAGE:
8479 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8480
8481 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8482
8483 if (vm_page_wait(interruptible)) {
8484 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8485
8486 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8487 vm_object_lock(object);
8488
8489 break;
8490 }
8491 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8492
8493 VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8494 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8495 OS_FALLTHROUGH;
8496
8497 case VM_FAULT_INTERRUPTED:
8498 error_code = MACH_SEND_INTERRUPTED;
8499 OS_FALLTHROUGH;
8500 case VM_FAULT_MEMORY_ERROR:
8501 memory_error:
8502 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8503
8504 vm_object_lock(object);
8505 goto return_err;
8506
8507 case VM_FAULT_SUCCESS_NO_VM_PAGE:
8508 /* success but no page: fail */
8509 vm_object_paging_end(object);
8510 vm_object_unlock(object);
8511 goto memory_error;
8512
8513 default:
8514 panic("vm_object_iopl_request: unexpected error"
8515 " 0x%x from vm_fault_page()\n", result);
8516 }
8517 } while (result != VM_FAULT_SUCCESS);
8518 }
8519 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8520
8521 if (upl->flags & UPL_KERNEL_OBJECT) {
8522 goto record_phys_addr;
8523 }
8524
8525 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8526 dst_page->vmp_busy = TRUE;
8527 goto record_phys_addr;
8528 }
8529
8530 if (dst_page->vmp_cleaning) {
8531 /*
8532 * Someone else is cleaning this page in place.
8533 * In theory, we should be able to proceed and use this
8534 * page but they'll probably end up clearing the "busy"
8535 * bit on it in upl_commit_range() but they didn't set
8536 * it, so they would clear our "busy" bit and open
8537 * us to race conditions.
8538 * We'd better wait for the cleaning to complete and
8539 * then try again.
8540 */
8541 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8542 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8543 continue;
8544 }
8545 if (dst_page->vmp_laundry) {
8546 vm_pageout_steal_laundry(dst_page, FALSE);
8547 }
8548
8549 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8550 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
8551 vm_page_t low_page;
8552 int refmod;
8553
8554 /*
8555 * support devices that can't DMA above 32 bits
8556 * by substituting pages from a pool of low address
8557 * memory for any pages we find above the 4G mark
8558 * can't substitute if the page is already wired because
8559 * we don't know whether that physical address has been
8560 * handed out to some other 64 bit capable DMA device to use
8561 */
8562 if (VM_PAGE_WIRED(dst_page)) {
8563 ret = KERN_PROTECTION_FAILURE;
8564 goto return_err;
8565 }
8566 low_page = vm_page_grablo();
8567
8568 if (low_page == VM_PAGE_NULL) {
8569 ret = KERN_RESOURCE_SHORTAGE;
8570 goto return_err;
8571 }
8572 /*
8573 * from here until the vm_page_replace completes
8574 * we musn't drop the object lock... we don't
8575 * want anyone refaulting this page in and using
8576 * it after we disconnect it... we want the fault
8577 * to find the new page being substituted.
8578 */
8579 if (dst_page->vmp_pmapped) {
8580 refmod = pmap_disconnect(phys_page);
8581 } else {
8582 refmod = 0;
8583 }
8584
8585 if (!dst_page->vmp_absent) {
8586 vm_page_copy(dst_page, low_page);
8587 }
8588
8589 low_page->vmp_reference = dst_page->vmp_reference;
8590 low_page->vmp_dirty = dst_page->vmp_dirty;
8591 low_page->vmp_absent = dst_page->vmp_absent;
8592
8593 if (refmod & VM_MEM_REFERENCED) {
8594 low_page->vmp_reference = TRUE;
8595 }
8596 if (refmod & VM_MEM_MODIFIED) {
8597 SET_PAGE_DIRTY(low_page, FALSE);
8598 }
8599
8600 vm_page_replace(low_page, object, dst_offset);
8601
8602 dst_page = low_page;
8603 /*
8604 * vm_page_grablo returned the page marked
8605 * BUSY... we don't need a PAGE_WAKEUP_DONE
8606 * here, because we've never dropped the object lock
8607 */
8608 if (!dst_page->vmp_absent) {
8609 dst_page->vmp_busy = FALSE;
8610 }
8611
8612 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8613 }
8614 if (!dst_page->vmp_busy) {
8615 dwp->dw_mask |= DW_vm_page_wire;
8616 }
8617
8618 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8619 /*
8620 * Mark the page "busy" to block any future page fault
8621 * on this page in addition to wiring it.
8622 * We'll also remove the mapping
8623 * of all these pages before leaving this routine.
8624 */
8625 assert(!dst_page->vmp_fictitious);
8626 dst_page->vmp_busy = TRUE;
8627 }
8628 /*
8629 * expect the page to be used
8630 * page queues lock must be held to set 'reference'
8631 */
8632 dwp->dw_mask |= DW_set_reference;
8633
8634 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8635 SET_PAGE_DIRTY(dst_page, TRUE);
8636 /*
8637 * Page belonging to a code-signed object is about to
8638 * be written. Mark it tainted and disconnect it from
8639 * all pmaps so processes have to fault it back in and
8640 * deal with the tainted bit.
8641 */
8642 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8643 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8644 vm_page_iopl_tainted++;
8645 if (dst_page->vmp_pmapped) {
8646 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8647 if (refmod & VM_MEM_REFERENCED) {
8648 dst_page->vmp_reference = TRUE;
8649 }
8650 }
8651 }
8652 }
8653 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8654 pmap_sync_page_attributes_phys(phys_page);
8655 dst_page->vmp_written_by_kernel = FALSE;
8656 }
8657
8658 record_phys_addr:
8659 if (dst_page->vmp_busy) {
8660 upl->flags |= UPL_HAS_BUSY;
8661 }
8662
8663 bitmap_set(upl->lite_list, entry);
8664
8665 if (phys_page > upl->highest_page) {
8666 upl->highest_page = phys_page;
8667 }
8668
8669 if (user_page_list) {
8670 user_page_list[entry].phys_addr = phys_page;
8671 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8672 user_page_list[entry].absent = dst_page->vmp_absent;
8673 user_page_list[entry].dirty = dst_page->vmp_dirty;
8674 user_page_list[entry].precious = dst_page->vmp_precious;
8675 user_page_list[entry].device = FALSE;
8676 user_page_list[entry].needed = FALSE;
8677 if (dst_page->vmp_clustered == TRUE) {
8678 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8679 } else {
8680 user_page_list[entry].speculative = FALSE;
8681 }
8682 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8683 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8684 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8685 user_page_list[entry].mark = FALSE;
8686 }
8687 if (!is_kernel_object(object) && object != compressor_object) {
8688 /*
8689 * someone is explicitly grabbing this page...
8690 * update clustered and speculative state
8691 *
8692 */
8693 if (dst_page->vmp_clustered) {
8694 VM_PAGE_CONSUME_CLUSTERED(dst_page);
8695 }
8696 }
8697 skip_page:
8698 entry++;
8699 dst_offset += PAGE_SIZE_64;
8700 xfer_size -= PAGE_SIZE;
8701
8702 if (dwp->dw_mask) {
8703 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8704
8705 if (dw_count >= dw_limit) {
8706 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8707
8708 dwp = dwp_start;
8709 dw_count = 0;
8710 }
8711 }
8712 }
8713 assert(entry == size_in_pages);
8714
8715 if (dw_count) {
8716 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8717 dwp = dwp_start;
8718 dw_count = 0;
8719 }
8720 finish:
8721 if (user_page_list && set_cache_attr_needed == TRUE) {
8722 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8723 }
8724
8725 if (page_list_count != NULL) {
8726 if (upl->flags & UPL_INTERNAL) {
8727 *page_list_count = 0;
8728 } else if (*page_list_count > size_in_pages) {
8729 *page_list_count = size_in_pages;
8730 }
8731 }
8732 vm_object_unlock(object);
8733
8734 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8735 /*
8736 * We've marked all the pages "busy" so that future
8737 * page faults will block.
8738 * Now remove the mapping for these pages, so that they
8739 * can't be accessed without causing a page fault.
8740 */
8741 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8742 PMAP_NULL,
8743 PAGE_SIZE,
8744 0, VM_PROT_NONE);
8745 assert(!object->blocked_access);
8746 object->blocked_access = TRUE;
8747 }
8748
8749 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8750 #if DEVELOPMENT || DEBUG
8751 if (task != NULL) {
8752 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8753 }
8754 #endif /* DEVELOPMENT || DEBUG */
8755
8756 if (dwp_start && dwp_finish_ctx) {
8757 vm_page_delayed_work_finish_ctx(dwp_start);
8758 dwp_start = dwp = NULL;
8759 }
8760
8761 return KERN_SUCCESS;
8762
8763 return_err:
8764 dw_index = 0;
8765
8766 for (; offset < dst_offset; offset += PAGE_SIZE) {
8767 boolean_t need_unwire;
8768 bool need_wakeup;
8769
8770 dst_page = vm_page_lookup(object, offset);
8771
8772 if (dst_page == VM_PAGE_NULL) {
8773 panic("vm_object_iopl_request: Wired page missing.");
8774 }
8775
8776 /*
8777 * if we've already processed this page in an earlier
8778 * dw_do_work, we need to undo the wiring... we will
8779 * leave the dirty and reference bits on if they
8780 * were set, since we don't have a good way of knowing
8781 * what the previous state was and we won't get here
8782 * under any normal circumstances... we will always
8783 * clear BUSY and wakeup any waiters via vm_page_free
8784 * or PAGE_WAKEUP_DONE
8785 */
8786 need_unwire = TRUE;
8787
8788 need_wakeup = false;
8789 if (dw_count) {
8790 if ((dwp_start)[dw_index].dw_m == dst_page) {
8791 /*
8792 * still in the deferred work list
8793 * which means we haven't yet called
8794 * vm_page_wire on this page
8795 */
8796 need_unwire = FALSE;
8797
8798 if (dst_page->vmp_busy &&
8799 ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8800 /*
8801 * It's our own "busy" bit, so we need to clear it
8802 * now and wake up waiters below.
8803 */
8804 dst_page->vmp_busy = false;
8805 need_wakeup = true;
8806 }
8807
8808 dw_index++;
8809 dw_count--;
8810 }
8811 }
8812 vm_page_lock_queues();
8813
8814 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8815 vm_page_free(dst_page);
8816
8817 need_unwire = FALSE;
8818 } else {
8819 if (need_unwire == TRUE) {
8820 vm_page_unwire(dst_page, TRUE);
8821 }
8822 if (dst_page->vmp_busy) {
8823 /* not our "busy" or we would have cleared it above */
8824 assert(!need_wakeup);
8825 }
8826 if (need_wakeup) {
8827 assert(!dst_page->vmp_busy);
8828 vm_page_wakeup(object, dst_page);
8829 }
8830 }
8831 vm_page_unlock_queues();
8832
8833 if (need_unwire == TRUE) {
8834 counter_inc(&vm_statistics_reactivations);
8835 }
8836 }
8837 #if UPL_DEBUG
8838 upl->upl_state = 2;
8839 #endif
8840 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8841 vm_object_activity_end(object);
8842 vm_object_collapse(object, 0, TRUE);
8843 }
8844 vm_object_unlock(object);
8845 upl_destroy(upl);
8846
8847 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8848 #if DEVELOPMENT || DEBUG
8849 if (task != NULL) {
8850 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8851 }
8852 #endif /* DEVELOPMENT || DEBUG */
8853
8854 if (dwp_start && dwp_finish_ctx) {
8855 vm_page_delayed_work_finish_ctx(dwp_start);
8856 dwp_start = dwp = NULL;
8857 }
8858 return ret;
8859 }
8860
8861 kern_return_t
8862 upl_transpose(
8863 upl_t upl1,
8864 upl_t upl2)
8865 {
8866 kern_return_t retval;
8867 boolean_t upls_locked;
8868 vm_object_t object1, object2;
8869
8870 /* LD: Should mapped UPLs be eligible for a transpose? */
8871 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8872 return KERN_INVALID_ARGUMENT;
8873 }
8874
8875 upls_locked = FALSE;
8876
8877 /*
8878 * Since we need to lock both UPLs at the same time,
8879 * avoid deadlocks by always taking locks in the same order.
8880 */
8881 if (upl1 < upl2) {
8882 upl_lock(upl1);
8883 upl_lock(upl2);
8884 } else {
8885 upl_lock(upl2);
8886 upl_lock(upl1);
8887 }
8888 upls_locked = TRUE; /* the UPLs will need to be unlocked */
8889
8890 object1 = upl1->map_object;
8891 object2 = upl2->map_object;
8892
8893 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8894 upl1->u_size != upl2->u_size) {
8895 /*
8896 * We deal only with full objects, not subsets.
8897 * That's because we exchange the entire backing store info
8898 * for the objects: pager, resident pages, etc... We can't do
8899 * only part of it.
8900 */
8901 retval = KERN_INVALID_VALUE;
8902 goto done;
8903 }
8904
8905 /*
8906 * Tranpose the VM objects' backing store.
8907 */
8908 retval = vm_object_transpose(object1, object2,
8909 upl_adjusted_size(upl1, PAGE_MASK));
8910
8911 if (retval == KERN_SUCCESS) {
8912 /*
8913 * Make each UPL point to the correct VM object, i.e. the
8914 * object holding the pages that the UPL refers to...
8915 */
8916 #if CONFIG_IOSCHED || UPL_DEBUG
8917 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8918 vm_object_lock(object1);
8919 vm_object_lock(object2);
8920 }
8921 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8922 queue_remove(&object1->uplq, upl1, upl_t, uplq);
8923 }
8924 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8925 queue_remove(&object2->uplq, upl2, upl_t, uplq);
8926 }
8927 #endif
8928 upl1->map_object = object2;
8929 upl2->map_object = object1;
8930
8931 #if CONFIG_IOSCHED || UPL_DEBUG
8932 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8933 queue_enter(&object2->uplq, upl1, upl_t, uplq);
8934 }
8935 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8936 queue_enter(&object1->uplq, upl2, upl_t, uplq);
8937 }
8938 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8939 vm_object_unlock(object2);
8940 vm_object_unlock(object1);
8941 }
8942 #endif
8943 }
8944
8945 done:
8946 /*
8947 * Cleanup.
8948 */
8949 if (upls_locked) {
8950 upl_unlock(upl1);
8951 upl_unlock(upl2);
8952 upls_locked = FALSE;
8953 }
8954
8955 return retval;
8956 }
8957
8958 void
8959 upl_range_needed(
8960 upl_t upl,
8961 int index,
8962 int count)
8963 {
8964 int size_in_pages;
8965
8966 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8967 return;
8968 }
8969
8970 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8971
8972 while (count-- && index < size_in_pages) {
8973 upl->page_list[index++].needed = TRUE;
8974 }
8975 }
8976
8977
8978 /*
8979 * Reserve of virtual addresses in the kernel address space.
8980 * We need to map the physical pages in the kernel, so that we
8981 * can call the code-signing or slide routines with a kernel
8982 * virtual address. We keep this pool of pre-allocated kernel
8983 * virtual addresses so that we don't have to scan the kernel's
8984 * virtaul address space each time we need to work with
8985 * a physical page.
8986 */
8987 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8988 #define VM_PAGING_NUM_PAGES 64
8989 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8990 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8991 int vm_paging_max_index = 0;
8992 int vm_paging_page_waiter = 0;
8993 int vm_paging_page_waiter_total = 0;
8994
8995 unsigned long vm_paging_no_kernel_page = 0;
8996 unsigned long vm_paging_objects_mapped = 0;
8997 unsigned long vm_paging_pages_mapped = 0;
8998 unsigned long vm_paging_objects_mapped_slow = 0;
8999 unsigned long vm_paging_pages_mapped_slow = 0;
9000
9001 __startup_func
9002 static void
9003 vm_paging_map_init(void)
9004 {
9005 kmem_alloc(kernel_map, &vm_paging_base_address,
9006 ptoa(VM_PAGING_NUM_PAGES),
9007 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
9008 VM_KERN_MEMORY_NONE);
9009 }
9010 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
9011
9012 /*
9013 * vm_paging_map_object:
9014 * Maps part of a VM object's pages in the kernel
9015 * virtual address space, using the pre-allocated
9016 * kernel virtual addresses, if possible.
9017 * Context:
9018 * The VM object is locked. This lock will get
9019 * dropped and re-acquired though, so the caller
9020 * must make sure the VM object is kept alive
9021 * (by holding a VM map that has a reference
9022 * on it, for example, or taking an extra reference).
9023 * The page should also be kept busy to prevent
9024 * it from being reclaimed.
9025 */
9026 kern_return_t
9027 vm_paging_map_object(
9028 vm_page_t page,
9029 vm_object_t object,
9030 vm_object_offset_t offset,
9031 vm_prot_t protection,
9032 boolean_t can_unlock_object,
9033 vm_map_size_t *size, /* IN/OUT */
9034 vm_map_offset_t *address, /* OUT */
9035 boolean_t *need_unmap) /* OUT */
9036 {
9037 kern_return_t kr;
9038 vm_map_offset_t page_map_offset;
9039 vm_map_size_t map_size;
9040 vm_object_offset_t object_offset;
9041 int i;
9042
9043 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9044 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9045 *address = (vm_map_offset_t)
9046 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9047 *need_unmap = FALSE;
9048 return KERN_SUCCESS;
9049
9050 assert(page->vmp_busy);
9051 /*
9052 * Use one of the pre-allocated kernel virtual addresses
9053 * and just enter the VM page in the kernel address space
9054 * at that virtual address.
9055 */
9056 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9057
9058 /*
9059 * Try and find an available kernel virtual address
9060 * from our pre-allocated pool.
9061 */
9062 page_map_offset = 0;
9063 for (;;) {
9064 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9065 if (vm_paging_page_inuse[i] == FALSE) {
9066 page_map_offset =
9067 vm_paging_base_address +
9068 (i * PAGE_SIZE);
9069 break;
9070 }
9071 }
9072 if (page_map_offset != 0) {
9073 /* found a space to map our page ! */
9074 break;
9075 }
9076
9077 if (can_unlock_object) {
9078 /*
9079 * If we can afford to unlock the VM object,
9080 * let's take the slow path now...
9081 */
9082 break;
9083 }
9084 /*
9085 * We can't afford to unlock the VM object, so
9086 * let's wait for a space to become available...
9087 */
9088 vm_paging_page_waiter_total++;
9089 vm_paging_page_waiter++;
9090 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9091 if (kr == THREAD_WAITING) {
9092 simple_unlock(&vm_paging_lock);
9093 kr = thread_block(THREAD_CONTINUE_NULL);
9094 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9095 }
9096 vm_paging_page_waiter--;
9097 /* ... and try again */
9098 }
9099
9100 if (page_map_offset != 0) {
9101 /*
9102 * We found a kernel virtual address;
9103 * map the physical page to that virtual address.
9104 */
9105 if (i > vm_paging_max_index) {
9106 vm_paging_max_index = i;
9107 }
9108 vm_paging_page_inuse[i] = TRUE;
9109 simple_unlock(&vm_paging_lock);
9110
9111 page->vmp_pmapped = TRUE;
9112
9113 /*
9114 * Keep the VM object locked over the PMAP_ENTER
9115 * and the actual use of the page by the kernel,
9116 * or this pmap mapping might get undone by a
9117 * vm_object_pmap_protect() call...
9118 */
9119 kr = pmap_enter_check(kernel_pmap,
9120 page_map_offset,
9121 page,
9122 protection,
9123 VM_PROT_NONE,
9124 0,
9125 TRUE);
9126 assert(kr == KERN_SUCCESS);
9127 vm_paging_objects_mapped++;
9128 vm_paging_pages_mapped++;
9129 *address = page_map_offset;
9130 *need_unmap = TRUE;
9131
9132 #if KASAN
9133 kasan_notify_address(page_map_offset, PAGE_SIZE);
9134 #endif
9135
9136 /* all done and mapped, ready to use ! */
9137 return KERN_SUCCESS;
9138 }
9139
9140 /*
9141 * We ran out of pre-allocated kernel virtual
9142 * addresses. Just map the page in the kernel
9143 * the slow and regular way.
9144 */
9145 vm_paging_no_kernel_page++;
9146 simple_unlock(&vm_paging_lock);
9147 }
9148
9149 if (!can_unlock_object) {
9150 *address = 0;
9151 *size = 0;
9152 *need_unmap = FALSE;
9153 return KERN_NOT_SUPPORTED;
9154 }
9155
9156 object_offset = vm_object_trunc_page(offset);
9157 map_size = vm_map_round_page(*size,
9158 VM_MAP_PAGE_MASK(kernel_map));
9159
9160 /*
9161 * Try and map the required range of the object
9162 * in the kernel_map. Given that allocation is
9163 * for pageable memory, it shouldn't contain
9164 * pointers and is mapped into the data range.
9165 */
9166
9167 vm_object_reference_locked(object); /* for the map entry */
9168 vm_object_unlock(object);
9169
9170 kr = vm_map_enter(kernel_map,
9171 address,
9172 map_size,
9173 0,
9174 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9175 object,
9176 object_offset,
9177 FALSE,
9178 protection,
9179 VM_PROT_ALL,
9180 VM_INHERIT_NONE);
9181 if (kr != KERN_SUCCESS) {
9182 *address = 0;
9183 *size = 0;
9184 *need_unmap = FALSE;
9185 vm_object_deallocate(object); /* for the map entry */
9186 vm_object_lock(object);
9187 return kr;
9188 }
9189
9190 *size = map_size;
9191
9192 /*
9193 * Enter the mapped pages in the page table now.
9194 */
9195 vm_object_lock(object);
9196 /*
9197 * VM object must be kept locked from before PMAP_ENTER()
9198 * until after the kernel is done accessing the page(s).
9199 * Otherwise, the pmap mappings in the kernel could be
9200 * undone by a call to vm_object_pmap_protect().
9201 */
9202
9203 for (page_map_offset = 0;
9204 map_size != 0;
9205 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9206 page = vm_page_lookup(object, offset + page_map_offset);
9207 if (page == VM_PAGE_NULL) {
9208 printf("vm_paging_map_object: no page !?");
9209 vm_object_unlock(object);
9210 vm_map_remove(kernel_map, *address, *size);
9211 *address = 0;
9212 *size = 0;
9213 *need_unmap = FALSE;
9214 vm_object_lock(object);
9215 return KERN_MEMORY_ERROR;
9216 }
9217 page->vmp_pmapped = TRUE;
9218
9219 kr = pmap_enter_check(kernel_pmap,
9220 *address + page_map_offset,
9221 page,
9222 protection,
9223 VM_PROT_NONE,
9224 0,
9225 TRUE);
9226 assert(kr == KERN_SUCCESS);
9227 #if KASAN
9228 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9229 #endif
9230 }
9231
9232 vm_paging_objects_mapped_slow++;
9233 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9234
9235 *need_unmap = TRUE;
9236
9237 return KERN_SUCCESS;
9238 }
9239
9240 /*
9241 * vm_paging_unmap_object:
9242 * Unmaps part of a VM object's pages from the kernel
9243 * virtual address space.
9244 * Context:
9245 * The VM object is locked. This lock will get
9246 * dropped and re-acquired though.
9247 */
9248 void
9249 vm_paging_unmap_object(
9250 vm_object_t object,
9251 vm_map_offset_t start,
9252 vm_map_offset_t end)
9253 {
9254 int i;
9255
9256 if ((vm_paging_base_address == 0) ||
9257 (start < vm_paging_base_address) ||
9258 (end > (vm_paging_base_address
9259 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9260 /*
9261 * We didn't use our pre-allocated pool of
9262 * kernel virtual address. Deallocate the
9263 * virtual memory.
9264 */
9265 if (object != VM_OBJECT_NULL) {
9266 vm_object_unlock(object);
9267 }
9268 vm_map_remove(kernel_map, start, end);
9269 if (object != VM_OBJECT_NULL) {
9270 vm_object_lock(object);
9271 }
9272 } else {
9273 /*
9274 * We used a kernel virtual address from our
9275 * pre-allocated pool. Put it back in the pool
9276 * for next time.
9277 */
9278 assert(end - start == PAGE_SIZE);
9279 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9280 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9281
9282 /* undo the pmap mapping */
9283 pmap_remove(kernel_pmap, start, end);
9284
9285 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9286 vm_paging_page_inuse[i] = FALSE;
9287 if (vm_paging_page_waiter) {
9288 thread_wakeup(&vm_paging_page_waiter);
9289 }
9290 simple_unlock(&vm_paging_lock);
9291 }
9292 }
9293
9294
9295 /*
9296 * page->vmp_object must be locked
9297 */
9298 void
9299 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9300 {
9301 if (!queues_locked) {
9302 vm_page_lockspin_queues();
9303 }
9304
9305 page->vmp_free_when_done = FALSE;
9306 /*
9307 * need to drop the laundry count...
9308 * we may also need to remove it
9309 * from the I/O paging queue...
9310 * vm_pageout_throttle_up handles both cases
9311 *
9312 * the laundry and pageout_queue flags are cleared...
9313 */
9314 vm_pageout_throttle_up(page);
9315
9316 if (!queues_locked) {
9317 vm_page_unlock_queues();
9318 }
9319 }
9320
9321 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9322
9323 upl_t
9324 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9325 {
9326 int i = 0;
9327 upl_t upl;
9328
9329 assert(max_upls > 0);
9330 if (max_upls == 0) {
9331 return NULL;
9332 }
9333
9334 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9335 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9336 }
9337 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9338
9339 upl = upl_create(0, UPL_VECTOR, 0);
9340 upl->vector_upl = vector_upl;
9341 upl->u_offset = upl_offset;
9342 vector_upl->size = 0;
9343 vector_upl->offset = upl_offset;
9344 vector_upl->invalid_upls = 0;
9345 vector_upl->num_upls = 0;
9346 vector_upl->pagelist = NULL;
9347 vector_upl->max_upls = max_upls;
9348
9349 for (i = 0; i < max_upls; i++) {
9350 vector_upl->upls[i].iostate.size = 0;
9351 vector_upl->upls[i].iostate.offset = 0;
9352 }
9353 return upl;
9354 }
9355
9356 upl_size_t
9357 vector_upl_get_size(const upl_t upl)
9358 {
9359 if (!vector_upl_is_valid(upl)) {
9360 return upl_get_size(upl);
9361 } else {
9362 return round_page_32(upl->vector_upl->size);
9363 }
9364 }
9365
9366 uint32_t
9367 vector_upl_max_upls(const upl_t upl)
9368 {
9369 if (!vector_upl_is_valid(upl)) {
9370 return 0;
9371 }
9372 return ((vector_upl_t)(upl->vector_upl))->max_upls;
9373 }
9374
9375 void
9376 vector_upl_deallocate(upl_t upl)
9377 {
9378 vector_upl_t vector_upl = upl->vector_upl;
9379
9380 assert(vector_upl_is_valid(upl));
9381
9382 if (vector_upl->invalid_upls != vector_upl->num_upls) {
9383 panic("Deallocating non-empty Vectored UPL");
9384 }
9385 uint32_t max_upls = vector_upl->max_upls;
9386 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9387 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9388 upl->vector_upl = NULL;
9389 }
9390
9391 boolean_t
9392 vector_upl_is_valid(upl_t upl)
9393 {
9394 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9395 }
9396
9397 boolean_t
9398 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9399 {
9400 if (vector_upl_is_valid(upl)) {
9401 vector_upl_t vector_upl = upl->vector_upl;
9402
9403 if (vector_upl) {
9404 if (subupl) {
9405 if (io_size) {
9406 if (io_size < PAGE_SIZE) {
9407 io_size = PAGE_SIZE;
9408 }
9409 subupl->vector_upl = (void*)vector_upl;
9410 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9411 vector_upl->size += io_size;
9412 upl->u_size += io_size;
9413 } else {
9414 uint32_t i = 0, invalid_upls = 0;
9415 for (i = 0; i < vector_upl->num_upls; i++) {
9416 if (vector_upl->upls[i].elem == subupl) {
9417 break;
9418 }
9419 }
9420 if (i == vector_upl->num_upls) {
9421 panic("Trying to remove sub-upl when none exists");
9422 }
9423
9424 vector_upl->upls[i].elem = NULL;
9425 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9426 relaxed);
9427 if (invalid_upls == vector_upl->num_upls) {
9428 return TRUE;
9429 } else {
9430 return FALSE;
9431 }
9432 }
9433 } else {
9434 panic("vector_upl_set_subupl was passed a NULL upl element");
9435 }
9436 } else {
9437 panic("vector_upl_set_subupl was passed a non-vectored upl");
9438 }
9439 } else {
9440 panic("vector_upl_set_subupl was passed a NULL upl");
9441 }
9442
9443 return FALSE;
9444 }
9445
9446 void
9447 vector_upl_set_pagelist(upl_t upl)
9448 {
9449 if (vector_upl_is_valid(upl)) {
9450 uint32_t i = 0;
9451 vector_upl_t vector_upl = upl->vector_upl;
9452
9453 if (vector_upl) {
9454 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9455
9456 vector_upl->pagelist = kalloc_type(struct upl_page_info,
9457 atop(vector_upl->size), Z_WAITOK);
9458
9459 for (i = 0; i < vector_upl->num_upls; i++) {
9460 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9461 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9462 pagelist_size += cur_upl_pagelist_size;
9463 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9464 upl->highest_page = vector_upl->upls[i].elem->highest_page;
9465 }
9466 }
9467 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9468 } else {
9469 panic("vector_upl_set_pagelist was passed a non-vectored upl");
9470 }
9471 } else {
9472 panic("vector_upl_set_pagelist was passed a NULL upl");
9473 }
9474 }
9475
9476 upl_t
9477 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9478 {
9479 if (vector_upl_is_valid(upl)) {
9480 vector_upl_t vector_upl = upl->vector_upl;
9481 if (vector_upl) {
9482 if (index < vector_upl->num_upls) {
9483 return vector_upl->upls[index].elem;
9484 }
9485 } else {
9486 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9487 }
9488 }
9489 return NULL;
9490 }
9491
9492 upl_t
9493 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9494 {
9495 if (vector_upl_is_valid(upl)) {
9496 uint32_t i = 0;
9497 vector_upl_t vector_upl = upl->vector_upl;
9498
9499 if (vector_upl) {
9500 upl_t subupl = NULL;
9501 vector_upl_iostates_t subupl_state;
9502
9503 for (i = 0; i < vector_upl->num_upls; i++) {
9504 subupl = vector_upl->upls[i].elem;
9505 subupl_state = vector_upl->upls[i].iostate;
9506 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9507 /* We could have been passed an offset/size pair that belongs
9508 * to an UPL element that has already been committed/aborted.
9509 * If so, return NULL.
9510 */
9511 if (subupl == NULL) {
9512 return NULL;
9513 }
9514 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9515 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9516 if (*upl_size > subupl_state.size) {
9517 *upl_size = subupl_state.size;
9518 }
9519 }
9520 if (*upl_offset >= subupl_state.offset) {
9521 *upl_offset -= subupl_state.offset;
9522 } else if (i) {
9523 panic("Vector UPL offset miscalculation");
9524 }
9525 return subupl;
9526 }
9527 }
9528 } else {
9529 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9530 }
9531 }
9532 return NULL;
9533 }
9534
9535 void
9536 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9537 {
9538 *v_upl_submap = NULL;
9539
9540 if (vector_upl_is_valid(upl)) {
9541 vector_upl_t vector_upl = upl->vector_upl;
9542 if (vector_upl) {
9543 *v_upl_submap = vector_upl->submap;
9544 *submap_dst_addr = vector_upl->submap_dst_addr;
9545 } else {
9546 panic("vector_upl_get_submap was passed a non-vectored UPL");
9547 }
9548 } else {
9549 panic("vector_upl_get_submap was passed a null UPL");
9550 }
9551 }
9552
9553 void
9554 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9555 {
9556 if (vector_upl_is_valid(upl)) {
9557 vector_upl_t vector_upl = upl->vector_upl;
9558 if (vector_upl) {
9559 vector_upl->submap = submap;
9560 vector_upl->submap_dst_addr = submap_dst_addr;
9561 } else {
9562 panic("vector_upl_get_submap was passed a non-vectored UPL");
9563 }
9564 } else {
9565 panic("vector_upl_get_submap was passed a NULL UPL");
9566 }
9567 }
9568
9569 void
9570 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9571 {
9572 if (vector_upl_is_valid(upl)) {
9573 uint32_t i = 0;
9574 vector_upl_t vector_upl = upl->vector_upl;
9575
9576 if (vector_upl) {
9577 for (i = 0; i < vector_upl->num_upls; i++) {
9578 if (vector_upl->upls[i].elem == subupl) {
9579 break;
9580 }
9581 }
9582
9583 if (i == vector_upl->num_upls) {
9584 panic("setting sub-upl iostate when none exists");
9585 }
9586
9587 vector_upl->upls[i].iostate.offset = offset;
9588 if (size < PAGE_SIZE) {
9589 size = PAGE_SIZE;
9590 }
9591 vector_upl->upls[i].iostate.size = size;
9592 } else {
9593 panic("vector_upl_set_iostate was passed a non-vectored UPL");
9594 }
9595 } else {
9596 panic("vector_upl_set_iostate was passed a NULL UPL");
9597 }
9598 }
9599
9600 void
9601 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9602 {
9603 if (vector_upl_is_valid(upl)) {
9604 uint32_t i = 0;
9605 vector_upl_t vector_upl = upl->vector_upl;
9606
9607 if (vector_upl) {
9608 for (i = 0; i < vector_upl->num_upls; i++) {
9609 if (vector_upl->upls[i].elem == subupl) {
9610 break;
9611 }
9612 }
9613
9614 if (i == vector_upl->num_upls) {
9615 panic("getting sub-upl iostate when none exists");
9616 }
9617
9618 *offset = vector_upl->upls[i].iostate.offset;
9619 *size = vector_upl->upls[i].iostate.size;
9620 } else {
9621 panic("vector_upl_get_iostate was passed a non-vectored UPL");
9622 }
9623 } else {
9624 panic("vector_upl_get_iostate was passed a NULL UPL");
9625 }
9626 }
9627
9628 void
9629 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9630 {
9631 if (vector_upl_is_valid(upl)) {
9632 vector_upl_t vector_upl = upl->vector_upl;
9633 if (vector_upl) {
9634 if (index < vector_upl->num_upls) {
9635 *offset = vector_upl->upls[index].iostate.offset;
9636 *size = vector_upl->upls[index].iostate.size;
9637 } else {
9638 *offset = *size = 0;
9639 }
9640 } else {
9641 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9642 }
9643 } else {
9644 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9645 }
9646 }
9647
9648 void *
9649 upl_get_internal_vectorupl(upl_t upl)
9650 {
9651 return upl->vector_upl;
9652 }
9653
9654 upl_page_info_t *
9655 upl_get_internal_vectorupl_pagelist(upl_t upl)
9656 {
9657 return upl->vector_upl->pagelist;
9658 }
9659
9660 upl_page_info_t *
9661 upl_get_internal_page_list(upl_t upl)
9662 {
9663 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9664 }
9665
9666 void
9667 upl_clear_dirty(
9668 upl_t upl,
9669 boolean_t value)
9670 {
9671 if (value) {
9672 upl->flags |= UPL_CLEAR_DIRTY;
9673 } else {
9674 upl->flags &= ~UPL_CLEAR_DIRTY;
9675 }
9676 }
9677
9678 void
9679 upl_set_referenced(
9680 upl_t upl,
9681 boolean_t value)
9682 {
9683 upl_lock(upl);
9684 if (value) {
9685 upl->ext_ref_count++;
9686 } else {
9687 if (!upl->ext_ref_count) {
9688 panic("upl_set_referenced not %p", upl);
9689 }
9690 upl->ext_ref_count--;
9691 }
9692 upl_unlock(upl);
9693 }
9694
9695 void
9696 upl_set_map_exclusive(upl_t upl)
9697 {
9698 upl_lock(upl);
9699 while (upl->map_addr_owner) {
9700 upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9701 upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9702 }
9703 upl->map_addr_owner = thread_get_ctid(current_thread());
9704 upl_unlock(upl);
9705 }
9706
9707 void
9708 upl_clear_map_exclusive(upl_t upl)
9709 {
9710 assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9711 upl_lock(upl);
9712 if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9713 upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9714 upl_wakeup(&upl->map_addr_owner);
9715 }
9716 upl->map_addr_owner = 0;
9717 upl_unlock(upl);
9718 }
9719
9720 #if CONFIG_IOSCHED
9721 void
9722 upl_set_blkno(
9723 upl_t upl,
9724 vm_offset_t upl_offset,
9725 int io_size,
9726 int64_t blkno)
9727 {
9728 int i, j;
9729 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9730 return;
9731 }
9732
9733 assert(upl->upl_reprio_info != 0);
9734 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9735 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9736 }
9737 }
9738 #endif
9739
9740 void inline
9741 memoryshot(unsigned int event, unsigned int control)
9742 {
9743 if (vm_debug_events) {
9744 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9745 vm_page_active_count, vm_page_inactive_count,
9746 vm_page_free_count, vm_page_speculative_count,
9747 vm_page_throttled_count);
9748 } else {
9749 (void) event;
9750 (void) control;
9751 }
9752 }
9753
9754 #ifdef MACH_BSD
9755
9756 boolean_t
9757 upl_device_page(upl_page_info_t *upl)
9758 {
9759 return UPL_DEVICE_PAGE(upl);
9760 }
9761 boolean_t
9762 upl_page_present(upl_page_info_t *upl, int index)
9763 {
9764 return UPL_PAGE_PRESENT(upl, index);
9765 }
9766 boolean_t
9767 upl_speculative_page(upl_page_info_t *upl, int index)
9768 {
9769 return UPL_SPECULATIVE_PAGE(upl, index);
9770 }
9771 boolean_t
9772 upl_dirty_page(upl_page_info_t *upl, int index)
9773 {
9774 return UPL_DIRTY_PAGE(upl, index);
9775 }
9776 boolean_t
9777 upl_valid_page(upl_page_info_t *upl, int index)
9778 {
9779 return UPL_VALID_PAGE(upl, index);
9780 }
9781 ppnum_t
9782 upl_phys_page(upl_page_info_t *upl, int index)
9783 {
9784 return UPL_PHYS_PAGE(upl, index);
9785 }
9786
9787 void
9788 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9789 {
9790 upl[index].mark = v;
9791 }
9792
9793 boolean_t
9794 upl_page_get_mark(upl_page_info_t *upl, int index)
9795 {
9796 return upl[index].mark;
9797 }
9798
9799 void
9800 vm_countdirtypages(void)
9801 {
9802 vm_page_t m;
9803 int dpages;
9804 int pgopages;
9805 int precpages;
9806
9807
9808 dpages = 0;
9809 pgopages = 0;
9810 precpages = 0;
9811
9812 vm_page_lock_queues();
9813 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9814 do {
9815 if (m == (vm_page_t)0) {
9816 break;
9817 }
9818
9819 if (m->vmp_dirty) {
9820 dpages++;
9821 }
9822 if (m->vmp_free_when_done) {
9823 pgopages++;
9824 }
9825 if (m->vmp_precious) {
9826 precpages++;
9827 }
9828
9829 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9830 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9831 if (m == (vm_page_t)0) {
9832 break;
9833 }
9834 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9835 vm_page_unlock_queues();
9836
9837 vm_page_lock_queues();
9838 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9839 do {
9840 if (m == (vm_page_t)0) {
9841 break;
9842 }
9843
9844 dpages++;
9845 assert(m->vmp_dirty);
9846 assert(!m->vmp_free_when_done);
9847 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9848 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9849 if (m == (vm_page_t)0) {
9850 break;
9851 }
9852 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9853 vm_page_unlock_queues();
9854
9855 vm_page_lock_queues();
9856 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9857 do {
9858 if (m == (vm_page_t)0) {
9859 break;
9860 }
9861
9862 if (m->vmp_dirty) {
9863 dpages++;
9864 }
9865 if (m->vmp_free_when_done) {
9866 pgopages++;
9867 }
9868 if (m->vmp_precious) {
9869 precpages++;
9870 }
9871
9872 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9873 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9874 if (m == (vm_page_t)0) {
9875 break;
9876 }
9877 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9878 vm_page_unlock_queues();
9879
9880 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9881
9882 dpages = 0;
9883 pgopages = 0;
9884 precpages = 0;
9885
9886 vm_page_lock_queues();
9887 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9888
9889 do {
9890 if (m == (vm_page_t)0) {
9891 break;
9892 }
9893 if (m->vmp_dirty) {
9894 dpages++;
9895 }
9896 if (m->vmp_free_when_done) {
9897 pgopages++;
9898 }
9899 if (m->vmp_precious) {
9900 precpages++;
9901 }
9902
9903 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9904 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9905 if (m == (vm_page_t)0) {
9906 break;
9907 }
9908 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9909 vm_page_unlock_queues();
9910
9911 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9912 }
9913 #endif /* MACH_BSD */
9914
9915
9916 #if CONFIG_IOSCHED
9917 int
9918 upl_get_cached_tier(upl_t upl)
9919 {
9920 assert(upl);
9921 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9922 return upl->upl_priority;
9923 }
9924 return -1;
9925 }
9926 #endif /* CONFIG_IOSCHED */
9927
9928
9929 void
9930 upl_callout_iodone(upl_t upl)
9931 {
9932 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9933
9934 if (upl_ctx) {
9935 void (*iodone_func)(void *, int) = upl_ctx->io_done;
9936
9937 assert(upl_ctx->io_done);
9938
9939 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9940 }
9941 }
9942
9943 void
9944 upl_set_iodone(upl_t upl, void *upl_iodone)
9945 {
9946 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9947 }
9948
9949 void
9950 upl_set_iodone_error(upl_t upl, int error)
9951 {
9952 struct upl_io_completion *upl_ctx = upl->upl_iodone;
9953
9954 if (upl_ctx) {
9955 upl_ctx->io_error = error;
9956 }
9957 }
9958
9959
9960 ppnum_t
9961 upl_get_highest_page(
9962 upl_t upl)
9963 {
9964 return upl->highest_page;
9965 }
9966
9967 upl_size_t
9968 upl_get_size(
9969 upl_t upl)
9970 {
9971 return upl_adjusted_size(upl, PAGE_MASK);
9972 }
9973
9974 upl_size_t
9975 upl_adjusted_size(
9976 upl_t upl,
9977 vm_map_offset_t pgmask)
9978 {
9979 vm_object_offset_t start_offset, end_offset;
9980
9981 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9982 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9983
9984 return (upl_size_t)(end_offset - start_offset);
9985 }
9986
9987 vm_object_offset_t
9988 upl_adjusted_offset(
9989 upl_t upl,
9990 vm_map_offset_t pgmask)
9991 {
9992 return trunc_page_mask_64(upl->u_offset, pgmask);
9993 }
9994
9995 vm_object_offset_t
9996 upl_get_data_offset(
9997 upl_t upl)
9998 {
9999 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
10000 }
10001
10002 upl_t
10003 upl_associated_upl(upl_t upl)
10004 {
10005 return upl->associated_upl;
10006 }
10007
10008 void
10009 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10010 {
10011 upl->associated_upl = associated_upl;
10012 }
10013
10014 struct vnode *
10015 upl_lookup_vnode(upl_t upl)
10016 {
10017 if (!upl->map_object->internal) {
10018 return vnode_pager_lookup_vnode(upl->map_object->pager);
10019 } else {
10020 return NULL;
10021 }
10022 }
10023
10024 #if UPL_DEBUG
10025 kern_return_t
10026 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10027 {
10028 upl->ubc_alias1 = alias1;
10029 upl->ubc_alias2 = alias2;
10030 return KERN_SUCCESS;
10031 }
10032 int
10033 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10034 {
10035 if (al) {
10036 *al = upl->ubc_alias1;
10037 }
10038 if (al2) {
10039 *al2 = upl->ubc_alias2;
10040 }
10041 return KERN_SUCCESS;
10042 }
10043 #endif /* UPL_DEBUG */
10044
10045 #if VM_PRESSURE_EVENTS
10046 /*
10047 * Upward trajectory.
10048 */
10049
10050 boolean_t
10051 VM_PRESSURE_NORMAL_TO_WARNING(void)
10052 {
10053 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10054 /* Available pages below our threshold */
10055 uint32_t available_pages = memorystatus_get_available_page_count();
10056 if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10057 #if CONFIG_FREEZE
10058 /* No frozen processes to kill */
10059 if (memorystatus_frozen_count == 0) {
10060 /* Not enough suspended processes available. */
10061 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10062 return TRUE;
10063 }
10064 }
10065 #else /* CONFIG_FREEZE */
10066 return TRUE;
10067 #endif /* CONFIG_FREEZE */
10068 }
10069 return FALSE;
10070 } else {
10071 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10072 }
10073 }
10074
10075 boolean_t
10076 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10077 {
10078 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10079 /* Available pages below our threshold */
10080 uint32_t available_pages = memorystatus_get_available_page_count();
10081 return available_pages < memorystatus_get_critical_page_shortage_threshold();
10082 } else {
10083 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10084 }
10085 }
10086
10087 /*
10088 * Downward trajectory.
10089 */
10090 boolean_t
10091 VM_PRESSURE_WARNING_TO_NORMAL(void)
10092 {
10093 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10094 /* Available pages above our threshold */
10095 uint32_t available_pages = memorystatus_get_available_page_count();
10096 uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10097 return available_pages > target_threshold;
10098 } else {
10099 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10100 }
10101 }
10102
10103 boolean_t
10104 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10105 {
10106 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10107 uint32_t available_pages = memorystatus_get_available_page_count();
10108 uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10109 return available_pages > target_threshold;
10110 } else {
10111 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10112 }
10113 }
10114 #endif /* VM_PRESSURE_EVENTS */
10115
10116 #if DEVELOPMENT || DEBUG
10117 bool compressor_running_perf_test;
10118 uint64_t compressor_perf_test_pages_processed;
10119
10120 static kern_return_t
10121 move_pages_to_queue(
10122 vm_map_t map,
10123 user_addr_t start_addr,
10124 size_t buffer_size,
10125 vm_page_queue_head_t *queue,
10126 size_t *pages_moved)
10127 {
10128 kern_return_t err = KERN_SUCCESS;
10129 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10130 boolean_t addr_in_map = FALSE;
10131 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10132 vm_object_t curr_object = VM_OBJECT_NULL;
10133 *pages_moved = 0;
10134
10135
10136 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10137 /*
10138 * We don't currently support benchmarking maps with a different page size
10139 * than the kernel.
10140 */
10141 return KERN_INVALID_ARGUMENT;
10142 }
10143
10144 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10145 return KERN_INVALID_ARGUMENT;
10146 }
10147
10148 vm_map_lock_read(map);
10149 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10150 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10151
10152
10153 while (curr_addr < end_addr) {
10154 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10155 if (!addr_in_map) {
10156 err = KERN_INVALID_ARGUMENT;
10157 break;
10158 }
10159 curr_object = VME_OBJECT(curr_entry);
10160 if (curr_object) {
10161 vm_object_lock(curr_object);
10162 /* We really only want anonymous memory that's in the top level map and object here. */
10163 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10164 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10165 err = KERN_INVALID_ARGUMENT;
10166 vm_object_unlock(curr_object);
10167 break;
10168 }
10169 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10170 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10171 (curr_entry->vme_start + VME_OFFSET(curr_entry));
10172 vm_map_offset_t curr_offset = start_offset;
10173 vm_page_t curr_page;
10174 while (curr_offset < end_offset) {
10175 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10176 if (curr_page != VM_PAGE_NULL) {
10177 vm_page_lock_queues();
10178 if (curr_page->vmp_laundry) {
10179 vm_pageout_steal_laundry(curr_page, TRUE);
10180 }
10181 /*
10182 * we've already factored out pages in the laundry which
10183 * means this page can't be on the pageout queue so it's
10184 * safe to do the vm_page_queues_remove
10185 */
10186 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10187 vm_page_queues_remove(curr_page, TRUE);
10188 if (donate) {
10189 /*
10190 * The compressor needs to see this bit to know
10191 * where this page needs to land. Also if stolen,
10192 * this bit helps put the page back in the right
10193 * special queue where it belongs.
10194 */
10195 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10196 }
10197 // Clear the referenced bit so we ensure this gets paged out
10198 curr_page->vmp_reference = false;
10199 if (curr_page->vmp_pmapped) {
10200 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10201 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10202 }
10203 vm_page_queue_enter(queue, curr_page, vmp_pageq);
10204 vm_page_unlock_queues();
10205 *pages_moved += 1;
10206 }
10207 curr_offset += PAGE_SIZE_64;
10208 curr_addr += PAGE_SIZE_64;
10209 }
10210 }
10211 vm_object_unlock(curr_object);
10212 }
10213 vm_map_unlock_read(map);
10214 return err;
10215 }
10216
10217 /*
10218 * Local queue for processing benchmark pages.
10219 * Can't be allocated on the stack because the pointer has to
10220 * be packable.
10221 */
10222 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10223 kern_return_t
10224 run_compressor_perf_test(
10225 user_addr_t buf,
10226 size_t buffer_size,
10227 uint64_t *time,
10228 uint64_t *bytes_compressed,
10229 uint64_t *compressor_growth)
10230 {
10231 kern_return_t err = KERN_SUCCESS;
10232 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10233 return KERN_NOT_SUPPORTED;
10234 }
10235 if (current_task() == kernel_task) {
10236 return KERN_INVALID_ARGUMENT;
10237 }
10238 vm_page_lock_queues();
10239 if (compressor_running_perf_test) {
10240 /* Only run one instance of the benchmark at a time. */
10241 vm_page_unlock_queues();
10242 return KERN_RESOURCE_SHORTAGE;
10243 }
10244 vm_page_unlock_queues();
10245 size_t page_count = 0;
10246 vm_map_t map;
10247 vm_page_t p, next;
10248 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10249 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10250 *bytes_compressed = *compressor_growth = 0;
10251
10252 vm_page_queue_init(&compressor_perf_test_queue);
10253 map = current_task()->map;
10254 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10255 if (err != KERN_SUCCESS) {
10256 goto out;
10257 }
10258
10259 vm_page_lock_queues();
10260 compressor_running_perf_test = true;
10261 compressor_perf_test_pages_processed = 0;
10262 /*
10263 * At this point the compressor threads should only process the benchmark queue
10264 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10265 * to determine how many compressed bytes we ended up using.
10266 */
10267 compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10268 vm_page_unlock_queues();
10269
10270 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10271
10272 vm_page_lock_queues();
10273 compressor_perf_test_start = mach_absolute_time();
10274
10275 // Wake up the compressor thread(s)
10276 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10277 pgo_iothread_internal_state[0].pgo_iothread);
10278
10279 /*
10280 * Depending on when this test is run we could overshoot or be right on the mark
10281 * with our page_count. So the comparison is of the _less than_ variety.
10282 */
10283 while (compressor_perf_test_pages_processed < page_count) {
10284 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10285 vm_page_unlock_queues();
10286 thread_block(THREAD_CONTINUE_NULL);
10287 vm_page_lock_queues();
10288 }
10289 compressor_perf_test_end = mach_absolute_time();
10290 compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10291 vm_page_unlock_queues();
10292
10293
10294 out:
10295 /*
10296 * If we errored out above, then we could still have some pages
10297 * on the local queue. Make sure to put them back on the active queue before
10298 * returning so they're not orphaned.
10299 */
10300 vm_page_lock_queues();
10301 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10302 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10303 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10304 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10305
10306 vm_page_enqueue_active(p, FALSE);
10307 p = next;
10308 }
10309
10310 compressor_running_perf_test = false;
10311 vm_page_unlock_queues();
10312 if (err == KERN_SUCCESS) {
10313 *bytes_compressed = page_count * PAGE_SIZE_64;
10314 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
10315 }
10316
10317 /*
10318 * pageout_scan will consider waking the compactor swapper
10319 * before it blocks. Do the same thing here before we return
10320 * to ensure that back to back benchmark runs can't overly fragment the
10321 * compressor pool.
10322 */
10323 vm_consider_waking_compactor_swapper();
10324 return err;
10325 }
10326 #endif /* DEVELOPMENT || DEBUG */
10327