1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91
92 #include <sys/kdebug_triage.h>
93
94 #include <machine/vm_tuning.h>
95 #include <machine/commpage.h>
96
97 #include <vm/pmap.h>
98 #include <vm/vm_compressor_pager.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map_internal.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_pageout.h>
104 #include <vm/vm_protos.h> /* must be last */
105 #include <vm/memory_object.h>
106 #include <vm/vm_purgeable_internal.h>
107 #include <vm/vm_shared_region.h>
108 #include <vm/vm_compressor.h>
109
110 #include <san/kasan.h>
111
112 #if CONFIG_PHANTOM_CACHE
113 #include <vm/vm_phantom_cache.h>
114 #endif
115
116 #if UPL_DEBUG
117 #include <libkern/OSDebug.h>
118 #endif
119
120 extern int cs_debug;
121
122 extern void mbuf_drain(boolean_t);
123
124 #if VM_PRESSURE_EVENTS
125 #if CONFIG_JETSAM
126 extern unsigned int memorystatus_available_pages;
127 extern unsigned int memorystatus_available_pages_pressure;
128 extern unsigned int memorystatus_available_pages_critical;
129 #else /* CONFIG_JETSAM */
130 extern uint64_t memorystatus_available_pages;
131 extern uint64_t memorystatus_available_pages_pressure;
132 extern uint64_t memorystatus_available_pages_critical;
133 #endif /* CONFIG_JETSAM */
134
135 extern unsigned int memorystatus_frozen_count;
136 extern unsigned int memorystatus_suspended_count;
137 extern vm_pressure_level_t memorystatus_vm_pressure_level;
138
139 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
140 extern uint32_t memorystatus_jetsam_fg_band_waiters;
141
142 void vm_pressure_response(void);
143 extern void consider_vm_pressure_events(void);
144
145 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
146 #endif /* VM_PRESSURE_EVENTS */
147
148 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
149 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
150 boolean_t vps_dynamic_priority_enabled = FALSE;
151 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
152
153 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
154 #if !XNU_TARGET_OS_OSX
155 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
156 #else /* !XNU_TARGET_OS_OSX */
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
158 #endif /* !XNU_TARGET_OS_OSX */
159 #endif
160
161 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
162 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
163 #endif
164
165 #ifndef VM_PAGE_LAUNDRY_MAX
166 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
167 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
168
169 #ifndef VM_PAGEOUT_BURST_WAIT
170 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
171 #endif /* VM_PAGEOUT_BURST_WAIT */
172
173 #ifndef VM_PAGEOUT_EMPTY_WAIT
174 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
175 #endif /* VM_PAGEOUT_EMPTY_WAIT */
176
177 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
178 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
179 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
180
181 #ifndef VM_PAGEOUT_IDLE_WAIT
182 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
183 #endif /* VM_PAGEOUT_IDLE_WAIT */
184
185 #ifndef VM_PAGEOUT_SWAP_WAIT
186 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
187 #endif /* VM_PAGEOUT_SWAP_WAIT */
188
189
190 #ifndef VM_PAGE_SPECULATIVE_TARGET
191 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
192 #endif /* VM_PAGE_SPECULATIVE_TARGET */
193
194
195 /*
196 * To obtain a reasonable LRU approximation, the inactive queue
197 * needs to be large enough to give pages on it a chance to be
198 * referenced a second time. This macro defines the fraction
199 * of active+inactive pages that should be inactive.
200 * The pageout daemon uses it to update vm_page_inactive_target.
201 *
202 * If vm_page_free_count falls below vm_page_free_target and
203 * vm_page_inactive_count is below vm_page_inactive_target,
204 * then the pageout daemon starts running.
205 */
206
207 #ifndef VM_PAGE_INACTIVE_TARGET
208 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
209 #endif /* VM_PAGE_INACTIVE_TARGET */
210
211 /*
212 * Once the pageout daemon starts running, it keeps going
213 * until vm_page_free_count meets or exceeds vm_page_free_target.
214 */
215
216 #ifndef VM_PAGE_FREE_TARGET
217 #if !XNU_TARGET_OS_OSX
218 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
219 #else /* !XNU_TARGET_OS_OSX */
220 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
221 #endif /* !XNU_TARGET_OS_OSX */
222 #endif /* VM_PAGE_FREE_TARGET */
223
224
225 /*
226 * The pageout daemon always starts running once vm_page_free_count
227 * falls below vm_page_free_min.
228 */
229
230 #ifndef VM_PAGE_FREE_MIN
231 #if !XNU_TARGET_OS_OSX
232 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
233 #else /* !XNU_TARGET_OS_OSX */
234 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
235 #endif /* !XNU_TARGET_OS_OSX */
236 #endif /* VM_PAGE_FREE_MIN */
237
238 #if !XNU_TARGET_OS_OSX
239 #define VM_PAGE_FREE_RESERVED_LIMIT 100
240 #define VM_PAGE_FREE_MIN_LIMIT 1500
241 #define VM_PAGE_FREE_TARGET_LIMIT 2000
242 #else /* !XNU_TARGET_OS_OSX */
243 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
244 #define VM_PAGE_FREE_MIN_LIMIT 3500
245 #define VM_PAGE_FREE_TARGET_LIMIT 4000
246 #endif /* !XNU_TARGET_OS_OSX */
247
248 /*
249 * When vm_page_free_count falls below vm_page_free_reserved,
250 * only vm-privileged threads can allocate pages. vm-privilege
251 * allows the pageout daemon and default pager (and any other
252 * associated threads needed for default pageout) to continue
253 * operation by dipping into the reserved pool of pages.
254 */
255
256 #ifndef VM_PAGE_FREE_RESERVED
257 #define VM_PAGE_FREE_RESERVED(n) \
258 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
259 #endif /* VM_PAGE_FREE_RESERVED */
260
261 /*
262 * When we dequeue pages from the inactive list, they are
263 * reactivated (ie, put back on the active queue) if referenced.
264 * However, it is possible to starve the free list if other
265 * processors are referencing pages faster than we can turn off
266 * the referenced bit. So we limit the number of reactivations
267 * we will make per call of vm_pageout_scan().
268 */
269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270
271 #ifndef VM_PAGE_REACTIVATE_LIMIT
272 #if !XNU_TARGET_OS_OSX
273 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
274 #else /* !XNU_TARGET_OS_OSX */
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
276 #endif /* !XNU_TARGET_OS_OSX */
277 #endif /* VM_PAGE_REACTIVATE_LIMIT */
278 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
279
280 extern boolean_t hibernate_cleaning_in_progress;
281
282 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
283
284 #if VM_PRESSURE_EVENTS
285 void vm_pressure_thread(void);
286
287 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
288 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
289
290 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
291 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
292 #endif
293
294 static void vm_pageout_iothread_external(struct cq *, wait_result_t);
295 static void vm_pageout_iothread_internal(struct cq *, wait_result_t);
296 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
297
298 extern void vm_pageout_continue(void);
299 extern void vm_pageout_scan(void);
300
301 boolean_t vm_pageout_running = FALSE;
302
303 uint32_t vm_page_upl_tainted = 0;
304 uint32_t vm_page_iopl_tainted = 0;
305
306 #if XNU_TARGET_OS_OSX
307 static boolean_t vm_pageout_waiter = FALSE;
308 #endif /* XNU_TARGET_OS_OSX */
309
310
311 #if DEVELOPMENT || DEBUG
312 struct vm_pageout_debug vm_pageout_debug;
313 #endif
314 struct vm_pageout_vminfo vm_pageout_vminfo;
315 struct vm_pageout_state vm_pageout_state;
316 struct vm_config vm_config;
317
318 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
319 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
320 #if DEVELOPMENT || DEBUG
321 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
322 #endif /* DEVELOPMENT || DEBUG */
323
324 int vm_upl_wait_for_pages = 0;
325 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
326
327 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
328
329 int vm_debug_events = 0;
330
331 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
332
333 #if CONFIG_MEMORYSTATUS
334 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
335
336 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
337 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
338
339 #endif
340
341 #if __AMP__
342 int vm_compressor_ebound = 1;
343 int vm_pgo_pbound = 0;
344 extern void thread_bind_cluster_type(thread_t, char, bool);
345 #endif /* __AMP__ */
346
347
348 /*
349 * Routine: vm_pageout_object_terminate
350 * Purpose:
351 * Destroy the pageout_object, and perform all of the
352 * required cleanup actions.
353 *
354 * In/Out conditions:
355 * The object must be locked, and will be returned locked.
356 */
357 void
vm_pageout_object_terminate(vm_object_t object)358 vm_pageout_object_terminate(
359 vm_object_t object)
360 {
361 vm_object_t shadow_object;
362
363 /*
364 * Deal with the deallocation (last reference) of a pageout object
365 * (used for cleaning-in-place) by dropping the paging references/
366 * freeing pages in the original object.
367 */
368
369 assert(object->pageout);
370 shadow_object = object->shadow;
371 vm_object_lock(shadow_object);
372
373 while (!vm_page_queue_empty(&object->memq)) {
374 vm_page_t p, m;
375 vm_object_offset_t offset;
376
377 p = (vm_page_t) vm_page_queue_first(&object->memq);
378
379 assert(p->vmp_private);
380 assert(p->vmp_free_when_done);
381 p->vmp_free_when_done = FALSE;
382 assert(!p->vmp_cleaning);
383 assert(!p->vmp_laundry);
384
385 offset = p->vmp_offset;
386 VM_PAGE_FREE(p);
387 p = VM_PAGE_NULL;
388
389 m = vm_page_lookup(shadow_object,
390 offset + object->vo_shadow_offset);
391
392 if (m == VM_PAGE_NULL) {
393 continue;
394 }
395
396 assert((m->vmp_dirty) || (m->vmp_precious) ||
397 (m->vmp_busy && m->vmp_cleaning));
398
399 /*
400 * Handle the trusted pager throttle.
401 * Also decrement the burst throttle (if external).
402 */
403 vm_page_lock_queues();
404 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
405 vm_pageout_throttle_up(m);
406 }
407
408 /*
409 * Handle the "target" page(s). These pages are to be freed if
410 * successfully cleaned. Target pages are always busy, and are
411 * wired exactly once. The initial target pages are not mapped,
412 * (so cannot be referenced or modified) but converted target
413 * pages may have been modified between the selection as an
414 * adjacent page and conversion to a target.
415 */
416 if (m->vmp_free_when_done) {
417 assert(m->vmp_busy);
418 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
419 assert(m->vmp_wire_count == 1);
420 m->vmp_cleaning = FALSE;
421 m->vmp_free_when_done = FALSE;
422 /*
423 * Revoke all access to the page. Since the object is
424 * locked, and the page is busy, this prevents the page
425 * from being dirtied after the pmap_disconnect() call
426 * returns.
427 *
428 * Since the page is left "dirty" but "not modifed", we
429 * can detect whether the page was redirtied during
430 * pageout by checking the modify state.
431 */
432 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
433 SET_PAGE_DIRTY(m, FALSE);
434 } else {
435 m->vmp_dirty = FALSE;
436 }
437
438 if (m->vmp_dirty) {
439 vm_page_unwire(m, TRUE); /* reactivates */
440 counter_inc(&vm_statistics_reactivations);
441 PAGE_WAKEUP_DONE(m);
442 } else {
443 vm_page_free(m); /* clears busy, etc. */
444 }
445 vm_page_unlock_queues();
446 continue;
447 }
448 /*
449 * Handle the "adjacent" pages. These pages were cleaned in
450 * place, and should be left alone.
451 * If prep_pin_count is nonzero, then someone is using the
452 * page, so make it active.
453 */
454 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
455 if (m->vmp_reference) {
456 vm_page_activate(m);
457 } else {
458 vm_page_deactivate(m);
459 }
460 }
461 if (m->vmp_overwriting) {
462 /*
463 * the (COPY_OUT_FROM == FALSE) request_page_list case
464 */
465 if (m->vmp_busy) {
466 /*
467 * We do not re-set m->vmp_dirty !
468 * The page was busy so no extraneous activity
469 * could have occurred. COPY_INTO is a read into the
470 * new pages. CLEAN_IN_PLACE does actually write
471 * out the pages but handling outside of this code
472 * will take care of resetting dirty. We clear the
473 * modify however for the Programmed I/O case.
474 */
475 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
476
477 m->vmp_busy = FALSE;
478 m->vmp_absent = FALSE;
479 } else {
480 /*
481 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
482 * Occurs when the original page was wired
483 * at the time of the list request
484 */
485 assert(VM_PAGE_WIRED(m));
486 vm_page_unwire(m, TRUE); /* reactivates */
487 }
488 m->vmp_overwriting = FALSE;
489 } else {
490 m->vmp_dirty = FALSE;
491 }
492 m->vmp_cleaning = FALSE;
493
494 /*
495 * Wakeup any thread waiting for the page to be un-cleaning.
496 */
497 PAGE_WAKEUP(m);
498 vm_page_unlock_queues();
499 }
500 /*
501 * Account for the paging reference taken in vm_paging_object_allocate.
502 */
503 vm_object_activity_end(shadow_object);
504 vm_object_unlock(shadow_object);
505
506 assert(object->ref_count == 0);
507 assert(object->paging_in_progress == 0);
508 assert(object->activity_in_progress == 0);
509 assert(object->resident_page_count == 0);
510 return;
511 }
512
513 /*
514 * Routine: vm_pageclean_setup
515 *
516 * Purpose: setup a page to be cleaned (made non-dirty), but not
517 * necessarily flushed from the VM page cache.
518 * This is accomplished by cleaning in place.
519 *
520 * The page must not be busy, and new_object
521 * must be locked.
522 *
523 */
524 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)525 vm_pageclean_setup(
526 vm_page_t m,
527 vm_page_t new_m,
528 vm_object_t new_object,
529 vm_object_offset_t new_offset)
530 {
531 assert(!m->vmp_busy);
532 #if 0
533 assert(!m->vmp_cleaning);
534 #endif
535
536 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
537
538 /*
539 * Mark original page as cleaning in place.
540 */
541 m->vmp_cleaning = TRUE;
542 SET_PAGE_DIRTY(m, FALSE);
543 m->vmp_precious = FALSE;
544
545 /*
546 * Convert the fictitious page to a private shadow of
547 * the real page.
548 */
549 assert(new_m->vmp_fictitious);
550 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
551 new_m->vmp_fictitious = FALSE;
552 new_m->vmp_private = TRUE;
553 new_m->vmp_free_when_done = TRUE;
554 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
555
556 vm_page_lockspin_queues();
557 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
558 vm_page_unlock_queues();
559
560 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
561 assert(!new_m->vmp_wanted);
562 new_m->vmp_busy = FALSE;
563 }
564
565 /*
566 * Routine: vm_pageout_initialize_page
567 * Purpose:
568 * Causes the specified page to be initialized in
569 * the appropriate memory object. This routine is used to push
570 * pages into a copy-object when they are modified in the
571 * permanent object.
572 *
573 * The page is moved to a temporary object and paged out.
574 *
575 * In/out conditions:
576 * The page in question must not be on any pageout queues.
577 * The object to which it belongs must be locked.
578 * The page must be busy, but not hold a paging reference.
579 *
580 * Implementation:
581 * Move this page to a completely new object.
582 */
583 void
vm_pageout_initialize_page(vm_page_t m)584 vm_pageout_initialize_page(
585 vm_page_t m)
586 {
587 vm_object_t object;
588 vm_object_offset_t paging_offset;
589 memory_object_t pager;
590
591 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
592
593 object = VM_PAGE_OBJECT(m);
594
595 assert(m->vmp_busy);
596 assert(object->internal);
597
598 /*
599 * Verify that we really want to clean this page
600 */
601 assert(!m->vmp_absent);
602 assert(m->vmp_dirty);
603
604 /*
605 * Create a paging reference to let us play with the object.
606 */
607 paging_offset = m->vmp_offset + object->paging_offset;
608
609 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
610 panic("reservation without pageout?"); /* alan */
611
612 VM_PAGE_FREE(m);
613 vm_object_unlock(object);
614
615 return;
616 }
617
618 /*
619 * If there's no pager, then we can't clean the page. This should
620 * never happen since this should be a copy object and therefore not
621 * an external object, so the pager should always be there.
622 */
623
624 pager = object->pager;
625
626 if (pager == MEMORY_OBJECT_NULL) {
627 panic("missing pager for copy object");
628
629 VM_PAGE_FREE(m);
630 return;
631 }
632
633 /*
634 * set the page for future call to vm_fault_list_request
635 */
636 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
637 SET_PAGE_DIRTY(m, FALSE);
638
639 /*
640 * keep the object from collapsing or terminating
641 */
642 vm_object_paging_begin(object);
643 vm_object_unlock(object);
644
645 /*
646 * Write the data to its pager.
647 * Note that the data is passed by naming the new object,
648 * not a virtual address; the pager interface has been
649 * manipulated to use the "internal memory" data type.
650 * [The object reference from its allocation is donated
651 * to the eventual recipient.]
652 */
653 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
654
655 vm_object_lock(object);
656 vm_object_paging_end(object);
657 }
658
659
660 /*
661 * vm_pageout_cluster:
662 *
663 * Given a page, queue it to the appropriate I/O thread,
664 * which will page it out and attempt to clean adjacent pages
665 * in the same operation.
666 *
667 * The object and queues must be locked. We will take a
668 * paging reference to prevent deallocation or collapse when we
669 * release the object lock back at the call site. The I/O thread
670 * is responsible for consuming this reference
671 *
672 * The page must not be on any pageout queue.
673 */
674 #if DEVELOPMENT || DEBUG
675 vmct_stats_t vmct_stats;
676
677 int32_t vmct_active = 0;
678 uint64_t vm_compressor_epoch_start = 0;
679 uint64_t vm_compressor_epoch_stop = 0;
680
681 typedef enum vmct_state_t {
682 VMCT_IDLE,
683 VMCT_AWAKENED,
684 VMCT_ACTIVE,
685 } vmct_state_t;
686 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
687 #endif
688
689
690
691 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)692 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
693 {
694 event_t wakeup_event;
695 vm_object_t object = VM_PAGE_OBJECT(m);
696
697 VM_PAGE_CHECK(m);
698 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
699 vm_object_lock_assert_exclusive(object);
700
701 /*
702 * Make sure it's OK to page this out.
703 */
704 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
705 assert(!m->vmp_cleaning && !m->vmp_laundry);
706 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
707
708 /*
709 * protect the object from collapse or termination
710 */
711 vm_object_activity_begin(object);
712
713 if (object->internal == TRUE) {
714 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
715
716 m->vmp_busy = TRUE;
717 wakeup_event = (event_t) &(vm_pageout_queue_internal.pgo_pending);
718 } else {
719 wakeup_event = (event_t) &(vm_pageout_queue_external.pgo_pending);
720 }
721
722 /*
723 * pgo_laundry count is tied to the laundry bit
724 */
725 m->vmp_laundry = TRUE;
726 q->pgo_laundry++;
727
728 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
729 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
730
731 if (q->pgo_idle == TRUE) {
732 q->pgo_idle = FALSE;
733 thread_wakeup(wakeup_event);
734 }
735 VM_PAGE_CHECK(m);
736 }
737
738 void
vm_pageout_cluster(vm_page_t m)739 vm_pageout_cluster(vm_page_t m)
740 {
741 struct vm_pageout_queue *q;
742 vm_object_t object = VM_PAGE_OBJECT(m);
743 if (object->internal) {
744 q = &vm_pageout_queue_internal;
745 } else {
746 q = &vm_pageout_queue_external;
747 }
748 vm_pageout_cluster_to_queue(m, q);
749 }
750
751
752 /*
753 * A page is back from laundry or we are stealing it back from
754 * the laundering state. See if there are some pages waiting to
755 * go to laundry and if we can let some of them go now.
756 *
757 * Object and page queues must be locked.
758 */
759 void
vm_pageout_throttle_up(vm_page_t m)760 vm_pageout_throttle_up(
761 vm_page_t m)
762 {
763 struct vm_pageout_queue *q;
764 vm_object_t m_object;
765
766 m_object = VM_PAGE_OBJECT(m);
767
768 assert(m_object != VM_OBJECT_NULL);
769 assert(m_object != kernel_object);
770
771 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
772 vm_object_lock_assert_exclusive(m_object);
773
774 if (m_object->internal == TRUE) {
775 q = &vm_pageout_queue_internal;
776 } else {
777 q = &vm_pageout_queue_external;
778 }
779
780 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
781 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
782 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
783
784 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
785
786 vm_object_activity_end(m_object);
787
788 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
789 }
790 if (m->vmp_laundry == TRUE) {
791 m->vmp_laundry = FALSE;
792 q->pgo_laundry--;
793
794 if (q->pgo_throttled == TRUE) {
795 q->pgo_throttled = FALSE;
796 thread_wakeup((event_t) &q->pgo_laundry);
797 }
798 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
799 q->pgo_draining = FALSE;
800 thread_wakeup((event_t) (&q->pgo_laundry + 1));
801 }
802 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
803 }
804 }
805
806
807 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)808 vm_pageout_throttle_up_batch(
809 struct vm_pageout_queue *q,
810 int batch_cnt)
811 {
812 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
813
814 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
815
816 q->pgo_laundry -= batch_cnt;
817
818 if (q->pgo_throttled == TRUE) {
819 q->pgo_throttled = FALSE;
820 thread_wakeup((event_t) &q->pgo_laundry);
821 }
822 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
823 q->pgo_draining = FALSE;
824 thread_wakeup((event_t) (&q->pgo_laundry + 1));
825 }
826 }
827
828
829
830 /*
831 * VM memory pressure monitoring.
832 *
833 * vm_pageout_scan() keeps track of the number of pages it considers and
834 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
835 *
836 * compute_memory_pressure() is called every second from compute_averages()
837 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
838 * of recalimed pages in a new vm_pageout_stat[] bucket.
839 *
840 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
841 * The caller provides the number of seconds ("nsecs") worth of statistics
842 * it wants, up to 30 seconds.
843 * It computes the number of pages reclaimed in the past "nsecs" seconds and
844 * also returns the number of pages the system still needs to reclaim at this
845 * moment in time.
846 */
847 #if DEVELOPMENT || DEBUG
848 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
849 #else
850 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
851 #endif
852 struct vm_pageout_stat {
853 unsigned long vm_page_active_count;
854 unsigned long vm_page_speculative_count;
855 unsigned long vm_page_inactive_count;
856 unsigned long vm_page_anonymous_count;
857
858 unsigned long vm_page_free_count;
859 unsigned long vm_page_wire_count;
860 unsigned long vm_page_compressor_count;
861
862 unsigned long vm_page_pages_compressed;
863 unsigned long vm_page_pageable_internal_count;
864 unsigned long vm_page_pageable_external_count;
865 unsigned long vm_page_xpmapped_external_count;
866
867 unsigned int pages_grabbed;
868 unsigned int pages_freed;
869
870 unsigned int pages_compressed;
871 unsigned int pages_grabbed_by_compressor;
872 unsigned int failed_compressions;
873
874 unsigned int pages_evicted;
875 unsigned int pages_purged;
876
877 unsigned int considered;
878 unsigned int considered_bq_internal;
879 unsigned int considered_bq_external;
880
881 unsigned int skipped_external;
882 unsigned int skipped_internal;
883 unsigned int filecache_min_reactivations;
884
885 unsigned int freed_speculative;
886 unsigned int freed_cleaned;
887 unsigned int freed_internal;
888 unsigned int freed_external;
889
890 unsigned int cleaned_dirty_external;
891 unsigned int cleaned_dirty_internal;
892
893 unsigned int inactive_referenced;
894 unsigned int inactive_nolock;
895 unsigned int reactivation_limit_exceeded;
896 unsigned int forced_inactive_reclaim;
897
898 unsigned int throttled_internal_q;
899 unsigned int throttled_external_q;
900
901 unsigned int phantom_ghosts_found;
902 unsigned int phantom_ghosts_added;
903 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, };
904
905 unsigned int vm_pageout_stat_now = 0;
906
907 #define VM_PAGEOUT_STAT_BEFORE(i) \
908 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
909 #define VM_PAGEOUT_STAT_AFTER(i) \
910 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
911
912 #if VM_PAGE_BUCKETS_CHECK
913 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
914 #endif /* VM_PAGE_BUCKETS_CHECK */
915
916
917 void
918 record_memory_pressure(void);
919 void
record_memory_pressure(void)920 record_memory_pressure(void)
921 {
922 unsigned int vm_pageout_next;
923
924 #if VM_PAGE_BUCKETS_CHECK
925 /* check the consistency of VM page buckets at regular interval */
926 static int counter = 0;
927 if ((++counter % vm_page_buckets_check_interval) == 0) {
928 vm_page_buckets_check();
929 }
930 #endif /* VM_PAGE_BUCKETS_CHECK */
931
932 vm_pageout_state.vm_memory_pressure =
933 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
934 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
935 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
936 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
937
938 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
939
940 /* move "now" forward */
941 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
942
943 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
944
945 vm_pageout_stat_now = vm_pageout_next;
946 }
947
948
949 /*
950 * IMPORTANT
951 * mach_vm_ctl_page_free_wanted() is called indirectly, via
952 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
953 * it must be safe in the restricted stackshot context. Locks and/or
954 * blocking are not allowable.
955 */
956 unsigned int
mach_vm_ctl_page_free_wanted(void)957 mach_vm_ctl_page_free_wanted(void)
958 {
959 unsigned int page_free_target, page_free_count, page_free_wanted;
960
961 page_free_target = vm_page_free_target;
962 page_free_count = vm_page_free_count;
963 if (page_free_target > page_free_count) {
964 page_free_wanted = page_free_target - page_free_count;
965 } else {
966 page_free_wanted = 0;
967 }
968
969 return page_free_wanted;
970 }
971
972
973 /*
974 * IMPORTANT:
975 * mach_vm_pressure_monitor() is called when taking a stackshot, with
976 * wait_for_pressure FALSE, so that code path must remain safe in the
977 * restricted stackshot context. No blocking or locks are allowable.
978 * on that code path.
979 */
980
981 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)982 mach_vm_pressure_monitor(
983 boolean_t wait_for_pressure,
984 unsigned int nsecs_monitored,
985 unsigned int *pages_reclaimed_p,
986 unsigned int *pages_wanted_p)
987 {
988 wait_result_t wr;
989 unsigned int vm_pageout_then, vm_pageout_now;
990 unsigned int pages_reclaimed;
991 unsigned int units_of_monitor;
992
993 units_of_monitor = 8 * nsecs_monitored;
994 /*
995 * We don't take the vm_page_queue_lock here because we don't want
996 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
997 * thread when it's trying to reclaim memory. We don't need fully
998 * accurate monitoring anyway...
999 */
1000
1001 if (wait_for_pressure) {
1002 /* wait until there's memory pressure */
1003 while (vm_page_free_count >= vm_page_free_target) {
1004 wr = assert_wait((event_t) &vm_page_free_wanted,
1005 THREAD_INTERRUPTIBLE);
1006 if (wr == THREAD_WAITING) {
1007 wr = thread_block(THREAD_CONTINUE_NULL);
1008 }
1009 if (wr == THREAD_INTERRUPTED) {
1010 return KERN_ABORTED;
1011 }
1012 if (wr == THREAD_AWAKENED) {
1013 /*
1014 * The memory pressure might have already
1015 * been relieved but let's not block again
1016 * and let's report that there was memory
1017 * pressure at some point.
1018 */
1019 break;
1020 }
1021 }
1022 }
1023
1024 /* provide the number of pages the system wants to reclaim */
1025 if (pages_wanted_p != NULL) {
1026 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1027 }
1028
1029 if (pages_reclaimed_p == NULL) {
1030 return KERN_SUCCESS;
1031 }
1032
1033 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1034 vm_pageout_now = vm_pageout_stat_now;
1035 pages_reclaimed = 0;
1036 for (vm_pageout_then =
1037 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1038 vm_pageout_then != vm_pageout_now &&
1039 units_of_monitor-- != 0;
1040 vm_pageout_then =
1041 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1042 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1043 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1044 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1045 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1046 }
1047 *pages_reclaimed_p = pages_reclaimed;
1048
1049 return KERN_SUCCESS;
1050 }
1051
1052
1053
1054 #if DEVELOPMENT || DEBUG
1055
1056 static void
1057 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1058
1059 /*
1060 * condition variable used to make sure there is
1061 * only a single sweep going on at a time
1062 */
1063 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1064
1065
1066 void
vm_pageout_disconnect_all_pages()1067 vm_pageout_disconnect_all_pages()
1068 {
1069 vm_page_lock_queues();
1070
1071 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1072 vm_page_unlock_queues();
1073 return;
1074 }
1075 vm_pageout_disconnect_all_pages_active = TRUE;
1076 vm_page_unlock_queues();
1077
1078 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1079 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1080 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1081
1082 vm_pageout_disconnect_all_pages_active = FALSE;
1083 }
1084
1085
1086 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1087 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1088 {
1089 vm_page_t m;
1090 vm_object_t t_object = NULL;
1091 vm_object_t l_object = NULL;
1092 vm_object_t m_object = NULL;
1093 int delayed_unlock = 0;
1094 int try_failed_count = 0;
1095 int disconnected_count = 0;
1096 int paused_count = 0;
1097 int object_locked_count = 0;
1098
1099 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1100 q, qcount, 0, 0, 0);
1101
1102 vm_page_lock_queues();
1103
1104 while (qcount && !vm_page_queue_empty(q)) {
1105 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1106
1107 m = (vm_page_t) vm_page_queue_first(q);
1108 m_object = VM_PAGE_OBJECT(m);
1109
1110 /*
1111 * check to see if we currently are working
1112 * with the same object... if so, we've
1113 * already got the lock
1114 */
1115 if (m_object != l_object) {
1116 /*
1117 * the object associated with candidate page is
1118 * different from the one we were just working
1119 * with... dump the lock if we still own it
1120 */
1121 if (l_object != NULL) {
1122 vm_object_unlock(l_object);
1123 l_object = NULL;
1124 }
1125 if (m_object != t_object) {
1126 try_failed_count = 0;
1127 }
1128
1129 /*
1130 * Try to lock object; since we've alread got the
1131 * page queues lock, we can only 'try' for this one.
1132 * if the 'try' fails, we need to do a mutex_pause
1133 * to allow the owner of the object lock a chance to
1134 * run...
1135 */
1136 if (!vm_object_lock_try_scan(m_object)) {
1137 if (try_failed_count > 20) {
1138 goto reenter_pg_on_q;
1139 }
1140 vm_page_unlock_queues();
1141 mutex_pause(try_failed_count++);
1142 vm_page_lock_queues();
1143 delayed_unlock = 0;
1144
1145 paused_count++;
1146
1147 t_object = m_object;
1148 continue;
1149 }
1150 object_locked_count++;
1151
1152 l_object = m_object;
1153 }
1154 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1155 /*
1156 * put it back on the head of its queue
1157 */
1158 goto reenter_pg_on_q;
1159 }
1160 if (m->vmp_pmapped == TRUE) {
1161 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1162
1163 disconnected_count++;
1164 }
1165 reenter_pg_on_q:
1166 vm_page_queue_remove(q, m, vmp_pageq);
1167 vm_page_queue_enter(q, m, vmp_pageq);
1168
1169 qcount--;
1170 try_failed_count = 0;
1171
1172 if (delayed_unlock++ > 128) {
1173 if (l_object != NULL) {
1174 vm_object_unlock(l_object);
1175 l_object = NULL;
1176 }
1177 lck_mtx_yield(&vm_page_queue_lock);
1178 delayed_unlock = 0;
1179 }
1180 }
1181 if (l_object != NULL) {
1182 vm_object_unlock(l_object);
1183 l_object = NULL;
1184 }
1185 vm_page_unlock_queues();
1186
1187 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1188 q, disconnected_count, object_locked_count, paused_count, 0);
1189 }
1190
1191 extern char* proc_best_name(struct proc* proc);
1192
1193 int
vm_toggle_task_selfdonate_pages(task_t task)1194 vm_toggle_task_selfdonate_pages(task_t task)
1195 {
1196 int state = 0;
1197 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1198 printf("VM Donation mode is OFF on the system\n");
1199 return state;
1200 }
1201 if (task != kernel_task) {
1202 task_lock(task);
1203 if (!task->donates_own_pages) {
1204 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1205 task->donates_own_pages = true;
1206 state = 1;
1207 } else if (task->donates_own_pages) {
1208 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1209 task->donates_own_pages = false;
1210 state = 0;
1211 }
1212 task_unlock(task);
1213 }
1214 return state;
1215 }
1216 #endif /* DEVELOPMENT || DEBUG */
1217
1218 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1219 vm_task_set_selfdonate_pages(task_t task, bool donate)
1220 {
1221 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1222 assert(task != kernel_task);
1223
1224 task_lock(task);
1225 task->donates_own_pages = donate;
1226 task_unlock(task);
1227 }
1228
1229
1230
1231 static size_t
1232 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1233
1234 /*
1235 * condition variable used to make sure there is
1236 * only a single sweep going on at a time
1237 */
1238 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1239
1240
1241 void
vm_pageout_anonymous_pages()1242 vm_pageout_anonymous_pages()
1243 {
1244 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1245 vm_page_lock_queues();
1246
1247 if (vm_pageout_anonymous_pages_active == TRUE) {
1248 vm_page_unlock_queues();
1249 return;
1250 }
1251 vm_pageout_anonymous_pages_active = TRUE;
1252 vm_page_unlock_queues();
1253
1254 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1255 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1256 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1257
1258 if (VM_CONFIG_SWAP_IS_PRESENT) {
1259 vm_consider_swapping();
1260 }
1261
1262 vm_page_lock_queues();
1263 vm_pageout_anonymous_pages_active = FALSE;
1264 vm_page_unlock_queues();
1265 }
1266 }
1267
1268
1269 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1270 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1271 {
1272 vm_page_t m;
1273 vm_object_t t_object = NULL;
1274 vm_object_t l_object = NULL;
1275 vm_object_t m_object = NULL;
1276 int delayed_unlock = 0;
1277 int try_failed_count = 0;
1278 int refmod_state;
1279 int pmap_options;
1280 struct vm_pageout_queue *iq;
1281 ppnum_t phys_page;
1282 size_t pages_moved = 0;
1283
1284
1285 iq = &vm_pageout_queue_internal;
1286
1287 vm_page_lock_queues();
1288
1289 #if DEVELOPMENT || DEBUG
1290 if (perf_test) {
1291 iq = &vm_pageout_queue_benchmark;
1292 }
1293 #endif /* DEVELOPMENT ||DEBUG */
1294
1295 while (qcount && !vm_page_queue_empty(q)) {
1296 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1297
1298 if (VM_PAGE_Q_THROTTLED(iq)) {
1299 if (l_object != NULL) {
1300 vm_object_unlock(l_object);
1301 l_object = NULL;
1302 }
1303 iq->pgo_draining = TRUE;
1304
1305 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1306 vm_page_unlock_queues();
1307
1308 thread_block(THREAD_CONTINUE_NULL);
1309
1310 vm_page_lock_queues();
1311 delayed_unlock = 0;
1312 continue;
1313 }
1314 m = (vm_page_t) vm_page_queue_first(q);
1315 m_object = VM_PAGE_OBJECT(m);
1316
1317 /*
1318 * check to see if we currently are working
1319 * with the same object... if so, we've
1320 * already got the lock
1321 */
1322 if (m_object != l_object) {
1323 if (!m_object->internal) {
1324 goto reenter_pg_on_q;
1325 }
1326
1327 /*
1328 * the object associated with candidate page is
1329 * different from the one we were just working
1330 * with... dump the lock if we still own it
1331 */
1332 if (l_object != NULL) {
1333 vm_object_unlock(l_object);
1334 l_object = NULL;
1335 }
1336 if (m_object != t_object) {
1337 try_failed_count = 0;
1338 }
1339
1340 /*
1341 * Try to lock object; since we've alread got the
1342 * page queues lock, we can only 'try' for this one.
1343 * if the 'try' fails, we need to do a mutex_pause
1344 * to allow the owner of the object lock a chance to
1345 * run...
1346 */
1347 if (!vm_object_lock_try_scan(m_object)) {
1348 if (try_failed_count > 20) {
1349 goto reenter_pg_on_q;
1350 }
1351 vm_page_unlock_queues();
1352 mutex_pause(try_failed_count++);
1353 vm_page_lock_queues();
1354 delayed_unlock = 0;
1355
1356 t_object = m_object;
1357 continue;
1358 }
1359 l_object = m_object;
1360 }
1361 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1362 /*
1363 * page is not to be cleaned
1364 * put it back on the head of its queue
1365 */
1366 goto reenter_pg_on_q;
1367 }
1368 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1369
1370 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1371 refmod_state = pmap_get_refmod(phys_page);
1372
1373 if (refmod_state & VM_MEM_REFERENCED) {
1374 m->vmp_reference = TRUE;
1375 }
1376 if (refmod_state & VM_MEM_MODIFIED) {
1377 SET_PAGE_DIRTY(m, FALSE);
1378 }
1379 }
1380 if (m->vmp_reference == TRUE) {
1381 m->vmp_reference = FALSE;
1382 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1383 goto reenter_pg_on_q;
1384 }
1385 if (m->vmp_pmapped == TRUE) {
1386 if (m->vmp_dirty || m->vmp_precious) {
1387 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1388 } else {
1389 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1390 }
1391 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1392 if (refmod_state & VM_MEM_MODIFIED) {
1393 SET_PAGE_DIRTY(m, FALSE);
1394 }
1395 }
1396
1397 if (!m->vmp_dirty && !m->vmp_precious) {
1398 vm_page_unlock_queues();
1399 VM_PAGE_FREE(m);
1400 vm_page_lock_queues();
1401 delayed_unlock = 0;
1402
1403 goto next_pg;
1404 }
1405 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1406 if (!m_object->pager_initialized) {
1407 vm_page_unlock_queues();
1408
1409 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1410
1411 if (!m_object->pager_initialized) {
1412 vm_object_compressor_pager_create(m_object);
1413 }
1414
1415 vm_page_lock_queues();
1416 delayed_unlock = 0;
1417 }
1418 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1419 goto reenter_pg_on_q;
1420 }
1421 /*
1422 * vm_object_compressor_pager_create will drop the object lock
1423 * which means 'm' may no longer be valid to use
1424 */
1425 continue;
1426 }
1427
1428 if (!perf_test) {
1429 /*
1430 * we've already factored out pages in the laundry which
1431 * means this page can't be on the pageout queue so it's
1432 * safe to do the vm_page_queues_remove
1433 */
1434 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1435 vm_page_queues_remove(m, TRUE);
1436 if (donate) {
1437 /*
1438 * The compressor needs to see this bit to know
1439 * where this page needs to land. Also if stolen,
1440 * this bit helps put the page back in the right
1441 * special queue where it belongs.
1442 */
1443 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1444 }
1445 } else {
1446 vm_page_queue_remove(q, m, vmp_pageq);
1447 }
1448
1449 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1450
1451 vm_pageout_cluster_to_queue(m, iq);
1452
1453 pages_moved++;
1454 goto next_pg;
1455
1456 reenter_pg_on_q:
1457 vm_page_queue_remove(q, m, vmp_pageq);
1458 vm_page_queue_enter(q, m, vmp_pageq);
1459 next_pg:
1460 qcount--;
1461 try_failed_count = 0;
1462
1463 if (delayed_unlock++ > 128) {
1464 if (l_object != NULL) {
1465 vm_object_unlock(l_object);
1466 l_object = NULL;
1467 }
1468 lck_mtx_yield(&vm_page_queue_lock);
1469 delayed_unlock = 0;
1470 }
1471 }
1472 if (l_object != NULL) {
1473 vm_object_unlock(l_object);
1474 l_object = NULL;
1475 }
1476 vm_page_unlock_queues();
1477 return pages_moved;
1478 }
1479
1480
1481
1482 /*
1483 * function in BSD to apply I/O throttle to the pageout thread
1484 */
1485 extern void vm_pageout_io_throttle(void);
1486
1487 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1488 MACRO_BEGIN \
1489 /* \
1490 * If a "reusable" page somehow made it back into \
1491 * the active queue, it's been re-used and is not \
1492 * quite re-usable. \
1493 * If the VM object was "all_reusable", consider it \
1494 * as "all re-used" instead of converting it to \
1495 * "partially re-used", which could be expensive. \
1496 */ \
1497 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1498 if ((m)->vmp_reusable || \
1499 (obj)->all_reusable) { \
1500 vm_object_reuse_pages((obj), \
1501 (m)->vmp_offset, \
1502 (m)->vmp_offset + PAGE_SIZE_64, \
1503 FALSE); \
1504 } \
1505 MACRO_END
1506
1507
1508 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1509 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1510
1511 #define FCS_IDLE 0
1512 #define FCS_DELAYED 1
1513 #define FCS_DEADLOCK_DETECTED 2
1514
1515 struct flow_control {
1516 int state;
1517 mach_timespec_t ts;
1518 };
1519
1520
1521 uint64_t vm_pageout_rejected_bq_internal = 0;
1522 uint64_t vm_pageout_rejected_bq_external = 0;
1523 uint64_t vm_pageout_skipped_bq_internal = 0;
1524 uint64_t vm_pageout_skipped_bq_external = 0;
1525
1526 #define ANONS_GRABBED_LIMIT 2
1527
1528
1529 #if 0
1530 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1531 #endif
1532 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1533
1534 #define VM_PAGEOUT_PB_NO_ACTION 0
1535 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1536 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1537
1538
1539 #if 0
1540 static void
1541 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1542 {
1543 if (*local_freeq) {
1544 vm_page_unlock_queues();
1545
1546 VM_DEBUG_CONSTANT_EVENT(
1547 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1548 vm_page_free_count, 0, 0, 1);
1549
1550 vm_page_free_list(*local_freeq, TRUE);
1551
1552 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1553 vm_page_free_count, *local_freed, 0, 1);
1554
1555 *local_freeq = NULL;
1556 *local_freed = 0;
1557
1558 vm_page_lock_queues();
1559 } else {
1560 lck_mtx_yield(&vm_page_queue_lock);
1561 }
1562 *delayed_unlock = 1;
1563 }
1564 #endif
1565
1566
1567 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1568 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1569 vm_page_t *local_freeq, int *local_freed, int action)
1570 {
1571 vm_page_unlock_queues();
1572
1573 if (*object != NULL) {
1574 vm_object_unlock(*object);
1575 *object = NULL;
1576 }
1577 if (*local_freeq) {
1578 vm_page_free_list(*local_freeq, TRUE);
1579
1580 *local_freeq = NULL;
1581 *local_freed = 0;
1582 }
1583 *delayed_unlock = 1;
1584
1585 switch (action) {
1586 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1587 vm_consider_waking_compactor_swapper();
1588 break;
1589 case VM_PAGEOUT_PB_THREAD_YIELD:
1590 thread_yield_internal(1);
1591 break;
1592 case VM_PAGEOUT_PB_NO_ACTION:
1593 default:
1594 break;
1595 }
1596 vm_page_lock_queues();
1597 }
1598
1599
1600 static struct vm_pageout_vminfo last;
1601
1602 uint64_t last_vm_page_pages_grabbed = 0;
1603
1604 extern uint32_t c_segment_pages_compressed;
1605
1606 extern uint64_t shared_region_pager_reclaimed;
1607 extern struct memory_object_pager_ops shared_region_pager_ops;
1608
1609 void
update_vm_info(void)1610 update_vm_info(void)
1611 {
1612 unsigned long tmp;
1613 uint64_t tmp64;
1614
1615 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1616 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1617 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1618 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1619
1620 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1621 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1622 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1623
1624 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1625 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1626 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1627 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1628
1629
1630 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1631 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1632 last.vm_pageout_considered_page = tmp;
1633
1634 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1635 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1636 last.vm_pageout_compressions = tmp64;
1637
1638 tmp = vm_pageout_vminfo.vm_compressor_failed;
1639 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1640 last.vm_compressor_failed = tmp;
1641
1642 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1643 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1644 last.vm_compressor_pages_grabbed = tmp64;
1645
1646 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1647 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1648 last.vm_phantom_cache_found_ghost = tmp;
1649
1650 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1651 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1652 last.vm_phantom_cache_added_ghost = tmp;
1653
1654 tmp64 = counter_load(&vm_page_grab_count);
1655 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1656 last_vm_page_pages_grabbed = tmp64;
1657
1658 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1659 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1660 last.vm_page_pages_freed = tmp;
1661
1662
1663 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1664 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1665 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1666 last.vm_pageout_pages_evicted = tmp;
1667
1668 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1669 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1670 last.vm_pageout_pages_purged = tmp;
1671
1672 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1673 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1674 last.vm_pageout_freed_speculative = tmp;
1675
1676 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1677 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1678 last.vm_pageout_freed_external = tmp;
1679
1680 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1681 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1682 last.vm_pageout_inactive_referenced = tmp;
1683
1684 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1685 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1686 last.vm_pageout_scan_inactive_throttled_external = tmp;
1687
1688 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1689 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1690 last.vm_pageout_inactive_dirty_external = tmp;
1691
1692 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1693 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1694 last.vm_pageout_freed_cleaned = tmp;
1695
1696 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1697 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1698 last.vm_pageout_inactive_nolock = tmp;
1699
1700 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1701 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1702 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1703
1704 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1705 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1706 last.vm_pageout_skipped_external = tmp;
1707
1708 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1709 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1710 last.vm_pageout_skipped_internal = tmp;
1711
1712 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1713 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1714 last.vm_pageout_reactivation_limit_exceeded = tmp;
1715
1716 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1717 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1718 last.vm_pageout_inactive_force_reclaim = tmp;
1719
1720 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1721 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1722 last.vm_pageout_freed_internal = tmp;
1723
1724 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1725 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1726 last.vm_pageout_considered_bq_internal = tmp;
1727
1728 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1729 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1730 last.vm_pageout_considered_bq_external = tmp;
1731
1732 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1733 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1734 last.vm_pageout_filecache_min_reactivated = tmp;
1735
1736 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1737 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1738 last.vm_pageout_inactive_dirty_internal = tmp;
1739 }
1740
1741 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1742 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1743 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1744 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1745 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1746 0);
1747
1748 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1749 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1750 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1751 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1752 0,
1753 0);
1754
1755 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1756 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1757 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1758 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1759 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1760 0);
1761
1762 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1763 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1764 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1765 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1766 vm_pageout_stats[vm_pageout_stat_now].considered,
1767 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1768 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1769 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1770 0);
1771
1772 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1773 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1774 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1775 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1776 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1777 0);
1778
1779 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1780 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1781 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1782 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1783 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1784 0);
1785
1786 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1787 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1788 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1789 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1790 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1791 0);
1792
1793 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1794 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1795 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1796 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1797 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1798 0);
1799 }
1800 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1801 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1802 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1803 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1804 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1805 0);
1806
1807 record_memory_pressure();
1808 }
1809
1810 extern boolean_t hibernation_vmqueues_inspection;
1811
1812 /*
1813 * Return values for functions called by vm_pageout_scan
1814 * that control its flow.
1815 *
1816 * PROCEED -- vm_pageout_scan will keep making forward progress.
1817 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1818 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1819 */
1820
1821 #define VM_PAGEOUT_SCAN_PROCEED (0)
1822 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1823 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1824
1825 /*
1826 * This function is called only from vm_pageout_scan and
1827 * it moves overflow secluded pages (one-at-a-time) to the
1828 * batched 'local' free Q or active Q.
1829 */
1830 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1831 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1832 {
1833 #if CONFIG_SECLUDED_MEMORY
1834 /*
1835 * Deal with secluded_q overflow.
1836 */
1837 if (vm_page_secluded_count > vm_page_secluded_target) {
1838 vm_page_t secluded_page;
1839
1840 /*
1841 * SECLUDED_AGING_BEFORE_ACTIVE:
1842 * Excess secluded pages go to the active queue and
1843 * will later go to the inactive queue.
1844 */
1845 assert((vm_page_secluded_count_free +
1846 vm_page_secluded_count_inuse) ==
1847 vm_page_secluded_count);
1848 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1849 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1850
1851 vm_page_queues_remove(secluded_page, FALSE);
1852 assert(!secluded_page->vmp_fictitious);
1853 assert(!VM_PAGE_WIRED(secluded_page));
1854
1855 if (secluded_page->vmp_object == 0) {
1856 /* transfer to free queue */
1857 assert(secluded_page->vmp_busy);
1858 secluded_page->vmp_snext = *local_freeq;
1859 *local_freeq = secluded_page;
1860 *local_freed += 1;
1861 } else {
1862 /* transfer to head of active queue */
1863 vm_page_enqueue_active(secluded_page, FALSE);
1864 secluded_page = VM_PAGE_NULL;
1865 }
1866 }
1867 #else /* CONFIG_SECLUDED_MEMORY */
1868
1869 #pragma unused(local_freeq)
1870 #pragma unused(local_freed)
1871
1872 return;
1873
1874 #endif /* CONFIG_SECLUDED_MEMORY */
1875 }
1876
1877 /*
1878 * This function is called only from vm_pageout_scan and
1879 * it initializes the loop targets for vm_pageout_scan().
1880 */
1881 static void
vps_init_page_targets(void)1882 vps_init_page_targets(void)
1883 {
1884 /*
1885 * LD TODO: Other page targets should be calculated here too.
1886 */
1887 vm_page_anonymous_min = vm_page_inactive_target / 20;
1888
1889 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1890 vm_pageout_state.vm_page_speculative_percentage = 50;
1891 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1892 vm_pageout_state.vm_page_speculative_percentage = 1;
1893 }
1894
1895 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1896 vm_page_inactive_count);
1897 }
1898
1899 /*
1900 * This function is called only from vm_pageout_scan and
1901 * it purges a single VM object at-a-time and will either
1902 * make vm_pageout_scan() restart the loop or keeping moving forward.
1903 */
1904 static int
vps_purge_object()1905 vps_purge_object()
1906 {
1907 int force_purge;
1908
1909 assert(available_for_purge >= 0);
1910 force_purge = 0; /* no force-purging */
1911
1912 #if VM_PRESSURE_EVENTS
1913 vm_pressure_level_t pressure_level;
1914
1915 pressure_level = memorystatus_vm_pressure_level;
1916
1917 if (pressure_level > kVMPressureNormal) {
1918 if (pressure_level >= kVMPressureCritical) {
1919 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1920 } else if (pressure_level >= kVMPressureUrgent) {
1921 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1922 } else if (pressure_level >= kVMPressureWarning) {
1923 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1924 }
1925 }
1926 #endif /* VM_PRESSURE_EVENTS */
1927
1928 if (available_for_purge || force_purge) {
1929 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1930
1931 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1932 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1933 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1934 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1935 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1936
1937 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1938 }
1939 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1940 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1941 }
1942
1943 return VM_PAGEOUT_SCAN_PROCEED;
1944 }
1945
1946 /*
1947 * This function is called only from vm_pageout_scan and
1948 * it will try to age the next speculative Q if the oldest
1949 * one is empty.
1950 */
1951 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1952 vps_age_speculative_queue(boolean_t force_speculative_aging)
1953 {
1954 #define DELAY_SPECULATIVE_AGE 1000
1955
1956 /*
1957 * try to pull pages from the aging bins...
1958 * see vm_page.h for an explanation of how
1959 * this mechanism works
1960 */
1961 boolean_t can_steal = FALSE;
1962 int num_scanned_queues;
1963 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1964 mach_timespec_t ts;
1965 struct vm_speculative_age_q *aq;
1966 struct vm_speculative_age_q *sq;
1967
1968 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1969
1970 aq = &vm_page_queue_speculative[speculative_steal_index];
1971
1972 num_scanned_queues = 0;
1973 while (vm_page_queue_empty(&aq->age_q) &&
1974 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1975 speculative_steal_index++;
1976
1977 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1978 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1979 }
1980
1981 aq = &vm_page_queue_speculative[speculative_steal_index];
1982 }
1983
1984 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1985 /*
1986 * XXX We've scanned all the speculative
1987 * queues but still haven't found one
1988 * that is not empty, even though
1989 * vm_page_speculative_count is not 0.
1990 */
1991 if (!vm_page_queue_empty(&sq->age_q)) {
1992 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1993 }
1994 #if DEVELOPMENT || DEBUG
1995 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
1996 #endif
1997 /* readjust... */
1998 vm_page_speculative_count = 0;
1999 /* ... and continue */
2000 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001 }
2002
2003 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2004 can_steal = TRUE;
2005 } else {
2006 if (!delay_speculative_age) {
2007 mach_timespec_t ts_fully_aged;
2008
2009 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2010 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2011 * 1000 * NSEC_PER_USEC;
2012
2013 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2014
2015 clock_sec_t sec;
2016 clock_nsec_t nsec;
2017 clock_get_system_nanotime(&sec, &nsec);
2018 ts.tv_sec = (unsigned int) sec;
2019 ts.tv_nsec = nsec;
2020
2021 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2022 can_steal = TRUE;
2023 } else {
2024 delay_speculative_age++;
2025 }
2026 } else {
2027 delay_speculative_age++;
2028 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2029 delay_speculative_age = 0;
2030 }
2031 }
2032 }
2033 if (can_steal == TRUE) {
2034 vm_page_speculate_ageit(aq);
2035 }
2036
2037 return VM_PAGEOUT_SCAN_PROCEED;
2038 }
2039
2040 /*
2041 * This function is called only from vm_pageout_scan and
2042 * it evicts a single VM object from the cache.
2043 */
2044 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2045 vps_object_cache_evict(vm_object_t *object_to_unlock)
2046 {
2047 static int cache_evict_throttle = 0;
2048 struct vm_speculative_age_q *sq;
2049
2050 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2051
2052 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2053 int pages_evicted;
2054
2055 if (*object_to_unlock != NULL) {
2056 vm_object_unlock(*object_to_unlock);
2057 *object_to_unlock = NULL;
2058 }
2059 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2060
2061 pages_evicted = vm_object_cache_evict(100, 10);
2062
2063 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2064
2065 if (pages_evicted) {
2066 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2067
2068 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2069 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2070 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2071
2072 /*
2073 * we just freed up to 100 pages,
2074 * so go back to the top of the main loop
2075 * and re-evaulate the memory situation
2076 */
2077 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2078 } else {
2079 cache_evict_throttle = 1000;
2080 }
2081 }
2082 if (cache_evict_throttle) {
2083 cache_evict_throttle--;
2084 }
2085
2086 return VM_PAGEOUT_SCAN_PROCEED;
2087 }
2088
2089
2090 /*
2091 * This function is called only from vm_pageout_scan and
2092 * it calculates the filecache min. that needs to be maintained
2093 * as we start to steal pages.
2094 */
2095 static void
vps_calculate_filecache_min(void)2096 vps_calculate_filecache_min(void)
2097 {
2098 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2099
2100 #if CONFIG_JETSAM
2101 /*
2102 * don't let the filecache_min fall below 15% of available memory
2103 * on systems with an active compressor that isn't nearing its
2104 * limits w/r to accepting new data
2105 *
2106 * on systems w/o the compressor/swapper, the filecache is always
2107 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2108 * since most (if not all) of the anonymous pages are in the
2109 * throttled queue (which isn't counted as available) which
2110 * effectively disables this filter
2111 */
2112 if (vm_compressor_low_on_space() || divisor == 0) {
2113 vm_pageout_state.vm_page_filecache_min = 0;
2114 } else {
2115 vm_pageout_state.vm_page_filecache_min =
2116 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2117 }
2118 #else
2119 if (vm_compressor_out_of_space() || divisor == 0) {
2120 vm_pageout_state.vm_page_filecache_min = 0;
2121 } else {
2122 /*
2123 * don't let the filecache_min fall below the specified critical level
2124 */
2125 vm_pageout_state.vm_page_filecache_min =
2126 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2127 }
2128 #endif
2129 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2130 vm_pageout_state.vm_page_filecache_min = 0;
2131 }
2132 }
2133
2134 /*
2135 * This function is called only from vm_pageout_scan and
2136 * it updates the flow control time to detect if VM pageoutscan
2137 * isn't making progress.
2138 */
2139 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2140 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2141 {
2142 mach_timespec_t ts;
2143 clock_sec_t sec;
2144 clock_nsec_t nsec;
2145
2146 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2147 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2148 clock_get_system_nanotime(&sec, &nsec);
2149 flow_control->ts.tv_sec = (unsigned int) sec;
2150 flow_control->ts.tv_nsec = nsec;
2151 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2152
2153 flow_control->state = FCS_DELAYED;
2154
2155 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2156 }
2157
2158 /*
2159 * This function is called only from vm_pageout_scan and
2160 * it is the flow control logic of VM pageout scan which
2161 * controls if it should block and for how long.
2162 * Any blocking of vm_pageout_scan happens ONLY in this function.
2163 */
2164 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2165 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2166 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2167 {
2168 boolean_t exceeded_burst_throttle = FALSE;
2169 unsigned int msecs = 0;
2170 uint32_t inactive_external_count;
2171 mach_timespec_t ts;
2172 struct vm_pageout_queue *iq;
2173 struct vm_pageout_queue *eq;
2174 struct vm_speculative_age_q *sq;
2175
2176 iq = &vm_pageout_queue_internal;
2177 eq = &vm_pageout_queue_external;
2178 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2179
2180 /*
2181 * Sometimes we have to pause:
2182 * 1) No inactive pages - nothing to do.
2183 * 2) Loop control - no acceptable pages found on the inactive queue
2184 * within the last vm_pageout_burst_inactive_throttle iterations
2185 * 3) Flow control - default pageout queue is full
2186 */
2187 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2188 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2189 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2190 vm_page_queue_empty(&sq->age_q)) {
2191 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2192 msecs = vm_pageout_state.vm_pageout_empty_wait;
2193 } else if (inactive_burst_count >=
2194 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2195 (vm_page_inactive_count +
2196 vm_page_speculative_count))) {
2197 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2198 msecs = vm_pageout_state.vm_pageout_burst_wait;
2199
2200 exceeded_burst_throttle = TRUE;
2201 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2202 VM_DYNAMIC_PAGING_ENABLED()) {
2203 clock_sec_t sec;
2204 clock_nsec_t nsec;
2205
2206 switch (flow_control->state) {
2207 case FCS_IDLE:
2208 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2209 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2210 /*
2211 * since the compressor is running independently of vm_pageout_scan
2212 * let's not wait for it just yet... as long as we have a healthy supply
2213 * of filecache pages to work with, let's keep stealing those.
2214 */
2215 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2216
2217 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2218 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2219 *anons_grabbed = ANONS_GRABBED_LIMIT;
2220 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2221 return VM_PAGEOUT_SCAN_PROCEED;
2222 }
2223 }
2224
2225 vps_flow_control_reset_deadlock_timer(flow_control);
2226 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2227
2228 break;
2229
2230 case FCS_DELAYED:
2231 clock_get_system_nanotime(&sec, &nsec);
2232 ts.tv_sec = (unsigned int) sec;
2233 ts.tv_nsec = nsec;
2234
2235 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2236 /*
2237 * the pageout thread for the default pager is potentially
2238 * deadlocked since the
2239 * default pager queue has been throttled for more than the
2240 * allowable time... we need to move some clean pages or dirty
2241 * pages belonging to the external pagers if they aren't throttled
2242 * vm_page_free_wanted represents the number of threads currently
2243 * blocked waiting for pages... we'll move one page for each of
2244 * these plus a fixed amount to break the logjam... once we're done
2245 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2246 * with a new timeout target since we have no way of knowing
2247 * whether we've broken the deadlock except through observation
2248 * of the queue associated with the default pager... we need to
2249 * stop moving pages and allow the system to run to see what
2250 * state it settles into.
2251 */
2252
2253 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2254 vm_page_free_wanted + vm_page_free_wanted_privileged;
2255 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2256 flow_control->state = FCS_DEADLOCK_DETECTED;
2257 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2258 return VM_PAGEOUT_SCAN_PROCEED;
2259 }
2260 /*
2261 * just resniff instead of trying
2262 * to compute a new delay time... we're going to be
2263 * awakened immediately upon a laundry completion,
2264 * so we won't wait any longer than necessary
2265 */
2266 msecs = vm_pageout_state.vm_pageout_idle_wait;
2267 break;
2268
2269 case FCS_DEADLOCK_DETECTED:
2270 if (*vm_pageout_deadlock_target) {
2271 return VM_PAGEOUT_SCAN_PROCEED;
2272 }
2273
2274 vps_flow_control_reset_deadlock_timer(flow_control);
2275 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2276
2277 break;
2278 }
2279 } else {
2280 /*
2281 * No need to pause...
2282 */
2283 return VM_PAGEOUT_SCAN_PROCEED;
2284 }
2285
2286 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2287
2288 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2289 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2290
2291 if (vm_page_free_count >= vm_page_free_target) {
2292 /*
2293 * we're here because
2294 * 1) someone else freed up some pages while we had
2295 * the queues unlocked above
2296 * and we've hit one of the 3 conditions that
2297 * cause us to pause the pageout scan thread
2298 *
2299 * since we already have enough free pages,
2300 * let's avoid stalling and return normally
2301 *
2302 * before we return, make sure the pageout I/O threads
2303 * are running throttled in case there are still requests
2304 * in the laundry... since we have enough free pages
2305 * we don't need the laundry to be cleaned in a timely
2306 * fashion... so let's avoid interfering with foreground
2307 * activity
2308 *
2309 * we don't want to hold vm_page_queue_free_lock when
2310 * calling vm_pageout_adjust_eq_iothrottle (since it
2311 * may cause other locks to be taken), we do the intitial
2312 * check outside of the lock. Once we take the lock,
2313 * we recheck the condition since it may have changed.
2314 * if it has, no problem, we will make the threads
2315 * non-throttled before actually blocking
2316 */
2317 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2318 }
2319 vm_free_page_lock();
2320
2321 if (vm_page_free_count >= vm_page_free_target &&
2322 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2323 return VM_PAGEOUT_SCAN_DONE_RETURN;
2324 }
2325 vm_free_page_unlock();
2326
2327 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2328 /*
2329 * we're most likely about to block due to one of
2330 * the 3 conditions that cause vm_pageout_scan to
2331 * not be able to make forward progress w/r
2332 * to providing new pages to the free queue,
2333 * so unthrottle the I/O threads in case we
2334 * have laundry to be cleaned... it needs
2335 * to be completed ASAP.
2336 *
2337 * even if we don't block, we want the io threads
2338 * running unthrottled since the sum of free +
2339 * clean pages is still under our free target
2340 */
2341 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2342 }
2343 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2344 /*
2345 * if we get here we're below our free target and
2346 * we're stalling due to a full laundry queue or
2347 * we don't have any inactive pages other then
2348 * those in the clean queue...
2349 * however, we have pages on the clean queue that
2350 * can be moved to the free queue, so let's not
2351 * stall the pageout scan
2352 */
2353 flow_control->state = FCS_IDLE;
2354 return VM_PAGEOUT_SCAN_PROCEED;
2355 }
2356 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2357 flow_control->state = FCS_IDLE;
2358 return VM_PAGEOUT_SCAN_PROCEED;
2359 }
2360
2361 VM_CHECK_MEMORYSTATUS;
2362
2363 if (flow_control->state != FCS_IDLE) {
2364 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2365 }
2366
2367 iq->pgo_throttled = TRUE;
2368 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2369
2370 vm_page_unlock_queues();
2371
2372 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2373
2374 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2375 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2376 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2377
2378 thread_block(THREAD_CONTINUE_NULL);
2379
2380 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2381 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2382 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2383
2384 vm_page_lock_queues();
2385
2386 iq->pgo_throttled = FALSE;
2387
2388 vps_init_page_targets();
2389
2390 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2391 }
2392
2393 extern boolean_t vm_darkwake_mode;
2394 /*
2395 * This function is called only from vm_pageout_scan and
2396 * it will find and return the most appropriate page to be
2397 * reclaimed.
2398 */
2399 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2400 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2401 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2402 {
2403 vm_page_t m = NULL;
2404 vm_object_t m_object = VM_OBJECT_NULL;
2405 uint32_t inactive_external_count;
2406 struct vm_speculative_age_q *sq;
2407 struct vm_pageout_queue *iq;
2408 int retval = VM_PAGEOUT_SCAN_PROCEED;
2409
2410 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2411 iq = &vm_pageout_queue_internal;
2412
2413 *is_page_from_bg_q = FALSE;
2414
2415 m = NULL;
2416 m_object = VM_OBJECT_NULL;
2417
2418 if (VM_DYNAMIC_PAGING_ENABLED()) {
2419 assert(vm_page_throttled_count == 0);
2420 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2421 }
2422
2423 /*
2424 * Try for a clean-queue inactive page.
2425 * These are pages that vm_pageout_scan tried to steal earlier, but
2426 * were dirty and had to be cleaned. Pick them up now that they are clean.
2427 */
2428 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2429 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2430
2431 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2432
2433 goto found_page;
2434 }
2435
2436 /*
2437 * The next most eligible pages are ones we paged in speculatively,
2438 * but which have not yet been touched and have been aged out.
2439 */
2440 if (!vm_page_queue_empty(&sq->age_q)) {
2441 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2442
2443 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2444
2445 if (!m->vmp_dirty || force_anonymous == FALSE) {
2446 goto found_page;
2447 } else {
2448 m = NULL;
2449 }
2450 }
2451
2452 #if !CONFIG_JETSAM
2453 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2454 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2455 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2456 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2457 goto found_page;
2458 }
2459 }
2460 #endif /* !CONFIG_JETSAM */
2461
2462 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2463 vm_object_t bg_m_object = NULL;
2464
2465 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2466
2467 bg_m_object = VM_PAGE_OBJECT(m);
2468
2469 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2470 /*
2471 * This page is on the background queue
2472 * but not on a pageable queue OR is busy during
2473 * darkwake mode when the target is artificially lowered.
2474 * If it is busy during darkwake mode, and we don't skip it,
2475 * we will just swing back around and try again with the same
2476 * queue and might hit the same page or its neighbor in a
2477 * similar state. Both of these are transient states and will
2478 * get resolved, but, at this point let's ignore this page.
2479 */
2480 if (vm_darkwake_mode && m->vmp_busy) {
2481 if (bg_m_object->internal) {
2482 vm_pageout_skipped_bq_internal++;
2483 } else {
2484 vm_pageout_skipped_bq_external++;
2485 }
2486 }
2487 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2488 if (bg_m_object->internal &&
2489 (VM_PAGE_Q_THROTTLED(iq) ||
2490 vm_compressor_out_of_space() == TRUE ||
2491 vm_page_free_count < (vm_page_free_reserved / 4))) {
2492 vm_pageout_skipped_bq_internal++;
2493 } else {
2494 *is_page_from_bg_q = TRUE;
2495
2496 if (bg_m_object->internal) {
2497 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2498 } else {
2499 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2500 }
2501 goto found_page;
2502 }
2503 }
2504 }
2505
2506 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2507
2508 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2509 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2510 *grab_anonymous = TRUE;
2511 *anons_grabbed = 0;
2512
2513 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2514 vm_pageout_vminfo.vm_pageout_skipped_external++;
2515 } else {
2516 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2517 /*
2518 * No swap and we are in dangerously low levels of free memory.
2519 * If we keep going ahead with anonymous pages, we are going to run into a situation
2520 * where the compressor will be stuck waiting for free pages (if it isn't already).
2521 *
2522 * So, pick a file backed page...
2523 */
2524 *grab_anonymous = FALSE;
2525 *anons_grabbed = ANONS_GRABBED_LIMIT;
2526 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2527 }
2528 }
2529 goto want_anonymous;
2530 }
2531 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2532
2533 #if CONFIG_JETSAM
2534 /* If the file-backed pool has accumulated
2535 * significantly more pages than the jetsam
2536 * threshold, prefer to reclaim those
2537 * inline to minimise compute overhead of reclaiming
2538 * anonymous pages.
2539 * This calculation does not account for the CPU local
2540 * external page queues, as those are expected to be
2541 * much smaller relative to the global pools.
2542 */
2543
2544 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2545
2546 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2547 if (vm_page_pageable_external_count >
2548 vm_pageout_state.vm_page_filecache_min) {
2549 if ((vm_page_pageable_external_count *
2550 vm_pageout_memorystatus_fb_factor_dr) >
2551 (memorystatus_available_pages_critical *
2552 vm_pageout_memorystatus_fb_factor_nr)) {
2553 *grab_anonymous = FALSE;
2554
2555 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2556 }
2557 }
2558 if (*grab_anonymous) {
2559 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2560 }
2561 }
2562 #endif /* CONFIG_JETSAM */
2563
2564 want_anonymous:
2565 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2566 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2567 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2568
2569 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2570 *anons_grabbed = 0;
2571
2572 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2573 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2574 if ((++(*reactivated_this_call) % 100)) {
2575 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2576
2577 vm_page_activate(m);
2578 counter_inc(&vm_statistics_reactivations);
2579 #if DEVELOPMENT || DEBUG
2580 if (*is_page_from_bg_q == TRUE) {
2581 if (m_object->internal) {
2582 vm_pageout_rejected_bq_internal++;
2583 } else {
2584 vm_pageout_rejected_bq_external++;
2585 }
2586 }
2587 #endif /* DEVELOPMENT || DEBUG */
2588 vm_pageout_state.vm_pageout_inactive_used++;
2589
2590 m = NULL;
2591 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2592
2593 goto found_page;
2594 }
2595
2596 /*
2597 * steal 1 of the file backed pages even if
2598 * we are under the limit that has been set
2599 * for a healthy filecache
2600 */
2601 }
2602 }
2603 goto found_page;
2604 }
2605 }
2606 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2607 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2608
2609 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2610 *anons_grabbed += 1;
2611
2612 goto found_page;
2613 }
2614
2615 m = NULL;
2616
2617 found_page:
2618 *victim_page = m;
2619
2620 return retval;
2621 }
2622
2623 /*
2624 * This function is called only from vm_pageout_scan and
2625 * it will put a page back on the active/inactive queue
2626 * if we can't reclaim it for some reason.
2627 */
2628 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2629 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2630 {
2631 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2632 vm_page_enqueue_inactive(m, FALSE);
2633 } else {
2634 vm_page_activate(m);
2635 }
2636
2637 #if DEVELOPMENT || DEBUG
2638 vm_object_t m_object = VM_PAGE_OBJECT(m);
2639
2640 if (page_from_bg_q == TRUE) {
2641 if (m_object->internal) {
2642 vm_pageout_rejected_bq_internal++;
2643 } else {
2644 vm_pageout_rejected_bq_external++;
2645 }
2646 }
2647 #endif /* DEVELOPMENT || DEBUG */
2648 }
2649
2650 /*
2651 * This function is called only from vm_pageout_scan and
2652 * it will try to grab the victim page's VM object (m_object)
2653 * which differs from the previous victim page's object (object).
2654 */
2655 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2656 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2657 {
2658 struct vm_speculative_age_q *sq;
2659
2660 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2661
2662 /*
2663 * the object associated with candidate page is
2664 * different from the one we were just working
2665 * with... dump the lock if we still own it
2666 */
2667 if (*object != NULL) {
2668 vm_object_unlock(*object);
2669 *object = NULL;
2670 }
2671 /*
2672 * Try to lock object; since we've alread got the
2673 * page queues lock, we can only 'try' for this one.
2674 * if the 'try' fails, we need to do a mutex_pause
2675 * to allow the owner of the object lock a chance to
2676 * run... otherwise, we're likely to trip over this
2677 * object in the same state as we work our way through
2678 * the queue... clumps of pages associated with the same
2679 * object are fairly typical on the inactive and active queues
2680 */
2681 if (!vm_object_lock_try_scan(m_object)) {
2682 vm_page_t m_want = NULL;
2683
2684 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2685
2686 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2687 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2688 }
2689
2690 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2691
2692 m->vmp_reference = FALSE;
2693
2694 if (!m_object->object_is_shared_cache) {
2695 /*
2696 * don't apply this optimization if this is the shared cache
2697 * object, it's too easy to get rid of very hot and important
2698 * pages...
2699 * m->vmp_object must be stable since we hold the page queues lock...
2700 * we can update the scan_collisions field sans the object lock
2701 * since it is a separate field and this is the only spot that does
2702 * a read-modify-write operation and it is never executed concurrently...
2703 * we can asynchronously set this field to 0 when creating a UPL, so it
2704 * is possible for the value to be a bit non-determistic, but that's ok
2705 * since it's only used as a hint
2706 */
2707 m_object->scan_collisions = 1;
2708 }
2709 if (page_from_bg_q) {
2710 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2711 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2712 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2713 } else if (!vm_page_queue_empty(&sq->age_q)) {
2714 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2715 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2716 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2717 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2718 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2719 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2720 }
2721
2722 /*
2723 * this is the next object we're going to be interested in
2724 * try to make sure its available after the mutex_pause
2725 * returns control
2726 */
2727 if (m_want) {
2728 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2729 }
2730
2731 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2732
2733 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2734 } else {
2735 *object = m_object;
2736 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2737 }
2738
2739 return VM_PAGEOUT_SCAN_PROCEED;
2740 }
2741
2742 /*
2743 * This function is called only from vm_pageout_scan and
2744 * it notices that pageout scan may be rendered ineffective
2745 * due to a FS deadlock and will jetsam a process if possible.
2746 * If jetsam isn't supported, it'll move the page to the active
2747 * queue to try and get some different pages pushed onwards so
2748 * we can try to get out of this scenario.
2749 */
2750 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2751 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2752 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2753 {
2754 struct vm_pageout_queue *eq;
2755 vm_object_t cur_object = VM_OBJECT_NULL;
2756
2757 cur_object = *object;
2758
2759 eq = &vm_pageout_queue_external;
2760
2761 if (cur_object->internal == FALSE) {
2762 /*
2763 * we need to break up the following potential deadlock case...
2764 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2765 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2766 * c) Most of the pages in the inactive queue belong to this file.
2767 *
2768 * we are potentially in this deadlock because...
2769 * a) the external pageout queue is throttled
2770 * b) we're done with the active queue and moved on to the inactive queue
2771 * c) we've got a dirty external page
2772 *
2773 * since we don't know the reason for the external pageout queue being throttled we
2774 * must suspect that we are deadlocked, so move the current page onto the active queue
2775 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2776 *
2777 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2778 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2779 * pool the next time we select a victim page... if we can make enough new free pages,
2780 * the deadlock will break, the external pageout queue will empty and it will no longer
2781 * be throttled
2782 *
2783 * if we have jetsam configured, keep a count of the pages reactivated this way so
2784 * that we can try to find clean pages in the active/inactive queues before
2785 * deciding to jetsam a process
2786 */
2787 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2788
2789 vm_page_check_pageable_safe(m);
2790 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2791 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2792 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2793 vm_page_active_count++;
2794 vm_page_pageable_external_count++;
2795
2796 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2797
2798 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2799
2800 #pragma unused(force_anonymous)
2801
2802 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2803
2804 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2805 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2806 /*
2807 * Possible deadlock scenario so request jetsam action
2808 */
2809
2810 assert(cur_object);
2811 vm_object_unlock(cur_object);
2812
2813 cur_object = VM_OBJECT_NULL;
2814
2815 /*
2816 * VM pageout scan needs to know we have dropped this lock and so set the
2817 * object variable we got passed in to NULL.
2818 */
2819 *object = VM_OBJECT_NULL;
2820
2821 vm_page_unlock_queues();
2822
2823 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2824 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2825
2826 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2827 if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2828 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2829 }
2830
2831 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2832 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2833
2834 vm_page_lock_queues();
2835 *delayed_unlock = 1;
2836 }
2837 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2838
2839 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2840 #pragma unused(delayed_unlock)
2841
2842 *force_anonymous = TRUE;
2843 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2844 } else {
2845 vm_page_activate(m);
2846 counter_inc(&vm_statistics_reactivations);
2847
2848 #if DEVELOPMENT || DEBUG
2849 if (is_page_from_bg_q == TRUE) {
2850 if (cur_object->internal) {
2851 vm_pageout_rejected_bq_internal++;
2852 } else {
2853 vm_pageout_rejected_bq_external++;
2854 }
2855 }
2856 #endif /* DEVELOPMENT || DEBUG */
2857
2858 vm_pageout_state.vm_pageout_inactive_used++;
2859 }
2860 }
2861
2862
2863 void
vm_page_balance_inactive(int max_to_move)2864 vm_page_balance_inactive(int max_to_move)
2865 {
2866 vm_page_t m;
2867
2868 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2869
2870 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2871 /*
2872 * It is likely that the hibernation code path is
2873 * dealing with these very queues as we are about
2874 * to move pages around in/from them and completely
2875 * change the linkage of the pages.
2876 *
2877 * And so we skip the rebalancing of these queues.
2878 */
2879 return;
2880 }
2881 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2882 vm_page_inactive_count +
2883 vm_page_speculative_count);
2884
2885 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2886 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2887
2888 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2889
2890 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2891 assert(!m->vmp_laundry);
2892 assert(VM_PAGE_OBJECT(m) != kernel_object);
2893 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2894
2895 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2896
2897 /*
2898 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2899 *
2900 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2901 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2902 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2903 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2904 * by pageout_scan, which is just fine since the last reference would have happened quite far
2905 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2906 * have happened before we moved the page
2907 */
2908 if (m->vmp_pmapped == TRUE) {
2909 /*
2910 * We might be holding the page queue lock as a
2911 * spin lock and clearing the "referenced" bit could
2912 * take a while if there are lots of mappings of
2913 * that page, so make sure we acquire the lock as
2914 * as mutex to avoid a spinlock timeout.
2915 */
2916 vm_page_lockconvert_queues();
2917 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2918 }
2919
2920 /*
2921 * The page might be absent or busy,
2922 * but vm_page_deactivate can handle that.
2923 * FALSE indicates that we don't want a H/W clear reference
2924 */
2925 vm_page_deactivate_internal(m, FALSE);
2926 }
2927 }
2928
2929 /*
2930 * vm_pageout_scan does the dirty work for the pageout daemon.
2931 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2932 * held and vm_page_free_wanted == 0.
2933 */
2934 void
vm_pageout_scan(void)2935 vm_pageout_scan(void)
2936 {
2937 unsigned int loop_count = 0;
2938 unsigned int inactive_burst_count = 0;
2939 unsigned int reactivated_this_call;
2940 unsigned int reactivate_limit;
2941 vm_page_t local_freeq = NULL;
2942 int local_freed = 0;
2943 int delayed_unlock;
2944 int delayed_unlock_limit = 0;
2945 int refmod_state = 0;
2946 int vm_pageout_deadlock_target = 0;
2947 struct vm_pageout_queue *iq;
2948 struct vm_pageout_queue *eq;
2949 struct vm_speculative_age_q *sq;
2950 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2951 boolean_t inactive_throttled = FALSE;
2952 vm_object_t object = NULL;
2953 uint32_t inactive_reclaim_run;
2954 boolean_t grab_anonymous = FALSE;
2955 boolean_t force_anonymous = FALSE;
2956 boolean_t force_speculative_aging = FALSE;
2957 int anons_grabbed = 0;
2958 int page_prev_q_state = 0;
2959 boolean_t page_from_bg_q = FALSE;
2960 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
2961 vm_object_t m_object = VM_OBJECT_NULL;
2962 int retval = 0;
2963 boolean_t lock_yield_check = FALSE;
2964
2965
2966 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2967 vm_pageout_vminfo.vm_pageout_freed_speculative,
2968 vm_pageout_state.vm_pageout_inactive_clean,
2969 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2970 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2971
2972 flow_control.state = FCS_IDLE;
2973 iq = &vm_pageout_queue_internal;
2974 eq = &vm_pageout_queue_external;
2975 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2976
2977 /* Ask the pmap layer to return any pages it no longer needs. */
2978 uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
2979
2980 vm_page_lock_queues();
2981
2982 vm_page_wire_count -= pmap_wired_pages_freed;
2983
2984 delayed_unlock = 1;
2985
2986 /*
2987 * Calculate the max number of referenced pages on the inactive
2988 * queue that we will reactivate.
2989 */
2990 reactivated_this_call = 0;
2991 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
2992 vm_page_inactive_count);
2993 inactive_reclaim_run = 0;
2994
2995 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2996
2997 /*
2998 * We must limit the rate at which we send pages to the pagers
2999 * so that we don't tie up too many pages in the I/O queues.
3000 * We implement a throttling mechanism using the laundry count
3001 * to limit the number of pages outstanding to the default
3002 * and external pagers. We can bypass the throttles and look
3003 * for clean pages if the pageout queues don't drain in a timely
3004 * fashion since this may indicate that the pageout paths are
3005 * stalled waiting for memory, which only we can provide.
3006 */
3007
3008 vps_init_page_targets();
3009 assert(object == NULL);
3010 assert(delayed_unlock != 0);
3011
3012 for (;;) {
3013 vm_page_t m;
3014
3015 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3016
3017 if (lock_yield_check) {
3018 lock_yield_check = FALSE;
3019
3020 if (delayed_unlock++ > delayed_unlock_limit) {
3021 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3022 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3023 } else if (vm_pageout_scan_wants_object) {
3024 vm_page_unlock_queues();
3025 mutex_pause(0);
3026 vm_page_lock_queues();
3027 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3028 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3029 }
3030 }
3031
3032 if (vm_upl_wait_for_pages < 0) {
3033 vm_upl_wait_for_pages = 0;
3034 }
3035
3036 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3037
3038 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3039 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3040 }
3041
3042 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3043
3044 assert(delayed_unlock);
3045
3046 /*
3047 * maintain our balance
3048 */
3049 vm_page_balance_inactive(1);
3050
3051
3052 /**********************************************************************
3053 * above this point we're playing with the active and secluded queues
3054 * below this point we're playing with the throttling mechanisms
3055 * and the inactive queue
3056 **********************************************************************/
3057
3058 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3059 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3060
3061 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3062 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3063 /*
3064 * make sure the pageout I/O threads are running
3065 * throttled in case there are still requests
3066 * in the laundry... since we have met our targets
3067 * we don't need the laundry to be cleaned in a timely
3068 * fashion... so let's avoid interfering with foreground
3069 * activity
3070 */
3071 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3072
3073 vm_free_page_lock();
3074
3075 if ((vm_page_free_count >= vm_page_free_target) &&
3076 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3077 /*
3078 * done - we have met our target *and*
3079 * there is no one waiting for a page.
3080 */
3081 return_from_scan:
3082 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3083
3084 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3085 vm_pageout_state.vm_pageout_inactive,
3086 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3087 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3088 vm_pageout_vminfo.vm_pageout_freed_speculative,
3089 vm_pageout_state.vm_pageout_inactive_clean,
3090 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3091 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3092
3093 return;
3094 }
3095 vm_free_page_unlock();
3096 }
3097
3098 /*
3099 * Before anything, we check if we have any ripe volatile
3100 * objects around. If so, try to purge the first object.
3101 * If the purge fails, fall through to reclaim a page instead.
3102 * If the purge succeeds, go back to the top and reevalute
3103 * the new memory situation.
3104 */
3105 retval = vps_purge_object();
3106
3107 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3108 /*
3109 * Success
3110 */
3111 if (object != NULL) {
3112 vm_object_unlock(object);
3113 object = NULL;
3114 }
3115
3116 lock_yield_check = FALSE;
3117 continue;
3118 }
3119
3120 /*
3121 * If our 'aged' queue is empty and we have some speculative pages
3122 * in the other queues, let's go through and see if we need to age
3123 * them.
3124 *
3125 * If we succeeded in aging a speculative Q or just that everything
3126 * looks normal w.r.t queue age and queue counts, we keep going onward.
3127 *
3128 * If, for some reason, we seem to have a mismatch between the spec.
3129 * page count and the page queues, we reset those variables and
3130 * restart the loop (LD TODO: Track this better?).
3131 */
3132 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3133 retval = vps_age_speculative_queue(force_speculative_aging);
3134
3135 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3136 lock_yield_check = FALSE;
3137 continue;
3138 }
3139 }
3140 force_speculative_aging = FALSE;
3141
3142 /*
3143 * Check to see if we need to evict objects from the cache.
3144 *
3145 * Note: 'object' here doesn't have anything to do with
3146 * the eviction part. We just need to make sure we have dropped
3147 * any object lock we might be holding if we need to go down
3148 * into the eviction logic.
3149 */
3150 retval = vps_object_cache_evict(&object);
3151
3152 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3153 lock_yield_check = FALSE;
3154 continue;
3155 }
3156
3157
3158 /*
3159 * Calculate our filecache_min that will affect the loop
3160 * going forward.
3161 */
3162 vps_calculate_filecache_min();
3163
3164 /*
3165 * LD TODO: Use a structure to hold all state variables for a single
3166 * vm_pageout_scan iteration and pass that structure to this function instead.
3167 */
3168 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3169 &delayed_unlock, &local_freeq, &local_freed,
3170 &vm_pageout_deadlock_target, inactive_burst_count);
3171
3172 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3173 if (loop_count >= vm_page_inactive_count) {
3174 loop_count = 0;
3175 }
3176
3177 inactive_burst_count = 0;
3178
3179 assert(object == NULL);
3180 assert(delayed_unlock != 0);
3181
3182 lock_yield_check = FALSE;
3183 continue;
3184 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3185 goto return_from_scan;
3186 }
3187
3188 flow_control.state = FCS_IDLE;
3189
3190 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3191 vm_pageout_inactive_external_forced_reactivate_limit);
3192 loop_count++;
3193 inactive_burst_count++;
3194 vm_pageout_state.vm_pageout_inactive++;
3195
3196 /*
3197 * Choose a victim.
3198 */
3199
3200 m = NULL;
3201 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3202
3203 if (m == NULL) {
3204 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3205 inactive_burst_count = 0;
3206
3207 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3208 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3209 }
3210
3211 lock_yield_check = TRUE;
3212 continue;
3213 }
3214
3215 /*
3216 * if we've gotten here, we have no victim page.
3217 * check to see if we've not finished balancing the queues
3218 * or we have a page on the aged speculative queue that we
3219 * skipped due to force_anonymous == TRUE.. or we have
3220 * speculative pages that we can prematurely age... if
3221 * one of these cases we'll keep going, else panic
3222 */
3223 force_anonymous = FALSE;
3224 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3225
3226 if (!vm_page_queue_empty(&sq->age_q)) {
3227 lock_yield_check = TRUE;
3228 continue;
3229 }
3230
3231 if (vm_page_speculative_count) {
3232 force_speculative_aging = TRUE;
3233 lock_yield_check = TRUE;
3234 continue;
3235 }
3236 panic("vm_pageout: no victim");
3237
3238 /* NOTREACHED */
3239 }
3240
3241 assert(VM_PAGE_PAGEABLE(m));
3242 m_object = VM_PAGE_OBJECT(m);
3243 force_anonymous = FALSE;
3244
3245 page_prev_q_state = m->vmp_q_state;
3246 /*
3247 * we just found this page on one of our queues...
3248 * it can't also be on the pageout queue, so safe
3249 * to call vm_page_queues_remove
3250 */
3251 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3252 vm_page_queues_remove(m, TRUE);
3253 if (donate) {
3254 /*
3255 * The compressor needs to see this bit to know
3256 * where this page needs to land. Also if stolen,
3257 * this bit helps put the page back in the right
3258 * special queue where it belongs.
3259 */
3260 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3261 }
3262
3263 assert(!m->vmp_laundry);
3264 assert(!m->vmp_private);
3265 assert(!m->vmp_fictitious);
3266 assert(m_object != kernel_object);
3267 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3268
3269 vm_pageout_vminfo.vm_pageout_considered_page++;
3270
3271 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3272
3273 /*
3274 * check to see if we currently are working
3275 * with the same object... if so, we've
3276 * already got the lock
3277 */
3278 if (m_object != object) {
3279 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3280
3281 /*
3282 * vps_switch_object() will always drop the 'object' lock first
3283 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3284 * either 'm_object' or NULL.
3285 */
3286 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3287
3288 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3289 lock_yield_check = TRUE;
3290 continue;
3291 }
3292 }
3293 assert(m_object == object);
3294 assert(VM_PAGE_OBJECT(m) == m_object);
3295
3296 if (m->vmp_busy) {
3297 /*
3298 * Somebody is already playing with this page.
3299 * Put it back on the appropriate queue
3300 *
3301 */
3302 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3303
3304 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3305 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3306 }
3307
3308 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3309
3310 lock_yield_check = TRUE;
3311 continue;
3312 }
3313
3314 /*
3315 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3316 * If already cleaning this page in place
3317 * just leave if off the paging queues.
3318 * We can leave the page mapped, and upl_commit_range
3319 * will put it on the clean queue.
3320 *
3321 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3322 * an msync INVALIDATE is in progress...
3323 * this page has been marked for destruction
3324 * after it has been cleaned,
3325 * but not yet gathered into a UPL
3326 * where 'cleaning' will be set...
3327 * just leave it off the paging queues
3328 *
3329 * if (m->vmp_free_when_done && m->vmp_clenaing)
3330 * an msync INVALIDATE is in progress
3331 * and the UPL has already gathered this page...
3332 * just leave it off the paging queues
3333 */
3334 if (m->vmp_free_when_done || m->vmp_cleaning) {
3335 lock_yield_check = TRUE;
3336 continue;
3337 }
3338
3339
3340 /*
3341 * If it's absent, in error or the object is no longer alive,
3342 * we can reclaim the page... in the no longer alive case,
3343 * there are 2 states the page can be in that preclude us
3344 * from reclaiming it - busy or cleaning - that we've already
3345 * dealt with
3346 */
3347 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3348 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3349 if (m->vmp_absent) {
3350 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3351 } else if (!object->alive ||
3352 (!object->internal &&
3353 object->pager == MEMORY_OBJECT_NULL)) {
3354 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3355 } else {
3356 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3357 }
3358 reclaim_page:
3359 if (vm_pageout_deadlock_target) {
3360 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3361 vm_pageout_deadlock_target--;
3362 }
3363
3364 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3365
3366 if (object->internal) {
3367 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3368 } else {
3369 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3370 }
3371 assert(!m->vmp_cleaning);
3372 assert(!m->vmp_laundry);
3373
3374 if (!object->internal &&
3375 object->pager != NULL &&
3376 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3377 shared_region_pager_reclaimed++;
3378 }
3379
3380 m->vmp_busy = TRUE;
3381
3382 /*
3383 * remove page from object here since we're already
3384 * behind the object lock... defer the rest of the work
3385 * we'd normally do in vm_page_free_prepare_object
3386 * until 'vm_page_free_list' is called
3387 */
3388 if (m->vmp_tabled) {
3389 vm_page_remove(m, TRUE);
3390 }
3391
3392 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3393 m->vmp_snext = local_freeq;
3394 local_freeq = m;
3395 local_freed++;
3396
3397 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3398 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3399 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3400 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3401 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3402 vm_pageout_vminfo.vm_pageout_freed_internal++;
3403 } else {
3404 vm_pageout_vminfo.vm_pageout_freed_external++;
3405 }
3406
3407 inactive_burst_count = 0;
3408
3409 lock_yield_check = TRUE;
3410 continue;
3411 }
3412 if (object->copy == VM_OBJECT_NULL) {
3413 /*
3414 * No one else can have any interest in this page.
3415 * If this is an empty purgable object, the page can be
3416 * reclaimed even if dirty.
3417 * If the page belongs to a volatile purgable object, we
3418 * reactivate it if the compressor isn't active.
3419 */
3420 if (object->purgable == VM_PURGABLE_EMPTY) {
3421 if (m->vmp_pmapped == TRUE) {
3422 /* unmap the page */
3423 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3424 if (refmod_state & VM_MEM_MODIFIED) {
3425 SET_PAGE_DIRTY(m, FALSE);
3426 }
3427 }
3428 if (m->vmp_dirty || m->vmp_precious) {
3429 /* we saved the cost of cleaning this page ! */
3430 vm_page_purged_count++;
3431 }
3432 goto reclaim_page;
3433 }
3434
3435 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3436 /*
3437 * With the VM compressor, the cost of
3438 * reclaiming a page is much lower (no I/O),
3439 * so if we find a "volatile" page, it's better
3440 * to let it get compressed rather than letting
3441 * it occupy a full page until it gets purged.
3442 * So no need to check for "volatile" here.
3443 */
3444 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3445 /*
3446 * Avoid cleaning a "volatile" page which might
3447 * be purged soon.
3448 */
3449
3450 /* if it's wired, we can't put it on our queue */
3451 assert(!VM_PAGE_WIRED(m));
3452
3453 /* just stick it back on! */
3454 reactivated_this_call++;
3455
3456 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3457 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3458 }
3459
3460 goto reactivate_page;
3461 }
3462 }
3463 /*
3464 * If it's being used, reactivate.
3465 * (Fictitious pages are either busy or absent.)
3466 * First, update the reference and dirty bits
3467 * to make sure the page is unreferenced.
3468 */
3469 refmod_state = -1;
3470
3471 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3472 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3473
3474 if (refmod_state & VM_MEM_REFERENCED) {
3475 m->vmp_reference = TRUE;
3476 }
3477 if (refmod_state & VM_MEM_MODIFIED) {
3478 SET_PAGE_DIRTY(m, FALSE);
3479 }
3480 }
3481
3482 if (m->vmp_reference || m->vmp_dirty) {
3483 /* deal with a rogue "reusable" page */
3484 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3485 }
3486
3487 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3488 vm_pageout_state.vm_page_xpmapped_min = 0;
3489 } else {
3490 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3491 }
3492
3493 if (!m->vmp_no_cache &&
3494 page_from_bg_q == FALSE &&
3495 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3496 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3497 /*
3498 * The page we pulled off the inactive list has
3499 * been referenced. It is possible for other
3500 * processors to be touching pages faster than we
3501 * can clear the referenced bit and traverse the
3502 * inactive queue, so we limit the number of
3503 * reactivations.
3504 */
3505 if (++reactivated_this_call >= reactivate_limit) {
3506 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3507 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3508 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3509 } else {
3510 uint32_t isinuse;
3511
3512 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3513 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3514 }
3515
3516 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3517 reactivate_page:
3518 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3519 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3520 /*
3521 * no explict mappings of this object exist
3522 * and it's not open via the filesystem
3523 */
3524 vm_page_deactivate(m);
3525 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3526 } else {
3527 /*
3528 * The page was/is being used, so put back on active list.
3529 */
3530 vm_page_activate(m);
3531 counter_inc(&vm_statistics_reactivations);
3532 inactive_burst_count = 0;
3533 }
3534 #if DEVELOPMENT || DEBUG
3535 if (page_from_bg_q == TRUE) {
3536 if (m_object->internal) {
3537 vm_pageout_rejected_bq_internal++;
3538 } else {
3539 vm_pageout_rejected_bq_external++;
3540 }
3541 }
3542 #endif /* DEVELOPMENT || DEBUG */
3543
3544 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3545 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3546 }
3547 vm_pageout_state.vm_pageout_inactive_used++;
3548
3549 lock_yield_check = TRUE;
3550 continue;
3551 }
3552 /*
3553 * Make sure we call pmap_get_refmod() if it
3554 * wasn't already called just above, to update
3555 * the dirty bit.
3556 */
3557 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3558 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3559 if (refmod_state & VM_MEM_MODIFIED) {
3560 SET_PAGE_DIRTY(m, FALSE);
3561 }
3562 }
3563 }
3564
3565 /*
3566 * we've got a candidate page to steal...
3567 *
3568 * m->vmp_dirty is up to date courtesy of the
3569 * preceding check for m->vmp_reference... if
3570 * we get here, then m->vmp_reference had to be
3571 * FALSE (or possibly "reactivate_limit" was
3572 * exceeded), but in either case we called
3573 * pmap_get_refmod() and updated both
3574 * m->vmp_reference and m->vmp_dirty
3575 *
3576 * if it's dirty or precious we need to
3577 * see if the target queue is throtttled
3578 * it if is, we need to skip over it by moving it back
3579 * to the end of the inactive queue
3580 */
3581
3582 inactive_throttled = FALSE;
3583
3584 if (m->vmp_dirty || m->vmp_precious) {
3585 if (object->internal) {
3586 if (VM_PAGE_Q_THROTTLED(iq)) {
3587 inactive_throttled = TRUE;
3588 }
3589 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3590 inactive_throttled = TRUE;
3591 }
3592 }
3593 throttle_inactive:
3594 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3595 object->internal && m->vmp_dirty &&
3596 (object->purgable == VM_PURGABLE_DENY ||
3597 object->purgable == VM_PURGABLE_NONVOLATILE ||
3598 object->purgable == VM_PURGABLE_VOLATILE)) {
3599 vm_page_check_pageable_safe(m);
3600 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3601 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3602 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3603 vm_page_throttled_count++;
3604
3605 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3606
3607 inactive_burst_count = 0;
3608
3609 lock_yield_check = TRUE;
3610 continue;
3611 }
3612 if (inactive_throttled == TRUE) {
3613 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3614 &delayed_unlock, &force_anonymous, page_from_bg_q);
3615
3616 inactive_burst_count = 0;
3617
3618 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3619 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3620 }
3621
3622 lock_yield_check = TRUE;
3623 continue;
3624 }
3625
3626 /*
3627 * we've got a page that we can steal...
3628 * eliminate all mappings and make sure
3629 * we have the up-to-date modified state
3630 *
3631 * if we need to do a pmap_disconnect then we
3632 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3633 * provides the true state atomically... the
3634 * page was still mapped up to the pmap_disconnect
3635 * and may have been dirtied at the last microsecond
3636 *
3637 * Note that if 'pmapped' is FALSE then the page is not
3638 * and has not been in any map, so there is no point calling
3639 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3640 * of likely usage of the page.
3641 */
3642 if (m->vmp_pmapped == TRUE) {
3643 int pmap_options;
3644
3645 /*
3646 * Don't count this page as going into the compressor
3647 * if any of these are true:
3648 * 1) compressed pager isn't enabled
3649 * 2) Freezer enabled device with compressed pager
3650 * backend (exclusive use) i.e. most of the VM system
3651 * (including vm_pageout_scan) has no knowledge of
3652 * the compressor
3653 * 3) This page belongs to a file and hence will not be
3654 * sent into the compressor
3655 */
3656 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3657 object->internal == FALSE) {
3658 pmap_options = 0;
3659 } else if (m->vmp_dirty || m->vmp_precious) {
3660 /*
3661 * VM knows that this page is dirty (or
3662 * precious) and needs to be compressed
3663 * rather than freed.
3664 * Tell the pmap layer to count this page
3665 * as "compressed".
3666 */
3667 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3668 } else {
3669 /*
3670 * VM does not know if the page needs to
3671 * be preserved but the pmap layer might tell
3672 * us if any mapping has "modified" it.
3673 * Let's the pmap layer to count this page
3674 * as compressed if and only if it has been
3675 * modified.
3676 */
3677 pmap_options =
3678 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3679 }
3680 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3681 pmap_options,
3682 NULL);
3683 if (refmod_state & VM_MEM_MODIFIED) {
3684 SET_PAGE_DIRTY(m, FALSE);
3685 }
3686 }
3687
3688 /*
3689 * reset our count of pages that have been reclaimed
3690 * since the last page was 'stolen'
3691 */
3692 inactive_reclaim_run = 0;
3693
3694 /*
3695 * If it's clean and not precious, we can free the page.
3696 */
3697 if (!m->vmp_dirty && !m->vmp_precious) {
3698 vm_pageout_state.vm_pageout_inactive_clean++;
3699
3700 /*
3701 * OK, at this point we have found a page we are going to free.
3702 */
3703 #if CONFIG_PHANTOM_CACHE
3704 if (!object->internal) {
3705 vm_phantom_cache_add_ghost(m);
3706 }
3707 #endif
3708 goto reclaim_page;
3709 }
3710
3711 /*
3712 * The page may have been dirtied since the last check
3713 * for a throttled target queue (which may have been skipped
3714 * if the page was clean then). With the dirty page
3715 * disconnected here, we can make one final check.
3716 */
3717 if (object->internal) {
3718 if (VM_PAGE_Q_THROTTLED(iq)) {
3719 inactive_throttled = TRUE;
3720 }
3721 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3722 inactive_throttled = TRUE;
3723 }
3724
3725 if (inactive_throttled == TRUE) {
3726 goto throttle_inactive;
3727 }
3728
3729 #if VM_PRESSURE_EVENTS
3730 #if CONFIG_JETSAM
3731
3732 /*
3733 * If Jetsam is enabled, then the sending
3734 * of memory pressure notifications is handled
3735 * from the same thread that takes care of high-water
3736 * and other jetsams i.e. the memorystatus_thread.
3737 */
3738
3739 #else /* CONFIG_JETSAM */
3740
3741 vm_pressure_response();
3742
3743 #endif /* CONFIG_JETSAM */
3744 #endif /* VM_PRESSURE_EVENTS */
3745
3746 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3747 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3748 }
3749
3750 if (object->internal) {
3751 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3752 } else {
3753 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3754 }
3755
3756 /*
3757 * internal pages will go to the compressor...
3758 * external pages will go to the appropriate pager to be cleaned
3759 * and upon completion will end up on 'vm_page_queue_cleaned' which
3760 * is a preferred queue to steal from
3761 */
3762 vm_pageout_cluster(m);
3763 inactive_burst_count = 0;
3764
3765 /*
3766 * back to top of pageout scan loop
3767 */
3768 }
3769 }
3770
3771
3772 void
vm_page_free_reserve(int pages)3773 vm_page_free_reserve(
3774 int pages)
3775 {
3776 int free_after_reserve;
3777
3778 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3779 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3780 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3781 } else {
3782 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3783 }
3784 } else {
3785 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3786 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3787 } else {
3788 vm_page_free_reserved += pages;
3789 }
3790 }
3791 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3792
3793 vm_page_free_min = vm_page_free_reserved +
3794 VM_PAGE_FREE_MIN(free_after_reserve);
3795
3796 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3797 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3798 }
3799
3800 vm_page_free_target = vm_page_free_reserved +
3801 VM_PAGE_FREE_TARGET(free_after_reserve);
3802
3803 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3804 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3805 }
3806
3807 if (vm_page_free_target < vm_page_free_min + 5) {
3808 vm_page_free_target = vm_page_free_min + 5;
3809 }
3810
3811 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3812 }
3813
3814 /*
3815 * vm_pageout is the high level pageout daemon.
3816 */
3817
3818 void
vm_pageout_continue(void)3819 vm_pageout_continue(void)
3820 {
3821 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3822 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3823
3824 vm_free_page_lock();
3825 vm_pageout_running = TRUE;
3826 vm_free_page_unlock();
3827
3828 vm_pageout_scan();
3829 /*
3830 * we hold both the vm_page_queue_free_lock
3831 * and the vm_page_queues_lock at this point
3832 */
3833 assert(vm_page_free_wanted == 0);
3834 assert(vm_page_free_wanted_privileged == 0);
3835 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3836
3837 vm_pageout_running = FALSE;
3838 #if XNU_TARGET_OS_OSX
3839 if (vm_pageout_waiter) {
3840 vm_pageout_waiter = FALSE;
3841 thread_wakeup((event_t)&vm_pageout_waiter);
3842 }
3843 #endif /* XNU_TARGET_OS_OSX */
3844
3845 vm_free_page_unlock();
3846 vm_page_unlock_queues();
3847
3848 thread_block((thread_continue_t)vm_pageout_continue);
3849 /*NOTREACHED*/
3850 }
3851
3852 #if XNU_TARGET_OS_OSX
3853 kern_return_t
vm_pageout_wait(uint64_t deadline)3854 vm_pageout_wait(uint64_t deadline)
3855 {
3856 kern_return_t kr;
3857
3858 vm_free_page_lock();
3859 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3860 vm_pageout_waiter = TRUE;
3861 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3862 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3863 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3864 kr = KERN_OPERATION_TIMED_OUT;
3865 }
3866 }
3867 vm_free_page_unlock();
3868
3869 return kr;
3870 }
3871 #endif /* XNU_TARGET_OS_OSX */
3872
3873
3874 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q,__unused wait_result_t w)3875 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q, __unused wait_result_t w)
3876 {
3877 vm_page_t m = NULL;
3878 vm_object_t object;
3879 vm_object_offset_t offset;
3880 memory_object_t pager;
3881
3882 /* On systems with a compressor, the external IO thread clears its
3883 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3884 * creation)
3885 */
3886 if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3887 current_thread()->options &= ~TH_OPT_VMPRIV;
3888 }
3889
3890 vm_page_lockspin_queues();
3891
3892 while (!vm_page_queue_empty(&q->pgo_pending)) {
3893 q->pgo_busy = TRUE;
3894 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3895
3896 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3897 VM_PAGE_CHECK(m);
3898 /*
3899 * grab a snapshot of the object and offset this
3900 * page is tabled in so that we can relookup this
3901 * page after we've taken the object lock - these
3902 * fields are stable while we hold the page queues lock
3903 * but as soon as we drop it, there is nothing to keep
3904 * this page in this object... we hold an activity_in_progress
3905 * on this object which will keep it from terminating
3906 */
3907 object = VM_PAGE_OBJECT(m);
3908 offset = m->vmp_offset;
3909
3910 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3911 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3912
3913 vm_page_unlock_queues();
3914
3915 vm_object_lock(object);
3916
3917 m = vm_page_lookup(object, offset);
3918
3919 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3920 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3921 /*
3922 * it's either the same page that someone else has
3923 * started cleaning (or it's finished cleaning or
3924 * been put back on the pageout queue), or
3925 * the page has been freed or we have found a
3926 * new page at this offset... in all of these cases
3927 * we merely need to release the activity_in_progress
3928 * we took when we put the page on the pageout queue
3929 */
3930 vm_object_activity_end(object);
3931 vm_object_unlock(object);
3932
3933 vm_page_lockspin_queues();
3934 continue;
3935 }
3936 pager = object->pager;
3937
3938 if (pager == MEMORY_OBJECT_NULL) {
3939 /*
3940 * This pager has been destroyed by either
3941 * memory_object_destroy or vm_object_destroy, and
3942 * so there is nowhere for the page to go.
3943 */
3944 if (m->vmp_free_when_done) {
3945 /*
3946 * Just free the page... VM_PAGE_FREE takes
3947 * care of cleaning up all the state...
3948 * including doing the vm_pageout_throttle_up
3949 */
3950 VM_PAGE_FREE(m);
3951 } else {
3952 vm_page_lockspin_queues();
3953
3954 vm_pageout_throttle_up(m);
3955 vm_page_activate(m);
3956
3957 vm_page_unlock_queues();
3958
3959 /*
3960 * And we are done with it.
3961 */
3962 }
3963 vm_object_activity_end(object);
3964 vm_object_unlock(object);
3965
3966 vm_page_lockspin_queues();
3967 continue;
3968 }
3969 #if 0
3970 /*
3971 * we don't hold the page queue lock
3972 * so this check isn't safe to make
3973 */
3974 VM_PAGE_CHECK(m);
3975 #endif
3976 /*
3977 * give back the activity_in_progress reference we
3978 * took when we queued up this page and replace it
3979 * it with a paging_in_progress reference that will
3980 * also hold the paging offset from changing and
3981 * prevent the object from terminating
3982 */
3983 vm_object_activity_end(object);
3984 vm_object_paging_begin(object);
3985 vm_object_unlock(object);
3986
3987 /*
3988 * Send the data to the pager.
3989 * any pageout clustering happens there
3990 */
3991 memory_object_data_return(pager,
3992 m->vmp_offset + object->paging_offset,
3993 PAGE_SIZE,
3994 NULL,
3995 NULL,
3996 FALSE,
3997 FALSE,
3998 0);
3999
4000 vm_object_lock(object);
4001 vm_object_paging_end(object);
4002 vm_object_unlock(object);
4003
4004 vm_pageout_io_throttle();
4005
4006 vm_page_lockspin_queues();
4007 }
4008 q->pgo_busy = FALSE;
4009 q->pgo_idle = TRUE;
4010
4011 assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
4012 vm_page_unlock_queues();
4013
4014 thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
4015 /*NOTREACHED*/
4016 }
4017
4018
4019 #define MAX_FREE_BATCH 32
4020 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4021 * this thread.
4022 */
4023
4024
4025 void
4026 vm_pageout_iothread_internal_continue(struct cq *, __unused wait_result_t);
4027 void
vm_pageout_iothread_internal_continue(struct cq * cq,__unused wait_result_t w)4028 vm_pageout_iothread_internal_continue(struct cq *cq, __unused wait_result_t w)
4029 {
4030 struct vm_pageout_queue *q;
4031 vm_page_t m = NULL;
4032 boolean_t pgo_draining;
4033 vm_page_t local_q;
4034 int local_cnt;
4035 vm_page_t local_freeq = NULL;
4036 int local_freed = 0;
4037 int local_batch_size;
4038 #if DEVELOPMENT || DEBUG
4039 int ncomps = 0;
4040 boolean_t marked_active = FALSE;
4041 int num_pages_processed = 0;
4042 #endif
4043 void *chead = NULL;
4044
4045 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4046
4047 q = cq->q;
4048 #if DEVELOPMENT || DEBUG
4049 bool benchmark_accounting = false;
4050 /*
4051 * If we're running the compressor perf test, only process the benchmark pages.
4052 * We'll get back to our regular queue once the benchmark is done
4053 */
4054 if (compressor_running_perf_test) {
4055 q = cq->benchmark_q;
4056 if (!vm_page_queue_empty(&q->pgo_pending)) {
4057 benchmark_accounting = true;
4058 } else {
4059 q = cq->q;
4060 benchmark_accounting = false;
4061 }
4062 }
4063 #endif /* DEVELOPMENT || DEBUG */
4064
4065 #if __AMP__
4066 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4067 local_batch_size = (q->pgo_maxlaundry >> 3);
4068 local_batch_size = MAX(local_batch_size, 16);
4069 } else {
4070 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4071 }
4072 #else
4073 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4074 #endif
4075
4076 #if RECORD_THE_COMPRESSED_DATA
4077 if (q->pgo_laundry) {
4078 c_compressed_record_init();
4079 }
4080 #endif
4081 while (TRUE) {
4082 int pages_left_on_q = 0;
4083
4084 local_cnt = 0;
4085 local_q = NULL;
4086
4087 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4088
4089 vm_page_lock_queues();
4090 #if DEVELOPMENT || DEBUG
4091 if (marked_active == FALSE) {
4092 vmct_active++;
4093 vmct_state[cq->id] = VMCT_ACTIVE;
4094 marked_active = TRUE;
4095 if (vmct_active == 1) {
4096 vm_compressor_epoch_start = mach_absolute_time();
4097 }
4098 }
4099 #endif
4100 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4101
4102 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4103
4104 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4105 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4106 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4107 VM_PAGE_CHECK(m);
4108
4109 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4110 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4111 m->vmp_laundry = FALSE;
4112
4113 m->vmp_snext = local_q;
4114 local_q = m;
4115 local_cnt++;
4116 }
4117 if (local_q == NULL) {
4118 break;
4119 }
4120
4121 q->pgo_busy = TRUE;
4122
4123 if ((pgo_draining = q->pgo_draining) == FALSE) {
4124 vm_pageout_throttle_up_batch(q, local_cnt);
4125 pages_left_on_q = q->pgo_laundry;
4126 } else {
4127 pages_left_on_q = q->pgo_laundry - local_cnt;
4128 }
4129
4130 vm_page_unlock_queues();
4131
4132 #if !RECORD_THE_COMPRESSED_DATA
4133 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4134 thread_wakeup((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id + 1));
4135 }
4136 #endif
4137 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4138
4139 while (local_q) {
4140 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4141
4142 m = local_q;
4143 local_q = m->vmp_snext;
4144 m->vmp_snext = NULL;
4145
4146 /*
4147 * Technically we need the pageq locks to manipulate this field.
4148 * However, this page has been removed from all queues and is only
4149 * known to this compressor thread dealing with this local queue.
4150 *
4151 * TODO LIONEL: Add a second localq that is the early localq and
4152 * put special pages like this one on that queue in the block above
4153 * under the pageq lock to avoid this 'works but not clean' logic.
4154 */
4155 void *donate_queue_head;
4156 #if XNU_TARGET_OS_OSX
4157 donate_queue_head = &cq->current_early_swapout_chead;
4158 #else /* XNU_TARGET_OS_OSX */
4159 donate_queue_head = &cq->current_late_swapout_chead;
4160 #endif /* XNU_TARGET_OS_OSX */
4161 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4162 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4163 chead = donate_queue_head;
4164 } else {
4165 chead = &cq->current_regular_swapout_chead;
4166 }
4167
4168 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4169 #if DEVELOPMENT || DEBUG
4170 ncomps++;
4171 #endif
4172 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4173
4174 m->vmp_snext = local_freeq;
4175 local_freeq = m;
4176 local_freed++;
4177
4178 if (local_freed >= MAX_FREE_BATCH) {
4179 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4180
4181 vm_page_free_list(local_freeq, TRUE);
4182
4183 local_freeq = NULL;
4184 local_freed = 0;
4185 }
4186 }
4187 #if DEVELOPMENT || DEBUG
4188 num_pages_processed++;
4189 #endif /* DEVELOPMENT || DEBUG */
4190 #if !CONFIG_JETSAM
4191 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4192 kern_return_t wait_result;
4193 int need_wakeup = 0;
4194
4195 if (local_freeq) {
4196 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4197
4198 vm_page_free_list(local_freeq, TRUE);
4199 local_freeq = NULL;
4200 local_freed = 0;
4201
4202 continue;
4203 }
4204 vm_free_page_lock_spin();
4205
4206 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4207 if (vm_page_free_wanted_privileged++ == 0) {
4208 need_wakeup = 1;
4209 }
4210 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4211
4212 vm_free_page_unlock();
4213
4214 if (need_wakeup) {
4215 thread_wakeup((event_t)&vm_page_free_wanted);
4216 }
4217
4218 if (wait_result == THREAD_WAITING) {
4219 thread_block(THREAD_CONTINUE_NULL);
4220 }
4221 } else {
4222 vm_free_page_unlock();
4223 }
4224 }
4225 #endif
4226 }
4227 if (local_freeq) {
4228 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4229
4230 vm_page_free_list(local_freeq, TRUE);
4231 local_freeq = NULL;
4232 local_freed = 0;
4233 }
4234 if (pgo_draining == TRUE) {
4235 vm_page_lockspin_queues();
4236 vm_pageout_throttle_up_batch(q, local_cnt);
4237 vm_page_unlock_queues();
4238 }
4239 }
4240 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4241
4242 /*
4243 * queue lock is held and our q is empty
4244 */
4245 q->pgo_busy = FALSE;
4246 q->pgo_idle = TRUE;
4247
4248 assert_wait((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id), THREAD_UNINT);
4249 #if DEVELOPMENT || DEBUG
4250 if (marked_active == TRUE) {
4251 vmct_active--;
4252 vmct_state[cq->id] = VMCT_IDLE;
4253
4254 if (vmct_active == 0) {
4255 vm_compressor_epoch_stop = mach_absolute_time();
4256 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4257 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4258 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4259 /* This interval includes intervals where one or more
4260 * compressor threads were pre-empted
4261 */
4262 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4263 }
4264 }
4265 if (compressor_running_perf_test && benchmark_accounting) {
4266 /*
4267 * We could turn ON compressor_running_perf_test while still processing
4268 * regular non-benchmark pages. We shouldn't count them here else we
4269 * could overshoot. We might also still be populating that benchmark Q
4270 * and be under pressure. So we will go back to the regular queues. And
4271 * benchmark accounting will be off for that case too.
4272 */
4273 compressor_perf_test_pages_processed += num_pages_processed;
4274 thread_wakeup(&compressor_perf_test_pages_processed);
4275 }
4276 #endif
4277 vm_page_unlock_queues();
4278 #if DEVELOPMENT || DEBUG
4279 if (__improbable(vm_compressor_time_thread)) {
4280 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4281 vmct_stats.vmct_pages[cq->id] += ncomps;
4282 vmct_stats.vmct_iterations[cq->id]++;
4283 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4284 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4285 }
4286 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4287 vmct_stats.vmct_minpages[cq->id] = ncomps;
4288 }
4289 }
4290 #endif
4291
4292 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4293
4294 thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4295 /*NOTREACHED*/
4296 }
4297
4298
4299 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4300 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4301 {
4302 vm_object_t object;
4303 memory_object_t pager;
4304 int compressed_count_delta;
4305 kern_return_t retval;
4306
4307 object = VM_PAGE_OBJECT(m);
4308
4309 assert(!m->vmp_free_when_done);
4310 assert(!m->vmp_laundry);
4311
4312 pager = object->pager;
4313
4314 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4315 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4316
4317 vm_object_lock(object);
4318
4319 /*
4320 * If there is no memory object for the page, create
4321 * one and hand it to the compression pager.
4322 */
4323
4324 if (!object->pager_initialized) {
4325 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4326 }
4327 if (!object->pager_initialized) {
4328 vm_object_compressor_pager_create(object);
4329 }
4330
4331 pager = object->pager;
4332
4333 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4334 /*
4335 * Still no pager for the object,
4336 * or the pager has been destroyed.
4337 * Reactivate the page.
4338 *
4339 * Should only happen if there is no
4340 * compression pager
4341 */
4342 PAGE_WAKEUP_DONE(m);
4343
4344 vm_page_lockspin_queues();
4345 vm_page_activate(m);
4346 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4347 vm_page_unlock_queues();
4348
4349 /*
4350 * And we are done with it.
4351 */
4352 vm_object_activity_end(object);
4353 vm_object_unlock(object);
4354
4355 return KERN_FAILURE;
4356 }
4357 vm_object_unlock(object);
4358
4359 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4360 }
4361 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4362 assert(object->activity_in_progress > 0);
4363
4364 retval = vm_compressor_pager_put(
4365 pager,
4366 m->vmp_offset + object->paging_offset,
4367 VM_PAGE_GET_PHYS_PAGE(m),
4368 current_chead,
4369 scratch_buf,
4370 &compressed_count_delta);
4371
4372 vm_object_lock(object);
4373
4374 assert(object->activity_in_progress > 0);
4375 assert(VM_PAGE_OBJECT(m) == object);
4376 assert( !VM_PAGE_WIRED(m));
4377
4378 vm_compressor_pager_count(pager,
4379 compressed_count_delta,
4380 FALSE, /* shared_lock */
4381 object);
4382
4383 if (retval == KERN_SUCCESS) {
4384 /*
4385 * If the object is purgeable, its owner's
4386 * purgeable ledgers will be updated in
4387 * vm_page_remove() but the page still
4388 * contributes to the owner's memory footprint,
4389 * so account for it as such.
4390 */
4391 if ((object->purgable != VM_PURGABLE_DENY ||
4392 object->vo_ledger_tag) &&
4393 object->vo_owner != NULL) {
4394 /* one more compressed purgeable/tagged page */
4395 vm_object_owner_compressed_update(object,
4396 +1);
4397 }
4398 counter_inc(&vm_statistics_compressions);
4399
4400 if (m->vmp_tabled) {
4401 vm_page_remove(m, TRUE);
4402 }
4403 } else {
4404 PAGE_WAKEUP_DONE(m);
4405
4406 vm_page_lockspin_queues();
4407
4408 vm_page_activate(m);
4409 vm_pageout_vminfo.vm_compressor_failed++;
4410
4411 vm_page_unlock_queues();
4412 }
4413 vm_object_activity_end(object);
4414 vm_object_unlock(object);
4415
4416 return retval;
4417 }
4418
4419
4420 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4421 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4422 {
4423 uint32_t policy;
4424
4425 if (hibernate_cleaning_in_progress == TRUE) {
4426 req_lowpriority = FALSE;
4427 }
4428
4429 if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4430 vm_page_unlock_queues();
4431
4432 if (req_lowpriority == TRUE) {
4433 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4434 DTRACE_VM(laundrythrottle);
4435 } else {
4436 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4437 DTRACE_VM(laundryunthrottle);
4438 }
4439 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4440 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4441
4442 vm_page_lock_queues();
4443 eq->pgo_lowpriority = req_lowpriority;
4444 }
4445 }
4446
4447
4448 static void
vm_pageout_iothread_external(__unused struct cq * c,__unused wait_result_t w)4449 vm_pageout_iothread_external(__unused struct cq *c, __unused wait_result_t w)
4450 {
4451 thread_t self = current_thread();
4452
4453 self->options |= TH_OPT_VMPRIV;
4454
4455 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4456
4457 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4458 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4459
4460 vm_page_lock_queues();
4461
4462 vm_pageout_queue_external.pgo_tid = self->thread_id;
4463 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4464 vm_pageout_queue_external.pgo_inited = TRUE;
4465
4466 vm_page_unlock_queues();
4467
4468 #if CONFIG_THREAD_GROUPS
4469 thread_group_vm_add();
4470 #endif /* CONFIG_THREAD_GROUPS */
4471
4472 vm_pageout_iothread_external_continue(&vm_pageout_queue_external, 0);
4473
4474 /*NOTREACHED*/
4475 }
4476
4477
4478 static void
vm_pageout_iothread_internal(struct cq * cq,__unused wait_result_t w)4479 vm_pageout_iothread_internal(struct cq *cq, __unused wait_result_t w)
4480 {
4481 thread_t self = current_thread();
4482
4483 self->options |= TH_OPT_VMPRIV;
4484
4485 vm_page_lock_queues();
4486
4487 vm_pageout_queue_internal.pgo_tid = self->thread_id;
4488 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4489 vm_pageout_queue_internal.pgo_inited = TRUE;
4490
4491 #if DEVELOPMENT || DEBUG
4492 vm_pageout_queue_benchmark.pgo_tid = vm_pageout_queue_internal.pgo_tid;
4493 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4494 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4495 vm_pageout_queue_benchmark.pgo_idle = TRUE;
4496 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4497 #endif /* DEVELOPMENT || DEBUG */
4498
4499 vm_page_unlock_queues();
4500
4501 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4502 thread_vm_bind_group_add();
4503 }
4504
4505 #if CONFIG_THREAD_GROUPS
4506 thread_group_vm_add();
4507 #endif /* CONFIG_THREAD_GROUPS */
4508
4509 #if __AMP__
4510 if (vm_compressor_ebound) {
4511 /*
4512 * Use the soft bound option for vm_compressor to allow it to run on
4513 * P-cores if E-cluster is unavailable.
4514 */
4515 thread_bind_cluster_type(self, 'E', true);
4516 }
4517 #endif /* __AMP__ */
4518
4519 thread_set_thread_name(current_thread(), "VM_compressor");
4520 #if DEVELOPMENT || DEBUG
4521 vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4522 #endif
4523 vm_pageout_iothread_internal_continue(cq, 0);
4524
4525 /*NOTREACHED*/
4526 }
4527
4528 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4529 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4530 {
4531 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4532 return KERN_SUCCESS;
4533 } else {
4534 return KERN_FAILURE; /* Already set */
4535 }
4536 }
4537
4538 extern boolean_t memorystatus_manual_testing_on;
4539 extern unsigned int memorystatus_level;
4540
4541
4542 #if VM_PRESSURE_EVENTS
4543
4544 boolean_t vm_pressure_events_enabled = FALSE;
4545
4546 extern uint64_t next_warning_notification_sent_at_ts;
4547 extern uint64_t next_critical_notification_sent_at_ts;
4548
4549 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4550
4551 /*
4552 * The last time there was change in pressure level OR we forced a check
4553 * because the system is stuck in a non-normal pressure level.
4554 */
4555 uint64_t vm_pressure_last_level_transition_abs = 0;
4556
4557 /*
4558 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4559 * level before resending out notifications for that level again.
4560 */
4561 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4562
4563 void
vm_pressure_response(void)4564 vm_pressure_response(void)
4565 {
4566 vm_pressure_level_t old_level = kVMPressureNormal;
4567 int new_level = -1;
4568 unsigned int total_pages;
4569 uint64_t available_memory = 0;
4570 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4571 bool force_check = false;
4572 int time_in_mins;
4573
4574
4575 if (vm_pressure_events_enabled == FALSE) {
4576 return;
4577 }
4578
4579 #if !XNU_TARGET_OS_OSX
4580
4581 available_memory = (uint64_t) memorystatus_available_pages;
4582
4583 #else /* !XNU_TARGET_OS_OSX */
4584
4585 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4586 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4587
4588 #endif /* !XNU_TARGET_OS_OSX */
4589
4590 total_pages = (unsigned int) atop_64(max_mem);
4591 #if CONFIG_SECLUDED_MEMORY
4592 total_pages -= vm_page_secluded_count;
4593 #endif /* CONFIG_SECLUDED_MEMORY */
4594 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4595
4596 if (memorystatus_manual_testing_on) {
4597 return;
4598 }
4599
4600 curr_ts = mach_absolute_time();
4601 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4602
4603 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4604 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4605 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4606
4607 old_level = memorystatus_vm_pressure_level;
4608
4609 switch (memorystatus_vm_pressure_level) {
4610 case kVMPressureNormal:
4611 {
4612 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4613 new_level = kVMPressureCritical;
4614 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4615 new_level = kVMPressureWarning;
4616 }
4617 break;
4618 }
4619
4620 case kVMPressureWarning:
4621 case kVMPressureUrgent:
4622 {
4623 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4624 new_level = kVMPressureNormal;
4625 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4626 new_level = kVMPressureCritical;
4627 } else if (force_check) {
4628 new_level = kVMPressureWarning;
4629 next_warning_notification_sent_at_ts = curr_ts;
4630 }
4631 break;
4632 }
4633
4634 case kVMPressureCritical:
4635 {
4636 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4637 new_level = kVMPressureNormal;
4638 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4639 new_level = kVMPressureWarning;
4640 } else if (force_check) {
4641 new_level = kVMPressureCritical;
4642 next_critical_notification_sent_at_ts = curr_ts;
4643 }
4644 break;
4645 }
4646
4647 default:
4648 return;
4649 }
4650
4651 if (new_level != -1 || force_check) {
4652 if (new_level != -1) {
4653 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4654
4655 if (new_level != (int) old_level) {
4656 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4657 new_level, old_level, 0, 0);
4658 }
4659 } else {
4660 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4661 new_level, old_level, force_check, 0);
4662 }
4663
4664 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4665 /*
4666 * We don't want to schedule a wakeup while hibernation is in progress
4667 * because that could collide with checks for non-monotonicity in the scheduler.
4668 * We do however do all the updates to memorystatus_vm_pressure_level because
4669 * we _might_ want to use that for decisions regarding which pages or how
4670 * many pages we want to dump in hibernation.
4671 */
4672 return;
4673 }
4674
4675 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4676 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4677 thread_wakeup(&vm_pressure_thread);
4678 }
4679
4680 if (old_level != memorystatus_vm_pressure_level) {
4681 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4682 }
4683 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4684 }
4685 }
4686 }
4687 #endif /* VM_PRESSURE_EVENTS */
4688
4689 /*
4690 * Function called by a kernel thread to either get the current pressure level or
4691 * wait until memory pressure changes from a given level.
4692 */
4693 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4694 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4695 {
4696 #if !VM_PRESSURE_EVENTS
4697
4698 return KERN_FAILURE;
4699
4700 #else /* VM_PRESSURE_EVENTS */
4701
4702 wait_result_t wr = 0;
4703 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4704
4705 if (pressure_level == NULL) {
4706 return KERN_INVALID_ARGUMENT;
4707 }
4708
4709 if (*pressure_level == kVMPressureJetsam) {
4710 if (!wait_for_pressure) {
4711 return KERN_INVALID_ARGUMENT;
4712 }
4713
4714 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4715 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4716 THREAD_INTERRUPTIBLE);
4717 if (wr == THREAD_WAITING) {
4718 ++memorystatus_jetsam_fg_band_waiters;
4719 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4720 wr = thread_block(THREAD_CONTINUE_NULL);
4721 } else {
4722 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4723 }
4724 if (wr != THREAD_AWAKENED) {
4725 return KERN_ABORTED;
4726 }
4727 *pressure_level = kVMPressureJetsam;
4728 return KERN_SUCCESS;
4729 }
4730
4731 if (wait_for_pressure == TRUE) {
4732 while (old_level == *pressure_level) {
4733 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4734 THREAD_INTERRUPTIBLE);
4735 if (wr == THREAD_WAITING) {
4736 wr = thread_block(THREAD_CONTINUE_NULL);
4737 }
4738 if (wr == THREAD_INTERRUPTED) {
4739 return KERN_ABORTED;
4740 }
4741
4742 if (wr == THREAD_AWAKENED) {
4743 old_level = memorystatus_vm_pressure_level;
4744 }
4745 }
4746 }
4747
4748 *pressure_level = old_level;
4749 return KERN_SUCCESS;
4750 #endif /* VM_PRESSURE_EVENTS */
4751 }
4752
4753 #if VM_PRESSURE_EVENTS
4754 void
vm_pressure_thread(void)4755 vm_pressure_thread(void)
4756 {
4757 static boolean_t thread_initialized = FALSE;
4758
4759 if (thread_initialized == TRUE) {
4760 vm_pageout_state.vm_pressure_thread_running = TRUE;
4761 consider_vm_pressure_events();
4762 vm_pageout_state.vm_pressure_thread_running = FALSE;
4763 }
4764
4765 #if CONFIG_THREAD_GROUPS
4766 thread_group_vm_add();
4767 #endif /* CONFIG_THREAD_GROUPS */
4768
4769 thread_set_thread_name(current_thread(), "VM_pressure");
4770 thread_initialized = TRUE;
4771 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4772 thread_block((thread_continue_t)vm_pressure_thread);
4773 }
4774 #endif /* VM_PRESSURE_EVENTS */
4775
4776
4777 /*
4778 * called once per-second via "compute_averages"
4779 */
4780 void
compute_pageout_gc_throttle(__unused void * arg)4781 compute_pageout_gc_throttle(__unused void *arg)
4782 {
4783 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4784 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4785
4786 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4787 }
4788 }
4789
4790 /*
4791 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4792 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4793 * jetsams. We need to check if the zone map size is above its jetsam limit to
4794 * decide if this was indeed the case.
4795 *
4796 * We need to do this on a different thread because of the following reasons:
4797 *
4798 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4799 * itself causing the system to hang. We perform synchronous jetsams if we're
4800 * leaking in the VM map entries zone, so the leaking process could be doing a
4801 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4802 * jetsam itself. We also need the vm_map lock on the process termination path,
4803 * which would now lead the dying process to deadlock against itself.
4804 *
4805 * 2. The jetsam path might need to allocate zone memory itself. We could try
4806 * using the non-blocking variant of zalloc for this path, but we can still
4807 * end up trying to do a kmem_alloc when the zone maps are almost full.
4808 */
4809 __dead2
4810 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4811 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4812 {
4813 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4814
4815 if (step == VM_PAGEOUT_GC_INIT) {
4816 /* first time being called is not about GC */
4817 #if CONFIG_THREAD_GROUPS
4818 thread_group_vm_add();
4819 #endif /* CONFIG_THREAD_GROUPS */
4820 } else if (zone_map_nearing_exhaustion()) {
4821 /*
4822 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4823 *
4824 * Bail out after calling zone_gc (which triggers the
4825 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4826 * operations that clear out a bunch of caches might allocate zone
4827 * memory themselves (for eg. vm_map operations would need VM map
4828 * entries). Since the zone map is almost full at this point, we
4829 * could end up with a panic. We just need to quickly jetsam a
4830 * process and exit here.
4831 *
4832 * It could so happen that we were woken up to relieve memory
4833 * pressure and the zone map also happened to be near its limit at
4834 * the time, in which case we'll skip out early. But that should be
4835 * ok; if memory pressure persists, the thread will simply be woken
4836 * up again.
4837 */
4838 zone_gc(ZONE_GC_JETSAM);
4839 } else {
4840 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4841 boolean_t buf_large_zfree = FALSE;
4842 boolean_t first_try = TRUE;
4843
4844 stack_collect();
4845
4846 consider_machine_collect();
4847 mbuf_drain(FALSE);
4848
4849 do {
4850 if (consider_buffer_cache_collect != NULL) {
4851 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4852 }
4853 if (first_try == TRUE || buf_large_zfree == TRUE) {
4854 /*
4855 * zone_gc should be last, because the other operations
4856 * might return memory to zones.
4857 */
4858 zone_gc(ZONE_GC_TRIM);
4859 }
4860 first_try = FALSE;
4861 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4862
4863 consider_machine_adjust();
4864 }
4865
4866 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4867
4868 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4869 __builtin_unreachable();
4870 }
4871
4872
4873 #if VM_PAGE_BUCKETS_CHECK
4874 #if VM_PAGE_FAKE_BUCKETS
4875 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4876 #endif /* VM_PAGE_FAKE_BUCKETS */
4877 #endif /* VM_PAGE_BUCKETS_CHECK */
4878
4879
4880
4881 void
vm_set_restrictions(unsigned int num_cpus)4882 vm_set_restrictions(unsigned int num_cpus)
4883 {
4884 int vm_restricted_to_single_processor = 0;
4885
4886 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4887 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4888 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4889 } else {
4890 assert(num_cpus > 0);
4891
4892 if (num_cpus <= 3) {
4893 /*
4894 * on systems with a limited number of CPUS, bind the
4895 * 4 major threads that can free memory and that tend to use
4896 * a fair bit of CPU under pressured conditions to a single processor.
4897 * This insures that these threads don't hog all of the available CPUs
4898 * (important for camera launch), while allowing them to run independently
4899 * w/r to locks... the 4 threads are
4900 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4901 * vm_compressor_swap_trigger_thread (minor and major compactions),
4902 * memorystatus_thread (jetsams).
4903 *
4904 * the first time the thread is run, it is responsible for checking the
4905 * state of vm_restricted_to_single_processor, and if TRUE it calls
4906 * thread_bind_master... someday this should be replaced with a group
4907 * scheduling mechanism and KPI.
4908 */
4909 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4910 } else {
4911 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4912 }
4913 }
4914 }
4915
4916 /*
4917 * Set up vm_config based on the vm_compressor_mode.
4918 * Must run BEFORE the pageout thread starts up.
4919 */
4920 __startup_func
4921 void
vm_config_init(void)4922 vm_config_init(void)
4923 {
4924 bzero(&vm_config, sizeof(vm_config));
4925
4926 switch (vm_compressor_mode) {
4927 case VM_PAGER_DEFAULT:
4928 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4929 OS_FALLTHROUGH;
4930
4931 case VM_PAGER_COMPRESSOR_WITH_SWAP:
4932 vm_config.compressor_is_present = TRUE;
4933 vm_config.swap_is_present = TRUE;
4934 vm_config.compressor_is_active = TRUE;
4935 vm_config.swap_is_active = TRUE;
4936 break;
4937
4938 case VM_PAGER_COMPRESSOR_NO_SWAP:
4939 vm_config.compressor_is_present = TRUE;
4940 vm_config.swap_is_present = TRUE;
4941 vm_config.compressor_is_active = TRUE;
4942 break;
4943
4944 case VM_PAGER_FREEZER_DEFAULT:
4945 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4946 OS_FALLTHROUGH;
4947
4948 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4949 vm_config.compressor_is_present = TRUE;
4950 vm_config.swap_is_present = TRUE;
4951 break;
4952
4953 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4954 vm_config.compressor_is_present = TRUE;
4955 vm_config.swap_is_present = TRUE;
4956 vm_config.compressor_is_active = TRUE;
4957 vm_config.freezer_swap_is_active = TRUE;
4958 break;
4959
4960 case VM_PAGER_NOT_CONFIGURED:
4961 break;
4962
4963 default:
4964 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4965 break;
4966 }
4967 }
4968
4969 __startup_func
4970 static void
vm_pageout_create_gc_thread(void)4971 vm_pageout_create_gc_thread(void)
4972 {
4973 thread_t thread;
4974
4975 if (kernel_thread_create(vm_pageout_garbage_collect,
4976 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
4977 panic("vm_pageout_garbage_collect: create failed");
4978 }
4979 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
4980 if (thread->reserved_stack == 0) {
4981 assert(thread->kernel_stack);
4982 thread->reserved_stack = thread->kernel_stack;
4983 }
4984
4985 /* thread is started in vm_pageout() */
4986 vm_pageout_gc_thread = thread;
4987 }
4988 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
4989
4990 void
vm_pageout(void)4991 vm_pageout(void)
4992 {
4993 thread_t self = current_thread();
4994 thread_t thread;
4995 kern_return_t result;
4996 spl_t s;
4997
4998 /*
4999 * Set thread privileges.
5000 */
5001 s = splsched();
5002
5003 #if CONFIG_VPS_DYNAMIC_PRIO
5004
5005 int vps_dynprio_bootarg = 0;
5006
5007 if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5008 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5009 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5010 } else {
5011 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5012 vps_dynamic_priority_enabled = TRUE;
5013 } else {
5014 vps_dynamic_priority_enabled = FALSE;
5015 }
5016 }
5017
5018 if (vps_dynamic_priority_enabled) {
5019 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5020 thread_set_eager_preempt(self);
5021 } else {
5022 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5023 }
5024
5025 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5026
5027 vps_dynamic_priority_enabled = FALSE;
5028 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5029
5030 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5031
5032 thread_lock(self);
5033 self->options |= TH_OPT_VMPRIV;
5034 thread_unlock(self);
5035
5036 if (!self->reserved_stack) {
5037 self->reserved_stack = self->kernel_stack;
5038 }
5039
5040 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5041 vps_dynamic_priority_enabled == FALSE) {
5042 thread_vm_bind_group_add();
5043 }
5044
5045
5046 #if CONFIG_THREAD_GROUPS
5047 thread_group_vm_add();
5048 #endif /* CONFIG_THREAD_GROUPS */
5049
5050 #if __AMP__
5051 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5052 if (vm_pgo_pbound) {
5053 /*
5054 * Use the soft bound option for vm pageout to allow it to run on
5055 * E-cores if P-cluster is unavailable.
5056 */
5057 thread_bind_cluster_type(self, 'P', true);
5058 }
5059 #endif /* __AMP__ */
5060
5061 splx(s);
5062
5063 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5064
5065 /*
5066 * Initialize some paging parameters.
5067 */
5068
5069 vm_pageout_state.vm_pressure_thread_running = FALSE;
5070 vm_pageout_state.vm_pressure_changed = FALSE;
5071 vm_pageout_state.memorystatus_purge_on_warning = 2;
5072 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5073 vm_pageout_state.memorystatus_purge_on_critical = 8;
5074 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5075 vm_pageout_state.vm_page_speculative_percentage = 5;
5076 vm_pageout_state.vm_page_speculative_target = 0;
5077
5078 vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
5079 vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
5080
5081 vm_pageout_state.vm_pageout_swap_wait = 0;
5082 vm_pageout_state.vm_pageout_idle_wait = 0;
5083 vm_pageout_state.vm_pageout_empty_wait = 0;
5084 vm_pageout_state.vm_pageout_burst_wait = 0;
5085 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5086 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5087 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5088
5089 vm_pageout_state.vm_pageout_inactive = 0;
5090 vm_pageout_state.vm_pageout_inactive_used = 0;
5091 vm_pageout_state.vm_pageout_inactive_clean = 0;
5092
5093 vm_pageout_state.vm_memory_pressure = 0;
5094 vm_pageout_state.vm_page_filecache_min = 0;
5095 #if CONFIG_JETSAM
5096 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5097 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5098 #else
5099 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5100 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5101 #endif
5102 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5103
5104 vm_pageout_state.vm_pageout_considered_page_last = 0;
5105
5106 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5107 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5108 }
5109
5110 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5111 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5112 }
5113
5114 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5115 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5116 }
5117
5118 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5119 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5120 }
5121
5122 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5123 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5124 }
5125
5126 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5127 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5128 }
5129
5130 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5131 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5132 }
5133 /*
5134 * even if we've already called vm_page_free_reserve
5135 * call it again here to insure that the targets are
5136 * accurately calculated (it uses vm_page_free_count_init)
5137 * calling it with an arg of 0 will not change the reserve
5138 * but will re-calculate free_min and free_target
5139 */
5140 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5141 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5142 } else {
5143 vm_page_free_reserve(0);
5144 }
5145
5146 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5147 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5148
5149 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5150 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5151 vm_pageout_queue_external.pgo_tid = -1;
5152
5153 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5154 vm_pageout_queue_internal.pgo_tid = -1;
5155
5156 #if DEVELOPMENT || DEBUG
5157 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5158 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5159 vm_pageout_queue_internal.pgo_tid = -1;
5160 #endif /* DEVELOPMENT || DEBUG */
5161
5162
5163 /* internal pageout thread started when default pager registered first time */
5164 /* external pageout and garbage collection threads started here */
5165
5166 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5167 BASEPRI_VM,
5168 &vm_pageout_state.vm_pageout_external_iothread);
5169 if (result != KERN_SUCCESS) {
5170 panic("vm_pageout_iothread_external: create failed");
5171 }
5172 thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5173 thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5174
5175 thread_mtx_lock(vm_pageout_gc_thread );
5176 thread_start(vm_pageout_gc_thread );
5177 thread_mtx_unlock(vm_pageout_gc_thread);
5178
5179 #if VM_PRESSURE_EVENTS
5180 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5181 BASEPRI_DEFAULT,
5182 &thread);
5183
5184 if (result != KERN_SUCCESS) {
5185 panic("vm_pressure_thread: create failed");
5186 }
5187
5188 thread_deallocate(thread);
5189 #endif
5190
5191 vm_object_reaper_init();
5192
5193
5194 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5195 vm_compressor_init();
5196 }
5197
5198 #if VM_PRESSURE_EVENTS
5199 vm_pressure_events_enabled = TRUE;
5200 #endif /* VM_PRESSURE_EVENTS */
5201
5202 #if CONFIG_PHANTOM_CACHE
5203 vm_phantom_cache_init();
5204 #endif
5205 #if VM_PAGE_BUCKETS_CHECK
5206 #if VM_PAGE_FAKE_BUCKETS
5207 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5208 (uint64_t) vm_page_fake_buckets_start,
5209 (uint64_t) vm_page_fake_buckets_end);
5210 pmap_protect(kernel_pmap,
5211 vm_page_fake_buckets_start,
5212 vm_page_fake_buckets_end,
5213 VM_PROT_READ);
5214 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5215 #endif /* VM_PAGE_FAKE_BUCKETS */
5216 #endif /* VM_PAGE_BUCKETS_CHECK */
5217
5218 #if VM_OBJECT_TRACKING
5219 vm_object_tracking_init();
5220 #endif /* VM_OBJECT_TRACKING */
5221
5222 #if __arm64__
5223 // vm_tests();
5224 #endif /* __arm64__ */
5225
5226 vm_pageout_continue();
5227
5228 /*
5229 * Unreached code!
5230 *
5231 * The vm_pageout_continue() call above never returns, so the code below is never
5232 * executed. We take advantage of this to declare several DTrace VM related probe
5233 * points that our kernel doesn't have an analog for. These are probe points that
5234 * exist in Solaris and are in the DTrace documentation, so people may have written
5235 * scripts that use them. Declaring the probe points here means their scripts will
5236 * compile and execute which we want for portability of the scripts, but since this
5237 * section of code is never reached, the probe points will simply never fire. Yes,
5238 * this is basically a hack. The problem is the DTrace probe points were chosen with
5239 * Solaris specific VM events in mind, not portability to different VM implementations.
5240 */
5241
5242 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5243 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5244 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5245 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5246 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5247 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5248 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5249 /*NOTREACHED*/
5250 }
5251
5252
5253
5254 kern_return_t
vm_pageout_internal_start(void)5255 vm_pageout_internal_start(void)
5256 {
5257 kern_return_t result = KERN_SUCCESS;
5258 host_basic_info_data_t hinfo;
5259 vm_offset_t buf, bufsize;
5260
5261 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5262
5263 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5264 #define BSD_HOST 1
5265 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5266
5267 assert(hinfo.max_cpus > 0);
5268
5269 #if !XNU_TARGET_OS_OSX
5270 vm_pageout_state.vm_compressor_thread_count = 1;
5271 #else /* !XNU_TARGET_OS_OSX */
5272 if (hinfo.max_cpus > 4) {
5273 vm_pageout_state.vm_compressor_thread_count = 2;
5274 } else {
5275 vm_pageout_state.vm_compressor_thread_count = 1;
5276 }
5277 #endif /* !XNU_TARGET_OS_OSX */
5278 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5279 sizeof(vm_pageout_state.vm_compressor_thread_count));
5280
5281 #if __AMP__
5282 PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5283 if (vm_compressor_ebound) {
5284 vm_pageout_state.vm_compressor_thread_count = 2;
5285 }
5286 #endif
5287 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5288 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5289 }
5290 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5291 vm_pageout_state.vm_compressor_thread_count = 1;
5292 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5293 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5294 }
5295
5296 vm_pageout_queue_internal.pgo_maxlaundry =
5297 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5298
5299 PE_parse_boot_argn("vmpgoi_maxlaundry",
5300 &vm_pageout_queue_internal.pgo_maxlaundry,
5301 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5302
5303 #if DEVELOPMENT || DEBUG
5304 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5305 #endif /* DEVELOPMENT || DEBUG */
5306
5307 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5308
5309 kmem_alloc(kernel_map, &buf,
5310 bufsize * vm_pageout_state.vm_compressor_thread_count,
5311 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5312 VM_KERN_MEMORY_COMPRESSOR);
5313
5314 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5315 ciq[i].id = i;
5316 ciq[i].q = &vm_pageout_queue_internal;
5317 ciq[i].current_early_swapout_chead = NULL;
5318 ciq[i].current_regular_swapout_chead = NULL;
5319 ciq[i].current_late_swapout_chead = NULL;
5320 ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5321 #if DEVELOPMENT || DEBUG
5322 ciq[i].benchmark_q = &vm_pageout_queue_benchmark;
5323 #endif /* DEVELOPMENT || DEBUG */
5324
5325 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5326 (void *)&ciq[i], BASEPRI_VM,
5327 &vm_pageout_state.vm_pageout_internal_iothread);
5328
5329 if (result == KERN_SUCCESS) {
5330 thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5331 } else {
5332 break;
5333 }
5334 }
5335 return result;
5336 }
5337
5338 #if CONFIG_IOSCHED
5339 /*
5340 * To support I/O Expedite for compressed files we mark the upls with special flags.
5341 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5342 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5343 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5344 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5345 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5346 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5347 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5348 * unless the real I/O upl is being destroyed).
5349 */
5350
5351
5352 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5353 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5354 {
5355 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5356
5357 upl_lock(src_upl);
5358 if (src_upl->decmp_io_upl) {
5359 /*
5360 * If there is already an alive real I/O UPL, ignore this new UPL.
5361 * This case should rarely happen and even if it does, it just means
5362 * that we might issue a spurious expedite which the driver is expected
5363 * to handle.
5364 */
5365 upl_unlock(src_upl);
5366 return;
5367 }
5368 src_upl->decmp_io_upl = (void *)upl;
5369 src_upl->ref_count++;
5370
5371 upl->flags |= UPL_DECMP_REAL_IO;
5372 upl->decmp_io_upl = (void *)src_upl;
5373 upl_unlock(src_upl);
5374 }
5375 #endif /* CONFIG_IOSCHED */
5376
5377 #if UPL_DEBUG
5378 int upl_debug_enabled = 1;
5379 #else
5380 int upl_debug_enabled = 0;
5381 #endif
5382
5383 static upl_t
upl_create(int type,int flags,upl_size_t size)5384 upl_create(int type, int flags, upl_size_t size)
5385 {
5386 upl_t upl;
5387 vm_size_t page_field_size = 0;
5388 int upl_flags = 0;
5389 vm_size_t upl_size = sizeof(struct upl);
5390
5391 assert(page_aligned(size));
5392
5393 size = round_page_32(size);
5394
5395 if (type & UPL_CREATE_LITE) {
5396 page_field_size = (atop(size) + 7) >> 3;
5397 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5398
5399 upl_flags |= UPL_LITE;
5400 }
5401 if (type & UPL_CREATE_INTERNAL) {
5402 upl_size += sizeof(struct upl_page_info) * atop(size);
5403
5404 upl_flags |= UPL_INTERNAL;
5405 }
5406 // rdar://88964158
5407 /* BEGIN IGNORE CODESTYLE */
5408 __typed_allocators_ignore_push
5409 upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5410 __typed_allocators_ignore_pop
5411 /* END IGNORE CODESTYLE */
5412
5413 upl->flags = upl_flags | flags;
5414 upl->ref_count = 1;
5415 upl_lock_init(upl);
5416 #if CONFIG_IOSCHED
5417 if (type & UPL_CREATE_IO_TRACKING) {
5418 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5419 }
5420
5421 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5422 /* Only support expedite on internal UPLs */
5423 thread_t curthread = current_thread();
5424 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5425 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5426 if (curthread->decmp_upl != NULL) {
5427 upl_set_decmp_info(upl, curthread->decmp_upl);
5428 }
5429 }
5430 #endif
5431 #if CONFIG_IOSCHED || UPL_DEBUG
5432 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5433 upl->upl_creator = current_thread();
5434 upl->flags |= UPL_TRACKED_BY_OBJECT;
5435 }
5436 #endif
5437
5438 #if UPL_DEBUG
5439 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5440 #endif /* UPL_DEBUG */
5441
5442 return upl;
5443 }
5444
5445 static void
upl_destroy(upl_t upl)5446 upl_destroy(upl_t upl)
5447 {
5448 int page_field_size; /* bit field in word size buf */
5449 int size;
5450
5451 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5452
5453 if (upl->ext_ref_count) {
5454 panic("upl(%p) ext_ref_count", upl);
5455 }
5456
5457 #if CONFIG_IOSCHED
5458 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5459 upl_t src_upl;
5460 src_upl = upl->decmp_io_upl;
5461 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5462 upl_lock(src_upl);
5463 src_upl->decmp_io_upl = NULL;
5464 upl_unlock(src_upl);
5465 upl_deallocate(src_upl);
5466 }
5467 #endif /* CONFIG_IOSCHED */
5468
5469 #if CONFIG_IOSCHED || UPL_DEBUG
5470 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5471 !(upl->flags & UPL_VECTOR)) {
5472 vm_object_t object;
5473
5474 if (upl->flags & UPL_SHADOWED) {
5475 object = upl->map_object->shadow;
5476 } else {
5477 object = upl->map_object;
5478 }
5479
5480 vm_object_lock(object);
5481 queue_remove(&object->uplq, upl, upl_t, uplq);
5482 vm_object_activity_end(object);
5483 vm_object_collapse(object, 0, TRUE);
5484 vm_object_unlock(object);
5485 }
5486 #endif
5487 /*
5488 * drop a reference on the map_object whether or
5489 * not a pageout object is inserted
5490 */
5491 if (upl->flags & UPL_SHADOWED) {
5492 vm_object_deallocate(upl->map_object);
5493 }
5494
5495 if (upl->flags & UPL_DEVICE_MEMORY) {
5496 size = PAGE_SIZE;
5497 } else {
5498 size = upl_adjusted_size(upl, PAGE_MASK);
5499 }
5500 page_field_size = 0;
5501
5502 if (upl->flags & UPL_LITE) {
5503 page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5504 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5505 }
5506 upl_lock_destroy(upl);
5507 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5508
5509 #if CONFIG_IOSCHED
5510 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5511 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5512 }
5513 #endif
5514
5515 // rdar://88964158
5516 __typed_allocators_ignore_push
5517 if (upl->flags & UPL_INTERNAL) {
5518 kheap_free(KHEAP_DEFAULT, upl,
5519 sizeof(struct upl) +
5520 (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5521 + page_field_size);
5522 } else {
5523 kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5524 }
5525 __typed_allocators_ignore_pop
5526 }
5527
5528 void
upl_deallocate(upl_t upl)5529 upl_deallocate(upl_t upl)
5530 {
5531 upl_lock(upl);
5532
5533 if (--upl->ref_count == 0) {
5534 if (vector_upl_is_valid(upl)) {
5535 vector_upl_deallocate(upl);
5536 }
5537 upl_unlock(upl);
5538
5539 if (upl->upl_iodone) {
5540 upl_callout_iodone(upl);
5541 }
5542
5543 upl_destroy(upl);
5544 } else {
5545 upl_unlock(upl);
5546 }
5547 }
5548
5549 #if CONFIG_IOSCHED
5550 void
upl_mark_decmp(upl_t upl)5551 upl_mark_decmp(upl_t upl)
5552 {
5553 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5554 upl->flags |= UPL_DECMP_REQ;
5555 upl->upl_creator->decmp_upl = (void *)upl;
5556 }
5557 }
5558
5559 void
upl_unmark_decmp(upl_t upl)5560 upl_unmark_decmp(upl_t upl)
5561 {
5562 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5563 upl->upl_creator->decmp_upl = NULL;
5564 }
5565 }
5566
5567 #endif /* CONFIG_IOSCHED */
5568
5569 #define VM_PAGE_Q_BACKING_UP(q) \
5570 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5571
5572 boolean_t must_throttle_writes(void);
5573
5574 boolean_t
must_throttle_writes()5575 must_throttle_writes()
5576 {
5577 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5578 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5579 return TRUE;
5580 }
5581
5582 return FALSE;
5583 }
5584
5585 int vm_page_delayed_work_ctx_needed = 0;
5586 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5587
5588 __startup_func
5589 static void
vm_page_delayed_work_init_ctx(void)5590 vm_page_delayed_work_init_ctx(void)
5591 {
5592 uint16_t min_delayed_work_ctx_allocated = 16;
5593
5594 /*
5595 * try really hard to always keep NCPU elements around in the zone
5596 * in order for the UPL code to almost always get an element.
5597 */
5598 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5599 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5600 }
5601
5602 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5603 }
5604 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5605
5606 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5607 vm_page_delayed_work_get_ctx(void)
5608 {
5609 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5610
5611 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5612
5613 if (__probable(dw_ctx)) {
5614 dw_ctx->delayed_owner = current_thread();
5615 } else {
5616 vm_page_delayed_work_ctx_needed++;
5617 }
5618 return dw_ctx ? dw_ctx->dwp : NULL;
5619 }
5620
5621 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5622 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5623 {
5624 struct vm_page_delayed_work_ctx *ldw_ctx;
5625
5626 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5627 ldw_ctx->delayed_owner = NULL;
5628
5629 zfree(dw_ctx_zone, ldw_ctx);
5630 }
5631
5632 /*
5633 * Routine: vm_object_upl_request
5634 * Purpose:
5635 * Cause the population of a portion of a vm_object.
5636 * Depending on the nature of the request, the pages
5637 * returned may be contain valid data or be uninitialized.
5638 * A page list structure, listing the physical pages
5639 * will be returned upon request.
5640 * This function is called by the file system or any other
5641 * supplier of backing store to a pager.
5642 * IMPORTANT NOTE: The caller must still respect the relationship
5643 * between the vm_object and its backing memory object. The
5644 * caller MUST NOT substitute changes in the backing file
5645 * without first doing a memory_object_lock_request on the
5646 * target range unless it is know that the pages are not
5647 * shared with another entity at the pager level.
5648 * Copy_in_to:
5649 * if a page list structure is present
5650 * return the mapped physical pages, where a
5651 * page is not present, return a non-initialized
5652 * one. If the no_sync bit is turned on, don't
5653 * call the pager unlock to synchronize with other
5654 * possible copies of the page. Leave pages busy
5655 * in the original object, if a page list structure
5656 * was specified. When a commit of the page list
5657 * pages is done, the dirty bit will be set for each one.
5658 * Copy_out_from:
5659 * If a page list structure is present, return
5660 * all mapped pages. Where a page does not exist
5661 * map a zero filled one. Leave pages busy in
5662 * the original object. If a page list structure
5663 * is not specified, this call is a no-op.
5664 *
5665 * Note: access of default pager objects has a rather interesting
5666 * twist. The caller of this routine, presumably the file system
5667 * page cache handling code, will never actually make a request
5668 * against a default pager backed object. Only the default
5669 * pager will make requests on backing store related vm_objects
5670 * In this way the default pager can maintain the relationship
5671 * between backing store files (abstract memory objects) and
5672 * the vm_objects (cache objects), they support.
5673 *
5674 */
5675
5676 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5677 vm_object_upl_request(
5678 vm_object_t object,
5679 vm_object_offset_t offset,
5680 upl_size_t size,
5681 upl_t *upl_ptr,
5682 upl_page_info_array_t user_page_list,
5683 unsigned int *page_list_count,
5684 upl_control_flags_t cntrl_flags,
5685 vm_tag_t tag)
5686 {
5687 vm_page_t dst_page = VM_PAGE_NULL;
5688 vm_object_offset_t dst_offset;
5689 upl_size_t xfer_size;
5690 unsigned int size_in_pages;
5691 boolean_t dirty;
5692 boolean_t hw_dirty;
5693 upl_t upl = NULL;
5694 unsigned int entry;
5695 vm_page_t alias_page = NULL;
5696 int refmod_state = 0;
5697 wpl_array_t lite_list = NULL;
5698 vm_object_t last_copy_object;
5699 struct vm_page_delayed_work dw_array;
5700 struct vm_page_delayed_work *dwp, *dwp_start;
5701 bool dwp_finish_ctx = TRUE;
5702 int dw_count;
5703 int dw_limit;
5704 int io_tracking_flag = 0;
5705 int grab_options;
5706 int page_grab_count = 0;
5707 ppnum_t phys_page;
5708 pmap_flush_context pmap_flush_context_storage;
5709 boolean_t pmap_flushes_delayed = FALSE;
5710 #if DEVELOPMENT || DEBUG
5711 task_t task = current_task();
5712 #endif /* DEVELOPMENT || DEBUG */
5713
5714 dwp_start = dwp = NULL;
5715
5716 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5717 /*
5718 * For forward compatibility's sake,
5719 * reject any unknown flag.
5720 */
5721 return KERN_INVALID_VALUE;
5722 }
5723 if ((!object->internal) && (object->paging_offset != 0)) {
5724 panic("vm_object_upl_request: external object with non-zero paging offset");
5725 }
5726 if (object->phys_contiguous) {
5727 panic("vm_object_upl_request: contiguous object specified");
5728 }
5729
5730 assertf(page_aligned(offset) && page_aligned(size),
5731 "offset 0x%llx size 0x%x",
5732 offset, size);
5733
5734 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5735
5736 dw_count = 0;
5737 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5738 dwp_start = vm_page_delayed_work_get_ctx();
5739 if (dwp_start == NULL) {
5740 dwp_start = &dw_array;
5741 dw_limit = 1;
5742 dwp_finish_ctx = FALSE;
5743 }
5744
5745 dwp = dwp_start;
5746
5747 if (size > MAX_UPL_SIZE_BYTES) {
5748 size = MAX_UPL_SIZE_BYTES;
5749 }
5750
5751 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5752 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5753 }
5754
5755 #if CONFIG_IOSCHED || UPL_DEBUG
5756 if (object->io_tracking || upl_debug_enabled) {
5757 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5758 }
5759 #endif
5760 #if CONFIG_IOSCHED
5761 if (object->io_tracking) {
5762 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5763 }
5764 #endif
5765
5766 if (cntrl_flags & UPL_SET_INTERNAL) {
5767 if (cntrl_flags & UPL_SET_LITE) {
5768 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5769
5770 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5771 lite_list = (wpl_array_t)
5772 (((uintptr_t)user_page_list) +
5773 ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5774 if (size == 0) {
5775 user_page_list = NULL;
5776 lite_list = NULL;
5777 }
5778 } else {
5779 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5780
5781 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5782 if (size == 0) {
5783 user_page_list = NULL;
5784 }
5785 }
5786 } else {
5787 if (cntrl_flags & UPL_SET_LITE) {
5788 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5789
5790 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5791 if (size == 0) {
5792 lite_list = NULL;
5793 }
5794 } else {
5795 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5796 }
5797 }
5798 *upl_ptr = upl;
5799
5800 if (user_page_list) {
5801 user_page_list[0].device = FALSE;
5802 }
5803
5804 if (cntrl_flags & UPL_SET_LITE) {
5805 upl->map_object = object;
5806 } else {
5807 upl->map_object = vm_object_allocate(size);
5808 /*
5809 * No neeed to lock the new object: nobody else knows
5810 * about it yet, so it's all ours so far.
5811 */
5812 upl->map_object->shadow = object;
5813 upl->map_object->pageout = TRUE;
5814 upl->map_object->can_persist = FALSE;
5815 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5816 upl->map_object->vo_shadow_offset = offset;
5817 upl->map_object->wimg_bits = object->wimg_bits;
5818 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5819 "object %p shadow_offset 0x%llx",
5820 upl->map_object, upl->map_object->vo_shadow_offset);
5821
5822 alias_page = vm_page_grab_fictitious(TRUE);
5823
5824 upl->flags |= UPL_SHADOWED;
5825 }
5826 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5827 upl->flags |= UPL_PAGEOUT;
5828 }
5829
5830 vm_object_lock(object);
5831 vm_object_activity_begin(object);
5832
5833 grab_options = 0;
5834 #if CONFIG_SECLUDED_MEMORY
5835 if (object->can_grab_secluded) {
5836 grab_options |= VM_PAGE_GRAB_SECLUDED;
5837 }
5838 #endif /* CONFIG_SECLUDED_MEMORY */
5839
5840 /*
5841 * we can lock in the paging_offset once paging_in_progress is set
5842 */
5843 upl->u_size = size;
5844 upl->u_offset = offset + object->paging_offset;
5845
5846 #if CONFIG_IOSCHED || UPL_DEBUG
5847 if (object->io_tracking || upl_debug_enabled) {
5848 vm_object_activity_begin(object);
5849 queue_enter(&object->uplq, upl, upl_t, uplq);
5850 }
5851 #endif
5852 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5853 /*
5854 * Honor copy-on-write obligations
5855 *
5856 * The caller is gathering these pages and
5857 * might modify their contents. We need to
5858 * make sure that the copy object has its own
5859 * private copies of these pages before we let
5860 * the caller modify them.
5861 */
5862 vm_object_update(object,
5863 offset,
5864 size,
5865 NULL,
5866 NULL,
5867 FALSE, /* should_return */
5868 MEMORY_OBJECT_COPY_SYNC,
5869 VM_PROT_NO_CHANGE);
5870
5871 VM_PAGEOUT_DEBUG(upl_cow, 1);
5872 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5873 }
5874 /*
5875 * remember which copy object we synchronized with
5876 */
5877 last_copy_object = object->copy;
5878 entry = 0;
5879
5880 xfer_size = size;
5881 dst_offset = offset;
5882 size_in_pages = size / PAGE_SIZE;
5883
5884 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5885 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5886 object->scan_collisions = 0;
5887 }
5888
5889 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5890 boolean_t isSSD = FALSE;
5891
5892 #if !XNU_TARGET_OS_OSX
5893 isSSD = TRUE;
5894 #else /* !XNU_TARGET_OS_OSX */
5895 vnode_pager_get_isSSD(object->pager, &isSSD);
5896 #endif /* !XNU_TARGET_OS_OSX */
5897 vm_object_unlock(object);
5898
5899 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5900
5901 if (isSSD == TRUE) {
5902 delay(1000 * size_in_pages);
5903 } else {
5904 delay(5000 * size_in_pages);
5905 }
5906 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5907
5908 vm_object_lock(object);
5909 }
5910
5911 while (xfer_size) {
5912 dwp->dw_mask = 0;
5913
5914 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5915 vm_object_unlock(object);
5916 alias_page = vm_page_grab_fictitious(TRUE);
5917 vm_object_lock(object);
5918 }
5919 if (cntrl_flags & UPL_COPYOUT_FROM) {
5920 upl->flags |= UPL_PAGE_SYNC_DONE;
5921
5922 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5923 dst_page->vmp_fictitious ||
5924 dst_page->vmp_absent ||
5925 VMP_ERROR_GET(dst_page) ||
5926 dst_page->vmp_cleaning ||
5927 (VM_PAGE_WIRED(dst_page))) {
5928 if (user_page_list) {
5929 user_page_list[entry].phys_addr = 0;
5930 }
5931
5932 goto try_next_page;
5933 }
5934 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5935
5936 /*
5937 * grab this up front...
5938 * a high percentange of the time we're going to
5939 * need the hardware modification state a bit later
5940 * anyway... so we can eliminate an extra call into
5941 * the pmap layer by grabbing it here and recording it
5942 */
5943 if (dst_page->vmp_pmapped) {
5944 refmod_state = pmap_get_refmod(phys_page);
5945 } else {
5946 refmod_state = 0;
5947 }
5948
5949 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5950 /*
5951 * page is on inactive list and referenced...
5952 * reactivate it now... this gets it out of the
5953 * way of vm_pageout_scan which would have to
5954 * reactivate it upon tripping over it
5955 */
5956 dwp->dw_mask |= DW_vm_page_activate;
5957 }
5958 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5959 /*
5960 * we're only asking for DIRTY pages to be returned
5961 */
5962 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5963 /*
5964 * if we were the page stolen by vm_pageout_scan to be
5965 * cleaned (as opposed to a buddy being clustered in
5966 * or this request is not being driven by a PAGEOUT cluster
5967 * then we only need to check for the page being dirty or
5968 * precious to decide whether to return it
5969 */
5970 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
5971 goto check_busy;
5972 }
5973 goto dont_return;
5974 }
5975 /*
5976 * this is a request for a PAGEOUT cluster and this page
5977 * is merely along for the ride as a 'buddy'... not only
5978 * does it have to be dirty to be returned, but it also
5979 * can't have been referenced recently...
5980 */
5981 if ((hibernate_cleaning_in_progress == TRUE ||
5982 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
5983 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5984 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
5985 goto check_busy;
5986 }
5987 dont_return:
5988 /*
5989 * if we reach here, we're not to return
5990 * the page... go on to the next one
5991 */
5992 if (dst_page->vmp_laundry == TRUE) {
5993 /*
5994 * if we get here, the page is not 'cleaning' (filtered out above).
5995 * since it has been referenced, remove it from the laundry
5996 * so we don't pay the cost of an I/O to clean a page
5997 * we're just going to take back
5998 */
5999 vm_page_lockspin_queues();
6000
6001 vm_pageout_steal_laundry(dst_page, TRUE);
6002 vm_page_activate(dst_page);
6003
6004 vm_page_unlock_queues();
6005 }
6006 if (user_page_list) {
6007 user_page_list[entry].phys_addr = 0;
6008 }
6009
6010 goto try_next_page;
6011 }
6012 check_busy:
6013 if (dst_page->vmp_busy) {
6014 if (cntrl_flags & UPL_NOBLOCK) {
6015 if (user_page_list) {
6016 user_page_list[entry].phys_addr = 0;
6017 }
6018 dwp->dw_mask = 0;
6019
6020 goto try_next_page;
6021 }
6022 /*
6023 * someone else is playing with the
6024 * page. We will have to wait.
6025 */
6026 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6027
6028 continue;
6029 }
6030 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6031 vm_page_lockspin_queues();
6032
6033 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6034 /*
6035 * we've buddied up a page for a clustered pageout
6036 * that has already been moved to the pageout
6037 * queue by pageout_scan... we need to remove
6038 * it from the queue and drop the laundry count
6039 * on that queue
6040 */
6041 vm_pageout_throttle_up(dst_page);
6042 }
6043 vm_page_unlock_queues();
6044 }
6045 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6046 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6047
6048 if (phys_page > upl->highest_page) {
6049 upl->highest_page = phys_page;
6050 }
6051
6052 assert(!pmap_is_noencrypt(phys_page));
6053
6054 if (cntrl_flags & UPL_SET_LITE) {
6055 unsigned int pg_num;
6056
6057 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6058 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6059 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6060
6061 if (hw_dirty) {
6062 if (pmap_flushes_delayed == FALSE) {
6063 pmap_flush_context_init(&pmap_flush_context_storage);
6064 pmap_flushes_delayed = TRUE;
6065 }
6066 pmap_clear_refmod_options(phys_page,
6067 VM_MEM_MODIFIED,
6068 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6069 &pmap_flush_context_storage);
6070 }
6071
6072 /*
6073 * Mark original page as cleaning
6074 * in place.
6075 */
6076 dst_page->vmp_cleaning = TRUE;
6077 dst_page->vmp_precious = FALSE;
6078 } else {
6079 /*
6080 * use pageclean setup, it is more
6081 * convenient even for the pageout
6082 * cases here
6083 */
6084 vm_object_lock(upl->map_object);
6085 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6086 vm_object_unlock(upl->map_object);
6087
6088 alias_page->vmp_absent = FALSE;
6089 alias_page = NULL;
6090 }
6091 if (dirty) {
6092 SET_PAGE_DIRTY(dst_page, FALSE);
6093 } else {
6094 dst_page->vmp_dirty = FALSE;
6095 }
6096
6097 if (!dirty) {
6098 dst_page->vmp_precious = TRUE;
6099 }
6100
6101 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6102 if (!VM_PAGE_WIRED(dst_page)) {
6103 dst_page->vmp_free_when_done = TRUE;
6104 }
6105 }
6106 } else {
6107 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6108 /*
6109 * Honor copy-on-write obligations
6110 *
6111 * The copy object has changed since we
6112 * last synchronized for copy-on-write.
6113 * Another copy object might have been
6114 * inserted while we released the object's
6115 * lock. Since someone could have seen the
6116 * original contents of the remaining pages
6117 * through that new object, we have to
6118 * synchronize with it again for the remaining
6119 * pages only. The previous pages are "busy"
6120 * so they can not be seen through the new
6121 * mapping. The new mapping will see our
6122 * upcoming changes for those previous pages,
6123 * but that's OK since they couldn't see what
6124 * was there before. It's just a race anyway
6125 * and there's no guarantee of consistency or
6126 * atomicity. We just don't want new mappings
6127 * to see both the *before* and *after* pages.
6128 */
6129 if (object->copy != VM_OBJECT_NULL) {
6130 vm_object_update(
6131 object,
6132 dst_offset,/* current offset */
6133 xfer_size, /* remaining size */
6134 NULL,
6135 NULL,
6136 FALSE, /* should_return */
6137 MEMORY_OBJECT_COPY_SYNC,
6138 VM_PROT_NO_CHANGE);
6139
6140 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6141 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6142 }
6143 /*
6144 * remember the copy object we synced with
6145 */
6146 last_copy_object = object->copy;
6147 }
6148 dst_page = vm_page_lookup(object, dst_offset);
6149
6150 if (dst_page != VM_PAGE_NULL) {
6151 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6152 /*
6153 * skip over pages already present in the cache
6154 */
6155 if (user_page_list) {
6156 user_page_list[entry].phys_addr = 0;
6157 }
6158
6159 goto try_next_page;
6160 }
6161 if (dst_page->vmp_fictitious) {
6162 panic("need corner case for fictitious page");
6163 }
6164
6165 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6166 /*
6167 * someone else is playing with the
6168 * page. We will have to wait.
6169 */
6170 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6171
6172 continue;
6173 }
6174 if (dst_page->vmp_laundry) {
6175 vm_pageout_steal_laundry(dst_page, FALSE);
6176 }
6177 } else {
6178 if (object->private) {
6179 /*
6180 * This is a nasty wrinkle for users
6181 * of upl who encounter device or
6182 * private memory however, it is
6183 * unavoidable, only a fault can
6184 * resolve the actual backing
6185 * physical page by asking the
6186 * backing device.
6187 */
6188 if (user_page_list) {
6189 user_page_list[entry].phys_addr = 0;
6190 }
6191
6192 goto try_next_page;
6193 }
6194 if (object->scan_collisions) {
6195 /*
6196 * the pageout_scan thread is trying to steal
6197 * pages from this object, but has run into our
6198 * lock... grab 2 pages from the head of the object...
6199 * the first is freed on behalf of pageout_scan, the
6200 * 2nd is for our own use... we use vm_object_page_grab
6201 * in both cases to avoid taking pages from the free
6202 * list since we are under memory pressure and our
6203 * lock on this object is getting in the way of
6204 * relieving it
6205 */
6206 dst_page = vm_object_page_grab(object);
6207
6208 if (dst_page != VM_PAGE_NULL) {
6209 vm_page_release(dst_page,
6210 FALSE);
6211 }
6212
6213 dst_page = vm_object_page_grab(object);
6214 }
6215 if (dst_page == VM_PAGE_NULL) {
6216 /*
6217 * need to allocate a page
6218 */
6219 dst_page = vm_page_grab_options(grab_options);
6220 if (dst_page != VM_PAGE_NULL) {
6221 page_grab_count++;
6222 }
6223 }
6224 if (dst_page == VM_PAGE_NULL) {
6225 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6226 /*
6227 * we don't want to stall waiting for pages to come onto the free list
6228 * while we're already holding absent pages in this UPL
6229 * the caller will deal with the empty slots
6230 */
6231 if (user_page_list) {
6232 user_page_list[entry].phys_addr = 0;
6233 }
6234
6235 goto try_next_page;
6236 }
6237 /*
6238 * no pages available... wait
6239 * then try again for the same
6240 * offset...
6241 */
6242 vm_object_unlock(object);
6243
6244 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6245
6246 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6247
6248 VM_PAGE_WAIT();
6249 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6250
6251 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6252
6253 vm_object_lock(object);
6254
6255 continue;
6256 }
6257 vm_page_insert(dst_page, object, dst_offset);
6258
6259 dst_page->vmp_absent = TRUE;
6260 dst_page->vmp_busy = FALSE;
6261
6262 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6263 /*
6264 * if UPL_RET_ONLY_ABSENT was specified,
6265 * than we're definitely setting up a
6266 * upl for a clustered read/pagein
6267 * operation... mark the pages as clustered
6268 * so upl_commit_range can put them on the
6269 * speculative list
6270 */
6271 dst_page->vmp_clustered = TRUE;
6272
6273 if (!(cntrl_flags & UPL_FILE_IO)) {
6274 counter_inc(&vm_statistics_pageins);
6275 }
6276 }
6277 }
6278 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6279
6280 dst_page->vmp_overwriting = TRUE;
6281
6282 if (dst_page->vmp_pmapped) {
6283 if (!(cntrl_flags & UPL_FILE_IO)) {
6284 /*
6285 * eliminate all mappings from the
6286 * original object and its prodigy
6287 */
6288 refmod_state = pmap_disconnect(phys_page);
6289 } else {
6290 refmod_state = pmap_get_refmod(phys_page);
6291 }
6292 } else {
6293 refmod_state = 0;
6294 }
6295
6296 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6297 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6298
6299 if (cntrl_flags & UPL_SET_LITE) {
6300 unsigned int pg_num;
6301
6302 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6303 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6304 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6305
6306 if (hw_dirty) {
6307 pmap_clear_modify(phys_page);
6308 }
6309
6310 /*
6311 * Mark original page as cleaning
6312 * in place.
6313 */
6314 dst_page->vmp_cleaning = TRUE;
6315 dst_page->vmp_precious = FALSE;
6316 } else {
6317 /*
6318 * use pageclean setup, it is more
6319 * convenient even for the pageout
6320 * cases here
6321 */
6322 vm_object_lock(upl->map_object);
6323 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6324 vm_object_unlock(upl->map_object);
6325
6326 alias_page->vmp_absent = FALSE;
6327 alias_page = NULL;
6328 }
6329
6330 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6331 upl->flags &= ~UPL_CLEAR_DIRTY;
6332 upl->flags |= UPL_SET_DIRTY;
6333 dirty = TRUE;
6334 /*
6335 * Page belonging to a code-signed object is about to
6336 * be written. Mark it tainted and disconnect it from
6337 * all pmaps so processes have to fault it back in and
6338 * deal with the tainted bit.
6339 */
6340 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6341 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6342 vm_page_upl_tainted++;
6343 if (dst_page->vmp_pmapped) {
6344 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6345 if (refmod_state & VM_MEM_REFERENCED) {
6346 dst_page->vmp_reference = TRUE;
6347 }
6348 }
6349 }
6350 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6351 /*
6352 * clean in place for read implies
6353 * that a write will be done on all
6354 * the pages that are dirty before
6355 * a upl commit is done. The caller
6356 * is obligated to preserve the
6357 * contents of all pages marked dirty
6358 */
6359 upl->flags |= UPL_CLEAR_DIRTY;
6360 }
6361 dst_page->vmp_dirty = dirty;
6362
6363 if (!dirty) {
6364 dst_page->vmp_precious = TRUE;
6365 }
6366
6367 if (!VM_PAGE_WIRED(dst_page)) {
6368 /*
6369 * deny access to the target page while
6370 * it is being worked on
6371 */
6372 dst_page->vmp_busy = TRUE;
6373 } else {
6374 dwp->dw_mask |= DW_vm_page_wire;
6375 }
6376
6377 /*
6378 * We might be about to satisfy a fault which has been
6379 * requested. So no need for the "restart" bit.
6380 */
6381 dst_page->vmp_restart = FALSE;
6382 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6383 /*
6384 * expect the page to be used
6385 */
6386 dwp->dw_mask |= DW_set_reference;
6387 }
6388 if (cntrl_flags & UPL_PRECIOUS) {
6389 if (object->internal) {
6390 SET_PAGE_DIRTY(dst_page, FALSE);
6391 dst_page->vmp_precious = FALSE;
6392 } else {
6393 dst_page->vmp_precious = TRUE;
6394 }
6395 } else {
6396 dst_page->vmp_precious = FALSE;
6397 }
6398 }
6399 if (dst_page->vmp_busy) {
6400 upl->flags |= UPL_HAS_BUSY;
6401 }
6402
6403 if (phys_page > upl->highest_page) {
6404 upl->highest_page = phys_page;
6405 }
6406 assert(!pmap_is_noencrypt(phys_page));
6407 if (user_page_list) {
6408 user_page_list[entry].phys_addr = phys_page;
6409 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6410 user_page_list[entry].absent = dst_page->vmp_absent;
6411 user_page_list[entry].dirty = dst_page->vmp_dirty;
6412 user_page_list[entry].precious = dst_page->vmp_precious;
6413 user_page_list[entry].device = FALSE;
6414 user_page_list[entry].needed = FALSE;
6415 if (dst_page->vmp_clustered == TRUE) {
6416 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6417 } else {
6418 user_page_list[entry].speculative = FALSE;
6419 }
6420 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6421 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6422 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6423 user_page_list[entry].mark = FALSE;
6424 }
6425 /*
6426 * if UPL_RET_ONLY_ABSENT is set, then
6427 * we are working with a fresh page and we've
6428 * just set the clustered flag on it to
6429 * indicate that it was drug in as part of a
6430 * speculative cluster... so leave it alone
6431 */
6432 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6433 /*
6434 * someone is explicitly grabbing this page...
6435 * update clustered and speculative state
6436 *
6437 */
6438 if (dst_page->vmp_clustered) {
6439 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6440 }
6441 }
6442 try_next_page:
6443 if (dwp->dw_mask) {
6444 if (dwp->dw_mask & DW_vm_page_activate) {
6445 counter_inc(&vm_statistics_reactivations);
6446 }
6447
6448 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6449
6450 if (dw_count >= dw_limit) {
6451 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6452
6453 dwp = dwp_start;
6454 dw_count = 0;
6455 }
6456 }
6457 entry++;
6458 dst_offset += PAGE_SIZE_64;
6459 xfer_size -= PAGE_SIZE;
6460 }
6461 if (dw_count) {
6462 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6463 dwp = dwp_start;
6464 dw_count = 0;
6465 }
6466
6467 if (alias_page != NULL) {
6468 VM_PAGE_FREE(alias_page);
6469 }
6470 if (pmap_flushes_delayed == TRUE) {
6471 pmap_flush(&pmap_flush_context_storage);
6472 }
6473
6474 if (page_list_count != NULL) {
6475 if (upl->flags & UPL_INTERNAL) {
6476 *page_list_count = 0;
6477 } else if (*page_list_count > entry) {
6478 *page_list_count = entry;
6479 }
6480 }
6481 #if UPL_DEBUG
6482 upl->upl_state = 1;
6483 #endif
6484 vm_object_unlock(object);
6485
6486 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6487 #if DEVELOPMENT || DEBUG
6488 if (task != NULL) {
6489 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6490 }
6491 #endif /* DEVELOPMENT || DEBUG */
6492
6493 if (dwp_start && dwp_finish_ctx) {
6494 vm_page_delayed_work_finish_ctx(dwp_start);
6495 dwp_start = dwp = NULL;
6496 }
6497
6498 return KERN_SUCCESS;
6499 }
6500
6501 /*
6502 * Routine: vm_object_super_upl_request
6503 * Purpose:
6504 * Cause the population of a portion of a vm_object
6505 * in much the same way as memory_object_upl_request.
6506 * Depending on the nature of the request, the pages
6507 * returned may be contain valid data or be uninitialized.
6508 * However, the region may be expanded up to the super
6509 * cluster size provided.
6510 */
6511
6512 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6513 vm_object_super_upl_request(
6514 vm_object_t object,
6515 vm_object_offset_t offset,
6516 upl_size_t size,
6517 upl_size_t super_cluster,
6518 upl_t *upl,
6519 upl_page_info_t *user_page_list,
6520 unsigned int *page_list_count,
6521 upl_control_flags_t cntrl_flags,
6522 vm_tag_t tag)
6523 {
6524 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6525 return KERN_FAILURE;
6526 }
6527
6528 assert(object->paging_in_progress);
6529 offset = offset - object->paging_offset;
6530
6531 if (super_cluster > size) {
6532 vm_object_offset_t base_offset;
6533 upl_size_t super_size;
6534 vm_object_size_t super_size_64;
6535
6536 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6537 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6538 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6539 super_size = (upl_size_t) super_size_64;
6540 assert(super_size == super_size_64);
6541
6542 if (offset > (base_offset + super_size)) {
6543 panic("vm_object_super_upl_request: Missed target pageout"
6544 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6545 offset, base_offset, super_size, super_cluster,
6546 size, object->paging_offset);
6547 }
6548 /*
6549 * apparently there is a case where the vm requests a
6550 * page to be written out who's offset is beyond the
6551 * object size
6552 */
6553 if ((offset + size) > (base_offset + super_size)) {
6554 super_size_64 = (offset + size) - base_offset;
6555 super_size = (upl_size_t) super_size_64;
6556 assert(super_size == super_size_64);
6557 }
6558
6559 offset = base_offset;
6560 size = super_size;
6561 }
6562 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6563 }
6564
6565 int cs_executable_create_upl = 0;
6566 extern int proc_selfpid(void);
6567 extern char *proc_name_address(void *p);
6568
6569 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6570 vm_map_create_upl(
6571 vm_map_t map,
6572 vm_map_address_t offset,
6573 upl_size_t *upl_size,
6574 upl_t *upl,
6575 upl_page_info_array_t page_list,
6576 unsigned int *count,
6577 upl_control_flags_t *flags,
6578 vm_tag_t tag)
6579 {
6580 vm_map_entry_t entry;
6581 upl_control_flags_t caller_flags;
6582 int force_data_sync;
6583 int sync_cow_data;
6584 vm_object_t local_object;
6585 vm_map_offset_t local_offset;
6586 vm_map_offset_t local_start;
6587 kern_return_t ret;
6588 vm_map_address_t original_offset;
6589 vm_map_size_t original_size, adjusted_size;
6590 vm_map_offset_t local_entry_start;
6591 vm_object_offset_t local_entry_offset;
6592 vm_object_offset_t offset_in_mapped_page;
6593 boolean_t release_map = FALSE;
6594
6595 start_with_map:
6596
6597 original_offset = offset;
6598 original_size = *upl_size;
6599 adjusted_size = original_size;
6600
6601 caller_flags = *flags;
6602
6603 if (caller_flags & ~UPL_VALID_FLAGS) {
6604 /*
6605 * For forward compatibility's sake,
6606 * reject any unknown flag.
6607 */
6608 ret = KERN_INVALID_VALUE;
6609 goto done;
6610 }
6611 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6612 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6613
6614 if (upl == NULL) {
6615 ret = KERN_INVALID_ARGUMENT;
6616 goto done;
6617 }
6618
6619 REDISCOVER_ENTRY:
6620 vm_map_lock_read(map);
6621
6622 if (!vm_map_lookup_entry(map, offset, &entry)) {
6623 vm_map_unlock_read(map);
6624 ret = KERN_FAILURE;
6625 goto done;
6626 }
6627
6628 local_entry_start = entry->vme_start;
6629 local_entry_offset = VME_OFFSET(entry);
6630
6631 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6632 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6633 }
6634
6635 if (entry->vme_end - original_offset < adjusted_size) {
6636 adjusted_size = entry->vme_end - original_offset;
6637 assert(adjusted_size > 0);
6638 *upl_size = (upl_size_t) adjusted_size;
6639 assert(*upl_size == adjusted_size);
6640 }
6641
6642 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6643 *flags = 0;
6644
6645 if (!entry->is_sub_map &&
6646 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6647 if (VME_OBJECT(entry)->private) {
6648 *flags = UPL_DEV_MEMORY;
6649 }
6650
6651 if (VME_OBJECT(entry)->phys_contiguous) {
6652 *flags |= UPL_PHYS_CONTIG;
6653 }
6654 }
6655 vm_map_unlock_read(map);
6656 ret = KERN_SUCCESS;
6657 goto done;
6658 }
6659
6660 offset_in_mapped_page = 0;
6661 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6662 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6663 *upl_size = (upl_size_t)
6664 (vm_map_round_page(original_offset + adjusted_size,
6665 VM_MAP_PAGE_MASK(map))
6666 - offset);
6667
6668 offset_in_mapped_page = original_offset - offset;
6669 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6670
6671 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6672 }
6673
6674 if (!entry->is_sub_map) {
6675 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6676 !VME_OBJECT(entry)->phys_contiguous) {
6677 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6678 *upl_size = MAX_UPL_SIZE_BYTES;
6679 }
6680 }
6681
6682 /*
6683 * Create an object if necessary.
6684 */
6685 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6686 if (vm_map_lock_read_to_write(map)) {
6687 goto REDISCOVER_ENTRY;
6688 }
6689
6690 VME_OBJECT_SET(entry,
6691 vm_object_allocate((vm_size_t)
6692 vm_object_round_page((entry->vme_end - entry->vme_start))),
6693 false, 0);
6694 VME_OFFSET_SET(entry, 0);
6695 assert(entry->use_pmap);
6696
6697 vm_map_lock_write_to_read(map);
6698 }
6699
6700 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6701 !(entry->protection & VM_PROT_WRITE)) {
6702 vm_map_unlock_read(map);
6703 ret = KERN_PROTECTION_FAILURE;
6704 goto done;
6705 }
6706 }
6707
6708 #if !XNU_TARGET_OS_OSX
6709 if (map->pmap != kernel_pmap &&
6710 (caller_flags & UPL_COPYOUT_FROM) &&
6711 (entry->protection & VM_PROT_EXECUTE) &&
6712 !(entry->protection & VM_PROT_WRITE)) {
6713 vm_offset_t kaddr;
6714 vm_size_t ksize;
6715
6716 /*
6717 * We're about to create a read-only UPL backed by
6718 * memory from an executable mapping.
6719 * Wiring the pages would result in the pages being copied
6720 * (due to the "MAP_PRIVATE" mapping) and no longer
6721 * code-signed, so no longer eligible for execution.
6722 * Instead, let's copy the data into a kernel buffer and
6723 * create the UPL from this kernel buffer.
6724 * The kernel buffer is then freed, leaving the UPL holding
6725 * the last reference on the VM object, so the memory will
6726 * be released when the UPL is committed.
6727 */
6728
6729 vm_map_unlock_read(map);
6730 entry = VM_MAP_ENTRY_NULL;
6731 /* allocate kernel buffer */
6732 ksize = round_page(*upl_size);
6733 kaddr = 0;
6734 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6735 KMA_PAGEABLE | KMA_DATA, tag);
6736 if (ret == KERN_SUCCESS) {
6737 /* copyin the user data */
6738 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6739 }
6740 if (ret == KERN_SUCCESS) {
6741 if (ksize > *upl_size) {
6742 /* zero out the extra space in kernel buffer */
6743 memset((void *)(kaddr + *upl_size),
6744 0,
6745 ksize - *upl_size);
6746 }
6747 /* create the UPL from the kernel buffer */
6748 vm_object_offset_t offset_in_object;
6749 vm_object_offset_t offset_in_object_page;
6750
6751 offset_in_object = offset - local_entry_start + local_entry_offset;
6752 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6753 assert(offset_in_object_page < PAGE_SIZE);
6754 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6755 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6756 ret = vm_map_create_upl(kernel_map,
6757 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6758 upl_size, upl, page_list, count, flags, tag);
6759 }
6760 if (kaddr != 0) {
6761 /* free the kernel buffer */
6762 kmem_free(kernel_map, kaddr, ksize);
6763 kaddr = 0;
6764 ksize = 0;
6765 }
6766 #if DEVELOPMENT || DEBUG
6767 DTRACE_VM4(create_upl_from_executable,
6768 vm_map_t, map,
6769 vm_map_address_t, offset,
6770 upl_size_t, *upl_size,
6771 kern_return_t, ret);
6772 #endif /* DEVELOPMENT || DEBUG */
6773 goto done;
6774 }
6775 #endif /* !XNU_TARGET_OS_OSX */
6776
6777 if (!entry->is_sub_map) {
6778 local_object = VME_OBJECT(entry);
6779 assert(local_object != VM_OBJECT_NULL);
6780 }
6781
6782 if (!entry->is_sub_map &&
6783 !entry->needs_copy &&
6784 *upl_size != 0 &&
6785 local_object->vo_size > *upl_size && /* partial UPL */
6786 entry->wired_count == 0 && /* No COW for entries that are wired */
6787 (map->pmap != kernel_pmap) && /* alias checks */
6788 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6789 ||
6790 ( /* case 2 */
6791 local_object->internal &&
6792 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6793 local_object->ref_count > 1))) {
6794 vm_prot_t prot;
6795
6796 /*
6797 * Case 1:
6798 * Set up the targeted range for copy-on-write to avoid
6799 * applying true_share/copy_delay to the entire object.
6800 *
6801 * Case 2:
6802 * This map entry covers only part of an internal
6803 * object. There could be other map entries covering
6804 * other areas of this object and some of these map
6805 * entries could be marked as "needs_copy", which
6806 * assumes that the object is COPY_SYMMETRIC.
6807 * To avoid marking this object as COPY_DELAY and
6808 * "true_share", let's shadow it and mark the new
6809 * (smaller) object as "true_share" and COPY_DELAY.
6810 */
6811
6812 if (vm_map_lock_read_to_write(map)) {
6813 goto REDISCOVER_ENTRY;
6814 }
6815 vm_map_lock_assert_exclusive(map);
6816 assert(VME_OBJECT(entry) == local_object);
6817
6818 vm_map_clip_start(map,
6819 entry,
6820 vm_map_trunc_page(offset,
6821 VM_MAP_PAGE_MASK(map)));
6822 vm_map_clip_end(map,
6823 entry,
6824 vm_map_round_page(offset + *upl_size,
6825 VM_MAP_PAGE_MASK(map)));
6826 if ((entry->vme_end - offset) < *upl_size) {
6827 *upl_size = (upl_size_t) (entry->vme_end - offset);
6828 assert(*upl_size == entry->vme_end - offset);
6829 }
6830
6831 prot = entry->protection & ~VM_PROT_WRITE;
6832 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6833 prot |= VM_PROT_EXECUTE;
6834 }
6835 vm_object_pmap_protect(local_object,
6836 VME_OFFSET(entry),
6837 entry->vme_end - entry->vme_start,
6838 ((entry->is_shared ||
6839 map->mapped_in_other_pmaps)
6840 ? PMAP_NULL
6841 : map->pmap),
6842 VM_MAP_PAGE_SIZE(map),
6843 entry->vme_start,
6844 prot);
6845
6846 assert(entry->wired_count == 0);
6847
6848 /*
6849 * Lock the VM object and re-check its status: if it's mapped
6850 * in another address space, we could still be racing with
6851 * another thread holding that other VM map exclusively.
6852 */
6853 vm_object_lock(local_object);
6854 if (local_object->true_share) {
6855 /* object is already in proper state: no COW needed */
6856 assert(local_object->copy_strategy !=
6857 MEMORY_OBJECT_COPY_SYMMETRIC);
6858 } else {
6859 /* not true_share: ask for copy-on-write below */
6860 assert(local_object->copy_strategy ==
6861 MEMORY_OBJECT_COPY_SYMMETRIC);
6862 entry->needs_copy = TRUE;
6863 }
6864 vm_object_unlock(local_object);
6865
6866 vm_map_lock_write_to_read(map);
6867 }
6868
6869 if (entry->needs_copy) {
6870 /*
6871 * Honor copy-on-write for COPY_SYMMETRIC
6872 * strategy.
6873 */
6874 vm_map_t local_map;
6875 vm_object_t object;
6876 vm_object_offset_t new_offset;
6877 vm_prot_t prot;
6878 boolean_t wired;
6879 vm_map_version_t version;
6880 vm_map_t real_map;
6881 vm_prot_t fault_type;
6882
6883 local_map = map;
6884
6885 if (caller_flags & UPL_COPYOUT_FROM) {
6886 fault_type = VM_PROT_READ | VM_PROT_COPY;
6887 vm_counters.create_upl_extra_cow++;
6888 vm_counters.create_upl_extra_cow_pages +=
6889 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6890 } else {
6891 fault_type = VM_PROT_WRITE;
6892 }
6893 if (vm_map_lookup_and_lock_object(&local_map,
6894 offset, fault_type,
6895 OBJECT_LOCK_EXCLUSIVE,
6896 &version, &object,
6897 &new_offset, &prot, &wired,
6898 NULL,
6899 &real_map, NULL) != KERN_SUCCESS) {
6900 if (fault_type == VM_PROT_WRITE) {
6901 vm_counters.create_upl_lookup_failure_write++;
6902 } else {
6903 vm_counters.create_upl_lookup_failure_copy++;
6904 }
6905 vm_map_unlock_read(local_map);
6906 ret = KERN_FAILURE;
6907 goto done;
6908 }
6909 if (real_map != local_map) {
6910 vm_map_unlock(real_map);
6911 }
6912 vm_map_unlock_read(local_map);
6913
6914 vm_object_unlock(object);
6915
6916 goto REDISCOVER_ENTRY;
6917 }
6918
6919 if (entry->is_sub_map) {
6920 vm_map_t submap;
6921
6922 submap = VME_SUBMAP(entry);
6923 local_start = entry->vme_start;
6924 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6925
6926 vm_map_reference(submap);
6927 vm_map_unlock_read(map);
6928
6929 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6930 offset += offset_in_mapped_page;
6931 *upl_size -= offset_in_mapped_page;
6932
6933 if (release_map) {
6934 vm_map_deallocate(map);
6935 }
6936 map = submap;
6937 release_map = TRUE;
6938 offset = local_offset + (offset - local_start);
6939 goto start_with_map;
6940 }
6941
6942 if (sync_cow_data &&
6943 (VME_OBJECT(entry)->shadow ||
6944 VME_OBJECT(entry)->copy)) {
6945 local_object = VME_OBJECT(entry);
6946 local_start = entry->vme_start;
6947 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6948
6949 vm_object_reference(local_object);
6950 vm_map_unlock_read(map);
6951
6952 if (local_object->shadow && local_object->copy) {
6953 vm_object_lock_request(local_object->shadow,
6954 ((vm_object_offset_t)
6955 ((offset - local_start) +
6956 local_offset) +
6957 local_object->vo_shadow_offset),
6958 *upl_size, FALSE,
6959 MEMORY_OBJECT_DATA_SYNC,
6960 VM_PROT_NO_CHANGE);
6961 }
6962 sync_cow_data = FALSE;
6963 vm_object_deallocate(local_object);
6964
6965 goto REDISCOVER_ENTRY;
6966 }
6967 if (force_data_sync) {
6968 local_object = VME_OBJECT(entry);
6969 local_start = entry->vme_start;
6970 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6971
6972 vm_object_reference(local_object);
6973 vm_map_unlock_read(map);
6974
6975 vm_object_lock_request(local_object,
6976 ((vm_object_offset_t)
6977 ((offset - local_start) +
6978 local_offset)),
6979 (vm_object_size_t)*upl_size,
6980 FALSE,
6981 MEMORY_OBJECT_DATA_SYNC,
6982 VM_PROT_NO_CHANGE);
6983
6984 force_data_sync = FALSE;
6985 vm_object_deallocate(local_object);
6986
6987 goto REDISCOVER_ENTRY;
6988 }
6989 if (VME_OBJECT(entry)->private) {
6990 *flags = UPL_DEV_MEMORY;
6991 } else {
6992 *flags = 0;
6993 }
6994
6995 if (VME_OBJECT(entry)->phys_contiguous) {
6996 *flags |= UPL_PHYS_CONTIG;
6997 }
6998
6999 local_object = VME_OBJECT(entry);
7000 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7001 local_start = entry->vme_start;
7002
7003 /*
7004 * Wiring will copy the pages to the shadow object.
7005 * The shadow object will not be code-signed so
7006 * attempting to execute code from these copied pages
7007 * would trigger a code-signing violation.
7008 */
7009 if (entry->protection & VM_PROT_EXECUTE) {
7010 #if MACH_ASSERT
7011 printf("pid %d[%s] create_upl out of executable range from "
7012 "0x%llx to 0x%llx: side effects may include "
7013 "code-signing violations later on\n",
7014 proc_selfpid(),
7015 (get_bsdtask_info(current_task())
7016 ? proc_name_address(get_bsdtask_info(current_task()))
7017 : "?"),
7018 (uint64_t) entry->vme_start,
7019 (uint64_t) entry->vme_end);
7020 #endif /* MACH_ASSERT */
7021 DTRACE_VM2(cs_executable_create_upl,
7022 uint64_t, (uint64_t)entry->vme_start,
7023 uint64_t, (uint64_t)entry->vme_end);
7024 cs_executable_create_upl++;
7025 }
7026
7027 vm_object_lock(local_object);
7028
7029 /*
7030 * Ensure that this object is "true_share" and "copy_delay" now,
7031 * while we're still holding the VM map lock. After we unlock the map,
7032 * anything could happen to that mapping, including some copy-on-write
7033 * activity. We need to make sure that the IOPL will point at the
7034 * same memory as the mapping.
7035 */
7036 if (local_object->true_share) {
7037 assert(local_object->copy_strategy !=
7038 MEMORY_OBJECT_COPY_SYMMETRIC);
7039 } else if (local_object != kernel_object &&
7040 local_object != compressor_object &&
7041 !local_object->phys_contiguous) {
7042 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7043 if (!local_object->true_share &&
7044 vm_object_tracking_btlog) {
7045 btlog_record(vm_object_tracking_btlog, local_object,
7046 VM_OBJECT_TRACKING_OP_TRUESHARE,
7047 btref_get(__builtin_frame_address(0), 0));
7048 }
7049 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7050 local_object->true_share = TRUE;
7051 if (local_object->copy_strategy ==
7052 MEMORY_OBJECT_COPY_SYMMETRIC) {
7053 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7054 }
7055 }
7056
7057 vm_object_reference_locked(local_object);
7058 vm_object_unlock(local_object);
7059
7060 vm_map_unlock_read(map);
7061
7062 offset += offset_in_mapped_page;
7063 assert(*upl_size > offset_in_mapped_page);
7064 *upl_size -= offset_in_mapped_page;
7065
7066 ret = vm_object_iopl_request(local_object,
7067 ((vm_object_offset_t)
7068 ((offset - local_start) + local_offset)),
7069 *upl_size,
7070 upl,
7071 page_list,
7072 count,
7073 caller_flags,
7074 tag);
7075 vm_object_deallocate(local_object);
7076
7077 done:
7078 if (release_map) {
7079 vm_map_deallocate(map);
7080 }
7081
7082 return ret;
7083 }
7084
7085 /*
7086 * Internal routine to enter a UPL into a VM map.
7087 *
7088 * JMM - This should just be doable through the standard
7089 * vm_map_enter() API.
7090 */
7091 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7092 vm_map_enter_upl_range(
7093 vm_map_t map,
7094 upl_t upl,
7095 vm_object_offset_t offset_to_map,
7096 upl_size_t size_to_map,
7097 vm_prot_t prot_to_map,
7098 vm_map_offset_t *dst_addr)
7099 {
7100 vm_map_size_t size;
7101 vm_object_offset_t offset;
7102 vm_map_offset_t addr;
7103 vm_page_t m;
7104 kern_return_t kr;
7105 int isVectorUPL = 0, curr_upl = 0;
7106 upl_t vector_upl = NULL;
7107 mach_vm_offset_t vector_upl_dst_addr = 0;
7108 vm_map_t vector_upl_submap = NULL;
7109 upl_offset_t subupl_offset = 0;
7110 upl_size_t subupl_size = 0;
7111
7112 if (upl == UPL_NULL) {
7113 return KERN_INVALID_ARGUMENT;
7114 }
7115
7116 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7117 assert(map == kernel_map);
7118
7119 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7120 int mapped = 0, valid_upls = 0;
7121 vector_upl = upl;
7122
7123 upl_lock(vector_upl);
7124 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7125 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7126 if (upl == NULL) {
7127 continue;
7128 }
7129 valid_upls++;
7130 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7131 mapped++;
7132 }
7133 }
7134
7135 if (mapped) {
7136 if (mapped != valid_upls) {
7137 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7138 } else {
7139 upl_unlock(vector_upl);
7140 return KERN_FAILURE;
7141 }
7142 }
7143
7144 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7145 panic("TODO4K: vector UPL not implemented");
7146 }
7147
7148 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7149 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7150 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7151 VM_KERN_MEMORY_NONE).kmr_submap;
7152 map = vector_upl_submap;
7153 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7154 curr_upl = 0;
7155 } else {
7156 upl_lock(upl);
7157 }
7158
7159 process_upl_to_enter:
7160 if (isVectorUPL) {
7161 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7162 *dst_addr = vector_upl_dst_addr;
7163 upl_unlock(vector_upl);
7164 return KERN_SUCCESS;
7165 }
7166 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7167 if (upl == NULL) {
7168 goto process_upl_to_enter;
7169 }
7170
7171 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7172 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7173 } else {
7174 /*
7175 * check to see if already mapped
7176 */
7177 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7178 upl_unlock(upl);
7179 return KERN_FAILURE;
7180 }
7181 }
7182
7183 if ((!(upl->flags & UPL_SHADOWED)) &&
7184 ((upl->flags & UPL_HAS_BUSY) ||
7185 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7186 vm_object_t object;
7187 vm_page_t alias_page;
7188 vm_object_offset_t new_offset;
7189 unsigned int pg_num;
7190 wpl_array_t lite_list;
7191
7192 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7193 if (upl->flags & UPL_INTERNAL) {
7194 lite_list = (wpl_array_t)
7195 ((((uintptr_t)upl) + sizeof(struct upl))
7196 + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7197 } else {
7198 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7199 }
7200 object = upl->map_object;
7201 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7202
7203 vm_object_lock(upl->map_object);
7204
7205 upl->map_object->shadow = object;
7206 upl->map_object->pageout = TRUE;
7207 upl->map_object->can_persist = FALSE;
7208 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7209 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7210 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7211 "object %p shadow_offset 0x%llx",
7212 upl->map_object,
7213 (uint64_t)upl->map_object->vo_shadow_offset);
7214 upl->map_object->wimg_bits = object->wimg_bits;
7215 offset = upl->map_object->vo_shadow_offset;
7216 new_offset = 0;
7217
7218 upl->flags |= UPL_SHADOWED;
7219
7220 while (size) {
7221 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7222 assert(pg_num == new_offset / PAGE_SIZE);
7223
7224 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7225 alias_page = vm_page_grab_fictitious(TRUE);
7226
7227 vm_object_lock(object);
7228
7229 m = vm_page_lookup(object, offset);
7230 if (m == VM_PAGE_NULL) {
7231 panic("vm_upl_map: page missing");
7232 }
7233
7234 /*
7235 * Convert the fictitious page to a private
7236 * shadow of the real page.
7237 */
7238 assert(alias_page->vmp_fictitious);
7239 alias_page->vmp_fictitious = FALSE;
7240 alias_page->vmp_private = TRUE;
7241 alias_page->vmp_free_when_done = TRUE;
7242 /*
7243 * since m is a page in the upl it must
7244 * already be wired or BUSY, so it's
7245 * safe to assign the underlying physical
7246 * page to the alias
7247 */
7248 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7249
7250 vm_object_unlock(object);
7251
7252 vm_page_lockspin_queues();
7253 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7254 vm_page_unlock_queues();
7255
7256 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7257
7258 assert(!alias_page->vmp_wanted);
7259 alias_page->vmp_busy = FALSE;
7260 alias_page->vmp_absent = FALSE;
7261 }
7262 size -= PAGE_SIZE;
7263 offset += PAGE_SIZE_64;
7264 new_offset += PAGE_SIZE_64;
7265 }
7266 vm_object_unlock(upl->map_object);
7267 }
7268 if (upl->flags & UPL_SHADOWED) {
7269 if (isVectorUPL) {
7270 offset = 0;
7271 } else {
7272 offset = offset_to_map;
7273 }
7274 } else {
7275 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7276 if (!isVectorUPL) {
7277 offset += offset_to_map;
7278 }
7279 }
7280
7281 if (isVectorUPL) {
7282 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7283 } else {
7284 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7285 }
7286
7287 vm_object_reference(upl->map_object);
7288
7289 if (!isVectorUPL) {
7290 *dst_addr = 0;
7291 /*
7292 * NEED A UPL_MAP ALIAS
7293 */
7294 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7295 VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_DATA, VM_KERN_MEMORY_OSFMK,
7296 upl->map_object, offset, FALSE,
7297 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7298
7299 if (kr != KERN_SUCCESS) {
7300 vm_object_deallocate(upl->map_object);
7301 upl_unlock(upl);
7302 return kr;
7303 }
7304 } else {
7305 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7306 VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7307 upl->map_object, offset, FALSE,
7308 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7309 if (kr) {
7310 panic("vm_map_enter failed for a Vector UPL");
7311 }
7312 }
7313 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7314 /* this will have to be an increment rather than */
7315 /* an assignment. */
7316 vm_object_lock(upl->map_object);
7317
7318 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7319 m = vm_page_lookup(upl->map_object, offset);
7320
7321 if (m) {
7322 m->vmp_pmapped = TRUE;
7323
7324 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7325 * but only in kernel space. If this was on a user map,
7326 * we'd have to set the wpmapped bit. */
7327 /* m->vmp_wpmapped = TRUE; */
7328 assert(map->pmap == kernel_pmap);
7329
7330 PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7331
7332 assert(kr == KERN_SUCCESS);
7333 #if KASAN
7334 kasan_notify_address(addr, PAGE_SIZE_64);
7335 #endif
7336 }
7337 offset += PAGE_SIZE_64;
7338 }
7339 vm_object_unlock(upl->map_object);
7340
7341 /*
7342 * hold a reference for the mapping
7343 */
7344 upl->ref_count++;
7345 upl->flags |= UPL_PAGE_LIST_MAPPED;
7346 upl->kaddr = (vm_offset_t) *dst_addr;
7347 assert(upl->kaddr == *dst_addr);
7348
7349 if (isVectorUPL) {
7350 goto process_upl_to_enter;
7351 }
7352
7353 if (!isVectorUPL) {
7354 vm_map_offset_t addr_adjustment;
7355
7356 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7357 if (addr_adjustment) {
7358 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7359 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7360 *dst_addr += addr_adjustment;
7361 }
7362 }
7363
7364 upl_unlock(upl);
7365
7366 return KERN_SUCCESS;
7367 }
7368
7369 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7370 vm_map_enter_upl(
7371 vm_map_t map,
7372 upl_t upl,
7373 vm_map_offset_t *dst_addr)
7374 {
7375 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7376 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7377 }
7378
7379 /*
7380 * Internal routine to remove a UPL mapping from a VM map.
7381 *
7382 * XXX - This should just be doable through a standard
7383 * vm_map_remove() operation. Otherwise, implicit clean-up
7384 * of the target map won't be able to correctly remove
7385 * these (and release the reference on the UPL). Having
7386 * to do this means we can't map these into user-space
7387 * maps yet.
7388 */
7389 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7390 vm_map_remove_upl_range(
7391 vm_map_t map,
7392 upl_t upl,
7393 __unused vm_object_offset_t offset_to_unmap,
7394 __unused upl_size_t size_to_unmap)
7395 {
7396 vm_address_t addr;
7397 upl_size_t size;
7398 int isVectorUPL = 0, curr_upl = 0;
7399 upl_t vector_upl = NULL;
7400
7401 if (upl == UPL_NULL) {
7402 return KERN_INVALID_ARGUMENT;
7403 }
7404
7405 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7406 int unmapped = 0, valid_upls = 0;
7407 vector_upl = upl;
7408 upl_lock(vector_upl);
7409 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7410 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7411 if (upl == NULL) {
7412 continue;
7413 }
7414 valid_upls++;
7415 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7416 unmapped++;
7417 }
7418 }
7419
7420 if (unmapped) {
7421 if (unmapped != valid_upls) {
7422 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7423 } else {
7424 upl_unlock(vector_upl);
7425 return KERN_FAILURE;
7426 }
7427 }
7428 curr_upl = 0;
7429 } else {
7430 upl_lock(upl);
7431 }
7432
7433 process_upl_to_remove:
7434 if (isVectorUPL) {
7435 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7436 vm_map_t v_upl_submap;
7437 vm_offset_t v_upl_submap_dst_addr;
7438 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7439
7440 kmem_free_guard(map, v_upl_submap_dst_addr,
7441 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7442 vm_map_deallocate(v_upl_submap);
7443 upl_unlock(vector_upl);
7444 return KERN_SUCCESS;
7445 }
7446
7447 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7448 if (upl == NULL) {
7449 goto process_upl_to_remove;
7450 }
7451 }
7452
7453 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7454 addr = upl->kaddr;
7455 size = upl->u_mapped_size;
7456
7457 assert(upl->ref_count > 1);
7458 upl->ref_count--; /* removing mapping ref */
7459
7460 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7461 upl->kaddr = (vm_offset_t) 0;
7462 upl->u_mapped_size = 0;
7463
7464 if (isVectorUPL) {
7465 /*
7466 * If it's a Vectored UPL, we'll be removing the entire
7467 * submap anyways, so no need to remove individual UPL
7468 * element mappings from within the submap
7469 */
7470 goto process_upl_to_remove;
7471 }
7472
7473 upl_unlock(upl);
7474
7475 vm_map_remove(map,
7476 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7477 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7478 return KERN_SUCCESS;
7479 }
7480 upl_unlock(upl);
7481
7482 return KERN_FAILURE;
7483 }
7484
7485 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7486 vm_map_remove_upl(
7487 vm_map_t map,
7488 upl_t upl)
7489 {
7490 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7491 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7492 }
7493
7494 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7495 upl_commit_range(
7496 upl_t upl,
7497 upl_offset_t offset,
7498 upl_size_t size,
7499 int flags,
7500 upl_page_info_t *page_list,
7501 mach_msg_type_number_t count,
7502 boolean_t *empty)
7503 {
7504 upl_size_t xfer_size, subupl_size;
7505 vm_object_t shadow_object;
7506 vm_object_t object;
7507 vm_object_t m_object;
7508 vm_object_offset_t target_offset;
7509 upl_offset_t subupl_offset = offset;
7510 int entry;
7511 wpl_array_t lite_list;
7512 int occupied;
7513 int clear_refmod = 0;
7514 int pgpgout_count = 0;
7515 struct vm_page_delayed_work dw_array;
7516 struct vm_page_delayed_work *dwp, *dwp_start;
7517 bool dwp_finish_ctx = TRUE;
7518 int dw_count;
7519 int dw_limit;
7520 int isVectorUPL = 0;
7521 upl_t vector_upl = NULL;
7522 boolean_t should_be_throttled = FALSE;
7523
7524 vm_page_t nxt_page = VM_PAGE_NULL;
7525 int fast_path_possible = 0;
7526 int fast_path_full_commit = 0;
7527 int throttle_page = 0;
7528 int unwired_count = 0;
7529 int local_queue_count = 0;
7530 vm_page_t first_local, last_local;
7531 vm_object_offset_t obj_start, obj_end, obj_offset;
7532 kern_return_t kr = KERN_SUCCESS;
7533
7534 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7535
7536 dwp_start = dwp = NULL;
7537
7538 subupl_size = size;
7539 *empty = FALSE;
7540
7541 if (upl == UPL_NULL) {
7542 return KERN_INVALID_ARGUMENT;
7543 }
7544
7545 dw_count = 0;
7546 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7547 dwp_start = vm_page_delayed_work_get_ctx();
7548 if (dwp_start == NULL) {
7549 dwp_start = &dw_array;
7550 dw_limit = 1;
7551 dwp_finish_ctx = FALSE;
7552 }
7553
7554 dwp = dwp_start;
7555
7556 if (count == 0) {
7557 page_list = NULL;
7558 }
7559
7560 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7561 vector_upl = upl;
7562 upl_lock(vector_upl);
7563 } else {
7564 upl_lock(upl);
7565 }
7566
7567 process_upl_to_commit:
7568
7569 if (isVectorUPL) {
7570 size = subupl_size;
7571 offset = subupl_offset;
7572 if (size == 0) {
7573 upl_unlock(vector_upl);
7574 kr = KERN_SUCCESS;
7575 goto done;
7576 }
7577 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7578 if (upl == NULL) {
7579 upl_unlock(vector_upl);
7580 kr = KERN_FAILURE;
7581 goto done;
7582 }
7583 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7584 subupl_size -= size;
7585 subupl_offset += size;
7586 }
7587
7588 #if UPL_DEBUG
7589 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7590 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7591
7592 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7593 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7594
7595 upl->upl_commit_index++;
7596 }
7597 #endif
7598 if (upl->flags & UPL_DEVICE_MEMORY) {
7599 xfer_size = 0;
7600 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7601 xfer_size = size;
7602 } else {
7603 if (!isVectorUPL) {
7604 upl_unlock(upl);
7605 } else {
7606 upl_unlock(vector_upl);
7607 }
7608 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7609 kr = KERN_FAILURE;
7610 goto done;
7611 }
7612 if (upl->flags & UPL_SET_DIRTY) {
7613 flags |= UPL_COMMIT_SET_DIRTY;
7614 }
7615 if (upl->flags & UPL_CLEAR_DIRTY) {
7616 flags |= UPL_COMMIT_CLEAR_DIRTY;
7617 }
7618
7619 if (upl->flags & UPL_INTERNAL) {
7620 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7621 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7622 } else {
7623 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7624 }
7625
7626 object = upl->map_object;
7627
7628 if (upl->flags & UPL_SHADOWED) {
7629 vm_object_lock(object);
7630 shadow_object = object->shadow;
7631 } else {
7632 shadow_object = object;
7633 }
7634 entry = offset / PAGE_SIZE;
7635 target_offset = (vm_object_offset_t)offset;
7636
7637 if (upl->flags & UPL_KERNEL_OBJECT) {
7638 vm_object_lock_shared(shadow_object);
7639 } else {
7640 vm_object_lock(shadow_object);
7641 }
7642
7643 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7644
7645 if (upl->flags & UPL_ACCESS_BLOCKED) {
7646 assert(shadow_object->blocked_access);
7647 shadow_object->blocked_access = FALSE;
7648 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7649 }
7650
7651 if (shadow_object->code_signed) {
7652 /*
7653 * CODE SIGNING:
7654 * If the object is code-signed, do not let this UPL tell
7655 * us if the pages are valid or not. Let the pages be
7656 * validated by VM the normal way (when they get mapped or
7657 * copied).
7658 */
7659 flags &= ~UPL_COMMIT_CS_VALIDATED;
7660 }
7661 if (!page_list) {
7662 /*
7663 * No page list to get the code-signing info from !?
7664 */
7665 flags &= ~UPL_COMMIT_CS_VALIDATED;
7666 }
7667 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7668 should_be_throttled = TRUE;
7669 }
7670
7671 if ((upl->flags & UPL_IO_WIRE) &&
7672 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7673 !isVectorUPL &&
7674 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7675 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7676 if (!vm_page_queue_empty(&shadow_object->memq)) {
7677 if (size == shadow_object->vo_size) {
7678 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7679 fast_path_full_commit = 1;
7680 }
7681 fast_path_possible = 1;
7682
7683 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7684 (shadow_object->purgable == VM_PURGABLE_DENY ||
7685 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7686 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7687 throttle_page = 1;
7688 }
7689 }
7690 }
7691 first_local = VM_PAGE_NULL;
7692 last_local = VM_PAGE_NULL;
7693
7694 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7695 obj_end = obj_start + xfer_size;
7696 obj_start = vm_object_trunc_page(obj_start);
7697 obj_end = vm_object_round_page(obj_end);
7698 for (obj_offset = obj_start;
7699 obj_offset < obj_end;
7700 obj_offset += PAGE_SIZE) {
7701 vm_page_t t, m;
7702
7703 dwp->dw_mask = 0;
7704 clear_refmod = 0;
7705
7706 m = VM_PAGE_NULL;
7707
7708 if (upl->flags & UPL_LITE) {
7709 unsigned int pg_num;
7710
7711 if (nxt_page != VM_PAGE_NULL) {
7712 m = nxt_page;
7713 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7714 target_offset = m->vmp_offset;
7715 }
7716 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7717 assert(pg_num == target_offset / PAGE_SIZE);
7718
7719 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7720 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7721
7722 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7723 m = vm_page_lookup(shadow_object, obj_offset);
7724 }
7725 } else {
7726 m = NULL;
7727 }
7728 }
7729 if (upl->flags & UPL_SHADOWED) {
7730 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7731 t->vmp_free_when_done = FALSE;
7732
7733 VM_PAGE_FREE(t);
7734
7735 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7736 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7737 }
7738 }
7739 }
7740 if (m == VM_PAGE_NULL) {
7741 goto commit_next_page;
7742 }
7743
7744 m_object = VM_PAGE_OBJECT(m);
7745
7746 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7747 assert(m->vmp_busy);
7748
7749 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7750 goto commit_next_page;
7751 }
7752
7753 if (flags & UPL_COMMIT_CS_VALIDATED) {
7754 /*
7755 * CODE SIGNING:
7756 * Set the code signing bits according to
7757 * what the UPL says they should be.
7758 */
7759 m->vmp_cs_validated |= page_list[entry].cs_validated;
7760 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7761 m->vmp_cs_nx |= page_list[entry].cs_nx;
7762 }
7763 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7764 m->vmp_written_by_kernel = TRUE;
7765 }
7766
7767 if (upl->flags & UPL_IO_WIRE) {
7768 if (page_list) {
7769 page_list[entry].phys_addr = 0;
7770 }
7771
7772 if (flags & UPL_COMMIT_SET_DIRTY) {
7773 SET_PAGE_DIRTY(m, FALSE);
7774 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7775 m->vmp_dirty = FALSE;
7776
7777 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7778 m->vmp_cs_validated &&
7779 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7780 /*
7781 * CODE SIGNING:
7782 * This page is no longer dirty
7783 * but could have been modified,
7784 * so it will need to be
7785 * re-validated.
7786 */
7787 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7788
7789 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7790
7791 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7792 }
7793 clear_refmod |= VM_MEM_MODIFIED;
7794 }
7795 if (upl->flags & UPL_ACCESS_BLOCKED) {
7796 /*
7797 * We blocked access to the pages in this UPL.
7798 * Clear the "busy" bit and wake up any waiter
7799 * for this page.
7800 */
7801 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7802 }
7803 if (fast_path_possible) {
7804 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7805 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7806 if (m->vmp_absent) {
7807 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7808 assert(m->vmp_wire_count == 0);
7809 assert(m->vmp_busy);
7810
7811 m->vmp_absent = FALSE;
7812 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7813 } else {
7814 if (m->vmp_wire_count == 0) {
7815 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7816 }
7817 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7818
7819 /*
7820 * XXX FBDP need to update some other
7821 * counters here (purgeable_wired_count)
7822 * (ledgers), ...
7823 */
7824 assert(m->vmp_wire_count > 0);
7825 m->vmp_wire_count--;
7826
7827 if (m->vmp_wire_count == 0) {
7828 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7829 unwired_count++;
7830 }
7831 }
7832 if (m->vmp_wire_count == 0) {
7833 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7834
7835 if (last_local == VM_PAGE_NULL) {
7836 assert(first_local == VM_PAGE_NULL);
7837
7838 last_local = m;
7839 first_local = m;
7840 } else {
7841 assert(first_local != VM_PAGE_NULL);
7842
7843 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7844 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7845 first_local = m;
7846 }
7847 local_queue_count++;
7848
7849 if (throttle_page) {
7850 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7851 } else {
7852 if (flags & UPL_COMMIT_INACTIVATE) {
7853 if (shadow_object->internal) {
7854 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7855 } else {
7856 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7857 }
7858 } else {
7859 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7860 }
7861 }
7862 }
7863 } else {
7864 if (flags & UPL_COMMIT_INACTIVATE) {
7865 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7866 clear_refmod |= VM_MEM_REFERENCED;
7867 }
7868 if (m->vmp_absent) {
7869 if (flags & UPL_COMMIT_FREE_ABSENT) {
7870 dwp->dw_mask |= DW_vm_page_free;
7871 } else {
7872 m->vmp_absent = FALSE;
7873 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7874
7875 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7876 dwp->dw_mask |= DW_vm_page_activate;
7877 }
7878 }
7879 } else {
7880 dwp->dw_mask |= DW_vm_page_unwire;
7881 }
7882 }
7883 goto commit_next_page;
7884 }
7885 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7886
7887 if (page_list) {
7888 page_list[entry].phys_addr = 0;
7889 }
7890
7891 /*
7892 * make sure to clear the hardware
7893 * modify or reference bits before
7894 * releasing the BUSY bit on this page
7895 * otherwise we risk losing a legitimate
7896 * change of state
7897 */
7898 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7899 m->vmp_dirty = FALSE;
7900
7901 clear_refmod |= VM_MEM_MODIFIED;
7902 }
7903 if (m->vmp_laundry) {
7904 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7905 }
7906
7907 if (VM_PAGE_WIRED(m)) {
7908 m->vmp_free_when_done = FALSE;
7909 }
7910
7911 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7912 m->vmp_cs_validated &&
7913 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7914 /*
7915 * CODE SIGNING:
7916 * This page is no longer dirty
7917 * but could have been modified,
7918 * so it will need to be
7919 * re-validated.
7920 */
7921 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7922
7923 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7924
7925 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7926 }
7927 if (m->vmp_overwriting) {
7928 /*
7929 * the (COPY_OUT_FROM == FALSE) request_page_list case
7930 */
7931 if (m->vmp_busy) {
7932 #if CONFIG_PHANTOM_CACHE
7933 if (m->vmp_absent && !m_object->internal) {
7934 dwp->dw_mask |= DW_vm_phantom_cache_update;
7935 }
7936 #endif
7937 m->vmp_absent = FALSE;
7938
7939 dwp->dw_mask |= DW_clear_busy;
7940 } else {
7941 /*
7942 * alternate (COPY_OUT_FROM == FALSE) page_list case
7943 * Occurs when the original page was wired
7944 * at the time of the list request
7945 */
7946 assert(VM_PAGE_WIRED(m));
7947
7948 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7949 }
7950 m->vmp_overwriting = FALSE;
7951 }
7952 m->vmp_cleaning = FALSE;
7953
7954 if (m->vmp_free_when_done) {
7955 /*
7956 * With the clean queue enabled, UPL_PAGEOUT should
7957 * no longer set the pageout bit. Its pages now go
7958 * to the clean queue.
7959 *
7960 * We don't use the cleaned Q anymore and so this
7961 * assert isn't correct. The code for the clean Q
7962 * still exists and might be used in the future. If we
7963 * go back to the cleaned Q, we will re-enable this
7964 * assert.
7965 *
7966 * assert(!(upl->flags & UPL_PAGEOUT));
7967 */
7968 assert(!m_object->internal);
7969
7970 m->vmp_free_when_done = FALSE;
7971
7972 if ((flags & UPL_COMMIT_SET_DIRTY) ||
7973 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7974 /*
7975 * page was re-dirtied after we started
7976 * the pageout... reactivate it since
7977 * we don't know whether the on-disk
7978 * copy matches what is now in memory
7979 */
7980 SET_PAGE_DIRTY(m, FALSE);
7981
7982 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7983
7984 if (upl->flags & UPL_PAGEOUT) {
7985 counter_inc(&vm_statistics_reactivations);
7986 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7987 }
7988 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
7989 /*
7990 * Someone else might still be handling this
7991 * page (vm_fault() for example), so let's not
7992 * free it or "un-busy" it!
7993 * Put that page in the "speculative" queue
7994 * for now (since we would otherwise have freed
7995 * it) and let whoever is keeping the page
7996 * "busy" move it if needed when they're done
7997 * with it.
7998 */
7999 dwp->dw_mask |= DW_vm_page_speculate;
8000 } else {
8001 /*
8002 * page has been successfully cleaned
8003 * go ahead and free it for other use
8004 */
8005 if (m_object->internal) {
8006 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8007 } else {
8008 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8009 }
8010 m->vmp_dirty = FALSE;
8011 if (!(upl->flags & UPL_HAS_BUSY)) {
8012 assert(!m->vmp_busy);
8013 }
8014 m->vmp_busy = TRUE;
8015
8016 dwp->dw_mask |= DW_vm_page_free;
8017 }
8018 goto commit_next_page;
8019 }
8020 /*
8021 * It is a part of the semantic of COPYOUT_FROM
8022 * UPLs that a commit implies cache sync
8023 * between the vm page and the backing store
8024 * this can be used to strip the precious bit
8025 * as well as clean
8026 */
8027 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8028 m->vmp_precious = FALSE;
8029 }
8030
8031 if (flags & UPL_COMMIT_SET_DIRTY) {
8032 SET_PAGE_DIRTY(m, FALSE);
8033 } else {
8034 m->vmp_dirty = FALSE;
8035 }
8036
8037 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8038 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8039 pgpgout_count++;
8040
8041 counter_inc(&vm_statistics_pageouts);
8042 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8043
8044 dwp->dw_mask |= DW_enqueue_cleaned;
8045 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8046 /*
8047 * page coming back in from being 'frozen'...
8048 * it was dirty before it was frozen, so keep it so
8049 * the vm_page_activate will notice that it really belongs
8050 * on the throttle queue and put it there
8051 */
8052 SET_PAGE_DIRTY(m, FALSE);
8053 dwp->dw_mask |= DW_vm_page_activate;
8054 } else {
8055 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8056 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8057 clear_refmod |= VM_MEM_REFERENCED;
8058 } else if (!VM_PAGE_PAGEABLE(m)) {
8059 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8060 dwp->dw_mask |= DW_vm_page_speculate;
8061 } else if (m->vmp_reference) {
8062 dwp->dw_mask |= DW_vm_page_activate;
8063 } else {
8064 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8065 clear_refmod |= VM_MEM_REFERENCED;
8066 }
8067 }
8068 }
8069 if (upl->flags & UPL_ACCESS_BLOCKED) {
8070 /*
8071 * We blocked access to the pages in this URL.
8072 * Clear the "busy" bit on this page before we
8073 * wake up any waiter.
8074 */
8075 dwp->dw_mask |= DW_clear_busy;
8076 }
8077 /*
8078 * Wakeup any thread waiting for the page to be un-cleaning.
8079 */
8080 dwp->dw_mask |= DW_PAGE_WAKEUP;
8081
8082 commit_next_page:
8083 if (clear_refmod) {
8084 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8085 }
8086
8087 target_offset += PAGE_SIZE_64;
8088 xfer_size -= PAGE_SIZE;
8089 entry++;
8090
8091 if (dwp->dw_mask) {
8092 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8093 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8094
8095 if (dw_count >= dw_limit) {
8096 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8097
8098 dwp = dwp_start;
8099 dw_count = 0;
8100 }
8101 } else {
8102 if (dwp->dw_mask & DW_clear_busy) {
8103 m->vmp_busy = FALSE;
8104 }
8105
8106 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8107 PAGE_WAKEUP(m);
8108 }
8109 }
8110 }
8111 }
8112 if (dw_count) {
8113 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8114 dwp = dwp_start;
8115 dw_count = 0;
8116 }
8117
8118 if (fast_path_possible) {
8119 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8120 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8121
8122 if (local_queue_count || unwired_count) {
8123 if (local_queue_count) {
8124 vm_page_t first_target;
8125 vm_page_queue_head_t *target_queue;
8126
8127 if (throttle_page) {
8128 target_queue = &vm_page_queue_throttled;
8129 } else {
8130 if (flags & UPL_COMMIT_INACTIVATE) {
8131 if (shadow_object->internal) {
8132 target_queue = &vm_page_queue_anonymous;
8133 } else {
8134 target_queue = &vm_page_queue_inactive;
8135 }
8136 } else {
8137 target_queue = &vm_page_queue_active;
8138 }
8139 }
8140 /*
8141 * Transfer the entire local queue to a regular LRU page queues.
8142 */
8143 vm_page_lockspin_queues();
8144
8145 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8146
8147 if (vm_page_queue_empty(target_queue)) {
8148 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8149 } else {
8150 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8151 }
8152
8153 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8154 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8155 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8156
8157 /*
8158 * Adjust the global page counts.
8159 */
8160 if (throttle_page) {
8161 vm_page_throttled_count += local_queue_count;
8162 } else {
8163 if (flags & UPL_COMMIT_INACTIVATE) {
8164 if (shadow_object->internal) {
8165 vm_page_anonymous_count += local_queue_count;
8166 }
8167 vm_page_inactive_count += local_queue_count;
8168
8169 token_new_pagecount += local_queue_count;
8170 } else {
8171 vm_page_active_count += local_queue_count;
8172 }
8173
8174 if (shadow_object->internal) {
8175 vm_page_pageable_internal_count += local_queue_count;
8176 } else {
8177 vm_page_pageable_external_count += local_queue_count;
8178 }
8179 }
8180 } else {
8181 vm_page_lockspin_queues();
8182 }
8183 if (unwired_count) {
8184 vm_page_wire_count -= unwired_count;
8185 VM_CHECK_MEMORYSTATUS;
8186 }
8187 vm_page_unlock_queues();
8188
8189 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8190 }
8191 }
8192 occupied = 1;
8193
8194 if (upl->flags & UPL_DEVICE_MEMORY) {
8195 occupied = 0;
8196 } else if (upl->flags & UPL_LITE) {
8197 int pg_num;
8198 int i;
8199
8200 occupied = 0;
8201
8202 if (!fast_path_full_commit) {
8203 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8204 pg_num = (pg_num + 31) >> 5;
8205
8206 for (i = 0; i < pg_num; i++) {
8207 if (lite_list[i] != 0) {
8208 occupied = 1;
8209 break;
8210 }
8211 }
8212 }
8213 } else {
8214 if (vm_page_queue_empty(&upl->map_object->memq)) {
8215 occupied = 0;
8216 }
8217 }
8218 if (occupied == 0) {
8219 /*
8220 * If this UPL element belongs to a Vector UPL and is
8221 * empty, then this is the right function to deallocate
8222 * it. So go ahead set the *empty variable. The flag
8223 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8224 * should be considered relevant for the Vector UPL and not
8225 * the internal UPLs.
8226 */
8227 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8228 *empty = TRUE;
8229 }
8230
8231 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8232 /*
8233 * this is not a paging object
8234 * so we need to drop the paging reference
8235 * that was taken when we created the UPL
8236 * against this object
8237 */
8238 vm_object_activity_end(shadow_object);
8239 vm_object_collapse(shadow_object, 0, TRUE);
8240 } else {
8241 /*
8242 * we dontated the paging reference to
8243 * the map object... vm_pageout_object_terminate
8244 * will drop this reference
8245 */
8246 }
8247 }
8248 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8249 vm_object_unlock(shadow_object);
8250 if (object != shadow_object) {
8251 vm_object_unlock(object);
8252 }
8253
8254 if (!isVectorUPL) {
8255 upl_unlock(upl);
8256 } else {
8257 /*
8258 * If we completed our operations on an UPL that is
8259 * part of a Vectored UPL and if empty is TRUE, then
8260 * we should go ahead and deallocate this UPL element.
8261 * Then we check if this was the last of the UPL elements
8262 * within that Vectored UPL. If so, set empty to TRUE
8263 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8264 * can go ahead and deallocate the Vector UPL too.
8265 */
8266 if (*empty == TRUE) {
8267 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8268 upl_deallocate(upl);
8269 }
8270 goto process_upl_to_commit;
8271 }
8272 if (pgpgout_count) {
8273 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8274 }
8275
8276 kr = KERN_SUCCESS;
8277 done:
8278 if (dwp_start && dwp_finish_ctx) {
8279 vm_page_delayed_work_finish_ctx(dwp_start);
8280 dwp_start = dwp = NULL;
8281 }
8282
8283 return kr;
8284 }
8285
8286 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8287 upl_abort_range(
8288 upl_t upl,
8289 upl_offset_t offset,
8290 upl_size_t size,
8291 int error,
8292 boolean_t *empty)
8293 {
8294 upl_page_info_t *user_page_list = NULL;
8295 upl_size_t xfer_size, subupl_size;
8296 vm_object_t shadow_object;
8297 vm_object_t object;
8298 vm_object_offset_t target_offset;
8299 upl_offset_t subupl_offset = offset;
8300 int entry;
8301 wpl_array_t lite_list;
8302 int occupied;
8303 struct vm_page_delayed_work dw_array;
8304 struct vm_page_delayed_work *dwp, *dwp_start;
8305 bool dwp_finish_ctx = TRUE;
8306 int dw_count;
8307 int dw_limit;
8308 int isVectorUPL = 0;
8309 upl_t vector_upl = NULL;
8310 vm_object_offset_t obj_start, obj_end, obj_offset;
8311 kern_return_t kr = KERN_SUCCESS;
8312
8313 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8314
8315 dwp_start = dwp = NULL;
8316
8317 subupl_size = size;
8318 *empty = FALSE;
8319
8320 if (upl == UPL_NULL) {
8321 return KERN_INVALID_ARGUMENT;
8322 }
8323
8324 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8325 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8326 }
8327
8328 dw_count = 0;
8329 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8330 dwp_start = vm_page_delayed_work_get_ctx();
8331 if (dwp_start == NULL) {
8332 dwp_start = &dw_array;
8333 dw_limit = 1;
8334 dwp_finish_ctx = FALSE;
8335 }
8336
8337 dwp = dwp_start;
8338
8339 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8340 vector_upl = upl;
8341 upl_lock(vector_upl);
8342 } else {
8343 upl_lock(upl);
8344 }
8345
8346 process_upl_to_abort:
8347 if (isVectorUPL) {
8348 size = subupl_size;
8349 offset = subupl_offset;
8350 if (size == 0) {
8351 upl_unlock(vector_upl);
8352 kr = KERN_SUCCESS;
8353 goto done;
8354 }
8355 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8356 if (upl == NULL) {
8357 upl_unlock(vector_upl);
8358 kr = KERN_FAILURE;
8359 goto done;
8360 }
8361 subupl_size -= size;
8362 subupl_offset += size;
8363 }
8364
8365 *empty = FALSE;
8366
8367 #if UPL_DEBUG
8368 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8369 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8370
8371 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8372 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8373 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8374
8375 upl->upl_commit_index++;
8376 }
8377 #endif
8378 if (upl->flags & UPL_DEVICE_MEMORY) {
8379 xfer_size = 0;
8380 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8381 xfer_size = size;
8382 } else {
8383 if (!isVectorUPL) {
8384 upl_unlock(upl);
8385 } else {
8386 upl_unlock(vector_upl);
8387 }
8388 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8389 kr = KERN_FAILURE;
8390 goto done;
8391 }
8392 if (upl->flags & UPL_INTERNAL) {
8393 lite_list = (wpl_array_t)
8394 ((((uintptr_t)upl) + sizeof(struct upl))
8395 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8396
8397 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8398 } else {
8399 lite_list = (wpl_array_t)
8400 (((uintptr_t)upl) + sizeof(struct upl));
8401 }
8402 object = upl->map_object;
8403
8404 if (upl->flags & UPL_SHADOWED) {
8405 vm_object_lock(object);
8406 shadow_object = object->shadow;
8407 } else {
8408 shadow_object = object;
8409 }
8410
8411 entry = offset / PAGE_SIZE;
8412 target_offset = (vm_object_offset_t)offset;
8413
8414 if (upl->flags & UPL_KERNEL_OBJECT) {
8415 vm_object_lock_shared(shadow_object);
8416 } else {
8417 vm_object_lock(shadow_object);
8418 }
8419
8420 if (upl->flags & UPL_ACCESS_BLOCKED) {
8421 assert(shadow_object->blocked_access);
8422 shadow_object->blocked_access = FALSE;
8423 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8424 }
8425
8426 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8427 panic("upl_abort_range: kernel_object being DUMPED");
8428 }
8429
8430 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8431 obj_end = obj_start + xfer_size;
8432 obj_start = vm_object_trunc_page(obj_start);
8433 obj_end = vm_object_round_page(obj_end);
8434 for (obj_offset = obj_start;
8435 obj_offset < obj_end;
8436 obj_offset += PAGE_SIZE) {
8437 vm_page_t t, m;
8438 unsigned int pg_num;
8439 boolean_t needed;
8440
8441 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8442 assert(pg_num == target_offset / PAGE_SIZE);
8443
8444 needed = FALSE;
8445
8446 if (user_page_list) {
8447 needed = user_page_list[pg_num].needed;
8448 }
8449
8450 dwp->dw_mask = 0;
8451 m = VM_PAGE_NULL;
8452
8453 if (upl->flags & UPL_LITE) {
8454 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8455 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8456
8457 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8458 m = vm_page_lookup(shadow_object, obj_offset);
8459 }
8460 }
8461 }
8462 if (upl->flags & UPL_SHADOWED) {
8463 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8464 t->vmp_free_when_done = FALSE;
8465
8466 VM_PAGE_FREE(t);
8467
8468 if (m == VM_PAGE_NULL) {
8469 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8470 }
8471 }
8472 }
8473 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8474 goto abort_next_page;
8475 }
8476
8477 if (m != VM_PAGE_NULL) {
8478 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8479
8480 if (m->vmp_absent) {
8481 boolean_t must_free = TRUE;
8482
8483 /*
8484 * COPYOUT = FALSE case
8485 * check for error conditions which must
8486 * be passed back to the pages customer
8487 */
8488 if (error & UPL_ABORT_RESTART) {
8489 m->vmp_restart = TRUE;
8490 m->vmp_absent = FALSE;
8491 m->vmp_unusual = TRUE;
8492 must_free = FALSE;
8493 } else if (error & UPL_ABORT_UNAVAILABLE) {
8494 m->vmp_restart = FALSE;
8495 m->vmp_unusual = TRUE;
8496 must_free = FALSE;
8497 } else if (error & UPL_ABORT_ERROR) {
8498 m->vmp_restart = FALSE;
8499 m->vmp_absent = FALSE;
8500 m->vmp_error = TRUE;
8501 m->vmp_unusual = TRUE;
8502 must_free = FALSE;
8503 }
8504 if (m->vmp_clustered && needed == FALSE) {
8505 /*
8506 * This page was a part of a speculative
8507 * read-ahead initiated by the kernel
8508 * itself. No one is expecting this
8509 * page and no one will clean up its
8510 * error state if it ever becomes valid
8511 * in the future.
8512 * We have to free it here.
8513 */
8514 must_free = TRUE;
8515 }
8516 m->vmp_cleaning = FALSE;
8517
8518 if (m->vmp_overwriting && !m->vmp_busy) {
8519 /*
8520 * this shouldn't happen since
8521 * this is an 'absent' page, but
8522 * it doesn't hurt to check for
8523 * the 'alternate' method of
8524 * stabilizing the page...
8525 * we will mark 'busy' to be cleared
8526 * in the following code which will
8527 * take care of the primary stabilzation
8528 * method (i.e. setting 'busy' to TRUE)
8529 */
8530 dwp->dw_mask |= DW_vm_page_unwire;
8531 }
8532 m->vmp_overwriting = FALSE;
8533
8534 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8535
8536 if (must_free == TRUE) {
8537 dwp->dw_mask |= DW_vm_page_free;
8538 } else {
8539 dwp->dw_mask |= DW_vm_page_activate;
8540 }
8541 } else {
8542 /*
8543 * Handle the trusted pager throttle.
8544 */
8545 if (m->vmp_laundry) {
8546 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8547 }
8548
8549 if (upl->flags & UPL_ACCESS_BLOCKED) {
8550 /*
8551 * We blocked access to the pages in this UPL.
8552 * Clear the "busy" bit and wake up any waiter
8553 * for this page.
8554 */
8555 dwp->dw_mask |= DW_clear_busy;
8556 }
8557 if (m->vmp_overwriting) {
8558 if (m->vmp_busy) {
8559 dwp->dw_mask |= DW_clear_busy;
8560 } else {
8561 /*
8562 * deal with the 'alternate' method
8563 * of stabilizing the page...
8564 * we will either free the page
8565 * or mark 'busy' to be cleared
8566 * in the following code which will
8567 * take care of the primary stabilzation
8568 * method (i.e. setting 'busy' to TRUE)
8569 */
8570 dwp->dw_mask |= DW_vm_page_unwire;
8571 }
8572 m->vmp_overwriting = FALSE;
8573 }
8574 m->vmp_free_when_done = FALSE;
8575 m->vmp_cleaning = FALSE;
8576
8577 if (error & UPL_ABORT_DUMP_PAGES) {
8578 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8579
8580 dwp->dw_mask |= DW_vm_page_free;
8581 } else {
8582 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8583 if (error & UPL_ABORT_REFERENCE) {
8584 /*
8585 * we've been told to explictly
8586 * reference this page... for
8587 * file I/O, this is done by
8588 * implementing an LRU on the inactive q
8589 */
8590 dwp->dw_mask |= DW_vm_page_lru;
8591 } else if (!VM_PAGE_PAGEABLE(m)) {
8592 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8593 }
8594 }
8595 dwp->dw_mask |= DW_PAGE_WAKEUP;
8596 }
8597 }
8598 }
8599 abort_next_page:
8600 target_offset += PAGE_SIZE_64;
8601 xfer_size -= PAGE_SIZE;
8602 entry++;
8603
8604 if (dwp->dw_mask) {
8605 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8606 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8607
8608 if (dw_count >= dw_limit) {
8609 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8610
8611 dwp = dwp_start;
8612 dw_count = 0;
8613 }
8614 } else {
8615 if (dwp->dw_mask & DW_clear_busy) {
8616 m->vmp_busy = FALSE;
8617 }
8618
8619 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8620 PAGE_WAKEUP(m);
8621 }
8622 }
8623 }
8624 }
8625 if (dw_count) {
8626 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8627 dwp = dwp_start;
8628 dw_count = 0;
8629 }
8630
8631 occupied = 1;
8632
8633 if (upl->flags & UPL_DEVICE_MEMORY) {
8634 occupied = 0;
8635 } else if (upl->flags & UPL_LITE) {
8636 int pg_num;
8637 int i;
8638
8639 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8640 pg_num = (pg_num + 31) >> 5;
8641 occupied = 0;
8642
8643 for (i = 0; i < pg_num; i++) {
8644 if (lite_list[i] != 0) {
8645 occupied = 1;
8646 break;
8647 }
8648 }
8649 } else {
8650 if (vm_page_queue_empty(&upl->map_object->memq)) {
8651 occupied = 0;
8652 }
8653 }
8654 if (occupied == 0) {
8655 /*
8656 * If this UPL element belongs to a Vector UPL and is
8657 * empty, then this is the right function to deallocate
8658 * it. So go ahead set the *empty variable. The flag
8659 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8660 * should be considered relevant for the Vector UPL and
8661 * not the internal UPLs.
8662 */
8663 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8664 *empty = TRUE;
8665 }
8666
8667 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8668 /*
8669 * this is not a paging object
8670 * so we need to drop the paging reference
8671 * that was taken when we created the UPL
8672 * against this object
8673 */
8674 vm_object_activity_end(shadow_object);
8675 vm_object_collapse(shadow_object, 0, TRUE);
8676 } else {
8677 /*
8678 * we dontated the paging reference to
8679 * the map object... vm_pageout_object_terminate
8680 * will drop this reference
8681 */
8682 }
8683 }
8684 vm_object_unlock(shadow_object);
8685 if (object != shadow_object) {
8686 vm_object_unlock(object);
8687 }
8688
8689 if (!isVectorUPL) {
8690 upl_unlock(upl);
8691 } else {
8692 /*
8693 * If we completed our operations on an UPL that is
8694 * part of a Vectored UPL and if empty is TRUE, then
8695 * we should go ahead and deallocate this UPL element.
8696 * Then we check if this was the last of the UPL elements
8697 * within that Vectored UPL. If so, set empty to TRUE
8698 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8699 * can go ahead and deallocate the Vector UPL too.
8700 */
8701 if (*empty == TRUE) {
8702 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8703 upl_deallocate(upl);
8704 }
8705 goto process_upl_to_abort;
8706 }
8707
8708 kr = KERN_SUCCESS;
8709
8710 done:
8711 if (dwp_start && dwp_finish_ctx) {
8712 vm_page_delayed_work_finish_ctx(dwp_start);
8713 dwp_start = dwp = NULL;
8714 }
8715
8716 return kr;
8717 }
8718
8719
8720 kern_return_t
upl_abort(upl_t upl,int error)8721 upl_abort(
8722 upl_t upl,
8723 int error)
8724 {
8725 boolean_t empty;
8726
8727 if (upl == UPL_NULL) {
8728 return KERN_INVALID_ARGUMENT;
8729 }
8730
8731 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8732 }
8733
8734
8735 /* an option on commit should be wire */
8736 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8737 upl_commit(
8738 upl_t upl,
8739 upl_page_info_t *page_list,
8740 mach_msg_type_number_t count)
8741 {
8742 boolean_t empty;
8743
8744 if (upl == UPL_NULL) {
8745 return KERN_INVALID_ARGUMENT;
8746 }
8747
8748 return upl_commit_range(upl, 0, upl->u_size, 0,
8749 page_list, count, &empty);
8750 }
8751
8752
8753 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8754 iopl_valid_data(
8755 upl_t upl,
8756 vm_tag_t tag)
8757 {
8758 vm_object_t object;
8759 vm_offset_t offset;
8760 vm_page_t m, nxt_page = VM_PAGE_NULL;
8761 upl_size_t size;
8762 int wired_count = 0;
8763
8764 if (upl == NULL) {
8765 panic("iopl_valid_data: NULL upl");
8766 }
8767 if (vector_upl_is_valid(upl)) {
8768 panic("iopl_valid_data: vector upl");
8769 }
8770 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8771 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8772 }
8773
8774 object = upl->map_object;
8775
8776 if (object == kernel_object || object == compressor_object) {
8777 panic("iopl_valid_data: object == kernel or compressor");
8778 }
8779
8780 if (object->purgable == VM_PURGABLE_VOLATILE ||
8781 object->purgable == VM_PURGABLE_EMPTY) {
8782 panic("iopl_valid_data: object %p purgable %d",
8783 object, object->purgable);
8784 }
8785
8786 size = upl_adjusted_size(upl, PAGE_MASK);
8787
8788 vm_object_lock(object);
8789 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8790
8791 bool whole_object;
8792
8793 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8794 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8795 whole_object = true;
8796 } else {
8797 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8798 whole_object = false;
8799 }
8800
8801 while (size) {
8802 if (whole_object) {
8803 if (nxt_page != VM_PAGE_NULL) {
8804 m = nxt_page;
8805 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8806 }
8807 } else {
8808 m = vm_page_lookup(object, offset);
8809 offset += PAGE_SIZE;
8810
8811 if (m == VM_PAGE_NULL) {
8812 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8813 }
8814 }
8815 if (m->vmp_busy) {
8816 if (!m->vmp_absent) {
8817 panic("iopl_valid_data: busy page w/o absent");
8818 }
8819
8820 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8821 panic("iopl_valid_data: busy+absent page on page queue");
8822 }
8823 if (m->vmp_reusable) {
8824 panic("iopl_valid_data: %p is reusable", m);
8825 }
8826
8827 m->vmp_absent = FALSE;
8828 m->vmp_dirty = TRUE;
8829 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8830 assert(m->vmp_wire_count == 0);
8831 m->vmp_wire_count++;
8832 assert(m->vmp_wire_count);
8833 if (m->vmp_wire_count == 1) {
8834 m->vmp_q_state = VM_PAGE_IS_WIRED;
8835 wired_count++;
8836 } else {
8837 panic("iopl_valid_data: %p already wired", m);
8838 }
8839
8840 PAGE_WAKEUP_DONE(m);
8841 }
8842 size -= PAGE_SIZE;
8843 }
8844 if (wired_count) {
8845 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8846 assert(object->resident_page_count >= object->wired_page_count);
8847
8848 /* no need to adjust purgeable accounting for this object: */
8849 assert(object->purgable != VM_PURGABLE_VOLATILE);
8850 assert(object->purgable != VM_PURGABLE_EMPTY);
8851
8852 vm_page_lockspin_queues();
8853 vm_page_wire_count += wired_count;
8854 vm_page_unlock_queues();
8855 }
8856 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8857 vm_object_unlock(object);
8858 }
8859
8860
8861 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8862 vm_object_set_pmap_cache_attr(
8863 vm_object_t object,
8864 upl_page_info_array_t user_page_list,
8865 unsigned int num_pages,
8866 boolean_t batch_pmap_op)
8867 {
8868 unsigned int cache_attr = 0;
8869
8870 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8871 assert(user_page_list);
8872 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8873 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8874 }
8875 }
8876
8877
8878 boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8879 kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8880
8881
8882
8883 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8884 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8885 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8886 {
8887 vm_page_t dst_page;
8888 unsigned int entry;
8889 int page_count;
8890 int delayed_unlock = 0;
8891 boolean_t retval = TRUE;
8892 ppnum_t phys_page;
8893
8894 vm_object_lock_assert_exclusive(object);
8895 assert(object->purgable != VM_PURGABLE_VOLATILE);
8896 assert(object->purgable != VM_PURGABLE_EMPTY);
8897 assert(object->pager == NULL);
8898 assert(object->copy == NULL);
8899 assert(object->shadow == NULL);
8900
8901 page_count = object->resident_page_count;
8902 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8903
8904 vm_page_lock_queues();
8905
8906 while (page_count--) {
8907 if (dst_page->vmp_busy ||
8908 dst_page->vmp_fictitious ||
8909 dst_page->vmp_absent ||
8910 VMP_ERROR_GET(dst_page) ||
8911 dst_page->vmp_cleaning ||
8912 dst_page->vmp_restart ||
8913 dst_page->vmp_laundry) {
8914 retval = FALSE;
8915 goto done;
8916 }
8917 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8918 retval = FALSE;
8919 goto done;
8920 }
8921 dst_page->vmp_reference = TRUE;
8922
8923 vm_page_wire(dst_page, tag, FALSE);
8924
8925 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8926 SET_PAGE_DIRTY(dst_page, FALSE);
8927 }
8928 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8929 assert(entry >= 0 && entry < object->resident_page_count);
8930 lite_list[entry >> 5] |= 1U << (entry & 31);
8931
8932 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8933
8934 if (phys_page > upl->highest_page) {
8935 upl->highest_page = phys_page;
8936 }
8937
8938 if (user_page_list) {
8939 user_page_list[entry].phys_addr = phys_page;
8940 user_page_list[entry].absent = dst_page->vmp_absent;
8941 user_page_list[entry].dirty = dst_page->vmp_dirty;
8942 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8943 user_page_list[entry].precious = dst_page->vmp_precious;
8944 user_page_list[entry].device = FALSE;
8945 user_page_list[entry].speculative = FALSE;
8946 user_page_list[entry].cs_validated = FALSE;
8947 user_page_list[entry].cs_tainted = FALSE;
8948 user_page_list[entry].cs_nx = FALSE;
8949 user_page_list[entry].needed = FALSE;
8950 user_page_list[entry].mark = FALSE;
8951 }
8952 if (delayed_unlock++ > 256) {
8953 delayed_unlock = 0;
8954 lck_mtx_yield(&vm_page_queue_lock);
8955
8956 VM_CHECK_MEMORYSTATUS;
8957 }
8958 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8959 }
8960 done:
8961 vm_page_unlock_queues();
8962
8963 VM_CHECK_MEMORYSTATUS;
8964
8965 return retval;
8966 }
8967
8968
8969 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8970 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8971 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
8972 int page_count, int* page_grab_count)
8973 {
8974 vm_page_t dst_page;
8975 boolean_t no_zero_fill = FALSE;
8976 int interruptible;
8977 int pages_wired = 0;
8978 int pages_inserted = 0;
8979 int entry = 0;
8980 uint64_t delayed_ledger_update = 0;
8981 kern_return_t ret = KERN_SUCCESS;
8982 int grab_options;
8983 ppnum_t phys_page;
8984
8985 vm_object_lock_assert_exclusive(object);
8986 assert(object->purgable != VM_PURGABLE_VOLATILE);
8987 assert(object->purgable != VM_PURGABLE_EMPTY);
8988 assert(object->pager == NULL);
8989 assert(object->copy == NULL);
8990 assert(object->shadow == NULL);
8991
8992 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8993 interruptible = THREAD_ABORTSAFE;
8994 } else {
8995 interruptible = THREAD_UNINT;
8996 }
8997
8998 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8999 no_zero_fill = TRUE;
9000 }
9001
9002 grab_options = 0;
9003 #if CONFIG_SECLUDED_MEMORY
9004 if (object->can_grab_secluded) {
9005 grab_options |= VM_PAGE_GRAB_SECLUDED;
9006 }
9007 #endif /* CONFIG_SECLUDED_MEMORY */
9008
9009 while (page_count--) {
9010 while ((dst_page = vm_page_grab_options(grab_options))
9011 == VM_PAGE_NULL) {
9012 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9013
9014 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9015
9016 if (vm_page_wait(interruptible) == FALSE) {
9017 /*
9018 * interrupted case
9019 */
9020 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9021
9022 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9023
9024 ret = MACH_SEND_INTERRUPTED;
9025 goto done;
9026 }
9027 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9028
9029 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9030 }
9031 if (no_zero_fill == FALSE) {
9032 vm_page_zero_fill(dst_page);
9033 } else {
9034 dst_page->vmp_absent = TRUE;
9035 }
9036
9037 dst_page->vmp_reference = TRUE;
9038
9039 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9040 SET_PAGE_DIRTY(dst_page, FALSE);
9041 }
9042 if (dst_page->vmp_absent == FALSE) {
9043 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9044 assert(dst_page->vmp_wire_count == 0);
9045 dst_page->vmp_wire_count++;
9046 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9047 assert(dst_page->vmp_wire_count);
9048 pages_wired++;
9049 PAGE_WAKEUP_DONE(dst_page);
9050 }
9051 pages_inserted++;
9052
9053 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9054
9055 lite_list[entry >> 5] |= 1U << (entry & 31);
9056
9057 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9058
9059 if (phys_page > upl->highest_page) {
9060 upl->highest_page = phys_page;
9061 }
9062
9063 if (user_page_list) {
9064 user_page_list[entry].phys_addr = phys_page;
9065 user_page_list[entry].absent = dst_page->vmp_absent;
9066 user_page_list[entry].dirty = dst_page->vmp_dirty;
9067 user_page_list[entry].free_when_done = FALSE;
9068 user_page_list[entry].precious = FALSE;
9069 user_page_list[entry].device = FALSE;
9070 user_page_list[entry].speculative = FALSE;
9071 user_page_list[entry].cs_validated = FALSE;
9072 user_page_list[entry].cs_tainted = FALSE;
9073 user_page_list[entry].cs_nx = FALSE;
9074 user_page_list[entry].needed = FALSE;
9075 user_page_list[entry].mark = FALSE;
9076 }
9077 entry++;
9078 *dst_offset += PAGE_SIZE_64;
9079 }
9080 done:
9081 if (pages_wired) {
9082 vm_page_lockspin_queues();
9083 vm_page_wire_count += pages_wired;
9084 vm_page_unlock_queues();
9085 }
9086 if (pages_inserted) {
9087 if (object->internal) {
9088 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9089 } else {
9090 OSAddAtomic(pages_inserted, &vm_page_external_count);
9091 }
9092 }
9093 if (delayed_ledger_update) {
9094 task_t owner;
9095 int ledger_idx_volatile;
9096 int ledger_idx_nonvolatile;
9097 int ledger_idx_volatile_compressed;
9098 int ledger_idx_nonvolatile_compressed;
9099 boolean_t do_footprint;
9100
9101 owner = VM_OBJECT_OWNER(object);
9102 assert(owner);
9103
9104 vm_object_ledger_tag_ledgers(object,
9105 &ledger_idx_volatile,
9106 &ledger_idx_nonvolatile,
9107 &ledger_idx_volatile_compressed,
9108 &ledger_idx_nonvolatile_compressed,
9109 &do_footprint);
9110
9111 /* more non-volatile bytes */
9112 ledger_credit(owner->ledger,
9113 ledger_idx_nonvolatile,
9114 delayed_ledger_update);
9115 if (do_footprint) {
9116 /* more footprint */
9117 ledger_credit(owner->ledger,
9118 task_ledgers.phys_footprint,
9119 delayed_ledger_update);
9120 }
9121 }
9122
9123 assert(page_grab_count);
9124 *page_grab_count = pages_inserted;
9125
9126 return ret;
9127 }
9128
9129
9130
9131 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9132 vm_object_iopl_request(
9133 vm_object_t object,
9134 vm_object_offset_t offset,
9135 upl_size_t size,
9136 upl_t *upl_ptr,
9137 upl_page_info_array_t user_page_list,
9138 unsigned int *page_list_count,
9139 upl_control_flags_t cntrl_flags,
9140 vm_tag_t tag)
9141 {
9142 vm_page_t dst_page;
9143 vm_object_offset_t dst_offset;
9144 upl_size_t xfer_size;
9145 upl_t upl = NULL;
9146 unsigned int entry;
9147 wpl_array_t lite_list = NULL;
9148 int no_zero_fill = FALSE;
9149 unsigned int size_in_pages;
9150 int page_grab_count = 0;
9151 u_int32_t psize;
9152 kern_return_t ret;
9153 vm_prot_t prot;
9154 struct vm_object_fault_info fault_info = {};
9155 struct vm_page_delayed_work dw_array;
9156 struct vm_page_delayed_work *dwp, *dwp_start;
9157 bool dwp_finish_ctx = TRUE;
9158 int dw_count;
9159 int dw_limit;
9160 int dw_index;
9161 boolean_t caller_lookup;
9162 int io_tracking_flag = 0;
9163 int interruptible;
9164 ppnum_t phys_page;
9165
9166 boolean_t set_cache_attr_needed = FALSE;
9167 boolean_t free_wired_pages = FALSE;
9168 boolean_t fast_path_empty_req = FALSE;
9169 boolean_t fast_path_full_req = FALSE;
9170
9171 #if DEVELOPMENT || DEBUG
9172 task_t task = current_task();
9173 #endif /* DEVELOPMENT || DEBUG */
9174
9175 dwp_start = dwp = NULL;
9176
9177 vm_object_offset_t original_offset = offset;
9178 upl_size_t original_size = size;
9179
9180 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9181
9182 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9183 offset = vm_object_trunc_page(offset);
9184 if (size != original_size || offset != original_offset) {
9185 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9186 }
9187
9188 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9189 /*
9190 * For forward compatibility's sake,
9191 * reject any unknown flag.
9192 */
9193 return KERN_INVALID_VALUE;
9194 }
9195 if (vm_lopage_needed == FALSE) {
9196 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9197 }
9198
9199 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9200 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9201 return KERN_INVALID_VALUE;
9202 }
9203
9204 if (object->phys_contiguous) {
9205 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9206 return KERN_INVALID_ADDRESS;
9207 }
9208
9209 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9210 return KERN_INVALID_ADDRESS;
9211 }
9212 }
9213 }
9214 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9215 no_zero_fill = TRUE;
9216 }
9217
9218 if (cntrl_flags & UPL_COPYOUT_FROM) {
9219 prot = VM_PROT_READ;
9220 } else {
9221 prot = VM_PROT_READ | VM_PROT_WRITE;
9222 }
9223
9224 if ((!object->internal) && (object->paging_offset != 0)) {
9225 panic("vm_object_iopl_request: external object with non-zero paging offset");
9226 }
9227
9228
9229 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9230
9231 #if CONFIG_IOSCHED || UPL_DEBUG
9232 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9233 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9234 }
9235 #endif
9236
9237 #if CONFIG_IOSCHED
9238 if (object->io_tracking) {
9239 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9240 if (object != kernel_object) {
9241 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9242 }
9243 }
9244 #endif
9245
9246 if (object->phys_contiguous) {
9247 psize = PAGE_SIZE;
9248 } else {
9249 psize = size;
9250
9251 dw_count = 0;
9252 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9253 dwp_start = vm_page_delayed_work_get_ctx();
9254 if (dwp_start == NULL) {
9255 dwp_start = &dw_array;
9256 dw_limit = 1;
9257 dwp_finish_ctx = FALSE;
9258 }
9259
9260 dwp = dwp_start;
9261 }
9262
9263 if (cntrl_flags & UPL_SET_INTERNAL) {
9264 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9265
9266 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9267 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9268 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9269 if (size == 0) {
9270 user_page_list = NULL;
9271 lite_list = NULL;
9272 }
9273 } else {
9274 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9275
9276 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9277 if (size == 0) {
9278 lite_list = NULL;
9279 }
9280 }
9281 if (user_page_list) {
9282 user_page_list[0].device = FALSE;
9283 }
9284 *upl_ptr = upl;
9285
9286 if (cntrl_flags & UPL_NOZEROFILLIO) {
9287 DTRACE_VM4(upl_nozerofillio,
9288 vm_object_t, object,
9289 vm_object_offset_t, offset,
9290 upl_size_t, size,
9291 upl_t, upl);
9292 }
9293
9294 upl->map_object = object;
9295 upl->u_offset = original_offset;
9296 upl->u_size = original_size;
9297
9298 size_in_pages = size / PAGE_SIZE;
9299
9300 if (object == kernel_object &&
9301 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9302 upl->flags |= UPL_KERNEL_OBJECT;
9303 #if UPL_DEBUG
9304 vm_object_lock(object);
9305 #else
9306 vm_object_lock_shared(object);
9307 #endif
9308 } else {
9309 vm_object_lock(object);
9310 vm_object_activity_begin(object);
9311 }
9312 /*
9313 * paging in progress also protects the paging_offset
9314 */
9315 upl->u_offset = original_offset + object->paging_offset;
9316
9317 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9318 /*
9319 * The user requested that access to the pages in this UPL
9320 * be blocked until the UPL is commited or aborted.
9321 */
9322 upl->flags |= UPL_ACCESS_BLOCKED;
9323 }
9324
9325 #if CONFIG_IOSCHED || UPL_DEBUG
9326 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9327 vm_object_activity_begin(object);
9328 queue_enter(&object->uplq, upl, upl_t, uplq);
9329 }
9330 #endif
9331
9332 if (object->phys_contiguous) {
9333 if (upl->flags & UPL_ACCESS_BLOCKED) {
9334 assert(!object->blocked_access);
9335 object->blocked_access = TRUE;
9336 }
9337
9338 vm_object_unlock(object);
9339
9340 /*
9341 * don't need any shadow mappings for this one
9342 * since it is already I/O memory
9343 */
9344 upl->flags |= UPL_DEVICE_MEMORY;
9345
9346 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9347
9348 if (user_page_list) {
9349 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9350 user_page_list[0].device = TRUE;
9351 }
9352 if (page_list_count != NULL) {
9353 if (upl->flags & UPL_INTERNAL) {
9354 *page_list_count = 0;
9355 } else {
9356 *page_list_count = 1;
9357 }
9358 }
9359
9360 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9361 #if DEVELOPMENT || DEBUG
9362 if (task != NULL) {
9363 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9364 }
9365 #endif /* DEVELOPMENT || DEBUG */
9366 return KERN_SUCCESS;
9367 }
9368 if (object != kernel_object && object != compressor_object) {
9369 /*
9370 * Protect user space from future COW operations
9371 */
9372 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9373 if (!object->true_share &&
9374 vm_object_tracking_btlog) {
9375 btlog_record(vm_object_tracking_btlog, object,
9376 VM_OBJECT_TRACKING_OP_TRUESHARE,
9377 btref_get(__builtin_frame_address(0), 0));
9378 }
9379 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9380
9381 vm_object_lock_assert_exclusive(object);
9382 object->true_share = TRUE;
9383
9384 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9385 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9386 }
9387 }
9388
9389 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9390 object->copy != VM_OBJECT_NULL) {
9391 /*
9392 * Honor copy-on-write obligations
9393 *
9394 * The caller is gathering these pages and
9395 * might modify their contents. We need to
9396 * make sure that the copy object has its own
9397 * private copies of these pages before we let
9398 * the caller modify them.
9399 *
9400 * NOTE: someone else could map the original object
9401 * after we've done this copy-on-write here, and they
9402 * could then see an inconsistent picture of the memory
9403 * while it's being modified via the UPL. To prevent this,
9404 * we would have to block access to these pages until the
9405 * UPL is released. We could use the UPL_BLOCK_ACCESS
9406 * code path for that...
9407 */
9408 vm_object_update(object,
9409 offset,
9410 size,
9411 NULL,
9412 NULL,
9413 FALSE, /* should_return */
9414 MEMORY_OBJECT_COPY_SYNC,
9415 VM_PROT_NO_CHANGE);
9416 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9417 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9418 }
9419 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9420 object->purgable != VM_PURGABLE_VOLATILE &&
9421 object->purgable != VM_PURGABLE_EMPTY &&
9422 object->copy == NULL &&
9423 size == object->vo_size &&
9424 offset == 0 &&
9425 object->shadow == NULL &&
9426 object->pager == NULL) {
9427 if (object->resident_page_count == size_in_pages) {
9428 assert(object != compressor_object);
9429 assert(object != kernel_object);
9430 fast_path_full_req = TRUE;
9431 } else if (object->resident_page_count == 0) {
9432 assert(object != compressor_object);
9433 assert(object != kernel_object);
9434 fast_path_empty_req = TRUE;
9435 set_cache_attr_needed = TRUE;
9436 }
9437 }
9438
9439 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9440 interruptible = THREAD_ABORTSAFE;
9441 } else {
9442 interruptible = THREAD_UNINT;
9443 }
9444
9445 entry = 0;
9446
9447 xfer_size = size;
9448 dst_offset = offset;
9449
9450 if (fast_path_full_req) {
9451 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9452 goto finish;
9453 }
9454 /*
9455 * we couldn't complete the processing of this request on the fast path
9456 * so fall through to the slow path and finish up
9457 */
9458 } else if (fast_path_empty_req) {
9459 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9460 ret = KERN_MEMORY_ERROR;
9461 goto return_err;
9462 }
9463 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9464
9465 if (ret) {
9466 free_wired_pages = TRUE;
9467 goto return_err;
9468 }
9469 goto finish;
9470 }
9471
9472 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9473 fault_info.lo_offset = offset;
9474 fault_info.hi_offset = offset + xfer_size;
9475 fault_info.mark_zf_absent = TRUE;
9476 fault_info.interruptible = interruptible;
9477 fault_info.batch_pmap_op = TRUE;
9478
9479 while (xfer_size) {
9480 vm_fault_return_t result;
9481
9482 dwp->dw_mask = 0;
9483
9484 if (fast_path_full_req) {
9485 /*
9486 * if we get here, it means that we ran into a page
9487 * state we couldn't handle in the fast path and
9488 * bailed out to the slow path... since the order
9489 * we look at pages is different between the 2 paths,
9490 * the following check is needed to determine whether
9491 * this page was already processed in the fast path
9492 */
9493 if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9494 goto skip_page;
9495 }
9496 }
9497 dst_page = vm_page_lookup(object, dst_offset);
9498
9499 if (dst_page == VM_PAGE_NULL ||
9500 dst_page->vmp_busy ||
9501 VMP_ERROR_GET(dst_page) ||
9502 dst_page->vmp_restart ||
9503 dst_page->vmp_absent ||
9504 dst_page->vmp_fictitious) {
9505 if (object == kernel_object) {
9506 panic("vm_object_iopl_request: missing/bad page in kernel object");
9507 }
9508 if (object == compressor_object) {
9509 panic("vm_object_iopl_request: missing/bad page in compressor object");
9510 }
9511
9512 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9513 ret = KERN_MEMORY_ERROR;
9514 goto return_err;
9515 }
9516 set_cache_attr_needed = TRUE;
9517
9518 /*
9519 * We just looked up the page and the result remains valid
9520 * until the object lock is release, so send it to
9521 * vm_fault_page() (as "dst_page"), to avoid having to
9522 * look it up again there.
9523 */
9524 caller_lookup = TRUE;
9525
9526 do {
9527 vm_page_t top_page;
9528 kern_return_t error_code;
9529
9530 fault_info.cluster_size = xfer_size;
9531
9532 vm_object_paging_begin(object);
9533
9534 result = vm_fault_page(object, dst_offset,
9535 prot | VM_PROT_WRITE, FALSE,
9536 caller_lookup,
9537 &prot, &dst_page, &top_page,
9538 (int *)0,
9539 &error_code, no_zero_fill,
9540 &fault_info);
9541
9542 /* our lookup is no longer valid at this point */
9543 caller_lookup = FALSE;
9544
9545 switch (result) {
9546 case VM_FAULT_SUCCESS:
9547 page_grab_count++;
9548
9549 if (!dst_page->vmp_absent) {
9550 PAGE_WAKEUP_DONE(dst_page);
9551 } else {
9552 /*
9553 * we only get back an absent page if we
9554 * requested that it not be zero-filled
9555 * because we are about to fill it via I/O
9556 *
9557 * absent pages should be left BUSY
9558 * to prevent them from being faulted
9559 * into an address space before we've
9560 * had a chance to complete the I/O on
9561 * them since they may contain info that
9562 * shouldn't be seen by the faulting task
9563 */
9564 }
9565 /*
9566 * Release paging references and
9567 * top-level placeholder page, if any.
9568 */
9569 if (top_page != VM_PAGE_NULL) {
9570 vm_object_t local_object;
9571
9572 local_object = VM_PAGE_OBJECT(top_page);
9573
9574 /*
9575 * comparing 2 packed pointers
9576 */
9577 if (top_page->vmp_object != dst_page->vmp_object) {
9578 vm_object_lock(local_object);
9579 VM_PAGE_FREE(top_page);
9580 vm_object_paging_end(local_object);
9581 vm_object_unlock(local_object);
9582 } else {
9583 VM_PAGE_FREE(top_page);
9584 vm_object_paging_end(local_object);
9585 }
9586 }
9587 vm_object_paging_end(object);
9588 break;
9589
9590 case VM_FAULT_RETRY:
9591 vm_object_lock(object);
9592 break;
9593
9594 case VM_FAULT_MEMORY_SHORTAGE:
9595 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9596
9597 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9598
9599 if (vm_page_wait(interruptible)) {
9600 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9601
9602 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9603 vm_object_lock(object);
9604
9605 break;
9606 }
9607 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9608
9609 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9610 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9611 OS_FALLTHROUGH;
9612
9613 case VM_FAULT_INTERRUPTED:
9614 error_code = MACH_SEND_INTERRUPTED;
9615 OS_FALLTHROUGH;
9616 case VM_FAULT_MEMORY_ERROR:
9617 memory_error:
9618 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9619
9620 vm_object_lock(object);
9621 goto return_err;
9622
9623 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9624 /* success but no page: fail */
9625 vm_object_paging_end(object);
9626 vm_object_unlock(object);
9627 goto memory_error;
9628
9629 default:
9630 panic("vm_object_iopl_request: unexpected error"
9631 " 0x%x from vm_fault_page()\n", result);
9632 }
9633 } while (result != VM_FAULT_SUCCESS);
9634 }
9635 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9636
9637 if (upl->flags & UPL_KERNEL_OBJECT) {
9638 goto record_phys_addr;
9639 }
9640
9641 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9642 dst_page->vmp_busy = TRUE;
9643 goto record_phys_addr;
9644 }
9645
9646 if (dst_page->vmp_cleaning) {
9647 /*
9648 * Someone else is cleaning this page in place.
9649 * In theory, we should be able to proceed and use this
9650 * page but they'll probably end up clearing the "busy"
9651 * bit on it in upl_commit_range() but they didn't set
9652 * it, so they would clear our "busy" bit and open
9653 * us to race conditions.
9654 * We'd better wait for the cleaning to complete and
9655 * then try again.
9656 */
9657 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9658 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9659 continue;
9660 }
9661 if (dst_page->vmp_laundry) {
9662 vm_pageout_steal_laundry(dst_page, FALSE);
9663 }
9664
9665 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9666 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9667 vm_page_t low_page;
9668 int refmod;
9669
9670 /*
9671 * support devices that can't DMA above 32 bits
9672 * by substituting pages from a pool of low address
9673 * memory for any pages we find above the 4G mark
9674 * can't substitute if the page is already wired because
9675 * we don't know whether that physical address has been
9676 * handed out to some other 64 bit capable DMA device to use
9677 */
9678 if (VM_PAGE_WIRED(dst_page)) {
9679 ret = KERN_PROTECTION_FAILURE;
9680 goto return_err;
9681 }
9682 low_page = vm_page_grablo();
9683
9684 if (low_page == VM_PAGE_NULL) {
9685 ret = KERN_RESOURCE_SHORTAGE;
9686 goto return_err;
9687 }
9688 /*
9689 * from here until the vm_page_replace completes
9690 * we musn't drop the object lock... we don't
9691 * want anyone refaulting this page in and using
9692 * it after we disconnect it... we want the fault
9693 * to find the new page being substituted.
9694 */
9695 if (dst_page->vmp_pmapped) {
9696 refmod = pmap_disconnect(phys_page);
9697 } else {
9698 refmod = 0;
9699 }
9700
9701 if (!dst_page->vmp_absent) {
9702 vm_page_copy(dst_page, low_page);
9703 }
9704
9705 low_page->vmp_reference = dst_page->vmp_reference;
9706 low_page->vmp_dirty = dst_page->vmp_dirty;
9707 low_page->vmp_absent = dst_page->vmp_absent;
9708
9709 if (refmod & VM_MEM_REFERENCED) {
9710 low_page->vmp_reference = TRUE;
9711 }
9712 if (refmod & VM_MEM_MODIFIED) {
9713 SET_PAGE_DIRTY(low_page, FALSE);
9714 }
9715
9716 vm_page_replace(low_page, object, dst_offset);
9717
9718 dst_page = low_page;
9719 /*
9720 * vm_page_grablo returned the page marked
9721 * BUSY... we don't need a PAGE_WAKEUP_DONE
9722 * here, because we've never dropped the object lock
9723 */
9724 if (!dst_page->vmp_absent) {
9725 dst_page->vmp_busy = FALSE;
9726 }
9727
9728 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9729 }
9730 if (!dst_page->vmp_busy) {
9731 dwp->dw_mask |= DW_vm_page_wire;
9732 }
9733
9734 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9735 /*
9736 * Mark the page "busy" to block any future page fault
9737 * on this page in addition to wiring it.
9738 * We'll also remove the mapping
9739 * of all these pages before leaving this routine.
9740 */
9741 assert(!dst_page->vmp_fictitious);
9742 dst_page->vmp_busy = TRUE;
9743 }
9744 /*
9745 * expect the page to be used
9746 * page queues lock must be held to set 'reference'
9747 */
9748 dwp->dw_mask |= DW_set_reference;
9749
9750 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9751 SET_PAGE_DIRTY(dst_page, TRUE);
9752 /*
9753 * Page belonging to a code-signed object is about to
9754 * be written. Mark it tainted and disconnect it from
9755 * all pmaps so processes have to fault it back in and
9756 * deal with the tainted bit.
9757 */
9758 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9759 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9760 vm_page_iopl_tainted++;
9761 if (dst_page->vmp_pmapped) {
9762 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9763 if (refmod & VM_MEM_REFERENCED) {
9764 dst_page->vmp_reference = TRUE;
9765 }
9766 }
9767 }
9768 }
9769 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9770 pmap_sync_page_attributes_phys(phys_page);
9771 dst_page->vmp_written_by_kernel = FALSE;
9772 }
9773
9774 record_phys_addr:
9775 if (dst_page->vmp_busy) {
9776 upl->flags |= UPL_HAS_BUSY;
9777 }
9778
9779 lite_list[entry >> 5] |= 1U << (entry & 31);
9780
9781 if (phys_page > upl->highest_page) {
9782 upl->highest_page = phys_page;
9783 }
9784
9785 if (user_page_list) {
9786 user_page_list[entry].phys_addr = phys_page;
9787 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9788 user_page_list[entry].absent = dst_page->vmp_absent;
9789 user_page_list[entry].dirty = dst_page->vmp_dirty;
9790 user_page_list[entry].precious = dst_page->vmp_precious;
9791 user_page_list[entry].device = FALSE;
9792 user_page_list[entry].needed = FALSE;
9793 if (dst_page->vmp_clustered == TRUE) {
9794 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9795 } else {
9796 user_page_list[entry].speculative = FALSE;
9797 }
9798 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9799 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9800 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9801 user_page_list[entry].mark = FALSE;
9802 }
9803 if (object != kernel_object && object != compressor_object) {
9804 /*
9805 * someone is explicitly grabbing this page...
9806 * update clustered and speculative state
9807 *
9808 */
9809 if (dst_page->vmp_clustered) {
9810 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9811 }
9812 }
9813 skip_page:
9814 entry++;
9815 dst_offset += PAGE_SIZE_64;
9816 xfer_size -= PAGE_SIZE;
9817
9818 if (dwp->dw_mask) {
9819 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9820
9821 if (dw_count >= dw_limit) {
9822 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9823
9824 dwp = dwp_start;
9825 dw_count = 0;
9826 }
9827 }
9828 }
9829 assert(entry == size_in_pages);
9830
9831 if (dw_count) {
9832 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9833 dwp = dwp_start;
9834 dw_count = 0;
9835 }
9836 finish:
9837 if (user_page_list && set_cache_attr_needed == TRUE) {
9838 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9839 }
9840
9841 if (page_list_count != NULL) {
9842 if (upl->flags & UPL_INTERNAL) {
9843 *page_list_count = 0;
9844 } else if (*page_list_count > size_in_pages) {
9845 *page_list_count = size_in_pages;
9846 }
9847 }
9848 vm_object_unlock(object);
9849
9850 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9851 /*
9852 * We've marked all the pages "busy" so that future
9853 * page faults will block.
9854 * Now remove the mapping for these pages, so that they
9855 * can't be accessed without causing a page fault.
9856 */
9857 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9858 PMAP_NULL,
9859 PAGE_SIZE,
9860 0, VM_PROT_NONE);
9861 assert(!object->blocked_access);
9862 object->blocked_access = TRUE;
9863 }
9864
9865 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9866 #if DEVELOPMENT || DEBUG
9867 if (task != NULL) {
9868 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9869 }
9870 #endif /* DEVELOPMENT || DEBUG */
9871
9872 if (dwp_start && dwp_finish_ctx) {
9873 vm_page_delayed_work_finish_ctx(dwp_start);
9874 dwp_start = dwp = NULL;
9875 }
9876
9877 return KERN_SUCCESS;
9878
9879 return_err:
9880 dw_index = 0;
9881
9882 for (; offset < dst_offset; offset += PAGE_SIZE) {
9883 boolean_t need_unwire;
9884
9885 dst_page = vm_page_lookup(object, offset);
9886
9887 if (dst_page == VM_PAGE_NULL) {
9888 panic("vm_object_iopl_request: Wired page missing.");
9889 }
9890
9891 /*
9892 * if we've already processed this page in an earlier
9893 * dw_do_work, we need to undo the wiring... we will
9894 * leave the dirty and reference bits on if they
9895 * were set, since we don't have a good way of knowing
9896 * what the previous state was and we won't get here
9897 * under any normal circumstances... we will always
9898 * clear BUSY and wakeup any waiters via vm_page_free
9899 * or PAGE_WAKEUP_DONE
9900 */
9901 need_unwire = TRUE;
9902
9903 if (dw_count) {
9904 if ((dwp_start)[dw_index].dw_m == dst_page) {
9905 /*
9906 * still in the deferred work list
9907 * which means we haven't yet called
9908 * vm_page_wire on this page
9909 */
9910 need_unwire = FALSE;
9911
9912 dw_index++;
9913 dw_count--;
9914 }
9915 }
9916 vm_page_lock_queues();
9917
9918 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9919 vm_page_free(dst_page);
9920
9921 need_unwire = FALSE;
9922 } else {
9923 if (need_unwire == TRUE) {
9924 vm_page_unwire(dst_page, TRUE);
9925 }
9926
9927 PAGE_WAKEUP_DONE(dst_page);
9928 }
9929 vm_page_unlock_queues();
9930
9931 if (need_unwire == TRUE) {
9932 counter_inc(&vm_statistics_reactivations);
9933 }
9934 }
9935 #if UPL_DEBUG
9936 upl->upl_state = 2;
9937 #endif
9938 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9939 vm_object_activity_end(object);
9940 vm_object_collapse(object, 0, TRUE);
9941 }
9942 vm_object_unlock(object);
9943 upl_destroy(upl);
9944
9945 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9946 #if DEVELOPMENT || DEBUG
9947 if (task != NULL) {
9948 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9949 }
9950 #endif /* DEVELOPMENT || DEBUG */
9951
9952 if (dwp_start && dwp_finish_ctx) {
9953 vm_page_delayed_work_finish_ctx(dwp_start);
9954 dwp_start = dwp = NULL;
9955 }
9956 return ret;
9957 }
9958
9959 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9960 upl_transpose(
9961 upl_t upl1,
9962 upl_t upl2)
9963 {
9964 kern_return_t retval;
9965 boolean_t upls_locked;
9966 vm_object_t object1, object2;
9967
9968 /* LD: Should mapped UPLs be eligible for a transpose? */
9969 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9970 return KERN_INVALID_ARGUMENT;
9971 }
9972
9973 upls_locked = FALSE;
9974
9975 /*
9976 * Since we need to lock both UPLs at the same time,
9977 * avoid deadlocks by always taking locks in the same order.
9978 */
9979 if (upl1 < upl2) {
9980 upl_lock(upl1);
9981 upl_lock(upl2);
9982 } else {
9983 upl_lock(upl2);
9984 upl_lock(upl1);
9985 }
9986 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9987
9988 object1 = upl1->map_object;
9989 object2 = upl2->map_object;
9990
9991 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9992 upl1->u_size != upl2->u_size) {
9993 /*
9994 * We deal only with full objects, not subsets.
9995 * That's because we exchange the entire backing store info
9996 * for the objects: pager, resident pages, etc... We can't do
9997 * only part of it.
9998 */
9999 retval = KERN_INVALID_VALUE;
10000 goto done;
10001 }
10002
10003 /*
10004 * Tranpose the VM objects' backing store.
10005 */
10006 retval = vm_object_transpose(object1, object2,
10007 upl_adjusted_size(upl1, PAGE_MASK));
10008
10009 if (retval == KERN_SUCCESS) {
10010 /*
10011 * Make each UPL point to the correct VM object, i.e. the
10012 * object holding the pages that the UPL refers to...
10013 */
10014 #if CONFIG_IOSCHED || UPL_DEBUG
10015 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10016 vm_object_lock(object1);
10017 vm_object_lock(object2);
10018 }
10019 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10020 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10021 }
10022 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10023 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10024 }
10025 #endif
10026 upl1->map_object = object2;
10027 upl2->map_object = object1;
10028
10029 #if CONFIG_IOSCHED || UPL_DEBUG
10030 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10031 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10032 }
10033 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10034 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10035 }
10036 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10037 vm_object_unlock(object2);
10038 vm_object_unlock(object1);
10039 }
10040 #endif
10041 }
10042
10043 done:
10044 /*
10045 * Cleanup.
10046 */
10047 if (upls_locked) {
10048 upl_unlock(upl1);
10049 upl_unlock(upl2);
10050 upls_locked = FALSE;
10051 }
10052
10053 return retval;
10054 }
10055
10056 void
upl_range_needed(upl_t upl,int index,int count)10057 upl_range_needed(
10058 upl_t upl,
10059 int index,
10060 int count)
10061 {
10062 upl_page_info_t *user_page_list;
10063 int size_in_pages;
10064
10065 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10066 return;
10067 }
10068
10069 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10070
10071 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
10072
10073 while (count-- && index < size_in_pages) {
10074 user_page_list[index++].needed = TRUE;
10075 }
10076 }
10077
10078
10079 /*
10080 * Reserve of virtual addresses in the kernel address space.
10081 * We need to map the physical pages in the kernel, so that we
10082 * can call the code-signing or slide routines with a kernel
10083 * virtual address. We keep this pool of pre-allocated kernel
10084 * virtual addresses so that we don't have to scan the kernel's
10085 * virtaul address space each time we need to work with
10086 * a physical page.
10087 */
10088 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10089 #define VM_PAGING_NUM_PAGES 64
10090 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10091 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10092 int vm_paging_max_index = 0;
10093 int vm_paging_page_waiter = 0;
10094 int vm_paging_page_waiter_total = 0;
10095
10096 unsigned long vm_paging_no_kernel_page = 0;
10097 unsigned long vm_paging_objects_mapped = 0;
10098 unsigned long vm_paging_pages_mapped = 0;
10099 unsigned long vm_paging_objects_mapped_slow = 0;
10100 unsigned long vm_paging_pages_mapped_slow = 0;
10101
10102 __startup_func
10103 static void
vm_paging_map_init(void)10104 vm_paging_map_init(void)
10105 {
10106 kmem_alloc(kernel_map, &vm_paging_base_address,
10107 ptoa(VM_PAGING_NUM_PAGES),
10108 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10109 VM_KERN_MEMORY_NONE);
10110 }
10111 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10112
10113 /*
10114 * vm_paging_map_object:
10115 * Maps part of a VM object's pages in the kernel
10116 * virtual address space, using the pre-allocated
10117 * kernel virtual addresses, if possible.
10118 * Context:
10119 * The VM object is locked. This lock will get
10120 * dropped and re-acquired though, so the caller
10121 * must make sure the VM object is kept alive
10122 * (by holding a VM map that has a reference
10123 * on it, for example, or taking an extra reference).
10124 * The page should also be kept busy to prevent
10125 * it from being reclaimed.
10126 */
10127 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10128 vm_paging_map_object(
10129 vm_page_t page,
10130 vm_object_t object,
10131 vm_object_offset_t offset,
10132 vm_prot_t protection,
10133 boolean_t can_unlock_object,
10134 vm_map_size_t *size, /* IN/OUT */
10135 vm_map_offset_t *address, /* OUT */
10136 boolean_t *need_unmap) /* OUT */
10137 {
10138 kern_return_t kr;
10139 vm_map_offset_t page_map_offset;
10140 vm_map_size_t map_size;
10141 vm_object_offset_t object_offset;
10142 int i;
10143
10144 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10145 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10146 *address = (vm_map_offset_t)
10147 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10148 *need_unmap = FALSE;
10149 return KERN_SUCCESS;
10150
10151 assert(page->vmp_busy);
10152 /*
10153 * Use one of the pre-allocated kernel virtual addresses
10154 * and just enter the VM page in the kernel address space
10155 * at that virtual address.
10156 */
10157 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10158
10159 /*
10160 * Try and find an available kernel virtual address
10161 * from our pre-allocated pool.
10162 */
10163 page_map_offset = 0;
10164 for (;;) {
10165 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10166 if (vm_paging_page_inuse[i] == FALSE) {
10167 page_map_offset =
10168 vm_paging_base_address +
10169 (i * PAGE_SIZE);
10170 break;
10171 }
10172 }
10173 if (page_map_offset != 0) {
10174 /* found a space to map our page ! */
10175 break;
10176 }
10177
10178 if (can_unlock_object) {
10179 /*
10180 * If we can afford to unlock the VM object,
10181 * let's take the slow path now...
10182 */
10183 break;
10184 }
10185 /*
10186 * We can't afford to unlock the VM object, so
10187 * let's wait for a space to become available...
10188 */
10189 vm_paging_page_waiter_total++;
10190 vm_paging_page_waiter++;
10191 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10192 if (kr == THREAD_WAITING) {
10193 simple_unlock(&vm_paging_lock);
10194 kr = thread_block(THREAD_CONTINUE_NULL);
10195 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10196 }
10197 vm_paging_page_waiter--;
10198 /* ... and try again */
10199 }
10200
10201 if (page_map_offset != 0) {
10202 /*
10203 * We found a kernel virtual address;
10204 * map the physical page to that virtual address.
10205 */
10206 if (i > vm_paging_max_index) {
10207 vm_paging_max_index = i;
10208 }
10209 vm_paging_page_inuse[i] = TRUE;
10210 simple_unlock(&vm_paging_lock);
10211
10212 page->vmp_pmapped = TRUE;
10213
10214 /*
10215 * Keep the VM object locked over the PMAP_ENTER
10216 * and the actual use of the page by the kernel,
10217 * or this pmap mapping might get undone by a
10218 * vm_object_pmap_protect() call...
10219 */
10220 PMAP_ENTER(kernel_pmap,
10221 page_map_offset,
10222 page,
10223 protection,
10224 VM_PROT_NONE,
10225 0,
10226 TRUE,
10227 kr);
10228 assert(kr == KERN_SUCCESS);
10229 vm_paging_objects_mapped++;
10230 vm_paging_pages_mapped++;
10231 *address = page_map_offset;
10232 *need_unmap = TRUE;
10233
10234 #if KASAN
10235 kasan_notify_address(page_map_offset, PAGE_SIZE);
10236 #endif
10237
10238 /* all done and mapped, ready to use ! */
10239 return KERN_SUCCESS;
10240 }
10241
10242 /*
10243 * We ran out of pre-allocated kernel virtual
10244 * addresses. Just map the page in the kernel
10245 * the slow and regular way.
10246 */
10247 vm_paging_no_kernel_page++;
10248 simple_unlock(&vm_paging_lock);
10249 }
10250
10251 if (!can_unlock_object) {
10252 *address = 0;
10253 *size = 0;
10254 *need_unmap = FALSE;
10255 return KERN_NOT_SUPPORTED;
10256 }
10257
10258 object_offset = vm_object_trunc_page(offset);
10259 map_size = vm_map_round_page(*size,
10260 VM_MAP_PAGE_MASK(kernel_map));
10261
10262 /*
10263 * Try and map the required range of the object
10264 * in the kernel_map. Given that allocation is
10265 * for pageable memory, it shouldn't contain
10266 * pointers and is mapped into the data range.
10267 */
10268
10269 vm_object_reference_locked(object); /* for the map entry */
10270 vm_object_unlock(object);
10271
10272 kr = vm_map_enter(kernel_map,
10273 address,
10274 map_size,
10275 0,
10276 VM_FLAGS_ANYWHERE,
10277 VM_MAP_KERNEL_FLAGS_DATA,
10278 VM_KERN_MEMORY_NONE,
10279 object,
10280 object_offset,
10281 FALSE,
10282 protection,
10283 VM_PROT_ALL,
10284 VM_INHERIT_NONE);
10285 if (kr != KERN_SUCCESS) {
10286 *address = 0;
10287 *size = 0;
10288 *need_unmap = FALSE;
10289 vm_object_deallocate(object); /* for the map entry */
10290 vm_object_lock(object);
10291 return kr;
10292 }
10293
10294 *size = map_size;
10295
10296 /*
10297 * Enter the mapped pages in the page table now.
10298 */
10299 vm_object_lock(object);
10300 /*
10301 * VM object must be kept locked from before PMAP_ENTER()
10302 * until after the kernel is done accessing the page(s).
10303 * Otherwise, the pmap mappings in the kernel could be
10304 * undone by a call to vm_object_pmap_protect().
10305 */
10306
10307 for (page_map_offset = 0;
10308 map_size != 0;
10309 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10310 page = vm_page_lookup(object, offset + page_map_offset);
10311 if (page == VM_PAGE_NULL) {
10312 printf("vm_paging_map_object: no page !?");
10313 vm_object_unlock(object);
10314 vm_map_remove(kernel_map, *address, *size);
10315 *address = 0;
10316 *size = 0;
10317 *need_unmap = FALSE;
10318 vm_object_lock(object);
10319 return KERN_MEMORY_ERROR;
10320 }
10321 page->vmp_pmapped = TRUE;
10322
10323 PMAP_ENTER(kernel_pmap,
10324 *address + page_map_offset,
10325 page,
10326 protection,
10327 VM_PROT_NONE,
10328 0,
10329 TRUE,
10330 kr);
10331 assert(kr == KERN_SUCCESS);
10332 #if KASAN
10333 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10334 #endif
10335 }
10336
10337 vm_paging_objects_mapped_slow++;
10338 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10339
10340 *need_unmap = TRUE;
10341
10342 return KERN_SUCCESS;
10343 }
10344
10345 /*
10346 * vm_paging_unmap_object:
10347 * Unmaps part of a VM object's pages from the kernel
10348 * virtual address space.
10349 * Context:
10350 * The VM object is locked. This lock will get
10351 * dropped and re-acquired though.
10352 */
10353 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10354 vm_paging_unmap_object(
10355 vm_object_t object,
10356 vm_map_offset_t start,
10357 vm_map_offset_t end)
10358 {
10359 int i;
10360
10361 if ((vm_paging_base_address == 0) ||
10362 (start < vm_paging_base_address) ||
10363 (end > (vm_paging_base_address
10364 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10365 /*
10366 * We didn't use our pre-allocated pool of
10367 * kernel virtual address. Deallocate the
10368 * virtual memory.
10369 */
10370 if (object != VM_OBJECT_NULL) {
10371 vm_object_unlock(object);
10372 }
10373 vm_map_remove(kernel_map, start, end);
10374 if (object != VM_OBJECT_NULL) {
10375 vm_object_lock(object);
10376 }
10377 } else {
10378 /*
10379 * We used a kernel virtual address from our
10380 * pre-allocated pool. Put it back in the pool
10381 * for next time.
10382 */
10383 assert(end - start == PAGE_SIZE);
10384 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10385 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10386
10387 /* undo the pmap mapping */
10388 pmap_remove(kernel_pmap, start, end);
10389
10390 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10391 vm_paging_page_inuse[i] = FALSE;
10392 if (vm_paging_page_waiter) {
10393 thread_wakeup(&vm_paging_page_waiter);
10394 }
10395 simple_unlock(&vm_paging_lock);
10396 }
10397 }
10398
10399
10400 /*
10401 * page->vmp_object must be locked
10402 */
10403 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10404 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10405 {
10406 if (!queues_locked) {
10407 vm_page_lockspin_queues();
10408 }
10409
10410 page->vmp_free_when_done = FALSE;
10411 /*
10412 * need to drop the laundry count...
10413 * we may also need to remove it
10414 * from the I/O paging queue...
10415 * vm_pageout_throttle_up handles both cases
10416 *
10417 * the laundry and pageout_queue flags are cleared...
10418 */
10419 vm_pageout_throttle_up(page);
10420
10421 if (!queues_locked) {
10422 vm_page_unlock_queues();
10423 }
10424 }
10425
10426 upl_t
vector_upl_create(vm_offset_t upl_offset)10427 vector_upl_create(vm_offset_t upl_offset)
10428 {
10429 int i = 0;
10430 upl_t upl;
10431 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10432
10433 upl = upl_create(0, UPL_VECTOR, 0);
10434 upl->vector_upl = vector_upl;
10435 upl->u_offset = upl_offset;
10436 vector_upl->size = 0;
10437 vector_upl->offset = upl_offset;
10438 vector_upl->invalid_upls = 0;
10439 vector_upl->num_upls = 0;
10440 vector_upl->pagelist = NULL;
10441
10442 for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10443 vector_upl->upl_iostates[i].size = 0;
10444 vector_upl->upl_iostates[i].offset = 0;
10445 }
10446 return upl;
10447 }
10448
10449 void
vector_upl_deallocate(upl_t upl)10450 vector_upl_deallocate(upl_t upl)
10451 {
10452 if (upl) {
10453 vector_upl_t vector_upl = upl->vector_upl;
10454 if (vector_upl) {
10455 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10456 panic("Deallocating non-empty Vectored UPL");
10457 }
10458 kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10459 vector_upl->invalid_upls = 0;
10460 vector_upl->num_upls = 0;
10461 vector_upl->pagelist = NULL;
10462 vector_upl->size = 0;
10463 vector_upl->offset = 0;
10464 kfree_type(struct _vector_upl, vector_upl);
10465 vector_upl = (vector_upl_t)0xfeedfeed;
10466 } else {
10467 panic("vector_upl_deallocate was passed a non-vectored upl");
10468 }
10469 } else {
10470 panic("vector_upl_deallocate was passed a NULL upl");
10471 }
10472 }
10473
10474 boolean_t
vector_upl_is_valid(upl_t upl)10475 vector_upl_is_valid(upl_t upl)
10476 {
10477 if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10478 vector_upl_t vector_upl = upl->vector_upl;
10479 if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10480 return FALSE;
10481 } else {
10482 return TRUE;
10483 }
10484 }
10485 return FALSE;
10486 }
10487
10488 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10489 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10490 {
10491 if (vector_upl_is_valid(upl)) {
10492 vector_upl_t vector_upl = upl->vector_upl;
10493
10494 if (vector_upl) {
10495 if (subupl) {
10496 if (io_size) {
10497 if (io_size < PAGE_SIZE) {
10498 io_size = PAGE_SIZE;
10499 }
10500 subupl->vector_upl = (void*)vector_upl;
10501 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10502 vector_upl->size += io_size;
10503 upl->u_size += io_size;
10504 } else {
10505 uint32_t i = 0, invalid_upls = 0;
10506 for (i = 0; i < vector_upl->num_upls; i++) {
10507 if (vector_upl->upl_elems[i] == subupl) {
10508 break;
10509 }
10510 }
10511 if (i == vector_upl->num_upls) {
10512 panic("Trying to remove sub-upl when none exists");
10513 }
10514
10515 vector_upl->upl_elems[i] = NULL;
10516 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10517 relaxed);
10518 if (invalid_upls == vector_upl->num_upls) {
10519 return TRUE;
10520 } else {
10521 return FALSE;
10522 }
10523 }
10524 } else {
10525 panic("vector_upl_set_subupl was passed a NULL upl element");
10526 }
10527 } else {
10528 panic("vector_upl_set_subupl was passed a non-vectored upl");
10529 }
10530 } else {
10531 panic("vector_upl_set_subupl was passed a NULL upl");
10532 }
10533
10534 return FALSE;
10535 }
10536
10537 void
vector_upl_set_pagelist(upl_t upl)10538 vector_upl_set_pagelist(upl_t upl)
10539 {
10540 if (vector_upl_is_valid(upl)) {
10541 uint32_t i = 0;
10542 vector_upl_t vector_upl = upl->vector_upl;
10543
10544 if (vector_upl) {
10545 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10546
10547 vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10548
10549 for (i = 0; i < vector_upl->num_upls; i++) {
10550 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10551 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10552 pagelist_size += cur_upl_pagelist_size;
10553 if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10554 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10555 }
10556 }
10557 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10558 } else {
10559 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10560 }
10561 } else {
10562 panic("vector_upl_set_pagelist was passed a NULL upl");
10563 }
10564 }
10565
10566 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10567 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10568 {
10569 if (vector_upl_is_valid(upl)) {
10570 vector_upl_t vector_upl = upl->vector_upl;
10571 if (vector_upl) {
10572 if (index < vector_upl->num_upls) {
10573 return vector_upl->upl_elems[index];
10574 }
10575 } else {
10576 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10577 }
10578 }
10579 return NULL;
10580 }
10581
10582 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10583 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10584 {
10585 if (vector_upl_is_valid(upl)) {
10586 uint32_t i = 0;
10587 vector_upl_t vector_upl = upl->vector_upl;
10588
10589 if (vector_upl) {
10590 upl_t subupl = NULL;
10591 vector_upl_iostates_t subupl_state;
10592
10593 for (i = 0; i < vector_upl->num_upls; i++) {
10594 subupl = vector_upl->upl_elems[i];
10595 subupl_state = vector_upl->upl_iostates[i];
10596 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10597 /* We could have been passed an offset/size pair that belongs
10598 * to an UPL element that has already been committed/aborted.
10599 * If so, return NULL.
10600 */
10601 if (subupl == NULL) {
10602 return NULL;
10603 }
10604 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10605 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10606 if (*upl_size > subupl_state.size) {
10607 *upl_size = subupl_state.size;
10608 }
10609 }
10610 if (*upl_offset >= subupl_state.offset) {
10611 *upl_offset -= subupl_state.offset;
10612 } else if (i) {
10613 panic("Vector UPL offset miscalculation");
10614 }
10615 return subupl;
10616 }
10617 }
10618 } else {
10619 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10620 }
10621 }
10622 return NULL;
10623 }
10624
10625 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10626 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10627 {
10628 *v_upl_submap = NULL;
10629
10630 if (vector_upl_is_valid(upl)) {
10631 vector_upl_t vector_upl = upl->vector_upl;
10632 if (vector_upl) {
10633 *v_upl_submap = vector_upl->submap;
10634 *submap_dst_addr = vector_upl->submap_dst_addr;
10635 } else {
10636 panic("vector_upl_get_submap was passed a non-vectored UPL");
10637 }
10638 } else {
10639 panic("vector_upl_get_submap was passed a null UPL");
10640 }
10641 }
10642
10643 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10644 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10645 {
10646 if (vector_upl_is_valid(upl)) {
10647 vector_upl_t vector_upl = upl->vector_upl;
10648 if (vector_upl) {
10649 vector_upl->submap = submap;
10650 vector_upl->submap_dst_addr = submap_dst_addr;
10651 } else {
10652 panic("vector_upl_get_submap was passed a non-vectored UPL");
10653 }
10654 } else {
10655 panic("vector_upl_get_submap was passed a NULL UPL");
10656 }
10657 }
10658
10659 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10660 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10661 {
10662 if (vector_upl_is_valid(upl)) {
10663 uint32_t i = 0;
10664 vector_upl_t vector_upl = upl->vector_upl;
10665
10666 if (vector_upl) {
10667 for (i = 0; i < vector_upl->num_upls; i++) {
10668 if (vector_upl->upl_elems[i] == subupl) {
10669 break;
10670 }
10671 }
10672
10673 if (i == vector_upl->num_upls) {
10674 panic("setting sub-upl iostate when none exists");
10675 }
10676
10677 vector_upl->upl_iostates[i].offset = offset;
10678 if (size < PAGE_SIZE) {
10679 size = PAGE_SIZE;
10680 }
10681 vector_upl->upl_iostates[i].size = size;
10682 } else {
10683 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10684 }
10685 } else {
10686 panic("vector_upl_set_iostate was passed a NULL UPL");
10687 }
10688 }
10689
10690 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10691 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10692 {
10693 if (vector_upl_is_valid(upl)) {
10694 uint32_t i = 0;
10695 vector_upl_t vector_upl = upl->vector_upl;
10696
10697 if (vector_upl) {
10698 for (i = 0; i < vector_upl->num_upls; i++) {
10699 if (vector_upl->upl_elems[i] == subupl) {
10700 break;
10701 }
10702 }
10703
10704 if (i == vector_upl->num_upls) {
10705 panic("getting sub-upl iostate when none exists");
10706 }
10707
10708 *offset = vector_upl->upl_iostates[i].offset;
10709 *size = vector_upl->upl_iostates[i].size;
10710 } else {
10711 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10712 }
10713 } else {
10714 panic("vector_upl_get_iostate was passed a NULL UPL");
10715 }
10716 }
10717
10718 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10719 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10720 {
10721 if (vector_upl_is_valid(upl)) {
10722 vector_upl_t vector_upl = upl->vector_upl;
10723 if (vector_upl) {
10724 if (index < vector_upl->num_upls) {
10725 *offset = vector_upl->upl_iostates[index].offset;
10726 *size = vector_upl->upl_iostates[index].size;
10727 } else {
10728 *offset = *size = 0;
10729 }
10730 } else {
10731 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10732 }
10733 } else {
10734 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10735 }
10736 }
10737
10738 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10739 upl_get_internal_vectorupl_pagelist(upl_t upl)
10740 {
10741 return ((vector_upl_t)(upl->vector_upl))->pagelist;
10742 }
10743
10744 void *
upl_get_internal_vectorupl(upl_t upl)10745 upl_get_internal_vectorupl(upl_t upl)
10746 {
10747 return upl->vector_upl;
10748 }
10749
10750 vm_size_t
upl_get_internal_pagelist_offset(void)10751 upl_get_internal_pagelist_offset(void)
10752 {
10753 return sizeof(struct upl);
10754 }
10755
10756 void
upl_clear_dirty(upl_t upl,boolean_t value)10757 upl_clear_dirty(
10758 upl_t upl,
10759 boolean_t value)
10760 {
10761 if (value) {
10762 upl->flags |= UPL_CLEAR_DIRTY;
10763 } else {
10764 upl->flags &= ~UPL_CLEAR_DIRTY;
10765 }
10766 }
10767
10768 void
upl_set_referenced(upl_t upl,boolean_t value)10769 upl_set_referenced(
10770 upl_t upl,
10771 boolean_t value)
10772 {
10773 upl_lock(upl);
10774 if (value) {
10775 upl->ext_ref_count++;
10776 } else {
10777 if (!upl->ext_ref_count) {
10778 panic("upl_set_referenced not %p", upl);
10779 }
10780 upl->ext_ref_count--;
10781 }
10782 upl_unlock(upl);
10783 }
10784
10785 #if CONFIG_IOSCHED
10786 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10787 upl_set_blkno(
10788 upl_t upl,
10789 vm_offset_t upl_offset,
10790 int io_size,
10791 int64_t blkno)
10792 {
10793 int i, j;
10794 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10795 return;
10796 }
10797
10798 assert(upl->upl_reprio_info != 0);
10799 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10800 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10801 }
10802 }
10803 #endif
10804
10805 void inline
memoryshot(unsigned int event,unsigned int control)10806 memoryshot(unsigned int event, unsigned int control)
10807 {
10808 if (vm_debug_events) {
10809 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10810 vm_page_active_count, vm_page_inactive_count,
10811 vm_page_free_count, vm_page_speculative_count,
10812 vm_page_throttled_count);
10813 } else {
10814 (void) event;
10815 (void) control;
10816 }
10817 }
10818
10819 #ifdef MACH_BSD
10820
10821 boolean_t
upl_device_page(upl_page_info_t * upl)10822 upl_device_page(upl_page_info_t *upl)
10823 {
10824 return UPL_DEVICE_PAGE(upl);
10825 }
10826 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10827 upl_page_present(upl_page_info_t *upl, int index)
10828 {
10829 return UPL_PAGE_PRESENT(upl, index);
10830 }
10831 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10832 upl_speculative_page(upl_page_info_t *upl, int index)
10833 {
10834 return UPL_SPECULATIVE_PAGE(upl, index);
10835 }
10836 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10837 upl_dirty_page(upl_page_info_t *upl, int index)
10838 {
10839 return UPL_DIRTY_PAGE(upl, index);
10840 }
10841 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10842 upl_valid_page(upl_page_info_t *upl, int index)
10843 {
10844 return UPL_VALID_PAGE(upl, index);
10845 }
10846 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10847 upl_phys_page(upl_page_info_t *upl, int index)
10848 {
10849 return UPL_PHYS_PAGE(upl, index);
10850 }
10851
10852 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10853 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10854 {
10855 upl[index].mark = v;
10856 }
10857
10858 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10859 upl_page_get_mark(upl_page_info_t *upl, int index)
10860 {
10861 return upl[index].mark;
10862 }
10863
10864 void
vm_countdirtypages(void)10865 vm_countdirtypages(void)
10866 {
10867 vm_page_t m;
10868 int dpages;
10869 int pgopages;
10870 int precpages;
10871
10872
10873 dpages = 0;
10874 pgopages = 0;
10875 precpages = 0;
10876
10877 vm_page_lock_queues();
10878 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10879 do {
10880 if (m == (vm_page_t)0) {
10881 break;
10882 }
10883
10884 if (m->vmp_dirty) {
10885 dpages++;
10886 }
10887 if (m->vmp_free_when_done) {
10888 pgopages++;
10889 }
10890 if (m->vmp_precious) {
10891 precpages++;
10892 }
10893
10894 assert(VM_PAGE_OBJECT(m) != kernel_object);
10895 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10896 if (m == (vm_page_t)0) {
10897 break;
10898 }
10899 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10900 vm_page_unlock_queues();
10901
10902 vm_page_lock_queues();
10903 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10904 do {
10905 if (m == (vm_page_t)0) {
10906 break;
10907 }
10908
10909 dpages++;
10910 assert(m->vmp_dirty);
10911 assert(!m->vmp_free_when_done);
10912 assert(VM_PAGE_OBJECT(m) != kernel_object);
10913 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10914 if (m == (vm_page_t)0) {
10915 break;
10916 }
10917 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10918 vm_page_unlock_queues();
10919
10920 vm_page_lock_queues();
10921 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10922 do {
10923 if (m == (vm_page_t)0) {
10924 break;
10925 }
10926
10927 if (m->vmp_dirty) {
10928 dpages++;
10929 }
10930 if (m->vmp_free_when_done) {
10931 pgopages++;
10932 }
10933 if (m->vmp_precious) {
10934 precpages++;
10935 }
10936
10937 assert(VM_PAGE_OBJECT(m) != kernel_object);
10938 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10939 if (m == (vm_page_t)0) {
10940 break;
10941 }
10942 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10943 vm_page_unlock_queues();
10944
10945 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10946
10947 dpages = 0;
10948 pgopages = 0;
10949 precpages = 0;
10950
10951 vm_page_lock_queues();
10952 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10953
10954 do {
10955 if (m == (vm_page_t)0) {
10956 break;
10957 }
10958 if (m->vmp_dirty) {
10959 dpages++;
10960 }
10961 if (m->vmp_free_when_done) {
10962 pgopages++;
10963 }
10964 if (m->vmp_precious) {
10965 precpages++;
10966 }
10967
10968 assert(VM_PAGE_OBJECT(m) != kernel_object);
10969 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10970 if (m == (vm_page_t)0) {
10971 break;
10972 }
10973 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10974 vm_page_unlock_queues();
10975
10976 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10977 }
10978 #endif /* MACH_BSD */
10979
10980
10981 #if CONFIG_IOSCHED
10982 int
upl_get_cached_tier(upl_t upl)10983 upl_get_cached_tier(upl_t upl)
10984 {
10985 assert(upl);
10986 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10987 return upl->upl_priority;
10988 }
10989 return -1;
10990 }
10991 #endif /* CONFIG_IOSCHED */
10992
10993
10994 void
upl_callout_iodone(upl_t upl)10995 upl_callout_iodone(upl_t upl)
10996 {
10997 struct upl_io_completion *upl_ctx = upl->upl_iodone;
10998
10999 if (upl_ctx) {
11000 void (*iodone_func)(void *, int) = upl_ctx->io_done;
11001
11002 assert(upl_ctx->io_done);
11003
11004 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11005 }
11006 }
11007
11008 void
upl_set_iodone(upl_t upl,void * upl_iodone)11009 upl_set_iodone(upl_t upl, void *upl_iodone)
11010 {
11011 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11012 }
11013
11014 void
upl_set_iodone_error(upl_t upl,int error)11015 upl_set_iodone_error(upl_t upl, int error)
11016 {
11017 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11018
11019 if (upl_ctx) {
11020 upl_ctx->io_error = error;
11021 }
11022 }
11023
11024
11025 ppnum_t
upl_get_highest_page(upl_t upl)11026 upl_get_highest_page(
11027 upl_t upl)
11028 {
11029 return upl->highest_page;
11030 }
11031
11032 upl_size_t
upl_get_size(upl_t upl)11033 upl_get_size(
11034 upl_t upl)
11035 {
11036 return upl_adjusted_size(upl, PAGE_MASK);
11037 }
11038
11039 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11040 upl_adjusted_size(
11041 upl_t upl,
11042 vm_map_offset_t pgmask)
11043 {
11044 vm_object_offset_t start_offset, end_offset;
11045
11046 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11047 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11048
11049 return (upl_size_t)(end_offset - start_offset);
11050 }
11051
11052 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11053 upl_adjusted_offset(
11054 upl_t upl,
11055 vm_map_offset_t pgmask)
11056 {
11057 return trunc_page_mask_64(upl->u_offset, pgmask);
11058 }
11059
11060 vm_object_offset_t
upl_get_data_offset(upl_t upl)11061 upl_get_data_offset(
11062 upl_t upl)
11063 {
11064 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11065 }
11066
11067 upl_t
upl_associated_upl(upl_t upl)11068 upl_associated_upl(upl_t upl)
11069 {
11070 return upl->associated_upl;
11071 }
11072
11073 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11074 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11075 {
11076 upl->associated_upl = associated_upl;
11077 }
11078
11079 struct vnode *
upl_lookup_vnode(upl_t upl)11080 upl_lookup_vnode(upl_t upl)
11081 {
11082 if (!upl->map_object->internal) {
11083 return vnode_pager_lookup_vnode(upl->map_object->pager);
11084 } else {
11085 return NULL;
11086 }
11087 }
11088
11089 #if UPL_DEBUG
11090 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11091 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11092 {
11093 upl->ubc_alias1 = alias1;
11094 upl->ubc_alias2 = alias2;
11095 return KERN_SUCCESS;
11096 }
11097 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11098 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11099 {
11100 if (al) {
11101 *al = upl->ubc_alias1;
11102 }
11103 if (al2) {
11104 *al2 = upl->ubc_alias2;
11105 }
11106 return KERN_SUCCESS;
11107 }
11108 #endif /* UPL_DEBUG */
11109
11110 #if VM_PRESSURE_EVENTS
11111 /*
11112 * Upward trajectory.
11113 */
11114 extern boolean_t vm_compressor_low_on_space(void);
11115
11116 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11117 VM_PRESSURE_NORMAL_TO_WARNING(void)
11118 {
11119 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11120 /* Available pages below our threshold */
11121 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11122 /* No frozen processes to kill */
11123 if (memorystatus_frozen_count == 0) {
11124 /* Not enough suspended processes available. */
11125 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11126 return TRUE;
11127 }
11128 }
11129 }
11130 return FALSE;
11131 } else {
11132 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11133 }
11134 }
11135
11136 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11137 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11138 {
11139 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11140 /* Available pages below our threshold */
11141 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11142 return TRUE;
11143 }
11144 return FALSE;
11145 } else {
11146 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11147 }
11148 }
11149
11150 /*
11151 * Downward trajectory.
11152 */
11153 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11154 VM_PRESSURE_WARNING_TO_NORMAL(void)
11155 {
11156 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11157 /* Available pages above our threshold */
11158 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11159 if (memorystatus_available_pages > target_threshold) {
11160 return TRUE;
11161 }
11162 return FALSE;
11163 } else {
11164 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11165 }
11166 }
11167
11168 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11169 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11170 {
11171 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11172 /* Available pages above our threshold */
11173 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11174 if (memorystatus_available_pages > target_threshold) {
11175 return TRUE;
11176 }
11177 return FALSE;
11178 } else {
11179 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11180 }
11181 }
11182 #endif /* VM_PRESSURE_EVENTS */
11183
11184 #if DEVELOPMENT || DEBUG
11185 bool compressor_running_perf_test;
11186 uint64_t compressor_perf_test_pages_processed;
11187
11188 kern_return_t
11189 run_compressor_perf_test(
11190 user_addr_t buf,
11191 size_t buffer_size,
11192 uint64_t *time,
11193 uint64_t *bytes_compressed,
11194 uint64_t *compressor_growth);
11195
11196 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11197 move_pages_to_queue(
11198 vm_map_t map,
11199 user_addr_t start_addr,
11200 size_t buffer_size,
11201 vm_page_queue_head_t *queue,
11202 size_t *pages_moved)
11203 {
11204 kern_return_t err = KERN_SUCCESS;
11205 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11206 boolean_t addr_in_map = FALSE;
11207 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11208 vm_object_t curr_object = VM_OBJECT_NULL;
11209 *pages_moved = 0;
11210
11211
11212 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11213 /*
11214 * We don't currently support benchmarking maps with a different page size
11215 * than the kernel.
11216 */
11217 return KERN_INVALID_ARGUMENT;
11218 }
11219
11220 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11221 return KERN_INVALID_ARGUMENT;
11222 }
11223
11224 vm_map_lock_read(map);
11225 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11226 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11227
11228
11229 while (curr_addr < end_addr) {
11230 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11231 if (!addr_in_map) {
11232 err = KERN_INVALID_ARGUMENT;
11233 break;
11234 }
11235 curr_object = VME_OBJECT(curr_entry);
11236 if (curr_object) {
11237 vm_object_lock(curr_object);
11238 /* We really only want anonymous memory that's in the top level map and object here. */
11239 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11240 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11241 err = KERN_INVALID_ARGUMENT;
11242 vm_object_unlock(curr_object);
11243 break;
11244 }
11245 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11246 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11247 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11248 vm_map_offset_t curr_offset = start_offset;
11249 vm_page_t curr_page;
11250 while (curr_offset < end_offset) {
11251 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11252 if (curr_page != VM_PAGE_NULL) {
11253 vm_page_lock_queues();
11254 if (curr_page->vmp_laundry) {
11255 vm_pageout_steal_laundry(curr_page, TRUE);
11256 }
11257 /*
11258 * we've already factored out pages in the laundry which
11259 * means this page can't be on the pageout queue so it's
11260 * safe to do the vm_page_queues_remove
11261 */
11262 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11263 vm_page_queues_remove(curr_page, TRUE);
11264 if (donate) {
11265 /*
11266 * The compressor needs to see this bit to know
11267 * where this page needs to land. Also if stolen,
11268 * this bit helps put the page back in the right
11269 * special queue where it belongs.
11270 */
11271 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11272 }
11273 // Clear the referenced bit so we ensure this gets paged out
11274 curr_page->vmp_reference = false;
11275 if (curr_page->vmp_pmapped) {
11276 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11277 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11278 }
11279 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11280 vm_page_unlock_queues();
11281 *pages_moved += 1;
11282 }
11283 curr_offset += PAGE_SIZE_64;
11284 curr_addr += PAGE_SIZE_64;
11285 }
11286 }
11287 vm_object_unlock(curr_object);
11288 }
11289 vm_map_unlock_read(map);
11290 return err;
11291 }
11292
11293 /*
11294 * Local queue for processing benchmark pages.
11295 * Can't be allocated on the stack because the pointer has to
11296 * be packable.
11297 */
11298 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11299 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11300 run_compressor_perf_test(
11301 user_addr_t buf,
11302 size_t buffer_size,
11303 uint64_t *time,
11304 uint64_t *bytes_compressed,
11305 uint64_t *compressor_growth)
11306 {
11307 kern_return_t err = KERN_SUCCESS;
11308 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11309 return KERN_NOT_SUPPORTED;
11310 }
11311 if (current_task() == kernel_task) {
11312 return KERN_INVALID_ARGUMENT;
11313 }
11314 vm_page_lock_queues();
11315 if (compressor_running_perf_test) {
11316 /* Only run one instance of the benchmark at a time. */
11317 vm_page_unlock_queues();
11318 return KERN_RESOURCE_SHORTAGE;
11319 }
11320 vm_page_unlock_queues();
11321 size_t page_count = 0;
11322 vm_map_t map;
11323 vm_page_t p, next;
11324 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11325 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11326 *bytes_compressed = *compressor_growth = 0;
11327
11328 vm_page_queue_init(&compressor_perf_test_queue);
11329 map = current_task()->map;
11330 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11331 if (err != KERN_SUCCESS) {
11332 goto out;
11333 }
11334
11335 vm_page_lock_queues();
11336 compressor_running_perf_test = true;
11337 compressor_perf_test_pages_processed = 0;
11338 /*
11339 * At this point the compressor threads should only process the benchmark queue
11340 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11341 * to determine how many compressed bytes we ended up using.
11342 */
11343 compressed_bytes_start = c_segment_compressed_bytes;
11344 vm_page_unlock_queues();
11345
11346 compressor_perf_test_start = mach_absolute_time();
11347 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11348
11349 vm_page_lock_queues();
11350 /*
11351 * Depending on when this test is run we could overshoot or be right on the mark
11352 * with our page_count. So the comparison is of the _less than_ variety.
11353 */
11354 while (compressor_perf_test_pages_processed < page_count) {
11355 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11356 vm_page_unlock_queues();
11357 thread_block(THREAD_CONTINUE_NULL);
11358 vm_page_lock_queues();
11359 }
11360 compressor_perf_test_end = mach_absolute_time();
11361 compressed_bytes_end = c_segment_compressed_bytes;
11362 vm_page_unlock_queues();
11363
11364
11365 out:
11366 /*
11367 * If we errored out above, then we could still have some pages
11368 * on the local queue. Make sure to put them back on the active queue before
11369 * returning so they're not orphaned.
11370 */
11371 vm_page_lock_queues();
11372 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11373 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11374 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11375 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11376
11377 vm_page_enqueue_active(p, FALSE);
11378 p = next;
11379 }
11380
11381 compressor_running_perf_test = false;
11382 vm_page_unlock_queues();
11383 if (err == KERN_SUCCESS) {
11384 *bytes_compressed = page_count * PAGE_SIZE_64;
11385 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11386 }
11387
11388 /*
11389 * pageout_scan will consider waking the compactor swapper
11390 * before it blocks. Do the same thing here before we return
11391 * to ensure that back to back benchmark runs can't overly fragment the
11392 * compressor pool.
11393 */
11394 vm_consider_waking_compactor_swapper();
11395 return err;
11396 }
11397 #endif /* DEVELOPMENT || DEBUG */
11398