1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91
92 #include <os/log.h>
93
94 #include <sys/kdebug_triage.h>
95
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map_internal.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111
112 #include <san/kasan.h>
113
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121
122 extern int cs_debug;
123
124 extern void mbuf_drain(boolean_t);
125
126 #if VM_PRESSURE_EVENTS
127 #if CONFIG_JETSAM
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 #else /* CONFIG_JETSAM */
132 extern uint64_t memorystatus_available_pages;
133 extern uint64_t memorystatus_available_pages_pressure;
134 extern uint64_t memorystatus_available_pages_critical;
135 #endif /* CONFIG_JETSAM */
136
137 extern unsigned int memorystatus_frozen_count;
138 extern unsigned int memorystatus_suspended_count;
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140
141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143
144 void vm_pressure_response(void);
145 extern void consider_vm_pressure_events(void);
146
147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
148 #endif /* VM_PRESSURE_EVENTS */
149
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 boolean_t vps_dynamic_priority_enabled = FALSE;
153 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
154
155 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
156 #if !XNU_TARGET_OS_OSX
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
158 #else /* !XNU_TARGET_OS_OSX */
159 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
160 #endif /* !XNU_TARGET_OS_OSX */
161 #endif
162
163 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
164 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
165 #endif
166
167 #ifndef VM_PAGE_LAUNDRY_MAX
168 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
169 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
170
171 #ifndef VM_PAGEOUT_BURST_WAIT
172 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
173 #endif /* VM_PAGEOUT_BURST_WAIT */
174
175 #ifndef VM_PAGEOUT_EMPTY_WAIT
176 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
177 #endif /* VM_PAGEOUT_EMPTY_WAIT */
178
179 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
180 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
181 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
182
183 #ifndef VM_PAGEOUT_IDLE_WAIT
184 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
185 #endif /* VM_PAGEOUT_IDLE_WAIT */
186
187 #ifndef VM_PAGEOUT_SWAP_WAIT
188 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
189 #endif /* VM_PAGEOUT_SWAP_WAIT */
190
191
192 #ifndef VM_PAGE_SPECULATIVE_TARGET
193 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
194 #endif /* VM_PAGE_SPECULATIVE_TARGET */
195
196
197 /*
198 * To obtain a reasonable LRU approximation, the inactive queue
199 * needs to be large enough to give pages on it a chance to be
200 * referenced a second time. This macro defines the fraction
201 * of active+inactive pages that should be inactive.
202 * The pageout daemon uses it to update vm_page_inactive_target.
203 *
204 * If vm_page_free_count falls below vm_page_free_target and
205 * vm_page_inactive_count is below vm_page_inactive_target,
206 * then the pageout daemon starts running.
207 */
208
209 #ifndef VM_PAGE_INACTIVE_TARGET
210 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
211 #endif /* VM_PAGE_INACTIVE_TARGET */
212
213 /*
214 * Once the pageout daemon starts running, it keeps going
215 * until vm_page_free_count meets or exceeds vm_page_free_target.
216 */
217
218 #ifndef VM_PAGE_FREE_TARGET
219 #if !XNU_TARGET_OS_OSX
220 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
221 #else /* !XNU_TARGET_OS_OSX */
222 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
223 #endif /* !XNU_TARGET_OS_OSX */
224 #endif /* VM_PAGE_FREE_TARGET */
225
226
227 /*
228 * The pageout daemon always starts running once vm_page_free_count
229 * falls below vm_page_free_min.
230 */
231
232 #ifndef VM_PAGE_FREE_MIN
233 #if !XNU_TARGET_OS_OSX
234 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
235 #else /* !XNU_TARGET_OS_OSX */
236 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
237 #endif /* !XNU_TARGET_OS_OSX */
238 #endif /* VM_PAGE_FREE_MIN */
239
240 #if !XNU_TARGET_OS_OSX
241 #define VM_PAGE_FREE_RESERVED_LIMIT 100
242 #define VM_PAGE_FREE_MIN_LIMIT 1500
243 #define VM_PAGE_FREE_TARGET_LIMIT 2000
244 #else /* !XNU_TARGET_OS_OSX */
245 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
246 #define VM_PAGE_FREE_MIN_LIMIT 3500
247 #define VM_PAGE_FREE_TARGET_LIMIT 4000
248 #endif /* !XNU_TARGET_OS_OSX */
249
250 /*
251 * When vm_page_free_count falls below vm_page_free_reserved,
252 * only vm-privileged threads can allocate pages. vm-privilege
253 * allows the pageout daemon and default pager (and any other
254 * associated threads needed for default pageout) to continue
255 * operation by dipping into the reserved pool of pages.
256 */
257
258 #ifndef VM_PAGE_FREE_RESERVED
259 #define VM_PAGE_FREE_RESERVED(n) \
260 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
261 #endif /* VM_PAGE_FREE_RESERVED */
262
263 /*
264 * When we dequeue pages from the inactive list, they are
265 * reactivated (ie, put back on the active queue) if referenced.
266 * However, it is possible to starve the free list if other
267 * processors are referencing pages faster than we can turn off
268 * the referenced bit. So we limit the number of reactivations
269 * we will make per call of vm_pageout_scan().
270 */
271 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
272
273 #ifndef VM_PAGE_REACTIVATE_LIMIT
274 #if !XNU_TARGET_OS_OSX
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
276 #else /* !XNU_TARGET_OS_OSX */
277 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
278 #endif /* !XNU_TARGET_OS_OSX */
279 #endif /* VM_PAGE_REACTIVATE_LIMIT */
280 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
281
282 int vm_pageout_protect_realtime = true;
283
284 extern boolean_t hibernate_cleaning_in_progress;
285
286 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
287 struct pgo_iothread_state pgo_iothread_external_state;
288
289 #if VM_PRESSURE_EVENTS
290 void vm_pressure_thread(void);
291
292 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
293 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
294
295 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
296 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
297 #endif
298
299 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
300 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
301 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
302
303 extern void vm_pageout_continue(void);
304 extern void vm_pageout_scan(void);
305
306 boolean_t vm_pageout_running = FALSE;
307
308 uint32_t vm_page_upl_tainted = 0;
309 uint32_t vm_page_iopl_tainted = 0;
310
311 #if XNU_TARGET_OS_OSX
312 static boolean_t vm_pageout_waiter = FALSE;
313 #endif /* XNU_TARGET_OS_OSX */
314
315
316 #if DEVELOPMENT || DEBUG
317 struct vm_pageout_debug vm_pageout_debug;
318 #endif
319 struct vm_pageout_vminfo vm_pageout_vminfo;
320 struct vm_pageout_state vm_pageout_state;
321 struct vm_config vm_config;
322
323 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
324 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
325 #if DEVELOPMENT || DEBUG
326 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
327 #endif /* DEVELOPMENT || DEBUG */
328
329 int vm_upl_wait_for_pages = 0;
330 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
331
332 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
333
334 int vm_debug_events = 0;
335
336 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
337
338 #if CONFIG_MEMORYSTATUS
339 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
340
341 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
342 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
343
344 #endif
345
346 #if __AMP__
347
348 // bind compressor threads e-cores
349 #define VM_COMPRESSOR_EBOUND_DEFAULT 1
350
351 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
352 int vm_pgo_pbound = 0;
353 extern void thread_bind_cluster_type(thread_t, char, bool);
354
355 #endif /* __AMP__ */
356
357
358 /*
359 * Routine: vm_pageout_object_terminate
360 * Purpose:
361 * Destroy the pageout_object, and perform all of the
362 * required cleanup actions.
363 *
364 * In/Out conditions:
365 * The object must be locked, and will be returned locked.
366 */
367 void
vm_pageout_object_terminate(vm_object_t object)368 vm_pageout_object_terminate(
369 vm_object_t object)
370 {
371 vm_object_t shadow_object;
372
373 /*
374 * Deal with the deallocation (last reference) of a pageout object
375 * (used for cleaning-in-place) by dropping the paging references/
376 * freeing pages in the original object.
377 */
378
379 assert(object->pageout);
380 shadow_object = object->shadow;
381 vm_object_lock(shadow_object);
382
383 while (!vm_page_queue_empty(&object->memq)) {
384 vm_page_t p, m;
385 vm_object_offset_t offset;
386
387 p = (vm_page_t) vm_page_queue_first(&object->memq);
388
389 assert(p->vmp_private);
390 assert(p->vmp_free_when_done);
391 p->vmp_free_when_done = FALSE;
392 assert(!p->vmp_cleaning);
393 assert(!p->vmp_laundry);
394
395 offset = p->vmp_offset;
396 VM_PAGE_FREE(p);
397 p = VM_PAGE_NULL;
398
399 m = vm_page_lookup(shadow_object,
400 offset + object->vo_shadow_offset);
401
402 if (m == VM_PAGE_NULL) {
403 continue;
404 }
405
406 assert((m->vmp_dirty) || (m->vmp_precious) ||
407 (m->vmp_busy && m->vmp_cleaning));
408
409 /*
410 * Handle the trusted pager throttle.
411 * Also decrement the burst throttle (if external).
412 */
413 vm_page_lock_queues();
414 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
415 vm_pageout_throttle_up(m);
416 }
417
418 /*
419 * Handle the "target" page(s). These pages are to be freed if
420 * successfully cleaned. Target pages are always busy, and are
421 * wired exactly once. The initial target pages are not mapped,
422 * (so cannot be referenced or modified) but converted target
423 * pages may have been modified between the selection as an
424 * adjacent page and conversion to a target.
425 */
426 if (m->vmp_free_when_done) {
427 assert(m->vmp_busy);
428 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
429 assert(m->vmp_wire_count == 1);
430 m->vmp_cleaning = FALSE;
431 m->vmp_free_when_done = FALSE;
432 /*
433 * Revoke all access to the page. Since the object is
434 * locked, and the page is busy, this prevents the page
435 * from being dirtied after the pmap_disconnect() call
436 * returns.
437 *
438 * Since the page is left "dirty" but "not modifed", we
439 * can detect whether the page was redirtied during
440 * pageout by checking the modify state.
441 */
442 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
443 SET_PAGE_DIRTY(m, FALSE);
444 } else {
445 m->vmp_dirty = FALSE;
446 }
447
448 if (m->vmp_dirty) {
449 vm_page_unwire(m, TRUE); /* reactivates */
450 counter_inc(&vm_statistics_reactivations);
451 PAGE_WAKEUP_DONE(m);
452 } else {
453 vm_page_free(m); /* clears busy, etc. */
454 }
455 vm_page_unlock_queues();
456 continue;
457 }
458 /*
459 * Handle the "adjacent" pages. These pages were cleaned in
460 * place, and should be left alone.
461 * If prep_pin_count is nonzero, then someone is using the
462 * page, so make it active.
463 */
464 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
465 if (m->vmp_reference) {
466 vm_page_activate(m);
467 } else {
468 vm_page_deactivate(m);
469 }
470 }
471 if (m->vmp_overwriting) {
472 /*
473 * the (COPY_OUT_FROM == FALSE) request_page_list case
474 */
475 if (m->vmp_busy) {
476 /*
477 * We do not re-set m->vmp_dirty !
478 * The page was busy so no extraneous activity
479 * could have occurred. COPY_INTO is a read into the
480 * new pages. CLEAN_IN_PLACE does actually write
481 * out the pages but handling outside of this code
482 * will take care of resetting dirty. We clear the
483 * modify however for the Programmed I/O case.
484 */
485 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
486
487 m->vmp_busy = FALSE;
488 m->vmp_absent = FALSE;
489 } else {
490 /*
491 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
492 * Occurs when the original page was wired
493 * at the time of the list request
494 */
495 assert(VM_PAGE_WIRED(m));
496 vm_page_unwire(m, TRUE); /* reactivates */
497 }
498 m->vmp_overwriting = FALSE;
499 } else {
500 m->vmp_dirty = FALSE;
501 }
502 m->vmp_cleaning = FALSE;
503
504 /*
505 * Wakeup any thread waiting for the page to be un-cleaning.
506 */
507 PAGE_WAKEUP(m);
508 vm_page_unlock_queues();
509 }
510 /*
511 * Account for the paging reference taken in vm_paging_object_allocate.
512 */
513 vm_object_activity_end(shadow_object);
514 vm_object_unlock(shadow_object);
515
516 assert(object->ref_count == 0);
517 assert(object->paging_in_progress == 0);
518 assert(object->activity_in_progress == 0);
519 assert(object->resident_page_count == 0);
520 return;
521 }
522
523 /*
524 * Routine: vm_pageclean_setup
525 *
526 * Purpose: setup a page to be cleaned (made non-dirty), but not
527 * necessarily flushed from the VM page cache.
528 * This is accomplished by cleaning in place.
529 *
530 * The page must not be busy, and new_object
531 * must be locked.
532 *
533 */
534 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)535 vm_pageclean_setup(
536 vm_page_t m,
537 vm_page_t new_m,
538 vm_object_t new_object,
539 vm_object_offset_t new_offset)
540 {
541 assert(!m->vmp_busy);
542 #if 0
543 assert(!m->vmp_cleaning);
544 #endif
545
546 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
547
548 /*
549 * Mark original page as cleaning in place.
550 */
551 m->vmp_cleaning = TRUE;
552 SET_PAGE_DIRTY(m, FALSE);
553 m->vmp_precious = FALSE;
554
555 /*
556 * Convert the fictitious page to a private shadow of
557 * the real page.
558 */
559 assert(new_m->vmp_fictitious);
560 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
561 new_m->vmp_fictitious = FALSE;
562 new_m->vmp_private = TRUE;
563 new_m->vmp_free_when_done = TRUE;
564 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
565
566 vm_page_lockspin_queues();
567 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
568 vm_page_unlock_queues();
569
570 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
571 assert(!new_m->vmp_wanted);
572 new_m->vmp_busy = FALSE;
573 }
574
575 /*
576 * Routine: vm_pageout_initialize_page
577 * Purpose:
578 * Causes the specified page to be initialized in
579 * the appropriate memory object. This routine is used to push
580 * pages into a copy-object when they are modified in the
581 * permanent object.
582 *
583 * The page is moved to a temporary object and paged out.
584 *
585 * In/out conditions:
586 * The page in question must not be on any pageout queues.
587 * The object to which it belongs must be locked.
588 * The page must be busy, but not hold a paging reference.
589 *
590 * Implementation:
591 * Move this page to a completely new object.
592 */
593 void
vm_pageout_initialize_page(vm_page_t m)594 vm_pageout_initialize_page(
595 vm_page_t m)
596 {
597 vm_object_t object;
598 vm_object_offset_t paging_offset;
599 memory_object_t pager;
600
601 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
602
603 object = VM_PAGE_OBJECT(m);
604
605 assert(m->vmp_busy);
606 assert(object->internal);
607
608 /*
609 * Verify that we really want to clean this page
610 */
611 assert(!m->vmp_absent);
612 assert(m->vmp_dirty);
613
614 /*
615 * Create a paging reference to let us play with the object.
616 */
617 paging_offset = m->vmp_offset + object->paging_offset;
618
619 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
620 panic("reservation without pageout?"); /* alan */
621
622 VM_PAGE_FREE(m);
623 vm_object_unlock(object);
624
625 return;
626 }
627
628 /*
629 * If there's no pager, then we can't clean the page. This should
630 * never happen since this should be a copy object and therefore not
631 * an external object, so the pager should always be there.
632 */
633
634 pager = object->pager;
635
636 if (pager == MEMORY_OBJECT_NULL) {
637 panic("missing pager for copy object");
638
639 VM_PAGE_FREE(m);
640 return;
641 }
642
643 /*
644 * set the page for future call to vm_fault_list_request
645 */
646 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
647 SET_PAGE_DIRTY(m, FALSE);
648
649 /*
650 * keep the object from collapsing or terminating
651 */
652 vm_object_paging_begin(object);
653 vm_object_unlock(object);
654
655 /*
656 * Write the data to its pager.
657 * Note that the data is passed by naming the new object,
658 * not a virtual address; the pager interface has been
659 * manipulated to use the "internal memory" data type.
660 * [The object reference from its allocation is donated
661 * to the eventual recipient.]
662 */
663 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
664
665 vm_object_lock(object);
666 vm_object_paging_end(object);
667 }
668
669
670 /*
671 * vm_pageout_cluster:
672 *
673 * Given a page, queue it to the appropriate I/O thread,
674 * which will page it out and attempt to clean adjacent pages
675 * in the same operation.
676 *
677 * The object and queues must be locked. We will take a
678 * paging reference to prevent deallocation or collapse when we
679 * release the object lock back at the call site. The I/O thread
680 * is responsible for consuming this reference
681 *
682 * The page must not be on any pageout queue.
683 */
684 #if DEVELOPMENT || DEBUG
685 vmct_stats_t vmct_stats;
686
687 int32_t vmct_active = 0;
688 uint64_t vm_compressor_epoch_start = 0;
689 uint64_t vm_compressor_epoch_stop = 0;
690
691 typedef enum vmct_state_t {
692 VMCT_IDLE,
693 VMCT_AWAKENED,
694 VMCT_ACTIVE,
695 } vmct_state_t;
696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
697 #endif
698
699
700
701 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)702 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
703 {
704 vm_object_t object = VM_PAGE_OBJECT(m);
705
706 VM_PAGE_CHECK(m);
707 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
708 vm_object_lock_assert_exclusive(object);
709
710 /*
711 * Make sure it's OK to page this out.
712 */
713 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
714 assert(!m->vmp_cleaning && !m->vmp_laundry);
715 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
716
717 /*
718 * protect the object from collapse or termination
719 */
720 vm_object_activity_begin(object);
721
722
723 /*
724 * pgo_laundry count is tied to the laundry bit
725 */
726 m->vmp_laundry = TRUE;
727 q->pgo_laundry++;
728
729 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
730 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
731
732 // the benchmark queue will be woken up independently by the benchmark itself
733 if (
734 object->internal == TRUE
735 #if DEVELOPMENT || DEBUG
736 && q != &vm_pageout_queue_benchmark
737 #endif
738 ) {
739 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
740 m->vmp_busy = TRUE;
741 // Wake up the first compressor thread. It will wake subsequent threads if necessary.
742 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup, pgo_iothread_internal_state[0].pgo_iothread);
743 } else {
744 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
745 }
746 VM_PAGE_CHECK(m);
747 }
748
749 void
vm_pageout_cluster(vm_page_t m)750 vm_pageout_cluster(vm_page_t m)
751 {
752 struct vm_pageout_queue *q;
753 vm_object_t object = VM_PAGE_OBJECT(m);
754 if (object->internal) {
755 q = &vm_pageout_queue_internal;
756 } else {
757 q = &vm_pageout_queue_external;
758 }
759 vm_pageout_cluster_to_queue(m, q);
760 }
761
762
763 /*
764 * A page is back from laundry or we are stealing it back from
765 * the laundering state. See if there are some pages waiting to
766 * go to laundry and if we can let some of them go now.
767 *
768 * Object and page queues must be locked.
769 */
770 void
vm_pageout_throttle_up(vm_page_t m)771 vm_pageout_throttle_up(
772 vm_page_t m)
773 {
774 struct vm_pageout_queue *q;
775 vm_object_t m_object;
776
777 m_object = VM_PAGE_OBJECT(m);
778
779 assert(m_object != VM_OBJECT_NULL);
780 assert(m_object != kernel_object);
781
782 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
783 vm_object_lock_assert_exclusive(m_object);
784
785 if (m_object->internal == TRUE) {
786 q = &vm_pageout_queue_internal;
787 } else {
788 q = &vm_pageout_queue_external;
789 }
790
791 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
792 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
793 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
794
795 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
796
797 vm_object_activity_end(m_object);
798
799 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
800 }
801 if (m->vmp_laundry == TRUE) {
802 m->vmp_laundry = FALSE;
803 q->pgo_laundry--;
804
805 if (q->pgo_throttled == TRUE) {
806 q->pgo_throttled = FALSE;
807 thread_wakeup((event_t) &q->pgo_laundry);
808 }
809 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
810 q->pgo_draining = FALSE;
811 thread_wakeup((event_t) (&q->pgo_laundry + 1));
812 }
813 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
814 }
815 }
816
817
818 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)819 vm_pageout_throttle_up_batch(
820 struct vm_pageout_queue *q,
821 int batch_cnt)
822 {
823 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
824
825 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
826
827 q->pgo_laundry -= batch_cnt;
828
829 if (q->pgo_throttled == TRUE) {
830 q->pgo_throttled = FALSE;
831 thread_wakeup((event_t) &q->pgo_laundry);
832 }
833 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
834 q->pgo_draining = FALSE;
835 thread_wakeup((event_t) (&q->pgo_laundry + 1));
836 }
837 }
838
839
840
841 /*
842 * VM memory pressure monitoring.
843 *
844 * vm_pageout_scan() keeps track of the number of pages it considers and
845 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
846 *
847 * compute_memory_pressure() is called every second from compute_averages()
848 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
849 * of recalimed pages in a new vm_pageout_stat[] bucket.
850 *
851 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
852 * The caller provides the number of seconds ("nsecs") worth of statistics
853 * it wants, up to 30 seconds.
854 * It computes the number of pages reclaimed in the past "nsecs" seconds and
855 * also returns the number of pages the system still needs to reclaim at this
856 * moment in time.
857 */
858 #if DEVELOPMENT || DEBUG
859 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
860 #else
861 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
862 #endif
863 struct vm_pageout_stat {
864 unsigned long vm_page_active_count;
865 unsigned long vm_page_speculative_count;
866 unsigned long vm_page_inactive_count;
867 unsigned long vm_page_anonymous_count;
868
869 unsigned long vm_page_free_count;
870 unsigned long vm_page_wire_count;
871 unsigned long vm_page_compressor_count;
872
873 unsigned long vm_page_pages_compressed;
874 unsigned long vm_page_pageable_internal_count;
875 unsigned long vm_page_pageable_external_count;
876 unsigned long vm_page_xpmapped_external_count;
877
878 unsigned int pages_grabbed;
879 unsigned int pages_freed;
880
881 unsigned int pages_compressed;
882 unsigned int pages_grabbed_by_compressor;
883 unsigned int failed_compressions;
884
885 unsigned int pages_evicted;
886 unsigned int pages_purged;
887
888 unsigned int considered;
889 unsigned int considered_bq_internal;
890 unsigned int considered_bq_external;
891
892 unsigned int skipped_external;
893 unsigned int skipped_internal;
894 unsigned int filecache_min_reactivations;
895
896 unsigned int freed_speculative;
897 unsigned int freed_cleaned;
898 unsigned int freed_internal;
899 unsigned int freed_external;
900
901 unsigned int cleaned_dirty_external;
902 unsigned int cleaned_dirty_internal;
903
904 unsigned int inactive_referenced;
905 unsigned int inactive_nolock;
906 unsigned int reactivation_limit_exceeded;
907 unsigned int forced_inactive_reclaim;
908
909 unsigned int throttled_internal_q;
910 unsigned int throttled_external_q;
911
912 unsigned int phantom_ghosts_found;
913 unsigned int phantom_ghosts_added;
914
915 unsigned int vm_page_realtime_count;
916 unsigned int forcereclaimed_sharedcache;
917 unsigned int forcereclaimed_realtime;
918 unsigned int protected_sharedcache;
919 unsigned int protected_realtime;
920 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
921
922 unsigned int vm_pageout_stat_now = 0;
923
924 #define VM_PAGEOUT_STAT_BEFORE(i) \
925 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
926 #define VM_PAGEOUT_STAT_AFTER(i) \
927 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
928
929 #if VM_PAGE_BUCKETS_CHECK
930 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
931 #endif /* VM_PAGE_BUCKETS_CHECK */
932
933
934 void
935 record_memory_pressure(void);
936 void
record_memory_pressure(void)937 record_memory_pressure(void)
938 {
939 unsigned int vm_pageout_next;
940
941 #if VM_PAGE_BUCKETS_CHECK
942 /* check the consistency of VM page buckets at regular interval */
943 static int counter = 0;
944 if ((++counter % vm_page_buckets_check_interval) == 0) {
945 vm_page_buckets_check();
946 }
947 #endif /* VM_PAGE_BUCKETS_CHECK */
948
949 vm_pageout_state.vm_memory_pressure =
950 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
951 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
952 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
953 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
954
955 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
956
957 /* move "now" forward */
958 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
959
960 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
961
962 vm_pageout_stat_now = vm_pageout_next;
963 }
964
965
966 /*
967 * IMPORTANT
968 * mach_vm_ctl_page_free_wanted() is called indirectly, via
969 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
970 * it must be safe in the restricted stackshot context. Locks and/or
971 * blocking are not allowable.
972 */
973 unsigned int
mach_vm_ctl_page_free_wanted(void)974 mach_vm_ctl_page_free_wanted(void)
975 {
976 unsigned int page_free_target, page_free_count, page_free_wanted;
977
978 page_free_target = vm_page_free_target;
979 page_free_count = vm_page_free_count;
980 if (page_free_target > page_free_count) {
981 page_free_wanted = page_free_target - page_free_count;
982 } else {
983 page_free_wanted = 0;
984 }
985
986 return page_free_wanted;
987 }
988
989
990 /*
991 * IMPORTANT:
992 * mach_vm_pressure_monitor() is called when taking a stackshot, with
993 * wait_for_pressure FALSE, so that code path must remain safe in the
994 * restricted stackshot context. No blocking or locks are allowable.
995 * on that code path.
996 */
997
998 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)999 mach_vm_pressure_monitor(
1000 boolean_t wait_for_pressure,
1001 unsigned int nsecs_monitored,
1002 unsigned int *pages_reclaimed_p,
1003 unsigned int *pages_wanted_p)
1004 {
1005 wait_result_t wr;
1006 unsigned int vm_pageout_then, vm_pageout_now;
1007 unsigned int pages_reclaimed;
1008 unsigned int units_of_monitor;
1009
1010 units_of_monitor = 8 * nsecs_monitored;
1011 /*
1012 * We don't take the vm_page_queue_lock here because we don't want
1013 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1014 * thread when it's trying to reclaim memory. We don't need fully
1015 * accurate monitoring anyway...
1016 */
1017
1018 if (wait_for_pressure) {
1019 /* wait until there's memory pressure */
1020 while (vm_page_free_count >= vm_page_free_target) {
1021 wr = assert_wait((event_t) &vm_page_free_wanted,
1022 THREAD_INTERRUPTIBLE);
1023 if (wr == THREAD_WAITING) {
1024 wr = thread_block(THREAD_CONTINUE_NULL);
1025 }
1026 if (wr == THREAD_INTERRUPTED) {
1027 return KERN_ABORTED;
1028 }
1029 if (wr == THREAD_AWAKENED) {
1030 /*
1031 * The memory pressure might have already
1032 * been relieved but let's not block again
1033 * and let's report that there was memory
1034 * pressure at some point.
1035 */
1036 break;
1037 }
1038 }
1039 }
1040
1041 /* provide the number of pages the system wants to reclaim */
1042 if (pages_wanted_p != NULL) {
1043 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1044 }
1045
1046 if (pages_reclaimed_p == NULL) {
1047 return KERN_SUCCESS;
1048 }
1049
1050 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1051 vm_pageout_now = vm_pageout_stat_now;
1052 pages_reclaimed = 0;
1053 for (vm_pageout_then =
1054 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1055 vm_pageout_then != vm_pageout_now &&
1056 units_of_monitor-- != 0;
1057 vm_pageout_then =
1058 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1059 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1060 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1061 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1062 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1063 }
1064 *pages_reclaimed_p = pages_reclaimed;
1065
1066 return KERN_SUCCESS;
1067 }
1068
1069
1070
1071 #if DEVELOPMENT || DEBUG
1072
1073 static void
1074 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1075
1076 /*
1077 * condition variable used to make sure there is
1078 * only a single sweep going on at a time
1079 */
1080 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1081
1082
1083 void
vm_pageout_disconnect_all_pages()1084 vm_pageout_disconnect_all_pages()
1085 {
1086 vm_page_lock_queues();
1087
1088 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1089 vm_page_unlock_queues();
1090 return;
1091 }
1092 vm_pageout_disconnect_all_pages_active = TRUE;
1093 vm_page_unlock_queues();
1094
1095 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1096 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1097 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1098
1099 vm_pageout_disconnect_all_pages_active = FALSE;
1100 }
1101
1102
1103 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1104 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1105 {
1106 vm_page_t m;
1107 vm_object_t t_object = NULL;
1108 vm_object_t l_object = NULL;
1109 vm_object_t m_object = NULL;
1110 int delayed_unlock = 0;
1111 int try_failed_count = 0;
1112 int disconnected_count = 0;
1113 int paused_count = 0;
1114 int object_locked_count = 0;
1115
1116 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1117 q, qcount, 0, 0, 0);
1118
1119 vm_page_lock_queues();
1120
1121 while (qcount && !vm_page_queue_empty(q)) {
1122 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1123
1124 m = (vm_page_t) vm_page_queue_first(q);
1125 m_object = VM_PAGE_OBJECT(m);
1126
1127 /*
1128 * check to see if we currently are working
1129 * with the same object... if so, we've
1130 * already got the lock
1131 */
1132 if (m_object != l_object) {
1133 /*
1134 * the object associated with candidate page is
1135 * different from the one we were just working
1136 * with... dump the lock if we still own it
1137 */
1138 if (l_object != NULL) {
1139 vm_object_unlock(l_object);
1140 l_object = NULL;
1141 }
1142 if (m_object != t_object) {
1143 try_failed_count = 0;
1144 }
1145
1146 /*
1147 * Try to lock object; since we've alread got the
1148 * page queues lock, we can only 'try' for this one.
1149 * if the 'try' fails, we need to do a mutex_pause
1150 * to allow the owner of the object lock a chance to
1151 * run...
1152 */
1153 if (!vm_object_lock_try_scan(m_object)) {
1154 if (try_failed_count > 20) {
1155 goto reenter_pg_on_q;
1156 }
1157 vm_page_unlock_queues();
1158 mutex_pause(try_failed_count++);
1159 vm_page_lock_queues();
1160 delayed_unlock = 0;
1161
1162 paused_count++;
1163
1164 t_object = m_object;
1165 continue;
1166 }
1167 object_locked_count++;
1168
1169 l_object = m_object;
1170 }
1171 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1172 /*
1173 * put it back on the head of its queue
1174 */
1175 goto reenter_pg_on_q;
1176 }
1177 if (m->vmp_pmapped == TRUE) {
1178 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1179
1180 disconnected_count++;
1181 }
1182 reenter_pg_on_q:
1183 vm_page_queue_remove(q, m, vmp_pageq);
1184 vm_page_queue_enter(q, m, vmp_pageq);
1185
1186 qcount--;
1187 try_failed_count = 0;
1188
1189 if (delayed_unlock++ > 128) {
1190 if (l_object != NULL) {
1191 vm_object_unlock(l_object);
1192 l_object = NULL;
1193 }
1194 lck_mtx_yield(&vm_page_queue_lock);
1195 delayed_unlock = 0;
1196 }
1197 }
1198 if (l_object != NULL) {
1199 vm_object_unlock(l_object);
1200 l_object = NULL;
1201 }
1202 vm_page_unlock_queues();
1203
1204 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1205 q, disconnected_count, object_locked_count, paused_count, 0);
1206 }
1207
1208 extern char* proc_best_name(struct proc* proc);
1209
1210 int
vm_toggle_task_selfdonate_pages(task_t task)1211 vm_toggle_task_selfdonate_pages(task_t task)
1212 {
1213 int state = 0;
1214 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1215 printf("VM Donation mode is OFF on the system\n");
1216 return state;
1217 }
1218 if (task != kernel_task) {
1219 task_lock(task);
1220 if (!task->donates_own_pages) {
1221 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1222 task->donates_own_pages = true;
1223 state = 1;
1224 } else if (task->donates_own_pages) {
1225 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1226 task->donates_own_pages = false;
1227 state = 0;
1228 }
1229 task_unlock(task);
1230 }
1231 return state;
1232 }
1233 #endif /* DEVELOPMENT || DEBUG */
1234
1235 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1236 vm_task_set_selfdonate_pages(task_t task, bool donate)
1237 {
1238 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1239 assert(task != kernel_task);
1240
1241 task_lock(task);
1242 task->donates_own_pages = donate;
1243 task_unlock(task);
1244 }
1245
1246
1247
1248 static size_t
1249 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1250
1251 /*
1252 * condition variable used to make sure there is
1253 * only a single sweep going on at a time
1254 */
1255 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1256
1257
1258 void
vm_pageout_anonymous_pages()1259 vm_pageout_anonymous_pages()
1260 {
1261 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1262 vm_page_lock_queues();
1263
1264 if (vm_pageout_anonymous_pages_active == TRUE) {
1265 vm_page_unlock_queues();
1266 return;
1267 }
1268 vm_pageout_anonymous_pages_active = TRUE;
1269 vm_page_unlock_queues();
1270
1271 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1272 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1273 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1274
1275 if (VM_CONFIG_SWAP_IS_PRESENT) {
1276 vm_consider_swapping();
1277 }
1278
1279 vm_page_lock_queues();
1280 vm_pageout_anonymous_pages_active = FALSE;
1281 vm_page_unlock_queues();
1282 }
1283 }
1284
1285
1286 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1287 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1288 {
1289 vm_page_t m;
1290 vm_object_t t_object = NULL;
1291 vm_object_t l_object = NULL;
1292 vm_object_t m_object = NULL;
1293 int delayed_unlock = 0;
1294 int try_failed_count = 0;
1295 int refmod_state;
1296 int pmap_options;
1297 struct vm_pageout_queue *iq;
1298 ppnum_t phys_page;
1299 size_t pages_moved = 0;
1300
1301
1302 iq = &vm_pageout_queue_internal;
1303
1304 vm_page_lock_queues();
1305
1306 #if DEVELOPMENT || DEBUG
1307 if (perf_test) {
1308 iq = &vm_pageout_queue_benchmark;
1309 // ensure the benchmark queue isn't throttled
1310 iq->pgo_maxlaundry = (unsigned int) qcount;
1311 }
1312 #endif /* DEVELOPMENT ||DEBUG */
1313
1314 while (qcount && !vm_page_queue_empty(q)) {
1315 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1316
1317 if (VM_PAGE_Q_THROTTLED(iq)) {
1318 if (l_object != NULL) {
1319 vm_object_unlock(l_object);
1320 l_object = NULL;
1321 }
1322 iq->pgo_draining = TRUE;
1323
1324 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1325 vm_page_unlock_queues();
1326
1327 thread_block(THREAD_CONTINUE_NULL);
1328
1329 vm_page_lock_queues();
1330 delayed_unlock = 0;
1331 continue;
1332 }
1333 m = (vm_page_t) vm_page_queue_first(q);
1334 m_object = VM_PAGE_OBJECT(m);
1335
1336 /*
1337 * check to see if we currently are working
1338 * with the same object... if so, we've
1339 * already got the lock
1340 */
1341 if (m_object != l_object) {
1342 if (!m_object->internal) {
1343 goto reenter_pg_on_q;
1344 }
1345
1346 /*
1347 * the object associated with candidate page is
1348 * different from the one we were just working
1349 * with... dump the lock if we still own it
1350 */
1351 if (l_object != NULL) {
1352 vm_object_unlock(l_object);
1353 l_object = NULL;
1354 }
1355 if (m_object != t_object) {
1356 try_failed_count = 0;
1357 }
1358
1359 /*
1360 * Try to lock object; since we've alread got the
1361 * page queues lock, we can only 'try' for this one.
1362 * if the 'try' fails, we need to do a mutex_pause
1363 * to allow the owner of the object lock a chance to
1364 * run...
1365 */
1366 if (!vm_object_lock_try_scan(m_object)) {
1367 if (try_failed_count > 20) {
1368 goto reenter_pg_on_q;
1369 }
1370 vm_page_unlock_queues();
1371 mutex_pause(try_failed_count++);
1372 vm_page_lock_queues();
1373 delayed_unlock = 0;
1374
1375 t_object = m_object;
1376 continue;
1377 }
1378 l_object = m_object;
1379 }
1380 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1381 /*
1382 * page is not to be cleaned
1383 * put it back on the head of its queue
1384 */
1385 goto reenter_pg_on_q;
1386 }
1387 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1388
1389 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1390 refmod_state = pmap_get_refmod(phys_page);
1391
1392 if (refmod_state & VM_MEM_REFERENCED) {
1393 m->vmp_reference = TRUE;
1394 }
1395 if (refmod_state & VM_MEM_MODIFIED) {
1396 SET_PAGE_DIRTY(m, FALSE);
1397 }
1398 }
1399 if (m->vmp_reference == TRUE) {
1400 m->vmp_reference = FALSE;
1401 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1402 goto reenter_pg_on_q;
1403 }
1404 if (m->vmp_pmapped == TRUE) {
1405 if (m->vmp_dirty || m->vmp_precious) {
1406 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1407 } else {
1408 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1409 }
1410 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1411 if (refmod_state & VM_MEM_MODIFIED) {
1412 SET_PAGE_DIRTY(m, FALSE);
1413 }
1414 }
1415
1416 if (!m->vmp_dirty && !m->vmp_precious) {
1417 vm_page_unlock_queues();
1418 VM_PAGE_FREE(m);
1419 vm_page_lock_queues();
1420 delayed_unlock = 0;
1421
1422 goto next_pg;
1423 }
1424 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1425 if (!m_object->pager_initialized) {
1426 vm_page_unlock_queues();
1427
1428 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1429
1430 if (!m_object->pager_initialized) {
1431 vm_object_compressor_pager_create(m_object);
1432 }
1433
1434 vm_page_lock_queues();
1435 delayed_unlock = 0;
1436 }
1437 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1438 goto reenter_pg_on_q;
1439 }
1440 /*
1441 * vm_object_compressor_pager_create will drop the object lock
1442 * which means 'm' may no longer be valid to use
1443 */
1444 continue;
1445 }
1446
1447 if (!perf_test) {
1448 /*
1449 * we've already factored out pages in the laundry which
1450 * means this page can't be on the pageout queue so it's
1451 * safe to do the vm_page_queues_remove
1452 */
1453 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1454 vm_page_queues_remove(m, TRUE);
1455 if (donate) {
1456 /*
1457 * The compressor needs to see this bit to know
1458 * where this page needs to land. Also if stolen,
1459 * this bit helps put the page back in the right
1460 * special queue where it belongs.
1461 */
1462 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1463 }
1464 } else {
1465 vm_page_queue_remove(q, m, vmp_pageq);
1466 }
1467
1468 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1469
1470 vm_pageout_cluster_to_queue(m, iq);
1471
1472 pages_moved++;
1473 goto next_pg;
1474
1475 reenter_pg_on_q:
1476 vm_page_queue_remove(q, m, vmp_pageq);
1477 vm_page_queue_enter(q, m, vmp_pageq);
1478 next_pg:
1479 qcount--;
1480 try_failed_count = 0;
1481
1482 if (delayed_unlock++ > 128) {
1483 if (l_object != NULL) {
1484 vm_object_unlock(l_object);
1485 l_object = NULL;
1486 }
1487 lck_mtx_yield(&vm_page_queue_lock);
1488 delayed_unlock = 0;
1489 }
1490 }
1491 if (l_object != NULL) {
1492 vm_object_unlock(l_object);
1493 l_object = NULL;
1494 }
1495 vm_page_unlock_queues();
1496 return pages_moved;
1497 }
1498
1499
1500
1501 /*
1502 * function in BSD to apply I/O throttle to the pageout thread
1503 */
1504 extern void vm_pageout_io_throttle(void);
1505
1506 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1507 MACRO_BEGIN \
1508 /* \
1509 * If a "reusable" page somehow made it back into \
1510 * the active queue, it's been re-used and is not \
1511 * quite re-usable. \
1512 * If the VM object was "all_reusable", consider it \
1513 * as "all re-used" instead of converting it to \
1514 * "partially re-used", which could be expensive. \
1515 */ \
1516 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1517 if ((m)->vmp_reusable || \
1518 (obj)->all_reusable) { \
1519 vm_object_reuse_pages((obj), \
1520 (m)->vmp_offset, \
1521 (m)->vmp_offset + PAGE_SIZE_64, \
1522 FALSE); \
1523 } \
1524 MACRO_END
1525
1526
1527 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1528 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1529
1530 #define FCS_IDLE 0
1531 #define FCS_DELAYED 1
1532 #define FCS_DEADLOCK_DETECTED 2
1533
1534 struct flow_control {
1535 int state;
1536 mach_timespec_t ts;
1537 };
1538
1539
1540 uint64_t vm_pageout_rejected_bq_internal = 0;
1541 uint64_t vm_pageout_rejected_bq_external = 0;
1542 uint64_t vm_pageout_skipped_bq_internal = 0;
1543 uint64_t vm_pageout_skipped_bq_external = 0;
1544
1545 #define ANONS_GRABBED_LIMIT 2
1546
1547
1548 #if 0
1549 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1550 #endif
1551 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1552
1553 #define VM_PAGEOUT_PB_NO_ACTION 0
1554 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1555 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1556
1557
1558 #if 0
1559 static void
1560 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1561 {
1562 if (*local_freeq) {
1563 vm_page_unlock_queues();
1564
1565 VM_DEBUG_CONSTANT_EVENT(
1566 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1567 vm_page_free_count, 0, 0, 1);
1568
1569 vm_page_free_list(*local_freeq, TRUE);
1570
1571 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1572 vm_page_free_count, *local_freed, 0, 1);
1573
1574 *local_freeq = NULL;
1575 *local_freed = 0;
1576
1577 vm_page_lock_queues();
1578 } else {
1579 lck_mtx_yield(&vm_page_queue_lock);
1580 }
1581 *delayed_unlock = 1;
1582 }
1583 #endif
1584
1585
1586 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1587 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1588 vm_page_t *local_freeq, int *local_freed, int action)
1589 {
1590 vm_page_unlock_queues();
1591
1592 if (*object != NULL) {
1593 vm_object_unlock(*object);
1594 *object = NULL;
1595 }
1596 if (*local_freeq) {
1597 vm_page_free_list(*local_freeq, TRUE);
1598
1599 *local_freeq = NULL;
1600 *local_freed = 0;
1601 }
1602 *delayed_unlock = 1;
1603
1604 switch (action) {
1605 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1606 vm_consider_waking_compactor_swapper();
1607 break;
1608 case VM_PAGEOUT_PB_THREAD_YIELD:
1609 thread_yield_internal(1);
1610 break;
1611 case VM_PAGEOUT_PB_NO_ACTION:
1612 default:
1613 break;
1614 }
1615 vm_page_lock_queues();
1616 }
1617
1618
1619 static struct vm_pageout_vminfo last;
1620
1621 uint64_t last_vm_page_pages_grabbed = 0;
1622
1623 extern uint32_t c_segment_pages_compressed;
1624
1625 extern uint64_t shared_region_pager_reclaimed;
1626 extern struct memory_object_pager_ops shared_region_pager_ops;
1627
1628 void
update_vm_info(void)1629 update_vm_info(void)
1630 {
1631 unsigned long tmp;
1632 uint64_t tmp64;
1633
1634 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1635 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1636 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1637 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1638
1639 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1640 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1641 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1642
1643 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1644 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1645 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1646 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1647 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1648
1649 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1650 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1651 last.vm_pageout_considered_page = tmp;
1652
1653 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1654 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1655 last.vm_pageout_compressions = tmp64;
1656
1657 tmp = vm_pageout_vminfo.vm_compressor_failed;
1658 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1659 last.vm_compressor_failed = tmp;
1660
1661 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1662 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1663 last.vm_compressor_pages_grabbed = tmp64;
1664
1665 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1666 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1667 last.vm_phantom_cache_found_ghost = tmp;
1668
1669 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1670 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1671 last.vm_phantom_cache_added_ghost = tmp;
1672
1673 tmp64 = counter_load(&vm_page_grab_count);
1674 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1675 last_vm_page_pages_grabbed = tmp64;
1676
1677 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1678 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1679 last.vm_page_pages_freed = tmp;
1680
1681 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1682 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1683 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1684 last.vm_pageout_pages_evicted = tmp;
1685
1686 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1687 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1688 last.vm_pageout_pages_purged = tmp;
1689
1690 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1691 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1692 last.vm_pageout_freed_speculative = tmp;
1693
1694 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1695 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1696 last.vm_pageout_freed_external = tmp;
1697
1698 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1699 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1700 last.vm_pageout_inactive_referenced = tmp;
1701
1702 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1703 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1704 last.vm_pageout_scan_inactive_throttled_external = tmp;
1705
1706 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1707 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1708 last.vm_pageout_inactive_dirty_external = tmp;
1709
1710 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1711 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1712 last.vm_pageout_freed_cleaned = tmp;
1713
1714 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1715 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1716 last.vm_pageout_inactive_nolock = tmp;
1717
1718 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1719 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1720 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1721
1722 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1723 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1724 last.vm_pageout_skipped_external = tmp;
1725
1726 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1727 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1728 last.vm_pageout_skipped_internal = tmp;
1729
1730 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1731 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1732 last.vm_pageout_reactivation_limit_exceeded = tmp;
1733
1734 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1735 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1736 last.vm_pageout_inactive_force_reclaim = tmp;
1737
1738 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1739 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1740 last.vm_pageout_freed_internal = tmp;
1741
1742 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1743 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1744 last.vm_pageout_considered_bq_internal = tmp;
1745
1746 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1747 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1748 last.vm_pageout_considered_bq_external = tmp;
1749
1750 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1751 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1752 last.vm_pageout_filecache_min_reactivated = tmp;
1753
1754 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1755 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1756 last.vm_pageout_inactive_dirty_internal = tmp;
1757
1758 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1759 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1760 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1761
1762 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1763 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1764 last.vm_pageout_forcereclaimed_realtime = tmp;
1765
1766 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1767 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1768 last.vm_pageout_protected_sharedcache = tmp;
1769
1770 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1771 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1772 last.vm_pageout_protected_realtime = tmp;
1773 }
1774
1775 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1776 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1777 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1778 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1779 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1780 0);
1781
1782 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1783 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1784 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1785 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1786 0,
1787 0);
1788
1789 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1790 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1791 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1792 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1793 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1794 0);
1795
1796 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1797 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1798 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1799 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1800 vm_pageout_stats[vm_pageout_stat_now].considered,
1801 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1802 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1803 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1804 0);
1805
1806 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1807 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1808 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1809 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1810 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1811 0);
1812
1813 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1814 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1815 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1816 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1817 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1818 0);
1819
1820 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1821 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1822 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1823 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1824 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1825 0);
1826
1827 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1828 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1829 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1830 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1831 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1832 0);
1833
1834 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1835 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1836 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1837 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1838 vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1839 0);
1840 }
1841 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1842 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1843 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1844 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1845 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1846 0);
1847
1848 record_memory_pressure();
1849 }
1850
1851 extern boolean_t hibernation_vmqueues_inspection;
1852
1853 /*
1854 * Return values for functions called by vm_pageout_scan
1855 * that control its flow.
1856 *
1857 * PROCEED -- vm_pageout_scan will keep making forward progress.
1858 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1859 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1860 */
1861
1862 #define VM_PAGEOUT_SCAN_PROCEED (0)
1863 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1864 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1865
1866 /*
1867 * This function is called only from vm_pageout_scan and
1868 * it moves overflow secluded pages (one-at-a-time) to the
1869 * batched 'local' free Q or active Q.
1870 */
1871 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1872 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1873 {
1874 #if CONFIG_SECLUDED_MEMORY
1875 /*
1876 * Deal with secluded_q overflow.
1877 */
1878 if (vm_page_secluded_count > vm_page_secluded_target) {
1879 vm_page_t secluded_page;
1880
1881 /*
1882 * SECLUDED_AGING_BEFORE_ACTIVE:
1883 * Excess secluded pages go to the active queue and
1884 * will later go to the inactive queue.
1885 */
1886 assert((vm_page_secluded_count_free +
1887 vm_page_secluded_count_inuse) ==
1888 vm_page_secluded_count);
1889 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1890 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1891
1892 vm_page_queues_remove(secluded_page, FALSE);
1893 assert(!secluded_page->vmp_fictitious);
1894 assert(!VM_PAGE_WIRED(secluded_page));
1895
1896 if (secluded_page->vmp_object == 0) {
1897 /* transfer to free queue */
1898 assert(secluded_page->vmp_busy);
1899 secluded_page->vmp_snext = *local_freeq;
1900 *local_freeq = secluded_page;
1901 *local_freed += 1;
1902 } else {
1903 /* transfer to head of active queue */
1904 vm_page_enqueue_active(secluded_page, FALSE);
1905 secluded_page = VM_PAGE_NULL;
1906 }
1907 }
1908 #else /* CONFIG_SECLUDED_MEMORY */
1909
1910 #pragma unused(local_freeq)
1911 #pragma unused(local_freed)
1912
1913 return;
1914
1915 #endif /* CONFIG_SECLUDED_MEMORY */
1916 }
1917
1918 /*
1919 * This function is called only from vm_pageout_scan and
1920 * it initializes the loop targets for vm_pageout_scan().
1921 */
1922 static void
vps_init_page_targets(void)1923 vps_init_page_targets(void)
1924 {
1925 /*
1926 * LD TODO: Other page targets should be calculated here too.
1927 */
1928 vm_page_anonymous_min = vm_page_inactive_target / 20;
1929
1930 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1931 vm_pageout_state.vm_page_speculative_percentage = 50;
1932 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1933 vm_pageout_state.vm_page_speculative_percentage = 1;
1934 }
1935
1936 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1937 vm_page_inactive_count);
1938 }
1939
1940 /*
1941 * This function is called only from vm_pageout_scan and
1942 * it purges a single VM object at-a-time and will either
1943 * make vm_pageout_scan() restart the loop or keeping moving forward.
1944 */
1945 static int
vps_purge_object()1946 vps_purge_object()
1947 {
1948 int force_purge;
1949
1950 assert(available_for_purge >= 0);
1951 force_purge = 0; /* no force-purging */
1952
1953 #if VM_PRESSURE_EVENTS
1954 vm_pressure_level_t pressure_level;
1955
1956 pressure_level = memorystatus_vm_pressure_level;
1957
1958 if (pressure_level > kVMPressureNormal) {
1959 if (pressure_level >= kVMPressureCritical) {
1960 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1961 } else if (pressure_level >= kVMPressureUrgent) {
1962 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1963 } else if (pressure_level >= kVMPressureWarning) {
1964 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1965 }
1966 }
1967 #endif /* VM_PRESSURE_EVENTS */
1968
1969 if (available_for_purge || force_purge) {
1970 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1971
1972 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1973 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1974 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1975 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1976 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1977
1978 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1979 }
1980 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1981 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1982 }
1983
1984 return VM_PAGEOUT_SCAN_PROCEED;
1985 }
1986
1987 /*
1988 * This function is called only from vm_pageout_scan and
1989 * it will try to age the next speculative Q if the oldest
1990 * one is empty.
1991 */
1992 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1993 vps_age_speculative_queue(boolean_t force_speculative_aging)
1994 {
1995 #define DELAY_SPECULATIVE_AGE 1000
1996
1997 /*
1998 * try to pull pages from the aging bins...
1999 * see vm_page.h for an explanation of how
2000 * this mechanism works
2001 */
2002 boolean_t can_steal = FALSE;
2003 int num_scanned_queues;
2004 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2005 mach_timespec_t ts;
2006 struct vm_speculative_age_q *aq;
2007 struct vm_speculative_age_q *sq;
2008
2009 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2010
2011 aq = &vm_page_queue_speculative[speculative_steal_index];
2012
2013 num_scanned_queues = 0;
2014 while (vm_page_queue_empty(&aq->age_q) &&
2015 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2016 speculative_steal_index++;
2017
2018 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2019 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2020 }
2021
2022 aq = &vm_page_queue_speculative[speculative_steal_index];
2023 }
2024
2025 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2026 /*
2027 * XXX We've scanned all the speculative
2028 * queues but still haven't found one
2029 * that is not empty, even though
2030 * vm_page_speculative_count is not 0.
2031 */
2032 if (!vm_page_queue_empty(&sq->age_q)) {
2033 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2034 }
2035 #if DEVELOPMENT || DEBUG
2036 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2037 #endif
2038 /* readjust... */
2039 vm_page_speculative_count = 0;
2040 /* ... and continue */
2041 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2042 }
2043
2044 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2045 can_steal = TRUE;
2046 } else {
2047 if (!delay_speculative_age) {
2048 mach_timespec_t ts_fully_aged;
2049
2050 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2051 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2052 * 1000 * NSEC_PER_USEC;
2053
2054 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2055
2056 clock_sec_t sec;
2057 clock_nsec_t nsec;
2058 clock_get_system_nanotime(&sec, &nsec);
2059 ts.tv_sec = (unsigned int) sec;
2060 ts.tv_nsec = nsec;
2061
2062 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2063 can_steal = TRUE;
2064 } else {
2065 delay_speculative_age++;
2066 }
2067 } else {
2068 delay_speculative_age++;
2069 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2070 delay_speculative_age = 0;
2071 }
2072 }
2073 }
2074 if (can_steal == TRUE) {
2075 vm_page_speculate_ageit(aq);
2076 }
2077
2078 return VM_PAGEOUT_SCAN_PROCEED;
2079 }
2080
2081 /*
2082 * This function is called only from vm_pageout_scan and
2083 * it evicts a single VM object from the cache.
2084 */
2085 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2086 vps_object_cache_evict(vm_object_t *object_to_unlock)
2087 {
2088 static int cache_evict_throttle = 0;
2089 struct vm_speculative_age_q *sq;
2090
2091 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2092
2093 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2094 int pages_evicted;
2095
2096 if (*object_to_unlock != NULL) {
2097 vm_object_unlock(*object_to_unlock);
2098 *object_to_unlock = NULL;
2099 }
2100 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2101
2102 pages_evicted = vm_object_cache_evict(100, 10);
2103
2104 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2105
2106 if (pages_evicted) {
2107 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2108
2109 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2110 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2111 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2112
2113 /*
2114 * we just freed up to 100 pages,
2115 * so go back to the top of the main loop
2116 * and re-evaulate the memory situation
2117 */
2118 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2119 } else {
2120 cache_evict_throttle = 1000;
2121 }
2122 }
2123 if (cache_evict_throttle) {
2124 cache_evict_throttle--;
2125 }
2126
2127 return VM_PAGEOUT_SCAN_PROCEED;
2128 }
2129
2130
2131 /*
2132 * This function is called only from vm_pageout_scan and
2133 * it calculates the filecache min. that needs to be maintained
2134 * as we start to steal pages.
2135 */
2136 static void
vps_calculate_filecache_min(void)2137 vps_calculate_filecache_min(void)
2138 {
2139 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2140
2141 #if CONFIG_JETSAM
2142 /*
2143 * don't let the filecache_min fall below 15% of available memory
2144 * on systems with an active compressor that isn't nearing its
2145 * limits w/r to accepting new data
2146 *
2147 * on systems w/o the compressor/swapper, the filecache is always
2148 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2149 * since most (if not all) of the anonymous pages are in the
2150 * throttled queue (which isn't counted as available) which
2151 * effectively disables this filter
2152 */
2153 if (vm_compressor_low_on_space() || divisor == 0) {
2154 vm_pageout_state.vm_page_filecache_min = 0;
2155 } else {
2156 vm_pageout_state.vm_page_filecache_min =
2157 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2158 }
2159 #else
2160 if (vm_compressor_out_of_space() || divisor == 0) {
2161 vm_pageout_state.vm_page_filecache_min = 0;
2162 } else {
2163 /*
2164 * don't let the filecache_min fall below the specified critical level
2165 */
2166 vm_pageout_state.vm_page_filecache_min =
2167 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2168 }
2169 #endif
2170 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2171 vm_pageout_state.vm_page_filecache_min = 0;
2172 }
2173 }
2174
2175 /*
2176 * This function is called only from vm_pageout_scan and
2177 * it updates the flow control time to detect if VM pageoutscan
2178 * isn't making progress.
2179 */
2180 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2181 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2182 {
2183 mach_timespec_t ts;
2184 clock_sec_t sec;
2185 clock_nsec_t nsec;
2186
2187 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2188 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2189 clock_get_system_nanotime(&sec, &nsec);
2190 flow_control->ts.tv_sec = (unsigned int) sec;
2191 flow_control->ts.tv_nsec = nsec;
2192 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2193
2194 flow_control->state = FCS_DELAYED;
2195
2196 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2197 }
2198
2199 /*
2200 * This function is called only from vm_pageout_scan and
2201 * it is the flow control logic of VM pageout scan which
2202 * controls if it should block and for how long.
2203 * Any blocking of vm_pageout_scan happens ONLY in this function.
2204 */
2205 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2206 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2207 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2208 {
2209 boolean_t exceeded_burst_throttle = FALSE;
2210 unsigned int msecs = 0;
2211 uint32_t inactive_external_count;
2212 mach_timespec_t ts;
2213 struct vm_pageout_queue *iq;
2214 struct vm_pageout_queue *eq;
2215 struct vm_speculative_age_q *sq;
2216
2217 iq = &vm_pageout_queue_internal;
2218 eq = &vm_pageout_queue_external;
2219 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2220
2221 /*
2222 * Sometimes we have to pause:
2223 * 1) No inactive pages - nothing to do.
2224 * 2) Loop control - no acceptable pages found on the inactive queue
2225 * within the last vm_pageout_burst_inactive_throttle iterations
2226 * 3) Flow control - default pageout queue is full
2227 */
2228 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2229 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2230 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2231 vm_page_queue_empty(&sq->age_q)) {
2232 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2233 msecs = vm_pageout_state.vm_pageout_empty_wait;
2234 } else if (inactive_burst_count >=
2235 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2236 (vm_page_inactive_count +
2237 vm_page_speculative_count))) {
2238 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2239 msecs = vm_pageout_state.vm_pageout_burst_wait;
2240
2241 exceeded_burst_throttle = TRUE;
2242 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2243 VM_DYNAMIC_PAGING_ENABLED()) {
2244 clock_sec_t sec;
2245 clock_nsec_t nsec;
2246
2247 switch (flow_control->state) {
2248 case FCS_IDLE:
2249 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2250 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2251 /*
2252 * since the compressor is running independently of vm_pageout_scan
2253 * let's not wait for it just yet... as long as we have a healthy supply
2254 * of filecache pages to work with, let's keep stealing those.
2255 */
2256 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2257
2258 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2259 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2260 *anons_grabbed = ANONS_GRABBED_LIMIT;
2261 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2262 return VM_PAGEOUT_SCAN_PROCEED;
2263 }
2264 }
2265
2266 vps_flow_control_reset_deadlock_timer(flow_control);
2267 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2268
2269 break;
2270
2271 case FCS_DELAYED:
2272 clock_get_system_nanotime(&sec, &nsec);
2273 ts.tv_sec = (unsigned int) sec;
2274 ts.tv_nsec = nsec;
2275
2276 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2277 /*
2278 * the pageout thread for the default pager is potentially
2279 * deadlocked since the
2280 * default pager queue has been throttled for more than the
2281 * allowable time... we need to move some clean pages or dirty
2282 * pages belonging to the external pagers if they aren't throttled
2283 * vm_page_free_wanted represents the number of threads currently
2284 * blocked waiting for pages... we'll move one page for each of
2285 * these plus a fixed amount to break the logjam... once we're done
2286 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2287 * with a new timeout target since we have no way of knowing
2288 * whether we've broken the deadlock except through observation
2289 * of the queue associated with the default pager... we need to
2290 * stop moving pages and allow the system to run to see what
2291 * state it settles into.
2292 */
2293
2294 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2295 vm_page_free_wanted + vm_page_free_wanted_privileged;
2296 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2297 flow_control->state = FCS_DEADLOCK_DETECTED;
2298 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2299 return VM_PAGEOUT_SCAN_PROCEED;
2300 }
2301 /*
2302 * just resniff instead of trying
2303 * to compute a new delay time... we're going to be
2304 * awakened immediately upon a laundry completion,
2305 * so we won't wait any longer than necessary
2306 */
2307 msecs = vm_pageout_state.vm_pageout_idle_wait;
2308 break;
2309
2310 case FCS_DEADLOCK_DETECTED:
2311 if (*vm_pageout_deadlock_target) {
2312 return VM_PAGEOUT_SCAN_PROCEED;
2313 }
2314
2315 vps_flow_control_reset_deadlock_timer(flow_control);
2316 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2317
2318 break;
2319 }
2320 } else {
2321 /*
2322 * No need to pause...
2323 */
2324 return VM_PAGEOUT_SCAN_PROCEED;
2325 }
2326
2327 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2328
2329 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2330 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2331
2332 if (vm_page_free_count >= vm_page_free_target) {
2333 /*
2334 * we're here because
2335 * 1) someone else freed up some pages while we had
2336 * the queues unlocked above
2337 * and we've hit one of the 3 conditions that
2338 * cause us to pause the pageout scan thread
2339 *
2340 * since we already have enough free pages,
2341 * let's avoid stalling and return normally
2342 *
2343 * before we return, make sure the pageout I/O threads
2344 * are running throttled in case there are still requests
2345 * in the laundry... since we have enough free pages
2346 * we don't need the laundry to be cleaned in a timely
2347 * fashion... so let's avoid interfering with foreground
2348 * activity
2349 *
2350 * we don't want to hold vm_page_queue_free_lock when
2351 * calling vm_pageout_adjust_eq_iothrottle (since it
2352 * may cause other locks to be taken), we do the intitial
2353 * check outside of the lock. Once we take the lock,
2354 * we recheck the condition since it may have changed.
2355 * if it has, no problem, we will make the threads
2356 * non-throttled before actually blocking
2357 */
2358 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2359 }
2360 vm_free_page_lock();
2361
2362 if (vm_page_free_count >= vm_page_free_target &&
2363 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2364 return VM_PAGEOUT_SCAN_DONE_RETURN;
2365 }
2366 vm_free_page_unlock();
2367
2368 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2369 /*
2370 * we're most likely about to block due to one of
2371 * the 3 conditions that cause vm_pageout_scan to
2372 * not be able to make forward progress w/r
2373 * to providing new pages to the free queue,
2374 * so unthrottle the I/O threads in case we
2375 * have laundry to be cleaned... it needs
2376 * to be completed ASAP.
2377 *
2378 * even if we don't block, we want the io threads
2379 * running unthrottled since the sum of free +
2380 * clean pages is still under our free target
2381 */
2382 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2383 }
2384 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2385 /*
2386 * if we get here we're below our free target and
2387 * we're stalling due to a full laundry queue or
2388 * we don't have any inactive pages other then
2389 * those in the clean queue...
2390 * however, we have pages on the clean queue that
2391 * can be moved to the free queue, so let's not
2392 * stall the pageout scan
2393 */
2394 flow_control->state = FCS_IDLE;
2395 return VM_PAGEOUT_SCAN_PROCEED;
2396 }
2397 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2398 flow_control->state = FCS_IDLE;
2399 return VM_PAGEOUT_SCAN_PROCEED;
2400 }
2401
2402 VM_CHECK_MEMORYSTATUS;
2403
2404 if (flow_control->state != FCS_IDLE) {
2405 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2406 }
2407
2408 iq->pgo_throttled = TRUE;
2409 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2410
2411 vm_page_unlock_queues();
2412
2413 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2414
2415 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2416 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2417 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2418
2419 thread_block(THREAD_CONTINUE_NULL);
2420
2421 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2422 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2423 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2424
2425 vm_page_lock_queues();
2426
2427 iq->pgo_throttled = FALSE;
2428
2429 vps_init_page_targets();
2430
2431 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2432 }
2433
2434 extern boolean_t vm_darkwake_mode;
2435 /*
2436 * This function is called only from vm_pageout_scan and
2437 * it will find and return the most appropriate page to be
2438 * reclaimed.
2439 */
2440 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2441 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2442 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2443 {
2444 vm_page_t m = NULL;
2445 vm_object_t m_object = VM_OBJECT_NULL;
2446 uint32_t inactive_external_count;
2447 struct vm_speculative_age_q *sq;
2448 struct vm_pageout_queue *iq;
2449 int retval = VM_PAGEOUT_SCAN_PROCEED;
2450
2451 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2452 iq = &vm_pageout_queue_internal;
2453
2454 *is_page_from_bg_q = FALSE;
2455
2456 m = NULL;
2457 m_object = VM_OBJECT_NULL;
2458
2459 if (VM_DYNAMIC_PAGING_ENABLED()) {
2460 assert(vm_page_throttled_count == 0);
2461 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2462 }
2463
2464 /*
2465 * Try for a clean-queue inactive page.
2466 * These are pages that vm_pageout_scan tried to steal earlier, but
2467 * were dirty and had to be cleaned. Pick them up now that they are clean.
2468 */
2469 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2470 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2471
2472 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2473
2474 goto found_page;
2475 }
2476
2477 /*
2478 * The next most eligible pages are ones we paged in speculatively,
2479 * but which have not yet been touched and have been aged out.
2480 */
2481 if (!vm_page_queue_empty(&sq->age_q)) {
2482 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2483
2484 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2485
2486 if (!m->vmp_dirty || force_anonymous == FALSE) {
2487 goto found_page;
2488 } else {
2489 m = NULL;
2490 }
2491 }
2492
2493 #if !CONFIG_JETSAM
2494 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2495 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2496 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2497 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2498 goto found_page;
2499 }
2500 }
2501 #endif /* !CONFIG_JETSAM */
2502
2503 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2504 vm_object_t bg_m_object = NULL;
2505
2506 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2507
2508 bg_m_object = VM_PAGE_OBJECT(m);
2509
2510 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2511 /*
2512 * This page is on the background queue
2513 * but not on a pageable queue OR is busy during
2514 * darkwake mode when the target is artificially lowered.
2515 * If it is busy during darkwake mode, and we don't skip it,
2516 * we will just swing back around and try again with the same
2517 * queue and might hit the same page or its neighbor in a
2518 * similar state. Both of these are transient states and will
2519 * get resolved, but, at this point let's ignore this page.
2520 */
2521 if (vm_darkwake_mode && m->vmp_busy) {
2522 if (bg_m_object->internal) {
2523 vm_pageout_skipped_bq_internal++;
2524 } else {
2525 vm_pageout_skipped_bq_external++;
2526 }
2527 }
2528 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2529 if (bg_m_object->internal &&
2530 (VM_PAGE_Q_THROTTLED(iq) ||
2531 vm_compressor_out_of_space() == TRUE ||
2532 vm_page_free_count < (vm_page_free_reserved / 4))) {
2533 vm_pageout_skipped_bq_internal++;
2534 } else {
2535 *is_page_from_bg_q = TRUE;
2536
2537 if (bg_m_object->internal) {
2538 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2539 } else {
2540 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2541 }
2542 goto found_page;
2543 }
2544 }
2545 }
2546
2547 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2548
2549 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2550 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2551 *grab_anonymous = TRUE;
2552 *anons_grabbed = 0;
2553
2554 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2555 vm_pageout_vminfo.vm_pageout_skipped_external++;
2556 } else {
2557 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2558 /*
2559 * No swap and we are in dangerously low levels of free memory.
2560 * If we keep going ahead with anonymous pages, we are going to run into a situation
2561 * where the compressor will be stuck waiting for free pages (if it isn't already).
2562 *
2563 * So, pick a file backed page...
2564 */
2565 *grab_anonymous = FALSE;
2566 *anons_grabbed = ANONS_GRABBED_LIMIT;
2567 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2568 }
2569 }
2570 goto want_anonymous;
2571 }
2572 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2573
2574 #if CONFIG_JETSAM
2575 /* If the file-backed pool has accumulated
2576 * significantly more pages than the jetsam
2577 * threshold, prefer to reclaim those
2578 * inline to minimise compute overhead of reclaiming
2579 * anonymous pages.
2580 * This calculation does not account for the CPU local
2581 * external page queues, as those are expected to be
2582 * much smaller relative to the global pools.
2583 */
2584
2585 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2586
2587 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2588 if (vm_page_pageable_external_count >
2589 vm_pageout_state.vm_page_filecache_min) {
2590 if ((vm_page_pageable_external_count *
2591 vm_pageout_memorystatus_fb_factor_dr) >
2592 (memorystatus_available_pages_critical *
2593 vm_pageout_memorystatus_fb_factor_nr)) {
2594 *grab_anonymous = FALSE;
2595
2596 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2597 }
2598 }
2599 if (*grab_anonymous) {
2600 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2601 }
2602 }
2603 #endif /* CONFIG_JETSAM */
2604
2605 want_anonymous:
2606 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2607 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2608 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2609
2610 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2611 *anons_grabbed = 0;
2612
2613 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2614 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2615 if ((++(*reactivated_this_call) % 100)) {
2616 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2617
2618 vm_page_activate(m);
2619 counter_inc(&vm_statistics_reactivations);
2620 #if DEVELOPMENT || DEBUG
2621 if (*is_page_from_bg_q == TRUE) {
2622 if (m_object->internal) {
2623 vm_pageout_rejected_bq_internal++;
2624 } else {
2625 vm_pageout_rejected_bq_external++;
2626 }
2627 }
2628 #endif /* DEVELOPMENT || DEBUG */
2629 vm_pageout_state.vm_pageout_inactive_used++;
2630
2631 m = NULL;
2632 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2633
2634 goto found_page;
2635 }
2636
2637 /*
2638 * steal 1 of the file backed pages even if
2639 * we are under the limit that has been set
2640 * for a healthy filecache
2641 */
2642 }
2643 }
2644 goto found_page;
2645 }
2646 }
2647 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2648 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2649
2650 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2651 *anons_grabbed += 1;
2652
2653 goto found_page;
2654 }
2655
2656 m = NULL;
2657
2658 found_page:
2659 *victim_page = m;
2660
2661 return retval;
2662 }
2663
2664 /*
2665 * This function is called only from vm_pageout_scan and
2666 * it will put a page back on the active/inactive queue
2667 * if we can't reclaim it for some reason.
2668 */
2669 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2670 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2671 {
2672 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2673 vm_page_enqueue_inactive(m, FALSE);
2674 } else {
2675 vm_page_activate(m);
2676 }
2677
2678 #if DEVELOPMENT || DEBUG
2679 vm_object_t m_object = VM_PAGE_OBJECT(m);
2680
2681 if (page_from_bg_q == TRUE) {
2682 if (m_object->internal) {
2683 vm_pageout_rejected_bq_internal++;
2684 } else {
2685 vm_pageout_rejected_bq_external++;
2686 }
2687 }
2688 #endif /* DEVELOPMENT || DEBUG */
2689 }
2690
2691 /*
2692 * This function is called only from vm_pageout_scan and
2693 * it will try to grab the victim page's VM object (m_object)
2694 * which differs from the previous victim page's object (object).
2695 */
2696 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2697 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2698 {
2699 struct vm_speculative_age_q *sq;
2700
2701 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2702
2703 /*
2704 * the object associated with candidate page is
2705 * different from the one we were just working
2706 * with... dump the lock if we still own it
2707 */
2708 if (*object != NULL) {
2709 vm_object_unlock(*object);
2710 *object = NULL;
2711 }
2712 /*
2713 * Try to lock object; since we've alread got the
2714 * page queues lock, we can only 'try' for this one.
2715 * if the 'try' fails, we need to do a mutex_pause
2716 * to allow the owner of the object lock a chance to
2717 * run... otherwise, we're likely to trip over this
2718 * object in the same state as we work our way through
2719 * the queue... clumps of pages associated with the same
2720 * object are fairly typical on the inactive and active queues
2721 */
2722 if (!vm_object_lock_try_scan(m_object)) {
2723 vm_page_t m_want = NULL;
2724
2725 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2726
2727 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2728 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2729 }
2730
2731 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2732
2733 m->vmp_reference = FALSE;
2734
2735 if (!m_object->object_is_shared_cache) {
2736 /*
2737 * don't apply this optimization if this is the shared cache
2738 * object, it's too easy to get rid of very hot and important
2739 * pages...
2740 * m->vmp_object must be stable since we hold the page queues lock...
2741 * we can update the scan_collisions field sans the object lock
2742 * since it is a separate field and this is the only spot that does
2743 * a read-modify-write operation and it is never executed concurrently...
2744 * we can asynchronously set this field to 0 when creating a UPL, so it
2745 * is possible for the value to be a bit non-determistic, but that's ok
2746 * since it's only used as a hint
2747 */
2748 m_object->scan_collisions = 1;
2749 }
2750 if (page_from_bg_q) {
2751 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2752 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2753 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2754 } else if (!vm_page_queue_empty(&sq->age_q)) {
2755 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2756 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2757 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2758 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2759 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2760 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2761 }
2762
2763 /*
2764 * this is the next object we're going to be interested in
2765 * try to make sure its available after the mutex_pause
2766 * returns control
2767 */
2768 if (m_want) {
2769 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2770 }
2771
2772 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2773
2774 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2775 } else {
2776 *object = m_object;
2777 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2778 }
2779
2780 return VM_PAGEOUT_SCAN_PROCEED;
2781 }
2782
2783 /*
2784 * This function is called only from vm_pageout_scan and
2785 * it notices that pageout scan may be rendered ineffective
2786 * due to a FS deadlock and will jetsam a process if possible.
2787 * If jetsam isn't supported, it'll move the page to the active
2788 * queue to try and get some different pages pushed onwards so
2789 * we can try to get out of this scenario.
2790 */
2791 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2792 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2793 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2794 {
2795 struct vm_pageout_queue *eq;
2796 vm_object_t cur_object = VM_OBJECT_NULL;
2797
2798 cur_object = *object;
2799
2800 eq = &vm_pageout_queue_external;
2801
2802 if (cur_object->internal == FALSE) {
2803 /*
2804 * we need to break up the following potential deadlock case...
2805 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2806 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2807 * c) Most of the pages in the inactive queue belong to this file.
2808 *
2809 * we are potentially in this deadlock because...
2810 * a) the external pageout queue is throttled
2811 * b) we're done with the active queue and moved on to the inactive queue
2812 * c) we've got a dirty external page
2813 *
2814 * since we don't know the reason for the external pageout queue being throttled we
2815 * must suspect that we are deadlocked, so move the current page onto the active queue
2816 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2817 *
2818 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2819 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2820 * pool the next time we select a victim page... if we can make enough new free pages,
2821 * the deadlock will break, the external pageout queue will empty and it will no longer
2822 * be throttled
2823 *
2824 * if we have jetsam configured, keep a count of the pages reactivated this way so
2825 * that we can try to find clean pages in the active/inactive queues before
2826 * deciding to jetsam a process
2827 */
2828 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2829
2830 vm_page_check_pageable_safe(m);
2831 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2832 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2833 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2834 vm_page_active_count++;
2835 vm_page_pageable_external_count++;
2836
2837 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2838
2839 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2840
2841 #pragma unused(force_anonymous)
2842
2843 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2844
2845 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2846 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2847 /*
2848 * Possible deadlock scenario so request jetsam action
2849 */
2850
2851 assert(cur_object);
2852 vm_object_unlock(cur_object);
2853
2854 cur_object = VM_OBJECT_NULL;
2855
2856 /*
2857 * VM pageout scan needs to know we have dropped this lock and so set the
2858 * object variable we got passed in to NULL.
2859 */
2860 *object = VM_OBJECT_NULL;
2861
2862 vm_page_unlock_queues();
2863
2864 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2865 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2866
2867 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2868 if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2869 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2870 }
2871
2872 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2873 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2874
2875 vm_page_lock_queues();
2876 *delayed_unlock = 1;
2877 }
2878 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2879
2880 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2881 #pragma unused(delayed_unlock)
2882
2883 *force_anonymous = TRUE;
2884 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2885 } else {
2886 vm_page_activate(m);
2887 counter_inc(&vm_statistics_reactivations);
2888
2889 #if DEVELOPMENT || DEBUG
2890 if (is_page_from_bg_q == TRUE) {
2891 if (cur_object->internal) {
2892 vm_pageout_rejected_bq_internal++;
2893 } else {
2894 vm_pageout_rejected_bq_external++;
2895 }
2896 }
2897 #endif /* DEVELOPMENT || DEBUG */
2898
2899 vm_pageout_state.vm_pageout_inactive_used++;
2900 }
2901 }
2902
2903
2904 void
vm_page_balance_inactive(int max_to_move)2905 vm_page_balance_inactive(int max_to_move)
2906 {
2907 vm_page_t m;
2908
2909 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2910
2911 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2912 /*
2913 * It is likely that the hibernation code path is
2914 * dealing with these very queues as we are about
2915 * to move pages around in/from them and completely
2916 * change the linkage of the pages.
2917 *
2918 * And so we skip the rebalancing of these queues.
2919 */
2920 return;
2921 }
2922 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2923 vm_page_inactive_count +
2924 vm_page_speculative_count);
2925
2926 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2927 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2928
2929 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2930
2931 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2932 assert(!m->vmp_laundry);
2933 assert(VM_PAGE_OBJECT(m) != kernel_object);
2934 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2935
2936 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2937
2938 /*
2939 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2940 *
2941 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2942 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2943 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2944 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2945 * by pageout_scan, which is just fine since the last reference would have happened quite far
2946 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2947 * have happened before we moved the page
2948 */
2949 if (m->vmp_pmapped == TRUE) {
2950 /*
2951 * We might be holding the page queue lock as a
2952 * spin lock and clearing the "referenced" bit could
2953 * take a while if there are lots of mappings of
2954 * that page, so make sure we acquire the lock as
2955 * as mutex to avoid a spinlock timeout.
2956 */
2957 vm_page_lockconvert_queues();
2958 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2959 }
2960
2961 /*
2962 * The page might be absent or busy,
2963 * but vm_page_deactivate can handle that.
2964 * FALSE indicates that we don't want a H/W clear reference
2965 */
2966 vm_page_deactivate_internal(m, FALSE);
2967 }
2968 }
2969
2970 /*
2971 * vm_pageout_scan does the dirty work for the pageout daemon.
2972 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2973 * held and vm_page_free_wanted == 0.
2974 */
2975 void
vm_pageout_scan(void)2976 vm_pageout_scan(void)
2977 {
2978 unsigned int loop_count = 0;
2979 unsigned int inactive_burst_count = 0;
2980 unsigned int reactivated_this_call;
2981 unsigned int reactivate_limit;
2982 vm_page_t local_freeq = NULL;
2983 int local_freed = 0;
2984 int delayed_unlock;
2985 int delayed_unlock_limit = 0;
2986 int refmod_state = 0;
2987 int vm_pageout_deadlock_target = 0;
2988 struct vm_pageout_queue *iq;
2989 struct vm_pageout_queue *eq;
2990 struct vm_speculative_age_q *sq;
2991 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2992 boolean_t inactive_throttled = FALSE;
2993 vm_object_t object = NULL;
2994 uint32_t inactive_reclaim_run;
2995 boolean_t grab_anonymous = FALSE;
2996 boolean_t force_anonymous = FALSE;
2997 boolean_t force_speculative_aging = FALSE;
2998 int anons_grabbed = 0;
2999 int page_prev_q_state = 0;
3000 boolean_t page_from_bg_q = FALSE;
3001 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3002 vm_object_t m_object = VM_OBJECT_NULL;
3003 int retval = 0;
3004 boolean_t lock_yield_check = FALSE;
3005
3006
3007 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3008 vm_pageout_vminfo.vm_pageout_freed_speculative,
3009 vm_pageout_state.vm_pageout_inactive_clean,
3010 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3011 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3012
3013 flow_control.state = FCS_IDLE;
3014 iq = &vm_pageout_queue_internal;
3015 eq = &vm_pageout_queue_external;
3016 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3017
3018 /* Ask the pmap layer to return any pages it no longer needs. */
3019 pmap_release_pages_fast();
3020
3021 vm_page_lock_queues();
3022
3023 delayed_unlock = 1;
3024
3025 /*
3026 * Calculate the max number of referenced pages on the inactive
3027 * queue that we will reactivate.
3028 */
3029 reactivated_this_call = 0;
3030 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3031 vm_page_inactive_count);
3032 inactive_reclaim_run = 0;
3033
3034 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3035
3036 /*
3037 * We must limit the rate at which we send pages to the pagers
3038 * so that we don't tie up too many pages in the I/O queues.
3039 * We implement a throttling mechanism using the laundry count
3040 * to limit the number of pages outstanding to the default
3041 * and external pagers. We can bypass the throttles and look
3042 * for clean pages if the pageout queues don't drain in a timely
3043 * fashion since this may indicate that the pageout paths are
3044 * stalled waiting for memory, which only we can provide.
3045 */
3046
3047 vps_init_page_targets();
3048 assert(object == NULL);
3049 assert(delayed_unlock != 0);
3050
3051 for (;;) {
3052 vm_page_t m;
3053
3054 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3055
3056 if (lock_yield_check) {
3057 lock_yield_check = FALSE;
3058
3059 if (delayed_unlock++ > delayed_unlock_limit) {
3060 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3061 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3062 } else if (vm_pageout_scan_wants_object) {
3063 vm_page_unlock_queues();
3064 mutex_pause(0);
3065 vm_page_lock_queues();
3066 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3067 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3068 }
3069 }
3070
3071 if (vm_upl_wait_for_pages < 0) {
3072 vm_upl_wait_for_pages = 0;
3073 }
3074
3075 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3076
3077 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3078 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3079 }
3080
3081 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3082
3083 assert(delayed_unlock);
3084
3085 /*
3086 * maintain our balance
3087 */
3088 vm_page_balance_inactive(1);
3089
3090
3091 /**********************************************************************
3092 * above this point we're playing with the active and secluded queues
3093 * below this point we're playing with the throttling mechanisms
3094 * and the inactive queue
3095 **********************************************************************/
3096
3097 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3098 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3099
3100 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3101 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3102 /*
3103 * make sure the pageout I/O threads are running
3104 * throttled in case there are still requests
3105 * in the laundry... since we have met our targets
3106 * we don't need the laundry to be cleaned in a timely
3107 * fashion... so let's avoid interfering with foreground
3108 * activity
3109 */
3110 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3111
3112 vm_free_page_lock();
3113
3114 if ((vm_page_free_count >= vm_page_free_target) &&
3115 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3116 /*
3117 * done - we have met our target *and*
3118 * there is no one waiting for a page.
3119 */
3120 return_from_scan:
3121 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3122
3123 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3124 vm_pageout_state.vm_pageout_inactive,
3125 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3126 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3127 vm_pageout_vminfo.vm_pageout_freed_speculative,
3128 vm_pageout_state.vm_pageout_inactive_clean,
3129 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3130 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3131
3132 return;
3133 }
3134 vm_free_page_unlock();
3135 }
3136
3137 /*
3138 * Before anything, we check if we have any ripe volatile
3139 * objects around. If so, try to purge the first object.
3140 * If the purge fails, fall through to reclaim a page instead.
3141 * If the purge succeeds, go back to the top and reevalute
3142 * the new memory situation.
3143 */
3144 retval = vps_purge_object();
3145
3146 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3147 /*
3148 * Success
3149 */
3150 if (object != NULL) {
3151 vm_object_unlock(object);
3152 object = NULL;
3153 }
3154
3155 lock_yield_check = FALSE;
3156 continue;
3157 }
3158
3159 /*
3160 * If our 'aged' queue is empty and we have some speculative pages
3161 * in the other queues, let's go through and see if we need to age
3162 * them.
3163 *
3164 * If we succeeded in aging a speculative Q or just that everything
3165 * looks normal w.r.t queue age and queue counts, we keep going onward.
3166 *
3167 * If, for some reason, we seem to have a mismatch between the spec.
3168 * page count and the page queues, we reset those variables and
3169 * restart the loop (LD TODO: Track this better?).
3170 */
3171 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3172 retval = vps_age_speculative_queue(force_speculative_aging);
3173
3174 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3175 lock_yield_check = FALSE;
3176 continue;
3177 }
3178 }
3179 force_speculative_aging = FALSE;
3180
3181 /*
3182 * Check to see if we need to evict objects from the cache.
3183 *
3184 * Note: 'object' here doesn't have anything to do with
3185 * the eviction part. We just need to make sure we have dropped
3186 * any object lock we might be holding if we need to go down
3187 * into the eviction logic.
3188 */
3189 retval = vps_object_cache_evict(&object);
3190
3191 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3192 lock_yield_check = FALSE;
3193 continue;
3194 }
3195
3196
3197 /*
3198 * Calculate our filecache_min that will affect the loop
3199 * going forward.
3200 */
3201 vps_calculate_filecache_min();
3202
3203 /*
3204 * LD TODO: Use a structure to hold all state variables for a single
3205 * vm_pageout_scan iteration and pass that structure to this function instead.
3206 */
3207 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3208 &delayed_unlock, &local_freeq, &local_freed,
3209 &vm_pageout_deadlock_target, inactive_burst_count);
3210
3211 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3212 if (loop_count >= vm_page_inactive_count) {
3213 loop_count = 0;
3214 }
3215
3216 inactive_burst_count = 0;
3217
3218 assert(object == NULL);
3219 assert(delayed_unlock != 0);
3220
3221 lock_yield_check = FALSE;
3222 continue;
3223 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3224 goto return_from_scan;
3225 }
3226
3227 flow_control.state = FCS_IDLE;
3228
3229 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3230 vm_pageout_inactive_external_forced_reactivate_limit);
3231 loop_count++;
3232 inactive_burst_count++;
3233 vm_pageout_state.vm_pageout_inactive++;
3234
3235 /*
3236 * Choose a victim.
3237 */
3238
3239 m = NULL;
3240 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3241
3242 if (m == NULL) {
3243 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3244 inactive_burst_count = 0;
3245
3246 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3247 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3248 }
3249
3250 lock_yield_check = TRUE;
3251 continue;
3252 }
3253
3254 /*
3255 * if we've gotten here, we have no victim page.
3256 * check to see if we've not finished balancing the queues
3257 * or we have a page on the aged speculative queue that we
3258 * skipped due to force_anonymous == TRUE.. or we have
3259 * speculative pages that we can prematurely age... if
3260 * one of these cases we'll keep going, else panic
3261 */
3262 force_anonymous = FALSE;
3263 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3264
3265 if (!vm_page_queue_empty(&sq->age_q)) {
3266 lock_yield_check = TRUE;
3267 continue;
3268 }
3269
3270 if (vm_page_speculative_count) {
3271 force_speculative_aging = TRUE;
3272 lock_yield_check = TRUE;
3273 continue;
3274 }
3275 panic("vm_pageout: no victim");
3276
3277 /* NOTREACHED */
3278 }
3279
3280 assert(VM_PAGE_PAGEABLE(m));
3281 m_object = VM_PAGE_OBJECT(m);
3282 force_anonymous = FALSE;
3283
3284 page_prev_q_state = m->vmp_q_state;
3285 /*
3286 * we just found this page on one of our queues...
3287 * it can't also be on the pageout queue, so safe
3288 * to call vm_page_queues_remove
3289 */
3290 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3291 vm_page_queues_remove(m, TRUE);
3292 if (donate) {
3293 /*
3294 * The compressor needs to see this bit to know
3295 * where this page needs to land. Also if stolen,
3296 * this bit helps put the page back in the right
3297 * special queue where it belongs.
3298 */
3299 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3300 }
3301
3302 assert(!m->vmp_laundry);
3303 assert(!m->vmp_private);
3304 assert(!m->vmp_fictitious);
3305 assert(m_object != kernel_object);
3306 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3307
3308 vm_pageout_vminfo.vm_pageout_considered_page++;
3309
3310 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3311
3312 /*
3313 * check to see if we currently are working
3314 * with the same object... if so, we've
3315 * already got the lock
3316 */
3317 if (m_object != object) {
3318 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3319
3320 /*
3321 * vps_switch_object() will always drop the 'object' lock first
3322 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3323 * either 'm_object' or NULL.
3324 */
3325 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3326
3327 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3328 lock_yield_check = TRUE;
3329 continue;
3330 }
3331 }
3332 assert(m_object == object);
3333 assert(VM_PAGE_OBJECT(m) == m_object);
3334
3335 if (m->vmp_busy) {
3336 /*
3337 * Somebody is already playing with this page.
3338 * Put it back on the appropriate queue
3339 *
3340 */
3341 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3342
3343 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3344 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3345 }
3346
3347 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3348
3349 lock_yield_check = TRUE;
3350 continue;
3351 }
3352
3353 /*
3354 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3355 * If already cleaning this page in place
3356 * just leave if off the paging queues.
3357 * We can leave the page mapped, and upl_commit_range
3358 * will put it on the clean queue.
3359 *
3360 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3361 * an msync INVALIDATE is in progress...
3362 * this page has been marked for destruction
3363 * after it has been cleaned,
3364 * but not yet gathered into a UPL
3365 * where 'cleaning' will be set...
3366 * just leave it off the paging queues
3367 *
3368 * if (m->vmp_free_when_done && m->vmp_clenaing)
3369 * an msync INVALIDATE is in progress
3370 * and the UPL has already gathered this page...
3371 * just leave it off the paging queues
3372 */
3373 if (m->vmp_free_when_done || m->vmp_cleaning) {
3374 lock_yield_check = TRUE;
3375 continue;
3376 }
3377
3378
3379 /*
3380 * If it's absent, in error or the object is no longer alive,
3381 * we can reclaim the page... in the no longer alive case,
3382 * there are 2 states the page can be in that preclude us
3383 * from reclaiming it - busy or cleaning - that we've already
3384 * dealt with
3385 */
3386 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3387 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3388 if (m->vmp_absent) {
3389 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3390 } else if (!object->alive ||
3391 (!object->internal &&
3392 object->pager == MEMORY_OBJECT_NULL)) {
3393 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3394 } else {
3395 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3396 }
3397 reclaim_page:
3398 if (vm_pageout_deadlock_target) {
3399 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3400 vm_pageout_deadlock_target--;
3401 }
3402
3403 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3404
3405 if (object->internal) {
3406 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3407 } else {
3408 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3409 }
3410 assert(!m->vmp_cleaning);
3411 assert(!m->vmp_laundry);
3412
3413 if (!object->internal &&
3414 object->pager != NULL &&
3415 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3416 shared_region_pager_reclaimed++;
3417 }
3418
3419 m->vmp_busy = TRUE;
3420
3421 /*
3422 * remove page from object here since we're already
3423 * behind the object lock... defer the rest of the work
3424 * we'd normally do in vm_page_free_prepare_object
3425 * until 'vm_page_free_list' is called
3426 */
3427 if (m->vmp_tabled) {
3428 vm_page_remove(m, TRUE);
3429 }
3430
3431 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3432 m->vmp_snext = local_freeq;
3433 local_freeq = m;
3434 local_freed++;
3435
3436 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3437 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3438 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3439 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3440 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3441 vm_pageout_vminfo.vm_pageout_freed_internal++;
3442 } else {
3443 vm_pageout_vminfo.vm_pageout_freed_external++;
3444 }
3445
3446 inactive_burst_count = 0;
3447
3448 lock_yield_check = TRUE;
3449 continue;
3450 }
3451 if (object->copy == VM_OBJECT_NULL) {
3452 /*
3453 * No one else can have any interest in this page.
3454 * If this is an empty purgable object, the page can be
3455 * reclaimed even if dirty.
3456 * If the page belongs to a volatile purgable object, we
3457 * reactivate it if the compressor isn't active.
3458 */
3459 if (object->purgable == VM_PURGABLE_EMPTY) {
3460 if (m->vmp_pmapped == TRUE) {
3461 /* unmap the page */
3462 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3463 if (refmod_state & VM_MEM_MODIFIED) {
3464 SET_PAGE_DIRTY(m, FALSE);
3465 }
3466 }
3467 if (m->vmp_dirty || m->vmp_precious) {
3468 /* we saved the cost of cleaning this page ! */
3469 vm_page_purged_count++;
3470 }
3471 goto reclaim_page;
3472 }
3473
3474 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3475 /*
3476 * With the VM compressor, the cost of
3477 * reclaiming a page is much lower (no I/O),
3478 * so if we find a "volatile" page, it's better
3479 * to let it get compressed rather than letting
3480 * it occupy a full page until it gets purged.
3481 * So no need to check for "volatile" here.
3482 */
3483 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3484 /*
3485 * Avoid cleaning a "volatile" page which might
3486 * be purged soon.
3487 */
3488
3489 /* if it's wired, we can't put it on our queue */
3490 assert(!VM_PAGE_WIRED(m));
3491
3492 /* just stick it back on! */
3493 reactivated_this_call++;
3494
3495 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3496 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3497 }
3498
3499 goto reactivate_page;
3500 }
3501 }
3502 /*
3503 * If it's being used, reactivate.
3504 * (Fictitious pages are either busy or absent.)
3505 * First, update the reference and dirty bits
3506 * to make sure the page is unreferenced.
3507 */
3508 refmod_state = -1;
3509
3510 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3511 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3512
3513 if (refmod_state & VM_MEM_REFERENCED) {
3514 m->vmp_reference = TRUE;
3515 }
3516 if (refmod_state & VM_MEM_MODIFIED) {
3517 SET_PAGE_DIRTY(m, FALSE);
3518 }
3519 }
3520
3521 if (m->vmp_reference || m->vmp_dirty) {
3522 /* deal with a rogue "reusable" page */
3523 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3524 }
3525
3526 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3527 vm_pageout_state.vm_page_xpmapped_min = 0;
3528 } else {
3529 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3530 }
3531
3532 if (!m->vmp_no_cache &&
3533 page_from_bg_q == FALSE &&
3534 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3535 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3536 /*
3537 * The page we pulled off the inactive list has
3538 * been referenced. It is possible for other
3539 * processors to be touching pages faster than we
3540 * can clear the referenced bit and traverse the
3541 * inactive queue, so we limit the number of
3542 * reactivations.
3543 */
3544 if (++reactivated_this_call >= reactivate_limit &&
3545 !object->object_is_shared_cache &&
3546 !((m->vmp_realtime ||
3547 object->for_realtime) &&
3548 vm_pageout_protect_realtime)) {
3549 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3550 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3551 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3552 if (object->object_is_shared_cache) {
3553 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3554 } else if (m->vmp_realtime ||
3555 object->for_realtime) {
3556 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3557 }
3558 } else {
3559 uint32_t isinuse;
3560
3561 if (reactivated_this_call >= reactivate_limit) {
3562 if (object->object_is_shared_cache) {
3563 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3564 } else if ((m->vmp_realtime ||
3565 object->for_realtime) &&
3566 vm_pageout_protect_realtime) {
3567 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3568 }
3569 }
3570 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3571 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3572 }
3573
3574 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3575 reactivate_page:
3576 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3577 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3578 /*
3579 * no explict mappings of this object exist
3580 * and it's not open via the filesystem
3581 */
3582 vm_page_deactivate(m);
3583 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3584 } else {
3585 /*
3586 * The page was/is being used, so put back on active list.
3587 */
3588 vm_page_activate(m);
3589 counter_inc(&vm_statistics_reactivations);
3590 inactive_burst_count = 0;
3591 }
3592 #if DEVELOPMENT || DEBUG
3593 if (page_from_bg_q == TRUE) {
3594 if (m_object->internal) {
3595 vm_pageout_rejected_bq_internal++;
3596 } else {
3597 vm_pageout_rejected_bq_external++;
3598 }
3599 }
3600 #endif /* DEVELOPMENT || DEBUG */
3601
3602 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3603 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3604 }
3605 vm_pageout_state.vm_pageout_inactive_used++;
3606
3607 lock_yield_check = TRUE;
3608 continue;
3609 }
3610 /*
3611 * Make sure we call pmap_get_refmod() if it
3612 * wasn't already called just above, to update
3613 * the dirty bit.
3614 */
3615 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3616 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3617 if (refmod_state & VM_MEM_MODIFIED) {
3618 SET_PAGE_DIRTY(m, FALSE);
3619 }
3620 }
3621 }
3622
3623 /*
3624 * we've got a candidate page to steal...
3625 *
3626 * m->vmp_dirty is up to date courtesy of the
3627 * preceding check for m->vmp_reference... if
3628 * we get here, then m->vmp_reference had to be
3629 * FALSE (or possibly "reactivate_limit" was
3630 * exceeded), but in either case we called
3631 * pmap_get_refmod() and updated both
3632 * m->vmp_reference and m->vmp_dirty
3633 *
3634 * if it's dirty or precious we need to
3635 * see if the target queue is throtttled
3636 * it if is, we need to skip over it by moving it back
3637 * to the end of the inactive queue
3638 */
3639
3640 inactive_throttled = FALSE;
3641
3642 if (m->vmp_dirty || m->vmp_precious) {
3643 if (object->internal) {
3644 if (VM_PAGE_Q_THROTTLED(iq)) {
3645 inactive_throttled = TRUE;
3646 }
3647 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3648 inactive_throttled = TRUE;
3649 }
3650 }
3651 throttle_inactive:
3652 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3653 object->internal && m->vmp_dirty &&
3654 (object->purgable == VM_PURGABLE_DENY ||
3655 object->purgable == VM_PURGABLE_NONVOLATILE ||
3656 object->purgable == VM_PURGABLE_VOLATILE)) {
3657 vm_page_check_pageable_safe(m);
3658 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3659 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3660 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3661 vm_page_throttled_count++;
3662
3663 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3664
3665 inactive_burst_count = 0;
3666
3667 lock_yield_check = TRUE;
3668 continue;
3669 }
3670 if (inactive_throttled == TRUE) {
3671 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3672 &delayed_unlock, &force_anonymous, page_from_bg_q);
3673
3674 inactive_burst_count = 0;
3675
3676 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3677 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3678 }
3679
3680 lock_yield_check = TRUE;
3681 continue;
3682 }
3683
3684 /*
3685 * we've got a page that we can steal...
3686 * eliminate all mappings and make sure
3687 * we have the up-to-date modified state
3688 *
3689 * if we need to do a pmap_disconnect then we
3690 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3691 * provides the true state atomically... the
3692 * page was still mapped up to the pmap_disconnect
3693 * and may have been dirtied at the last microsecond
3694 *
3695 * Note that if 'pmapped' is FALSE then the page is not
3696 * and has not been in any map, so there is no point calling
3697 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3698 * of likely usage of the page.
3699 */
3700 if (m->vmp_pmapped == TRUE) {
3701 int pmap_options;
3702
3703 /*
3704 * Don't count this page as going into the compressor
3705 * if any of these are true:
3706 * 1) compressed pager isn't enabled
3707 * 2) Freezer enabled device with compressed pager
3708 * backend (exclusive use) i.e. most of the VM system
3709 * (including vm_pageout_scan) has no knowledge of
3710 * the compressor
3711 * 3) This page belongs to a file and hence will not be
3712 * sent into the compressor
3713 */
3714 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3715 object->internal == FALSE) {
3716 pmap_options = 0;
3717 } else if (m->vmp_dirty || m->vmp_precious) {
3718 /*
3719 * VM knows that this page is dirty (or
3720 * precious) and needs to be compressed
3721 * rather than freed.
3722 * Tell the pmap layer to count this page
3723 * as "compressed".
3724 */
3725 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3726 } else {
3727 /*
3728 * VM does not know if the page needs to
3729 * be preserved but the pmap layer might tell
3730 * us if any mapping has "modified" it.
3731 * Let's the pmap layer to count this page
3732 * as compressed if and only if it has been
3733 * modified.
3734 */
3735 pmap_options =
3736 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3737 }
3738 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3739 pmap_options,
3740 NULL);
3741 if (refmod_state & VM_MEM_MODIFIED) {
3742 SET_PAGE_DIRTY(m, FALSE);
3743 }
3744 }
3745
3746 /*
3747 * reset our count of pages that have been reclaimed
3748 * since the last page was 'stolen'
3749 */
3750 inactive_reclaim_run = 0;
3751
3752 /*
3753 * If it's clean and not precious, we can free the page.
3754 */
3755 if (!m->vmp_dirty && !m->vmp_precious) {
3756 vm_pageout_state.vm_pageout_inactive_clean++;
3757
3758 /*
3759 * OK, at this point we have found a page we are going to free.
3760 */
3761 #if CONFIG_PHANTOM_CACHE
3762 if (!object->internal) {
3763 vm_phantom_cache_add_ghost(m);
3764 }
3765 #endif
3766 goto reclaim_page;
3767 }
3768
3769 /*
3770 * The page may have been dirtied since the last check
3771 * for a throttled target queue (which may have been skipped
3772 * if the page was clean then). With the dirty page
3773 * disconnected here, we can make one final check.
3774 */
3775 if (object->internal) {
3776 if (VM_PAGE_Q_THROTTLED(iq)) {
3777 inactive_throttled = TRUE;
3778 }
3779 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3780 inactive_throttled = TRUE;
3781 }
3782
3783 if (inactive_throttled == TRUE) {
3784 goto throttle_inactive;
3785 }
3786
3787 #if VM_PRESSURE_EVENTS
3788 #if CONFIG_JETSAM
3789
3790 /*
3791 * If Jetsam is enabled, then the sending
3792 * of memory pressure notifications is handled
3793 * from the same thread that takes care of high-water
3794 * and other jetsams i.e. the memorystatus_thread.
3795 */
3796
3797 #else /* CONFIG_JETSAM */
3798
3799 vm_pressure_response();
3800
3801 #endif /* CONFIG_JETSAM */
3802 #endif /* VM_PRESSURE_EVENTS */
3803
3804 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3805 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3806 }
3807
3808 if (object->internal) {
3809 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3810 } else {
3811 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3812 }
3813
3814 /*
3815 * internal pages will go to the compressor...
3816 * external pages will go to the appropriate pager to be cleaned
3817 * and upon completion will end up on 'vm_page_queue_cleaned' which
3818 * is a preferred queue to steal from
3819 */
3820 vm_pageout_cluster(m);
3821 inactive_burst_count = 0;
3822
3823 /*
3824 * back to top of pageout scan loop
3825 */
3826 }
3827 }
3828
3829
3830 void
vm_page_free_reserve(int pages)3831 vm_page_free_reserve(
3832 int pages)
3833 {
3834 int free_after_reserve;
3835
3836 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3837 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3838 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3839 } else {
3840 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3841 }
3842 } else {
3843 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3844 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3845 } else {
3846 vm_page_free_reserved += pages;
3847 }
3848 }
3849 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3850
3851 vm_page_free_min = vm_page_free_reserved +
3852 VM_PAGE_FREE_MIN(free_after_reserve);
3853
3854 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3855 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3856 }
3857
3858 vm_page_free_target = vm_page_free_reserved +
3859 VM_PAGE_FREE_TARGET(free_after_reserve);
3860
3861 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3862 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3863 }
3864
3865 if (vm_page_free_target < vm_page_free_min + 5) {
3866 vm_page_free_target = vm_page_free_min + 5;
3867 }
3868
3869 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3870 }
3871
3872 /*
3873 * vm_pageout is the high level pageout daemon.
3874 */
3875
3876 void
vm_pageout_continue(void)3877 vm_pageout_continue(void)
3878 {
3879 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3880 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3881
3882 vm_free_page_lock();
3883 vm_pageout_running = TRUE;
3884 vm_free_page_unlock();
3885
3886 vm_pageout_scan();
3887 /*
3888 * we hold both the vm_page_queue_free_lock
3889 * and the vm_page_queues_lock at this point
3890 */
3891 assert(vm_page_free_wanted == 0);
3892 assert(vm_page_free_wanted_privileged == 0);
3893 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3894
3895 vm_pageout_running = FALSE;
3896 #if XNU_TARGET_OS_OSX
3897 if (vm_pageout_waiter) {
3898 vm_pageout_waiter = FALSE;
3899 thread_wakeup((event_t)&vm_pageout_waiter);
3900 }
3901 #endif /* XNU_TARGET_OS_OSX */
3902
3903 vm_free_page_unlock();
3904 vm_page_unlock_queues();
3905
3906 thread_block((thread_continue_t)vm_pageout_continue);
3907 /*NOTREACHED*/
3908 }
3909
3910 #if XNU_TARGET_OS_OSX
3911 kern_return_t
vm_pageout_wait(uint64_t deadline)3912 vm_pageout_wait(uint64_t deadline)
3913 {
3914 kern_return_t kr;
3915
3916 vm_free_page_lock();
3917 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3918 vm_pageout_waiter = TRUE;
3919 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3920 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3921 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3922 kr = KERN_OPERATION_TIMED_OUT;
3923 }
3924 }
3925 vm_free_page_unlock();
3926
3927 return kr;
3928 }
3929 #endif /* XNU_TARGET_OS_OSX */
3930
3931 OS_NORETURN
3932 static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state * ethr,__unused wait_result_t w)3933 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3934 {
3935 vm_page_t m = NULL;
3936 vm_object_t object;
3937 vm_object_offset_t offset;
3938 memory_object_t pager;
3939 struct vm_pageout_queue *q = ethr->q;
3940
3941 /* On systems with a compressor, the external IO thread clears its
3942 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3943 * creation)
3944 */
3945 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3946 current_thread()->options &= ~TH_OPT_VMPRIV;
3947 }
3948
3949 sched_cond_ack(&(ethr->pgo_wakeup));
3950
3951 while (true) {
3952 vm_page_lockspin_queues();
3953
3954 while (!vm_page_queue_empty(&q->pgo_pending)) {
3955 q->pgo_busy = TRUE;
3956 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3957
3958 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3959 VM_PAGE_CHECK(m);
3960 /*
3961 * grab a snapshot of the object and offset this
3962 * page is tabled in so that we can relookup this
3963 * page after we've taken the object lock - these
3964 * fields are stable while we hold the page queues lock
3965 * but as soon as we drop it, there is nothing to keep
3966 * this page in this object... we hold an activity_in_progress
3967 * on this object which will keep it from terminating
3968 */
3969 object = VM_PAGE_OBJECT(m);
3970 offset = m->vmp_offset;
3971
3972 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3973 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3974
3975 vm_page_unlock_queues();
3976
3977 vm_object_lock(object);
3978
3979 m = vm_page_lookup(object, offset);
3980
3981 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3982 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3983 /*
3984 * it's either the same page that someone else has
3985 * started cleaning (or it's finished cleaning or
3986 * been put back on the pageout queue), or
3987 * the page has been freed or we have found a
3988 * new page at this offset... in all of these cases
3989 * we merely need to release the activity_in_progress
3990 * we took when we put the page on the pageout queue
3991 */
3992 vm_object_activity_end(object);
3993 vm_object_unlock(object);
3994
3995 vm_page_lockspin_queues();
3996 continue;
3997 }
3998 pager = object->pager;
3999
4000 if (pager == MEMORY_OBJECT_NULL) {
4001 /*
4002 * This pager has been destroyed by either
4003 * memory_object_destroy or vm_object_destroy, and
4004 * so there is nowhere for the page to go.
4005 */
4006 if (m->vmp_free_when_done) {
4007 /*
4008 * Just free the page... VM_PAGE_FREE takes
4009 * care of cleaning up all the state...
4010 * including doing the vm_pageout_throttle_up
4011 */
4012 VM_PAGE_FREE(m);
4013 } else {
4014 vm_page_lockspin_queues();
4015
4016 vm_pageout_throttle_up(m);
4017 vm_page_activate(m);
4018
4019 vm_page_unlock_queues();
4020
4021 /*
4022 * And we are done with it.
4023 */
4024 }
4025 vm_object_activity_end(object);
4026 vm_object_unlock(object);
4027
4028 vm_page_lockspin_queues();
4029 continue;
4030 }
4031 #if 0
4032 /*
4033 * we don't hold the page queue lock
4034 * so this check isn't safe to make
4035 */
4036 VM_PAGE_CHECK(m);
4037 #endif
4038 /*
4039 * give back the activity_in_progress reference we
4040 * took when we queued up this page and replace it
4041 * it with a paging_in_progress reference that will
4042 * also hold the paging offset from changing and
4043 * prevent the object from terminating
4044 */
4045 vm_object_activity_end(object);
4046 vm_object_paging_begin(object);
4047 vm_object_unlock(object);
4048
4049 /*
4050 * Send the data to the pager.
4051 * any pageout clustering happens there
4052 */
4053 memory_object_data_return(pager,
4054 m->vmp_offset + object->paging_offset,
4055 PAGE_SIZE,
4056 NULL,
4057 NULL,
4058 FALSE,
4059 FALSE,
4060 0);
4061
4062 vm_object_lock(object);
4063 vm_object_paging_end(object);
4064 vm_object_unlock(object);
4065
4066 vm_pageout_io_throttle();
4067
4068 vm_page_lockspin_queues();
4069 }
4070 q->pgo_busy = FALSE;
4071
4072 vm_page_unlock_queues();
4073 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4074 }
4075 /*NOTREACHED*/
4076 }
4077
4078
4079 #define MAX_FREE_BATCH 32
4080 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4081 * this thread.
4082 */
4083
4084
4085 OS_NORETURN
4086 static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state * cq,__unused wait_result_t w)4087 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4088 {
4089 struct vm_pageout_queue *q;
4090 vm_page_t m = NULL;
4091 boolean_t pgo_draining;
4092 vm_page_t local_q;
4093 int local_cnt;
4094 vm_page_t local_freeq = NULL;
4095 int local_freed = 0;
4096 int local_batch_size;
4097 #if DEVELOPMENT || DEBUG
4098 int ncomps = 0;
4099 boolean_t marked_active = FALSE;
4100 int num_pages_processed = 0;
4101 #endif
4102 void *chead = NULL;
4103
4104 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4105
4106 sched_cond_ack(&(cq->pgo_wakeup));
4107
4108 q = cq->q;
4109
4110 while (true) {
4111 #if DEVELOPMENT || DEBUG
4112 bool benchmark_accounting = false;
4113 /*
4114 * If we're running the compressor perf test, only process the benchmark pages.
4115 * We'll get back to our regular queue once the benchmark is done
4116 */
4117 if (compressor_running_perf_test) {
4118 q = cq->benchmark_q;
4119 if (!vm_page_queue_empty(&q->pgo_pending)) {
4120 benchmark_accounting = true;
4121 } else {
4122 q = cq->q;
4123 benchmark_accounting = false;
4124 }
4125 }
4126 #endif /* DEVELOPMENT || DEBUG */
4127
4128 #if __AMP__
4129 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4130 local_batch_size = (q->pgo_maxlaundry >> 3);
4131 local_batch_size = MAX(local_batch_size, 16);
4132 } else {
4133 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4134 }
4135 #else
4136 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4137 #endif
4138
4139 #if RECORD_THE_COMPRESSED_DATA
4140 if (q->pgo_laundry) {
4141 c_compressed_record_init();
4142 }
4143 #endif
4144 while (true) {
4145 int pages_left_on_q = 0;
4146
4147 local_cnt = 0;
4148 local_q = NULL;
4149
4150 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4151
4152 vm_page_lock_queues();
4153 #if DEVELOPMENT || DEBUG
4154 if (marked_active == FALSE) {
4155 vmct_active++;
4156 vmct_state[cq->id] = VMCT_ACTIVE;
4157 marked_active = TRUE;
4158 if (vmct_active == 1) {
4159 vm_compressor_epoch_start = mach_absolute_time();
4160 }
4161 }
4162 #endif
4163 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4164
4165 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4166
4167 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4168 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4169 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4170 VM_PAGE_CHECK(m);
4171
4172 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4173 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4174 m->vmp_laundry = FALSE;
4175
4176 m->vmp_snext = local_q;
4177 local_q = m;
4178 local_cnt++;
4179 }
4180 if (local_q == NULL) {
4181 break;
4182 }
4183
4184 q->pgo_busy = TRUE;
4185
4186 if ((pgo_draining = q->pgo_draining) == FALSE) {
4187 vm_pageout_throttle_up_batch(q, local_cnt);
4188 pages_left_on_q = q->pgo_laundry;
4189 } else {
4190 pages_left_on_q = q->pgo_laundry - local_cnt;
4191 }
4192
4193 vm_page_unlock_queues();
4194
4195 #if !RECORD_THE_COMPRESSED_DATA
4196 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4197 // wake up the next compressor thread
4198 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4199 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4200 }
4201 #endif
4202 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4203
4204 while (local_q) {
4205 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4206
4207 m = local_q;
4208 local_q = m->vmp_snext;
4209 m->vmp_snext = NULL;
4210
4211 /*
4212 * Technically we need the pageq locks to manipulate this field.
4213 * However, this page has been removed from all queues and is only
4214 * known to this compressor thread dealing with this local queue.
4215 *
4216 * TODO LIONEL: Add a second localq that is the early localq and
4217 * put special pages like this one on that queue in the block above
4218 * under the pageq lock to avoid this 'works but not clean' logic.
4219 */
4220 void *donate_queue_head;
4221 #if XNU_TARGET_OS_OSX
4222 donate_queue_head = &cq->current_early_swapout_chead;
4223 #else /* XNU_TARGET_OS_OSX */
4224 donate_queue_head = &cq->current_late_swapout_chead;
4225 #endif /* XNU_TARGET_OS_OSX */
4226 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4227 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4228 chead = donate_queue_head;
4229 } else {
4230 chead = &cq->current_regular_swapout_chead;
4231 }
4232
4233 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4234 #if DEVELOPMENT || DEBUG
4235 ncomps++;
4236 #endif
4237 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4238
4239 m->vmp_snext = local_freeq;
4240 local_freeq = m;
4241 local_freed++;
4242
4243 if (local_freed >= MAX_FREE_BATCH) {
4244 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4245
4246 vm_page_free_list(local_freeq, TRUE);
4247
4248 local_freeq = NULL;
4249 local_freed = 0;
4250 }
4251 }
4252 #if DEVELOPMENT || DEBUG
4253 num_pages_processed++;
4254 #endif /* DEVELOPMENT || DEBUG */
4255 #if !CONFIG_JETSAM
4256 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4257 kern_return_t wait_result;
4258 int need_wakeup = 0;
4259
4260 if (local_freeq) {
4261 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4262
4263 vm_page_free_list(local_freeq, TRUE);
4264 local_freeq = NULL;
4265 local_freed = 0;
4266
4267 continue;
4268 }
4269 vm_free_page_lock_spin();
4270
4271 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4272 if (vm_page_free_wanted_privileged++ == 0) {
4273 need_wakeup = 1;
4274 }
4275 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4276
4277 vm_free_page_unlock();
4278
4279 if (need_wakeup) {
4280 thread_wakeup((event_t)&vm_page_free_wanted);
4281 }
4282
4283 if (wait_result == THREAD_WAITING) {
4284 thread_block(THREAD_CONTINUE_NULL);
4285 }
4286 } else {
4287 vm_free_page_unlock();
4288 }
4289 }
4290 #endif
4291 }
4292 if (local_freeq) {
4293 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4294
4295 vm_page_free_list(local_freeq, TRUE);
4296 local_freeq = NULL;
4297 local_freed = 0;
4298 }
4299 if (pgo_draining == TRUE) {
4300 vm_page_lockspin_queues();
4301 vm_pageout_throttle_up_batch(q, local_cnt);
4302 vm_page_unlock_queues();
4303 }
4304 }
4305 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4306
4307 /*
4308 * queue lock is held and our q is empty
4309 */
4310 q->pgo_busy = FALSE;
4311 #if DEVELOPMENT || DEBUG
4312 if (marked_active == TRUE) {
4313 vmct_active--;
4314 vmct_state[cq->id] = VMCT_IDLE;
4315
4316 if (vmct_active == 0) {
4317 vm_compressor_epoch_stop = mach_absolute_time();
4318 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4319 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4320 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4321 /* This interval includes intervals where one or more
4322 * compressor threads were pre-empted
4323 */
4324 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4325 }
4326 }
4327 if (compressor_running_perf_test && benchmark_accounting) {
4328 /*
4329 * We could turn ON compressor_running_perf_test while still processing
4330 * regular non-benchmark pages. We shouldn't count them here else we
4331 * could overshoot. We might also still be populating that benchmark Q
4332 * and be under pressure. So we will go back to the regular queues. And
4333 * benchmark accounting will be off for that case too.
4334 */
4335 compressor_perf_test_pages_processed += num_pages_processed;
4336 thread_wakeup(&compressor_perf_test_pages_processed);
4337 }
4338 #endif
4339 vm_page_unlock_queues();
4340 #if DEVELOPMENT || DEBUG
4341 if (__improbable(vm_compressor_time_thread)) {
4342 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4343 vmct_stats.vmct_pages[cq->id] += ncomps;
4344 vmct_stats.vmct_iterations[cq->id]++;
4345 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4346 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4347 }
4348 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4349 vmct_stats.vmct_minpages[cq->id] = ncomps;
4350 }
4351 }
4352 #endif
4353
4354 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4355 #if DEVELOPMENT || DEBUG
4356 if (compressor_running_perf_test && benchmark_accounting) {
4357 /*
4358 * We've been exclusively compressing pages from the benchmark queue,
4359 * do 1 pass over the internal queue before blocking.
4360 */
4361 continue;
4362 }
4363 #endif
4364
4365 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4366 }
4367 /*NOTREACHED*/
4368 }
4369
4370
4371 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4372 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4373 {
4374 vm_object_t object;
4375 memory_object_t pager;
4376 int compressed_count_delta;
4377 kern_return_t retval;
4378
4379 object = VM_PAGE_OBJECT(m);
4380
4381 assert(!m->vmp_free_when_done);
4382 assert(!m->vmp_laundry);
4383
4384 pager = object->pager;
4385
4386 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4387 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4388
4389 vm_object_lock(object);
4390
4391 /*
4392 * If there is no memory object for the page, create
4393 * one and hand it to the compression pager.
4394 */
4395
4396 if (!object->pager_initialized) {
4397 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4398 }
4399 if (!object->pager_initialized) {
4400 vm_object_compressor_pager_create(object);
4401 }
4402
4403 pager = object->pager;
4404
4405 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4406 /*
4407 * Still no pager for the object,
4408 * or the pager has been destroyed.
4409 * Reactivate the page.
4410 *
4411 * Should only happen if there is no
4412 * compression pager
4413 */
4414 PAGE_WAKEUP_DONE(m);
4415
4416 vm_page_lockspin_queues();
4417 vm_page_activate(m);
4418 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4419 vm_page_unlock_queues();
4420
4421 /*
4422 * And we are done with it.
4423 */
4424 vm_object_activity_end(object);
4425 vm_object_unlock(object);
4426
4427 return KERN_FAILURE;
4428 }
4429 vm_object_unlock(object);
4430
4431 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4432 }
4433 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4434 assert(object->activity_in_progress > 0);
4435
4436 retval = vm_compressor_pager_put(
4437 pager,
4438 m->vmp_offset + object->paging_offset,
4439 VM_PAGE_GET_PHYS_PAGE(m),
4440 current_chead,
4441 scratch_buf,
4442 &compressed_count_delta);
4443
4444 vm_object_lock(object);
4445
4446 assert(object->activity_in_progress > 0);
4447 assert(VM_PAGE_OBJECT(m) == object);
4448 assert( !VM_PAGE_WIRED(m));
4449
4450 vm_compressor_pager_count(pager,
4451 compressed_count_delta,
4452 FALSE, /* shared_lock */
4453 object);
4454
4455 if (retval == KERN_SUCCESS) {
4456 /*
4457 * If the object is purgeable, its owner's
4458 * purgeable ledgers will be updated in
4459 * vm_page_remove() but the page still
4460 * contributes to the owner's memory footprint,
4461 * so account for it as such.
4462 */
4463 if ((object->purgable != VM_PURGABLE_DENY ||
4464 object->vo_ledger_tag) &&
4465 object->vo_owner != NULL) {
4466 /* one more compressed purgeable/tagged page */
4467 vm_object_owner_compressed_update(object,
4468 +1);
4469 }
4470 counter_inc(&vm_statistics_compressions);
4471
4472 if (m->vmp_tabled) {
4473 vm_page_remove(m, TRUE);
4474 }
4475 } else {
4476 PAGE_WAKEUP_DONE(m);
4477
4478 vm_page_lockspin_queues();
4479
4480 vm_page_activate(m);
4481 vm_pageout_vminfo.vm_compressor_failed++;
4482
4483 vm_page_unlock_queues();
4484 }
4485 vm_object_activity_end(object);
4486 vm_object_unlock(object);
4487
4488 return retval;
4489 }
4490
4491
4492 static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state * ethr,boolean_t req_lowpriority)4493 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4494 {
4495 uint32_t policy;
4496
4497 if (hibernate_cleaning_in_progress == TRUE) {
4498 req_lowpriority = FALSE;
4499 }
4500
4501 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4502 vm_page_unlock_queues();
4503
4504 if (req_lowpriority == TRUE) {
4505 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4506 DTRACE_VM(laundrythrottle);
4507 } else {
4508 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4509 DTRACE_VM(laundryunthrottle);
4510 }
4511 proc_set_thread_policy(ethr->pgo_iothread,
4512 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4513
4514 vm_page_lock_queues();
4515 ethr->q->pgo_lowpriority = req_lowpriority;
4516 }
4517 }
4518
4519 OS_NORETURN
4520 static void
vm_pageout_iothread_external(struct pgo_iothread_state * ethr,__unused wait_result_t w)4521 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4522 {
4523 thread_t self = current_thread();
4524
4525 self->options |= TH_OPT_VMPRIV;
4526
4527 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4528
4529 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4530 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4531
4532 vm_page_lock_queues();
4533
4534 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4535 vm_pageout_queue_external.pgo_inited = TRUE;
4536
4537 vm_page_unlock_queues();
4538
4539 #if CONFIG_THREAD_GROUPS
4540 thread_group_vm_add();
4541 #endif /* CONFIG_THREAD_GROUPS */
4542
4543 vm_pageout_iothread_external_continue(ethr, 0);
4544 /*NOTREACHED*/
4545 }
4546
4547
4548 OS_NORETURN
4549 static void
vm_pageout_iothread_internal(struct pgo_iothread_state * cthr,__unused wait_result_t w)4550 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4551 {
4552 thread_t self = current_thread();
4553
4554 self->options |= TH_OPT_VMPRIV;
4555
4556 vm_page_lock_queues();
4557
4558 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4559 vm_pageout_queue_internal.pgo_inited = TRUE;
4560
4561 #if DEVELOPMENT || DEBUG
4562 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4563 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4564 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4565 #endif /* DEVELOPMENT || DEBUG */
4566
4567 vm_page_unlock_queues();
4568
4569 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4570 thread_vm_bind_group_add();
4571 }
4572
4573 #if CONFIG_THREAD_GROUPS
4574 thread_group_vm_add();
4575 #endif /* CONFIG_THREAD_GROUPS */
4576
4577 #if __AMP__
4578 if (vm_compressor_ebound) {
4579 /*
4580 * Use the soft bound option for vm_compressor to allow it to run on
4581 * P-cores if E-cluster is unavailable.
4582 */
4583 thread_bind_cluster_type(self, 'E', true);
4584 }
4585 #endif /* __AMP__ */
4586
4587 thread_set_thread_name(current_thread(), "VM_compressor");
4588 #if DEVELOPMENT || DEBUG
4589 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4590 #endif
4591 vm_pageout_iothread_internal_continue(cthr, 0);
4592
4593 /*NOTREACHED*/
4594 }
4595
4596 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4597 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4598 {
4599 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4600 return KERN_SUCCESS;
4601 } else {
4602 return KERN_FAILURE; /* Already set */
4603 }
4604 }
4605
4606 extern boolean_t memorystatus_manual_testing_on;
4607 extern unsigned int memorystatus_level;
4608
4609
4610 #if VM_PRESSURE_EVENTS
4611
4612 boolean_t vm_pressure_events_enabled = FALSE;
4613
4614 extern uint64_t next_warning_notification_sent_at_ts;
4615 extern uint64_t next_critical_notification_sent_at_ts;
4616
4617 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4618
4619 /*
4620 * The last time there was change in pressure level OR we forced a check
4621 * because the system is stuck in a non-normal pressure level.
4622 */
4623 uint64_t vm_pressure_last_level_transition_abs = 0;
4624
4625 /*
4626 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4627 * level before resending out notifications for that level again.
4628 */
4629 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4630
4631 void
vm_pressure_response(void)4632 vm_pressure_response(void)
4633 {
4634 vm_pressure_level_t old_level = kVMPressureNormal;
4635 int new_level = -1;
4636 unsigned int total_pages;
4637 uint64_t available_memory = 0;
4638 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4639 bool force_check = false;
4640 int time_in_mins;
4641
4642
4643 if (vm_pressure_events_enabled == FALSE) {
4644 return;
4645 }
4646
4647 #if !XNU_TARGET_OS_OSX
4648
4649 available_memory = (uint64_t) memorystatus_available_pages;
4650
4651 #else /* !XNU_TARGET_OS_OSX */
4652
4653 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4654 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4655
4656 #endif /* !XNU_TARGET_OS_OSX */
4657
4658 total_pages = (unsigned int) atop_64(max_mem);
4659 #if CONFIG_SECLUDED_MEMORY
4660 total_pages -= vm_page_secluded_count;
4661 #endif /* CONFIG_SECLUDED_MEMORY */
4662 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4663
4664 if (memorystatus_manual_testing_on) {
4665 return;
4666 }
4667
4668 curr_ts = mach_absolute_time();
4669 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4670
4671 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4672 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4673 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4674
4675 old_level = memorystatus_vm_pressure_level;
4676
4677 switch (memorystatus_vm_pressure_level) {
4678 case kVMPressureNormal:
4679 {
4680 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4681 new_level = kVMPressureCritical;
4682 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4683 new_level = kVMPressureWarning;
4684 }
4685 break;
4686 }
4687
4688 case kVMPressureWarning:
4689 case kVMPressureUrgent:
4690 {
4691 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4692 new_level = kVMPressureNormal;
4693 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4694 new_level = kVMPressureCritical;
4695 } else if (force_check) {
4696 new_level = kVMPressureWarning;
4697 next_warning_notification_sent_at_ts = curr_ts;
4698 }
4699 break;
4700 }
4701
4702 case kVMPressureCritical:
4703 {
4704 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4705 new_level = kVMPressureNormal;
4706 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4707 new_level = kVMPressureWarning;
4708 } else if (force_check) {
4709 new_level = kVMPressureCritical;
4710 next_critical_notification_sent_at_ts = curr_ts;
4711 }
4712 break;
4713 }
4714
4715 default:
4716 return;
4717 }
4718
4719 if (new_level != -1 || force_check) {
4720 if (new_level != -1) {
4721 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4722
4723 if (new_level != (int) old_level) {
4724 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4725 new_level, old_level, 0, 0);
4726 }
4727 } else {
4728 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4729 new_level, old_level, force_check, 0);
4730 }
4731
4732 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4733 /*
4734 * We don't want to schedule a wakeup while hibernation is in progress
4735 * because that could collide with checks for non-monotonicity in the scheduler.
4736 * We do however do all the updates to memorystatus_vm_pressure_level because
4737 * we _might_ want to use that for decisions regarding which pages or how
4738 * many pages we want to dump in hibernation.
4739 */
4740 return;
4741 }
4742
4743 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4744 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4745 thread_wakeup(&vm_pressure_thread);
4746 }
4747
4748 if (old_level != memorystatus_vm_pressure_level) {
4749 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4750 }
4751 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4752 }
4753 }
4754 }
4755 #endif /* VM_PRESSURE_EVENTS */
4756
4757
4758 /**
4759 * Called by a kernel thread to ask if a number of pages may be wired.
4760 */
4761 kern_return_t
mach_vm_wire_level_monitor(int64_t requested_pages)4762 mach_vm_wire_level_monitor(int64_t requested_pages)
4763 {
4764 if (requested_pages <= 0) {
4765 return KERN_INVALID_ARGUMENT;
4766 }
4767
4768 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4769 /**
4770 * Available pages can be negative in the case where more system memory is
4771 * wired than the threshold, so we must use a signed integer.
4772 */
4773 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4774
4775 if (requested_pages > available_pages) {
4776 return KERN_RESOURCE_SHORTAGE;
4777 }
4778 return KERN_SUCCESS;
4779 }
4780
4781 /*
4782 * Function called by a kernel thread to either get the current pressure level or
4783 * wait until memory pressure changes from a given level.
4784 */
4785 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4786 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4787 {
4788 #if !VM_PRESSURE_EVENTS
4789
4790 return KERN_FAILURE;
4791
4792 #else /* VM_PRESSURE_EVENTS */
4793
4794 wait_result_t wr = 0;
4795 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4796
4797 if (pressure_level == NULL) {
4798 return KERN_INVALID_ARGUMENT;
4799 }
4800
4801 if (*pressure_level == kVMPressureJetsam) {
4802 if (!wait_for_pressure) {
4803 return KERN_INVALID_ARGUMENT;
4804 }
4805
4806 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4807 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4808 THREAD_INTERRUPTIBLE);
4809 if (wr == THREAD_WAITING) {
4810 ++memorystatus_jetsam_fg_band_waiters;
4811 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4812 wr = thread_block(THREAD_CONTINUE_NULL);
4813 } else {
4814 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4815 }
4816 if (wr != THREAD_AWAKENED) {
4817 return KERN_ABORTED;
4818 }
4819 *pressure_level = kVMPressureJetsam;
4820 return KERN_SUCCESS;
4821 }
4822
4823 if (wait_for_pressure == TRUE) {
4824 while (old_level == *pressure_level) {
4825 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4826 THREAD_INTERRUPTIBLE);
4827 if (wr == THREAD_WAITING) {
4828 wr = thread_block(THREAD_CONTINUE_NULL);
4829 }
4830 if (wr == THREAD_INTERRUPTED) {
4831 return KERN_ABORTED;
4832 }
4833
4834 if (wr == THREAD_AWAKENED) {
4835 old_level = memorystatus_vm_pressure_level;
4836 }
4837 }
4838 }
4839
4840 *pressure_level = old_level;
4841 return KERN_SUCCESS;
4842 #endif /* VM_PRESSURE_EVENTS */
4843 }
4844
4845 #if VM_PRESSURE_EVENTS
4846 void
vm_pressure_thread(void)4847 vm_pressure_thread(void)
4848 {
4849 static boolean_t thread_initialized = FALSE;
4850
4851 if (thread_initialized == TRUE) {
4852 vm_pageout_state.vm_pressure_thread_running = TRUE;
4853 consider_vm_pressure_events();
4854 vm_pageout_state.vm_pressure_thread_running = FALSE;
4855 }
4856
4857 #if CONFIG_THREAD_GROUPS
4858 thread_group_vm_add();
4859 #endif /* CONFIG_THREAD_GROUPS */
4860
4861 thread_set_thread_name(current_thread(), "VM_pressure");
4862 thread_initialized = TRUE;
4863 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4864 thread_block((thread_continue_t)vm_pressure_thread);
4865 }
4866 #endif /* VM_PRESSURE_EVENTS */
4867
4868
4869 /*
4870 * called once per-second via "compute_averages"
4871 */
4872 void
compute_pageout_gc_throttle(__unused void * arg)4873 compute_pageout_gc_throttle(__unused void *arg)
4874 {
4875 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4876 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4877
4878 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4879 }
4880 }
4881
4882 /*
4883 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4884 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4885 * jetsams. We need to check if the zone map size is above its jetsam limit to
4886 * decide if this was indeed the case.
4887 *
4888 * We need to do this on a different thread because of the following reasons:
4889 *
4890 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4891 * itself causing the system to hang. We perform synchronous jetsams if we're
4892 * leaking in the VM map entries zone, so the leaking process could be doing a
4893 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4894 * jetsam itself. We also need the vm_map lock on the process termination path,
4895 * which would now lead the dying process to deadlock against itself.
4896 *
4897 * 2. The jetsam path might need to allocate zone memory itself. We could try
4898 * using the non-blocking variant of zalloc for this path, but we can still
4899 * end up trying to do a kmem_alloc when the zone maps are almost full.
4900 */
4901 __dead2
4902 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4903 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4904 {
4905 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4906
4907 if (step == VM_PAGEOUT_GC_INIT) {
4908 /* first time being called is not about GC */
4909 #if CONFIG_THREAD_GROUPS
4910 thread_group_vm_add();
4911 #endif /* CONFIG_THREAD_GROUPS */
4912 } else if (zone_map_nearing_exhaustion()) {
4913 /*
4914 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4915 *
4916 * Bail out after calling zone_gc (which triggers the
4917 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4918 * operations that clear out a bunch of caches might allocate zone
4919 * memory themselves (for eg. vm_map operations would need VM map
4920 * entries). Since the zone map is almost full at this point, we
4921 * could end up with a panic. We just need to quickly jetsam a
4922 * process and exit here.
4923 *
4924 * It could so happen that we were woken up to relieve memory
4925 * pressure and the zone map also happened to be near its limit at
4926 * the time, in which case we'll skip out early. But that should be
4927 * ok; if memory pressure persists, the thread will simply be woken
4928 * up again.
4929 */
4930 zone_gc(ZONE_GC_JETSAM);
4931 } else {
4932 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4933 boolean_t buf_large_zfree = FALSE;
4934 boolean_t first_try = TRUE;
4935
4936 stack_collect();
4937
4938 consider_machine_collect();
4939 mbuf_drain(FALSE);
4940
4941 do {
4942 if (consider_buffer_cache_collect != NULL) {
4943 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4944 }
4945 if (first_try == TRUE || buf_large_zfree == TRUE) {
4946 /*
4947 * zone_gc should be last, because the other operations
4948 * might return memory to zones.
4949 */
4950 zone_gc(ZONE_GC_TRIM);
4951 }
4952 first_try = FALSE;
4953 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4954
4955 consider_machine_adjust();
4956 }
4957
4958 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4959
4960 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4961 __builtin_unreachable();
4962 }
4963
4964
4965 #if VM_PAGE_BUCKETS_CHECK
4966 #if VM_PAGE_FAKE_BUCKETS
4967 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4968 #endif /* VM_PAGE_FAKE_BUCKETS */
4969 #endif /* VM_PAGE_BUCKETS_CHECK */
4970
4971
4972
4973 void
vm_set_restrictions(unsigned int num_cpus)4974 vm_set_restrictions(unsigned int num_cpus)
4975 {
4976 int vm_restricted_to_single_processor = 0;
4977
4978 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4979 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4980 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4981 } else {
4982 assert(num_cpus > 0);
4983
4984 if (num_cpus <= 3) {
4985 /*
4986 * on systems with a limited number of CPUS, bind the
4987 * 4 major threads that can free memory and that tend to use
4988 * a fair bit of CPU under pressured conditions to a single processor.
4989 * This insures that these threads don't hog all of the available CPUs
4990 * (important for camera launch), while allowing them to run independently
4991 * w/r to locks... the 4 threads are
4992 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4993 * vm_compressor_swap_trigger_thread (minor and major compactions),
4994 * memorystatus_thread (jetsams).
4995 *
4996 * the first time the thread is run, it is responsible for checking the
4997 * state of vm_restricted_to_single_processor, and if TRUE it calls
4998 * thread_bind_master... someday this should be replaced with a group
4999 * scheduling mechanism and KPI.
5000 */
5001 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5002 } else {
5003 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5004 }
5005 }
5006 }
5007
5008 /*
5009 * Set up vm_config based on the vm_compressor_mode.
5010 * Must run BEFORE the pageout thread starts up.
5011 */
5012 __startup_func
5013 void
vm_config_init(void)5014 vm_config_init(void)
5015 {
5016 bzero(&vm_config, sizeof(vm_config));
5017
5018 switch (vm_compressor_mode) {
5019 case VM_PAGER_DEFAULT:
5020 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5021 OS_FALLTHROUGH;
5022
5023 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5024 vm_config.compressor_is_present = TRUE;
5025 vm_config.swap_is_present = TRUE;
5026 vm_config.compressor_is_active = TRUE;
5027 vm_config.swap_is_active = TRUE;
5028 break;
5029
5030 case VM_PAGER_COMPRESSOR_NO_SWAP:
5031 vm_config.compressor_is_present = TRUE;
5032 vm_config.swap_is_present = TRUE;
5033 vm_config.compressor_is_active = TRUE;
5034 break;
5035
5036 case VM_PAGER_FREEZER_DEFAULT:
5037 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5038 OS_FALLTHROUGH;
5039
5040 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5041 vm_config.compressor_is_present = TRUE;
5042 vm_config.swap_is_present = TRUE;
5043 break;
5044
5045 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5046 vm_config.compressor_is_present = TRUE;
5047 vm_config.swap_is_present = TRUE;
5048 vm_config.compressor_is_active = TRUE;
5049 vm_config.freezer_swap_is_active = TRUE;
5050 break;
5051
5052 case VM_PAGER_NOT_CONFIGURED:
5053 break;
5054
5055 default:
5056 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5057 break;
5058 }
5059 }
5060
5061 __startup_func
5062 static void
vm_pageout_create_gc_thread(void)5063 vm_pageout_create_gc_thread(void)
5064 {
5065 thread_t thread;
5066
5067 if (kernel_thread_create(vm_pageout_garbage_collect,
5068 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5069 panic("vm_pageout_garbage_collect: create failed");
5070 }
5071 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5072 if (thread->reserved_stack == 0) {
5073 assert(thread->kernel_stack);
5074 thread->reserved_stack = thread->kernel_stack;
5075 }
5076
5077 /* thread is started in vm_pageout() */
5078 vm_pageout_gc_thread = thread;
5079 }
5080 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5081
5082 void
vm_pageout(void)5083 vm_pageout(void)
5084 {
5085 thread_t self = current_thread();
5086 thread_t thread;
5087 kern_return_t result;
5088 spl_t s;
5089
5090 /*
5091 * Set thread privileges.
5092 */
5093 s = splsched();
5094
5095 #if CONFIG_VPS_DYNAMIC_PRIO
5096
5097 int vps_dynprio_bootarg = 0;
5098
5099 if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5100 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5101 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5102 } else {
5103 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5104 vps_dynamic_priority_enabled = TRUE;
5105 } else {
5106 vps_dynamic_priority_enabled = FALSE;
5107 }
5108 }
5109
5110 if (vps_dynamic_priority_enabled) {
5111 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5112 thread_set_eager_preempt(self);
5113 } else {
5114 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5115 }
5116
5117 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5118
5119 vps_dynamic_priority_enabled = FALSE;
5120 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5121
5122 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5123
5124 thread_lock(self);
5125 self->options |= TH_OPT_VMPRIV;
5126 thread_unlock(self);
5127
5128 if (!self->reserved_stack) {
5129 self->reserved_stack = self->kernel_stack;
5130 }
5131
5132 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5133 vps_dynamic_priority_enabled == FALSE) {
5134 thread_vm_bind_group_add();
5135 }
5136
5137
5138 #if CONFIG_THREAD_GROUPS
5139 thread_group_vm_add();
5140 #endif /* CONFIG_THREAD_GROUPS */
5141
5142 #if __AMP__
5143 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5144 if (vm_pgo_pbound) {
5145 /*
5146 * Use the soft bound option for vm pageout to allow it to run on
5147 * E-cores if P-cluster is unavailable.
5148 */
5149 thread_bind_cluster_type(self, 'P', true);
5150 }
5151 #endif /* __AMP__ */
5152
5153 PE_parse_boot_argn("vmpgo_protect_realtime",
5154 &vm_pageout_protect_realtime,
5155 sizeof(vm_pageout_protect_realtime));
5156 splx(s);
5157
5158 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5159
5160 /*
5161 * Initialize some paging parameters.
5162 */
5163
5164 vm_pageout_state.vm_pressure_thread_running = FALSE;
5165 vm_pageout_state.vm_pressure_changed = FALSE;
5166 vm_pageout_state.memorystatus_purge_on_warning = 2;
5167 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5168 vm_pageout_state.memorystatus_purge_on_critical = 8;
5169 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5170 vm_pageout_state.vm_page_speculative_percentage = 5;
5171 vm_pageout_state.vm_page_speculative_target = 0;
5172
5173 vm_pageout_state.vm_pageout_swap_wait = 0;
5174 vm_pageout_state.vm_pageout_idle_wait = 0;
5175 vm_pageout_state.vm_pageout_empty_wait = 0;
5176 vm_pageout_state.vm_pageout_burst_wait = 0;
5177 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5178 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5179 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5180
5181 vm_pageout_state.vm_pageout_inactive = 0;
5182 vm_pageout_state.vm_pageout_inactive_used = 0;
5183 vm_pageout_state.vm_pageout_inactive_clean = 0;
5184
5185 vm_pageout_state.vm_memory_pressure = 0;
5186 vm_pageout_state.vm_page_filecache_min = 0;
5187 #if CONFIG_JETSAM
5188 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5189 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5190 #else
5191 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5192 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5193 #endif
5194 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5195
5196 vm_pageout_state.vm_pageout_considered_page_last = 0;
5197
5198 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5199 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5200 }
5201
5202 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5203 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5204 }
5205
5206 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5207 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5208 }
5209
5210 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5211 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5212 }
5213
5214 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5215 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5216 }
5217
5218 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5219 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5220 }
5221
5222 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5223 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5224 }
5225 /*
5226 * even if we've already called vm_page_free_reserve
5227 * call it again here to insure that the targets are
5228 * accurately calculated (it uses vm_page_free_count_init)
5229 * calling it with an arg of 0 will not change the reserve
5230 * but will re-calculate free_min and free_target
5231 */
5232 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5233 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5234 } else {
5235 vm_page_free_reserve(0);
5236 }
5237
5238 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5239 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5240
5241 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5242 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5243
5244 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5245
5246 #if DEVELOPMENT || DEBUG
5247 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5248 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5249 #endif /* DEVELOPMENT || DEBUG */
5250
5251
5252 /* internal pageout thread started when default pager registered first time */
5253 /* external pageout and garbage collection threads started here */
5254 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5255 ethr->id = 0;
5256 ethr->q = &vm_pageout_queue_external;
5257 ethr->current_early_swapout_chead = NULL;
5258 ethr->current_regular_swapout_chead = NULL;
5259 ethr->current_late_swapout_chead = NULL;
5260 ethr->scratch_buf = NULL;
5261 #if DEVELOPMENT || DEBUG
5262 ethr->benchmark_q = NULL;
5263 #endif /* DEVELOPMENT || DEBUG */
5264 sched_cond_init(&(ethr->pgo_wakeup));
5265
5266 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5267 (void *)ethr, BASEPRI_VM,
5268 &(ethr->pgo_iothread));
5269 if (result != KERN_SUCCESS) {
5270 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5271 }
5272 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5273
5274 thread_mtx_lock(vm_pageout_gc_thread );
5275 thread_start(vm_pageout_gc_thread );
5276 thread_mtx_unlock(vm_pageout_gc_thread);
5277
5278 #if VM_PRESSURE_EVENTS
5279 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5280 BASEPRI_DEFAULT,
5281 &thread);
5282
5283 if (result != KERN_SUCCESS) {
5284 panic("vm_pressure_thread: create failed");
5285 }
5286
5287 thread_deallocate(thread);
5288 #endif
5289
5290 vm_object_reaper_init();
5291
5292
5293 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5294 vm_compressor_init();
5295 }
5296
5297 #if VM_PRESSURE_EVENTS
5298 vm_pressure_events_enabled = TRUE;
5299 #endif /* VM_PRESSURE_EVENTS */
5300
5301 #if CONFIG_PHANTOM_CACHE
5302 vm_phantom_cache_init();
5303 #endif
5304 #if VM_PAGE_BUCKETS_CHECK
5305 #if VM_PAGE_FAKE_BUCKETS
5306 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5307 (uint64_t) vm_page_fake_buckets_start,
5308 (uint64_t) vm_page_fake_buckets_end);
5309 pmap_protect(kernel_pmap,
5310 vm_page_fake_buckets_start,
5311 vm_page_fake_buckets_end,
5312 VM_PROT_READ);
5313 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5314 #endif /* VM_PAGE_FAKE_BUCKETS */
5315 #endif /* VM_PAGE_BUCKETS_CHECK */
5316
5317 #if VM_OBJECT_TRACKING
5318 vm_object_tracking_init();
5319 #endif /* VM_OBJECT_TRACKING */
5320
5321 #if __arm64__
5322 // vm_tests();
5323 #endif /* __arm64__ */
5324
5325 vm_pageout_continue();
5326
5327 /*
5328 * Unreached code!
5329 *
5330 * The vm_pageout_continue() call above never returns, so the code below is never
5331 * executed. We take advantage of this to declare several DTrace VM related probe
5332 * points that our kernel doesn't have an analog for. These are probe points that
5333 * exist in Solaris and are in the DTrace documentation, so people may have written
5334 * scripts that use them. Declaring the probe points here means their scripts will
5335 * compile and execute which we want for portability of the scripts, but since this
5336 * section of code is never reached, the probe points will simply never fire. Yes,
5337 * this is basically a hack. The problem is the DTrace probe points were chosen with
5338 * Solaris specific VM events in mind, not portability to different VM implementations.
5339 */
5340
5341 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5342 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5343 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5344 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5345 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5346 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5347 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5348 /*NOTREACHED*/
5349 }
5350
5351
5352
5353 kern_return_t
vm_pageout_internal_start(void)5354 vm_pageout_internal_start(void)
5355 {
5356 kern_return_t result = KERN_SUCCESS;
5357 host_basic_info_data_t hinfo;
5358 vm_offset_t buf, bufsize;
5359
5360 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5361
5362 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5363 #define BSD_HOST 1
5364 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5365
5366 assert(hinfo.max_cpus > 0);
5367
5368 #if !XNU_TARGET_OS_OSX
5369 vm_pageout_state.vm_compressor_thread_count = 1;
5370 #else /* !XNU_TARGET_OS_OSX */
5371 if (hinfo.max_cpus > 4) {
5372 vm_pageout_state.vm_compressor_thread_count = 2;
5373 } else {
5374 vm_pageout_state.vm_compressor_thread_count = 1;
5375 }
5376 #endif /* !XNU_TARGET_OS_OSX */
5377 #if __AMP__
5378 if (vm_compressor_ebound) {
5379 vm_pageout_state.vm_compressor_thread_count = 2;
5380 }
5381 #endif
5382 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5383 sizeof(vm_pageout_state.vm_compressor_thread_count));
5384
5385 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5386 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5387 }
5388 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5389 vm_pageout_state.vm_compressor_thread_count = 1;
5390 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5391 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5392 }
5393
5394 vm_pageout_queue_internal.pgo_maxlaundry =
5395 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5396
5397 PE_parse_boot_argn("vmpgoi_maxlaundry",
5398 &vm_pageout_queue_internal.pgo_maxlaundry,
5399 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5400
5401 #if DEVELOPMENT || DEBUG
5402 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5403 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5404 #endif /* DEVELOPMENT || DEBUG */
5405
5406 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5407
5408 kmem_alloc(kernel_map, &buf,
5409 bufsize * vm_pageout_state.vm_compressor_thread_count,
5410 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5411 VM_KERN_MEMORY_COMPRESSOR);
5412
5413 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5414 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5415 iq->id = i;
5416 iq->q = &vm_pageout_queue_internal;
5417 iq->current_early_swapout_chead = NULL;
5418 iq->current_regular_swapout_chead = NULL;
5419 iq->current_late_swapout_chead = NULL;
5420 iq->scratch_buf = (char *)(buf + i * bufsize);
5421 #if DEVELOPMENT || DEBUG
5422 iq->benchmark_q = &vm_pageout_queue_benchmark;
5423 #endif /* DEVELOPMENT || DEBUG */
5424 sched_cond_init(&(iq->pgo_wakeup));
5425 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5426 (void *)iq, BASEPRI_VM,
5427 &(iq->pgo_iothread));
5428
5429 if (result != KERN_SUCCESS) {
5430 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5431 }
5432 }
5433 return result;
5434 }
5435
5436 #if CONFIG_IOSCHED
5437 /*
5438 * To support I/O Expedite for compressed files we mark the upls with special flags.
5439 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5440 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5441 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5442 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5443 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5444 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5445 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5446 * unless the real I/O upl is being destroyed).
5447 */
5448
5449
5450 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5451 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5452 {
5453 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5454
5455 upl_lock(src_upl);
5456 if (src_upl->decmp_io_upl) {
5457 /*
5458 * If there is already an alive real I/O UPL, ignore this new UPL.
5459 * This case should rarely happen and even if it does, it just means
5460 * that we might issue a spurious expedite which the driver is expected
5461 * to handle.
5462 */
5463 upl_unlock(src_upl);
5464 return;
5465 }
5466 src_upl->decmp_io_upl = (void *)upl;
5467 src_upl->ref_count++;
5468
5469 upl->flags |= UPL_DECMP_REAL_IO;
5470 upl->decmp_io_upl = (void *)src_upl;
5471 upl_unlock(src_upl);
5472 }
5473 #endif /* CONFIG_IOSCHED */
5474
5475 #if UPL_DEBUG
5476 int upl_debug_enabled = 1;
5477 #else
5478 int upl_debug_enabled = 0;
5479 #endif
5480
5481 static upl_t
upl_create(int type,int flags,upl_size_t size)5482 upl_create(int type, int flags, upl_size_t size)
5483 {
5484 uint32_t pages = (uint32_t)atop(round_page_32(size));
5485 upl_t upl;
5486
5487 assert(page_aligned(size));
5488
5489 /*
5490 * FIXME: this code assumes the allocation always succeeds,
5491 * however `pages` can be up to MAX_UPL_SIZE.
5492 *
5493 * The allocation size is above 32k (resp. 128k)
5494 * on 16k pages (resp. 4k), which kalloc might fail
5495 * to allocate.
5496 */
5497 upl = kalloc_type(struct upl, struct upl_page_info,
5498 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5499 if (type & UPL_CREATE_INTERNAL) {
5500 flags |= UPL_INTERNAL;
5501 }
5502
5503 if (type & UPL_CREATE_LITE) {
5504 flags |= UPL_LITE;
5505 if (pages) {
5506 upl->lite_list = bitmap_alloc(pages);
5507 }
5508 }
5509
5510 upl->flags = flags;
5511 upl->ref_count = 1;
5512 upl_lock_init(upl);
5513 #if CONFIG_IOSCHED
5514 if (type & UPL_CREATE_IO_TRACKING) {
5515 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5516 }
5517
5518 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5519 /* Only support expedite on internal UPLs */
5520 thread_t curthread = current_thread();
5521 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5522 Z_WAITOK | Z_ZERO);
5523 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5524 if (curthread->decmp_upl != NULL) {
5525 upl_set_decmp_info(upl, curthread->decmp_upl);
5526 }
5527 }
5528 #endif
5529 #if CONFIG_IOSCHED || UPL_DEBUG
5530 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5531 upl->upl_creator = current_thread();
5532 upl->flags |= UPL_TRACKED_BY_OBJECT;
5533 }
5534 #endif
5535
5536 #if UPL_DEBUG
5537 upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5538 #endif /* UPL_DEBUG */
5539
5540 return upl;
5541 }
5542
5543 static void
upl_destroy(upl_t upl)5544 upl_destroy(upl_t upl)
5545 {
5546 uint32_t pages;
5547
5548 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5549
5550 if (upl->ext_ref_count) {
5551 panic("upl(%p) ext_ref_count", upl);
5552 }
5553
5554 #if CONFIG_IOSCHED
5555 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5556 upl_t src_upl;
5557 src_upl = upl->decmp_io_upl;
5558 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5559 upl_lock(src_upl);
5560 src_upl->decmp_io_upl = NULL;
5561 upl_unlock(src_upl);
5562 upl_deallocate(src_upl);
5563 }
5564 #endif /* CONFIG_IOSCHED */
5565
5566 #if CONFIG_IOSCHED || UPL_DEBUG
5567 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5568 !(upl->flags & UPL_VECTOR)) {
5569 vm_object_t object;
5570
5571 if (upl->flags & UPL_SHADOWED) {
5572 object = upl->map_object->shadow;
5573 } else {
5574 object = upl->map_object;
5575 }
5576
5577 vm_object_lock(object);
5578 queue_remove(&object->uplq, upl, upl_t, uplq);
5579 vm_object_activity_end(object);
5580 vm_object_collapse(object, 0, TRUE);
5581 vm_object_unlock(object);
5582 }
5583 #endif
5584 /*
5585 * drop a reference on the map_object whether or
5586 * not a pageout object is inserted
5587 */
5588 if (upl->flags & UPL_SHADOWED) {
5589 vm_object_deallocate(upl->map_object);
5590 }
5591
5592 if (upl->flags & UPL_DEVICE_MEMORY) {
5593 pages = 1;
5594 } else {
5595 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5596 }
5597
5598 upl_lock_destroy(upl);
5599
5600 #if CONFIG_IOSCHED
5601 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5602 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5603 }
5604 #endif
5605
5606 #if UPL_DEBUG
5607 for (int i = 0; i < upl->upl_commit_index; i++) {
5608 btref_put(upl->upl_commit_records[i].c_btref);
5609 }
5610 btref_put(upl->uple_create_btref);
5611 #endif /* UPL_DEBUG */
5612
5613 if ((upl->flags & UPL_LITE) && pages) {
5614 bitmap_free(upl->lite_list, pages);
5615 }
5616 kfree_type(struct upl, struct upl_page_info,
5617 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5618 }
5619
5620 void
upl_deallocate(upl_t upl)5621 upl_deallocate(upl_t upl)
5622 {
5623 upl_lock(upl);
5624
5625 if (--upl->ref_count == 0) {
5626 if (vector_upl_is_valid(upl)) {
5627 vector_upl_deallocate(upl);
5628 }
5629 upl_unlock(upl);
5630
5631 if (upl->upl_iodone) {
5632 upl_callout_iodone(upl);
5633 }
5634
5635 upl_destroy(upl);
5636 } else {
5637 upl_unlock(upl);
5638 }
5639 }
5640
5641 #if CONFIG_IOSCHED
5642 void
upl_mark_decmp(upl_t upl)5643 upl_mark_decmp(upl_t upl)
5644 {
5645 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5646 upl->flags |= UPL_DECMP_REQ;
5647 upl->upl_creator->decmp_upl = (void *)upl;
5648 }
5649 }
5650
5651 void
upl_unmark_decmp(upl_t upl)5652 upl_unmark_decmp(upl_t upl)
5653 {
5654 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5655 upl->upl_creator->decmp_upl = NULL;
5656 }
5657 }
5658
5659 #endif /* CONFIG_IOSCHED */
5660
5661 #define VM_PAGE_Q_BACKING_UP(q) \
5662 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5663
5664 boolean_t must_throttle_writes(void);
5665
5666 boolean_t
must_throttle_writes()5667 must_throttle_writes()
5668 {
5669 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5670 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5671 return TRUE;
5672 }
5673
5674 return FALSE;
5675 }
5676
5677 int vm_page_delayed_work_ctx_needed = 0;
5678 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5679
5680 __startup_func
5681 static void
vm_page_delayed_work_init_ctx(void)5682 vm_page_delayed_work_init_ctx(void)
5683 {
5684 uint16_t min_delayed_work_ctx_allocated = 16;
5685
5686 /*
5687 * try really hard to always keep NCPU elements around in the zone
5688 * in order for the UPL code to almost always get an element.
5689 */
5690 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5691 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5692 }
5693
5694 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5695 }
5696 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5697
5698 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5699 vm_page_delayed_work_get_ctx(void)
5700 {
5701 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5702
5703 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5704
5705 if (__probable(dw_ctx)) {
5706 dw_ctx->delayed_owner = current_thread();
5707 } else {
5708 vm_page_delayed_work_ctx_needed++;
5709 }
5710 return dw_ctx ? dw_ctx->dwp : NULL;
5711 }
5712
5713 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5714 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5715 {
5716 struct vm_page_delayed_work_ctx *ldw_ctx;
5717
5718 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5719 ldw_ctx->delayed_owner = NULL;
5720
5721 zfree(dw_ctx_zone, ldw_ctx);
5722 }
5723
5724 /*
5725 * Routine: vm_object_upl_request
5726 * Purpose:
5727 * Cause the population of a portion of a vm_object.
5728 * Depending on the nature of the request, the pages
5729 * returned may be contain valid data or be uninitialized.
5730 * A page list structure, listing the physical pages
5731 * will be returned upon request.
5732 * This function is called by the file system or any other
5733 * supplier of backing store to a pager.
5734 * IMPORTANT NOTE: The caller must still respect the relationship
5735 * between the vm_object and its backing memory object. The
5736 * caller MUST NOT substitute changes in the backing file
5737 * without first doing a memory_object_lock_request on the
5738 * target range unless it is know that the pages are not
5739 * shared with another entity at the pager level.
5740 * Copy_in_to:
5741 * if a page list structure is present
5742 * return the mapped physical pages, where a
5743 * page is not present, return a non-initialized
5744 * one. If the no_sync bit is turned on, don't
5745 * call the pager unlock to synchronize with other
5746 * possible copies of the page. Leave pages busy
5747 * in the original object, if a page list structure
5748 * was specified. When a commit of the page list
5749 * pages is done, the dirty bit will be set for each one.
5750 * Copy_out_from:
5751 * If a page list structure is present, return
5752 * all mapped pages. Where a page does not exist
5753 * map a zero filled one. Leave pages busy in
5754 * the original object. If a page list structure
5755 * is not specified, this call is a no-op.
5756 *
5757 * Note: access of default pager objects has a rather interesting
5758 * twist. The caller of this routine, presumably the file system
5759 * page cache handling code, will never actually make a request
5760 * against a default pager backed object. Only the default
5761 * pager will make requests on backing store related vm_objects
5762 * In this way the default pager can maintain the relationship
5763 * between backing store files (abstract memory objects) and
5764 * the vm_objects (cache objects), they support.
5765 *
5766 */
5767
5768 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5769 vm_object_upl_request(
5770 vm_object_t object,
5771 vm_object_offset_t offset,
5772 upl_size_t size,
5773 upl_t *upl_ptr,
5774 upl_page_info_array_t user_page_list,
5775 unsigned int *page_list_count,
5776 upl_control_flags_t cntrl_flags,
5777 vm_tag_t tag)
5778 {
5779 vm_page_t dst_page = VM_PAGE_NULL;
5780 vm_object_offset_t dst_offset;
5781 upl_size_t xfer_size;
5782 unsigned int size_in_pages;
5783 boolean_t dirty;
5784 boolean_t hw_dirty;
5785 upl_t upl = NULL;
5786 unsigned int entry;
5787 vm_page_t alias_page = NULL;
5788 int refmod_state = 0;
5789 vm_object_t last_copy_object;
5790 struct vm_page_delayed_work dw_array;
5791 struct vm_page_delayed_work *dwp, *dwp_start;
5792 bool dwp_finish_ctx = TRUE;
5793 int dw_count;
5794 int dw_limit;
5795 int io_tracking_flag = 0;
5796 int grab_options;
5797 int page_grab_count = 0;
5798 ppnum_t phys_page;
5799 pmap_flush_context pmap_flush_context_storage;
5800 boolean_t pmap_flushes_delayed = FALSE;
5801 #if DEVELOPMENT || DEBUG
5802 task_t task = current_task();
5803 #endif /* DEVELOPMENT || DEBUG */
5804
5805 dwp_start = dwp = NULL;
5806
5807 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5808 /*
5809 * For forward compatibility's sake,
5810 * reject any unknown flag.
5811 */
5812 return KERN_INVALID_VALUE;
5813 }
5814 if ((!object->internal) && (object->paging_offset != 0)) {
5815 panic("vm_object_upl_request: external object with non-zero paging offset");
5816 }
5817 if (object->phys_contiguous) {
5818 panic("vm_object_upl_request: contiguous object specified");
5819 }
5820
5821 assertf(page_aligned(offset) && page_aligned(size),
5822 "offset 0x%llx size 0x%x",
5823 offset, size);
5824
5825 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5826
5827 dw_count = 0;
5828 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5829 dwp_start = vm_page_delayed_work_get_ctx();
5830 if (dwp_start == NULL) {
5831 dwp_start = &dw_array;
5832 dw_limit = 1;
5833 dwp_finish_ctx = FALSE;
5834 }
5835
5836 dwp = dwp_start;
5837
5838 if (size > MAX_UPL_SIZE_BYTES) {
5839 size = MAX_UPL_SIZE_BYTES;
5840 }
5841
5842 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5843 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5844 }
5845
5846 #if CONFIG_IOSCHED || UPL_DEBUG
5847 if (object->io_tracking || upl_debug_enabled) {
5848 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5849 }
5850 #endif
5851 #if CONFIG_IOSCHED
5852 if (object->io_tracking) {
5853 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5854 }
5855 #endif
5856
5857 if (cntrl_flags & UPL_SET_INTERNAL) {
5858 if (cntrl_flags & UPL_SET_LITE) {
5859 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5860 } else {
5861 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5862 }
5863 user_page_list = size ? upl->page_list : NULL;
5864 } else {
5865 if (cntrl_flags & UPL_SET_LITE) {
5866 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5867 } else {
5868 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5869 }
5870 }
5871 *upl_ptr = upl;
5872
5873 if (user_page_list) {
5874 user_page_list[0].device = FALSE;
5875 }
5876
5877 if (cntrl_flags & UPL_SET_LITE) {
5878 upl->map_object = object;
5879 } else {
5880 upl->map_object = vm_object_allocate(size);
5881 /*
5882 * No neeed to lock the new object: nobody else knows
5883 * about it yet, so it's all ours so far.
5884 */
5885 upl->map_object->shadow = object;
5886 upl->map_object->pageout = TRUE;
5887 upl->map_object->can_persist = FALSE;
5888 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5889 upl->map_object->vo_shadow_offset = offset;
5890 upl->map_object->wimg_bits = object->wimg_bits;
5891 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5892 "object %p shadow_offset 0x%llx",
5893 upl->map_object, upl->map_object->vo_shadow_offset);
5894
5895 alias_page = vm_page_grab_fictitious(TRUE);
5896
5897 upl->flags |= UPL_SHADOWED;
5898 }
5899 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5900 upl->flags |= UPL_PAGEOUT;
5901 }
5902
5903 vm_object_lock(object);
5904 vm_object_activity_begin(object);
5905
5906 grab_options = 0;
5907 #if CONFIG_SECLUDED_MEMORY
5908 if (object->can_grab_secluded) {
5909 grab_options |= VM_PAGE_GRAB_SECLUDED;
5910 }
5911 #endif /* CONFIG_SECLUDED_MEMORY */
5912
5913 /*
5914 * we can lock in the paging_offset once paging_in_progress is set
5915 */
5916 upl->u_size = size;
5917 upl->u_offset = offset + object->paging_offset;
5918
5919 #if CONFIG_IOSCHED || UPL_DEBUG
5920 if (object->io_tracking || upl_debug_enabled) {
5921 vm_object_activity_begin(object);
5922 queue_enter(&object->uplq, upl, upl_t, uplq);
5923 }
5924 #endif
5925 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5926 /*
5927 * Honor copy-on-write obligations
5928 *
5929 * The caller is gathering these pages and
5930 * might modify their contents. We need to
5931 * make sure that the copy object has its own
5932 * private copies of these pages before we let
5933 * the caller modify them.
5934 */
5935 vm_object_update(object,
5936 offset,
5937 size,
5938 NULL,
5939 NULL,
5940 FALSE, /* should_return */
5941 MEMORY_OBJECT_COPY_SYNC,
5942 VM_PROT_NO_CHANGE);
5943
5944 VM_PAGEOUT_DEBUG(upl_cow, 1);
5945 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5946 }
5947 /*
5948 * remember which copy object we synchronized with
5949 */
5950 last_copy_object = object->copy;
5951 entry = 0;
5952
5953 xfer_size = size;
5954 dst_offset = offset;
5955 size_in_pages = size / PAGE_SIZE;
5956
5957 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5958 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5959 object->scan_collisions = 0;
5960 }
5961
5962 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5963 boolean_t isSSD = FALSE;
5964
5965 #if !XNU_TARGET_OS_OSX
5966 isSSD = TRUE;
5967 #else /* !XNU_TARGET_OS_OSX */
5968 vnode_pager_get_isSSD(object->pager, &isSSD);
5969 #endif /* !XNU_TARGET_OS_OSX */
5970 vm_object_unlock(object);
5971
5972 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5973
5974 if (isSSD == TRUE) {
5975 delay(1000 * size_in_pages);
5976 } else {
5977 delay(5000 * size_in_pages);
5978 }
5979 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5980
5981 vm_object_lock(object);
5982 }
5983
5984 while (xfer_size) {
5985 dwp->dw_mask = 0;
5986
5987 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5988 vm_object_unlock(object);
5989 alias_page = vm_page_grab_fictitious(TRUE);
5990 vm_object_lock(object);
5991 }
5992 if (cntrl_flags & UPL_COPYOUT_FROM) {
5993 upl->flags |= UPL_PAGE_SYNC_DONE;
5994
5995 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5996 dst_page->vmp_fictitious ||
5997 dst_page->vmp_absent ||
5998 VMP_ERROR_GET(dst_page) ||
5999 dst_page->vmp_cleaning ||
6000 (VM_PAGE_WIRED(dst_page))) {
6001 if (user_page_list) {
6002 user_page_list[entry].phys_addr = 0;
6003 }
6004
6005 goto try_next_page;
6006 }
6007 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6008
6009 /*
6010 * grab this up front...
6011 * a high percentange of the time we're going to
6012 * need the hardware modification state a bit later
6013 * anyway... so we can eliminate an extra call into
6014 * the pmap layer by grabbing it here and recording it
6015 */
6016 if (dst_page->vmp_pmapped) {
6017 refmod_state = pmap_get_refmod(phys_page);
6018 } else {
6019 refmod_state = 0;
6020 }
6021
6022 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6023 /*
6024 * page is on inactive list and referenced...
6025 * reactivate it now... this gets it out of the
6026 * way of vm_pageout_scan which would have to
6027 * reactivate it upon tripping over it
6028 */
6029 dwp->dw_mask |= DW_vm_page_activate;
6030 }
6031 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6032 /*
6033 * we're only asking for DIRTY pages to be returned
6034 */
6035 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6036 /*
6037 * if we were the page stolen by vm_pageout_scan to be
6038 * cleaned (as opposed to a buddy being clustered in
6039 * or this request is not being driven by a PAGEOUT cluster
6040 * then we only need to check for the page being dirty or
6041 * precious to decide whether to return it
6042 */
6043 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6044 goto check_busy;
6045 }
6046 goto dont_return;
6047 }
6048 /*
6049 * this is a request for a PAGEOUT cluster and this page
6050 * is merely along for the ride as a 'buddy'... not only
6051 * does it have to be dirty to be returned, but it also
6052 * can't have been referenced recently...
6053 */
6054 if ((hibernate_cleaning_in_progress == TRUE ||
6055 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6056 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6057 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6058 goto check_busy;
6059 }
6060 dont_return:
6061 /*
6062 * if we reach here, we're not to return
6063 * the page... go on to the next one
6064 */
6065 if (dst_page->vmp_laundry == TRUE) {
6066 /*
6067 * if we get here, the page is not 'cleaning' (filtered out above).
6068 * since it has been referenced, remove it from the laundry
6069 * so we don't pay the cost of an I/O to clean a page
6070 * we're just going to take back
6071 */
6072 vm_page_lockspin_queues();
6073
6074 vm_pageout_steal_laundry(dst_page, TRUE);
6075 vm_page_activate(dst_page);
6076
6077 vm_page_unlock_queues();
6078 }
6079 if (user_page_list) {
6080 user_page_list[entry].phys_addr = 0;
6081 }
6082
6083 goto try_next_page;
6084 }
6085 check_busy:
6086 if (dst_page->vmp_busy) {
6087 if (cntrl_flags & UPL_NOBLOCK) {
6088 if (user_page_list) {
6089 user_page_list[entry].phys_addr = 0;
6090 }
6091 dwp->dw_mask = 0;
6092
6093 goto try_next_page;
6094 }
6095 /*
6096 * someone else is playing with the
6097 * page. We will have to wait.
6098 */
6099 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6100
6101 continue;
6102 }
6103 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6104 vm_page_lockspin_queues();
6105
6106 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6107 /*
6108 * we've buddied up a page for a clustered pageout
6109 * that has already been moved to the pageout
6110 * queue by pageout_scan... we need to remove
6111 * it from the queue and drop the laundry count
6112 * on that queue
6113 */
6114 vm_pageout_throttle_up(dst_page);
6115 }
6116 vm_page_unlock_queues();
6117 }
6118 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6119 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6120
6121 if (phys_page > upl->highest_page) {
6122 upl->highest_page = phys_page;
6123 }
6124
6125 assert(!pmap_is_noencrypt(phys_page));
6126
6127 if (cntrl_flags & UPL_SET_LITE) {
6128 unsigned int pg_num;
6129
6130 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6131 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6132 bitmap_set(upl->lite_list, pg_num);
6133
6134 if (hw_dirty) {
6135 if (pmap_flushes_delayed == FALSE) {
6136 pmap_flush_context_init(&pmap_flush_context_storage);
6137 pmap_flushes_delayed = TRUE;
6138 }
6139 pmap_clear_refmod_options(phys_page,
6140 VM_MEM_MODIFIED,
6141 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6142 &pmap_flush_context_storage);
6143 }
6144
6145 /*
6146 * Mark original page as cleaning
6147 * in place.
6148 */
6149 dst_page->vmp_cleaning = TRUE;
6150 dst_page->vmp_precious = FALSE;
6151 } else {
6152 /*
6153 * use pageclean setup, it is more
6154 * convenient even for the pageout
6155 * cases here
6156 */
6157 vm_object_lock(upl->map_object);
6158 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6159 vm_object_unlock(upl->map_object);
6160
6161 alias_page->vmp_absent = FALSE;
6162 alias_page = NULL;
6163 }
6164 if (dirty) {
6165 SET_PAGE_DIRTY(dst_page, FALSE);
6166 } else {
6167 dst_page->vmp_dirty = FALSE;
6168 }
6169
6170 if (!dirty) {
6171 dst_page->vmp_precious = TRUE;
6172 }
6173
6174 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6175 if (!VM_PAGE_WIRED(dst_page)) {
6176 dst_page->vmp_free_when_done = TRUE;
6177 }
6178 }
6179 } else {
6180 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6181 /*
6182 * Honor copy-on-write obligations
6183 *
6184 * The copy object has changed since we
6185 * last synchronized for copy-on-write.
6186 * Another copy object might have been
6187 * inserted while we released the object's
6188 * lock. Since someone could have seen the
6189 * original contents of the remaining pages
6190 * through that new object, we have to
6191 * synchronize with it again for the remaining
6192 * pages only. The previous pages are "busy"
6193 * so they can not be seen through the new
6194 * mapping. The new mapping will see our
6195 * upcoming changes for those previous pages,
6196 * but that's OK since they couldn't see what
6197 * was there before. It's just a race anyway
6198 * and there's no guarantee of consistency or
6199 * atomicity. We just don't want new mappings
6200 * to see both the *before* and *after* pages.
6201 */
6202 if (object->copy != VM_OBJECT_NULL) {
6203 vm_object_update(
6204 object,
6205 dst_offset,/* current offset */
6206 xfer_size, /* remaining size */
6207 NULL,
6208 NULL,
6209 FALSE, /* should_return */
6210 MEMORY_OBJECT_COPY_SYNC,
6211 VM_PROT_NO_CHANGE);
6212
6213 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6214 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6215 }
6216 /*
6217 * remember the copy object we synced with
6218 */
6219 last_copy_object = object->copy;
6220 }
6221 dst_page = vm_page_lookup(object, dst_offset);
6222
6223 if (dst_page != VM_PAGE_NULL) {
6224 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6225 /*
6226 * skip over pages already present in the cache
6227 */
6228 if (user_page_list) {
6229 user_page_list[entry].phys_addr = 0;
6230 }
6231
6232 goto try_next_page;
6233 }
6234 if (dst_page->vmp_fictitious) {
6235 panic("need corner case for fictitious page");
6236 }
6237
6238 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6239 /*
6240 * someone else is playing with the
6241 * page. We will have to wait.
6242 */
6243 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6244
6245 continue;
6246 }
6247 if (dst_page->vmp_laundry) {
6248 vm_pageout_steal_laundry(dst_page, FALSE);
6249 }
6250 } else {
6251 if (object->private) {
6252 /*
6253 * This is a nasty wrinkle for users
6254 * of upl who encounter device or
6255 * private memory however, it is
6256 * unavoidable, only a fault can
6257 * resolve the actual backing
6258 * physical page by asking the
6259 * backing device.
6260 */
6261 if (user_page_list) {
6262 user_page_list[entry].phys_addr = 0;
6263 }
6264
6265 goto try_next_page;
6266 }
6267 if (object->scan_collisions) {
6268 /*
6269 * the pageout_scan thread is trying to steal
6270 * pages from this object, but has run into our
6271 * lock... grab 2 pages from the head of the object...
6272 * the first is freed on behalf of pageout_scan, the
6273 * 2nd is for our own use... we use vm_object_page_grab
6274 * in both cases to avoid taking pages from the free
6275 * list since we are under memory pressure and our
6276 * lock on this object is getting in the way of
6277 * relieving it
6278 */
6279 dst_page = vm_object_page_grab(object);
6280
6281 if (dst_page != VM_PAGE_NULL) {
6282 vm_page_release(dst_page,
6283 FALSE);
6284 }
6285
6286 dst_page = vm_object_page_grab(object);
6287 }
6288 if (dst_page == VM_PAGE_NULL) {
6289 /*
6290 * need to allocate a page
6291 */
6292 dst_page = vm_page_grab_options(grab_options);
6293 if (dst_page != VM_PAGE_NULL) {
6294 page_grab_count++;
6295 }
6296 }
6297 if (dst_page == VM_PAGE_NULL) {
6298 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6299 /*
6300 * we don't want to stall waiting for pages to come onto the free list
6301 * while we're already holding absent pages in this UPL
6302 * the caller will deal with the empty slots
6303 */
6304 if (user_page_list) {
6305 user_page_list[entry].phys_addr = 0;
6306 }
6307
6308 goto try_next_page;
6309 }
6310 /*
6311 * no pages available... wait
6312 * then try again for the same
6313 * offset...
6314 */
6315 vm_object_unlock(object);
6316
6317 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6318
6319 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6320
6321 VM_PAGE_WAIT();
6322 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6323
6324 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6325
6326 vm_object_lock(object);
6327
6328 continue;
6329 }
6330 vm_page_insert(dst_page, object, dst_offset);
6331
6332 dst_page->vmp_absent = TRUE;
6333 dst_page->vmp_busy = FALSE;
6334
6335 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6336 /*
6337 * if UPL_RET_ONLY_ABSENT was specified,
6338 * than we're definitely setting up a
6339 * upl for a clustered read/pagein
6340 * operation... mark the pages as clustered
6341 * so upl_commit_range can put them on the
6342 * speculative list
6343 */
6344 dst_page->vmp_clustered = TRUE;
6345
6346 if (!(cntrl_flags & UPL_FILE_IO)) {
6347 counter_inc(&vm_statistics_pageins);
6348 }
6349 }
6350 }
6351 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6352
6353 dst_page->vmp_overwriting = TRUE;
6354
6355 if (dst_page->vmp_pmapped) {
6356 if (!(cntrl_flags & UPL_FILE_IO)) {
6357 /*
6358 * eliminate all mappings from the
6359 * original object and its prodigy
6360 */
6361 refmod_state = pmap_disconnect(phys_page);
6362 } else {
6363 refmod_state = pmap_get_refmod(phys_page);
6364 }
6365 } else {
6366 refmod_state = 0;
6367 }
6368
6369 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6370 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6371
6372 if (cntrl_flags & UPL_SET_LITE) {
6373 unsigned int pg_num;
6374
6375 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6376 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6377 bitmap_set(upl->lite_list, pg_num);
6378
6379 if (hw_dirty) {
6380 pmap_clear_modify(phys_page);
6381 }
6382
6383 /*
6384 * Mark original page as cleaning
6385 * in place.
6386 */
6387 dst_page->vmp_cleaning = TRUE;
6388 dst_page->vmp_precious = FALSE;
6389 } else {
6390 /*
6391 * use pageclean setup, it is more
6392 * convenient even for the pageout
6393 * cases here
6394 */
6395 vm_object_lock(upl->map_object);
6396 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6397 vm_object_unlock(upl->map_object);
6398
6399 alias_page->vmp_absent = FALSE;
6400 alias_page = NULL;
6401 }
6402
6403 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6404 upl->flags &= ~UPL_CLEAR_DIRTY;
6405 upl->flags |= UPL_SET_DIRTY;
6406 dirty = TRUE;
6407 /*
6408 * Page belonging to a code-signed object is about to
6409 * be written. Mark it tainted and disconnect it from
6410 * all pmaps so processes have to fault it back in and
6411 * deal with the tainted bit.
6412 */
6413 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6414 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6415 vm_page_upl_tainted++;
6416 if (dst_page->vmp_pmapped) {
6417 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6418 if (refmod_state & VM_MEM_REFERENCED) {
6419 dst_page->vmp_reference = TRUE;
6420 }
6421 }
6422 }
6423 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6424 /*
6425 * clean in place for read implies
6426 * that a write will be done on all
6427 * the pages that are dirty before
6428 * a upl commit is done. The caller
6429 * is obligated to preserve the
6430 * contents of all pages marked dirty
6431 */
6432 upl->flags |= UPL_CLEAR_DIRTY;
6433 }
6434 dst_page->vmp_dirty = dirty;
6435
6436 if (!dirty) {
6437 dst_page->vmp_precious = TRUE;
6438 }
6439
6440 if (!VM_PAGE_WIRED(dst_page)) {
6441 /*
6442 * deny access to the target page while
6443 * it is being worked on
6444 */
6445 dst_page->vmp_busy = TRUE;
6446 } else {
6447 dwp->dw_mask |= DW_vm_page_wire;
6448 }
6449
6450 /*
6451 * We might be about to satisfy a fault which has been
6452 * requested. So no need for the "restart" bit.
6453 */
6454 dst_page->vmp_restart = FALSE;
6455 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6456 /*
6457 * expect the page to be used
6458 */
6459 dwp->dw_mask |= DW_set_reference;
6460 }
6461 if (cntrl_flags & UPL_PRECIOUS) {
6462 if (object->internal) {
6463 SET_PAGE_DIRTY(dst_page, FALSE);
6464 dst_page->vmp_precious = FALSE;
6465 } else {
6466 dst_page->vmp_precious = TRUE;
6467 }
6468 } else {
6469 dst_page->vmp_precious = FALSE;
6470 }
6471 }
6472 if (dst_page->vmp_busy) {
6473 upl->flags |= UPL_HAS_BUSY;
6474 }
6475
6476 if (phys_page > upl->highest_page) {
6477 upl->highest_page = phys_page;
6478 }
6479 assert(!pmap_is_noencrypt(phys_page));
6480 if (user_page_list) {
6481 user_page_list[entry].phys_addr = phys_page;
6482 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6483 user_page_list[entry].absent = dst_page->vmp_absent;
6484 user_page_list[entry].dirty = dst_page->vmp_dirty;
6485 user_page_list[entry].precious = dst_page->vmp_precious;
6486 user_page_list[entry].device = FALSE;
6487 user_page_list[entry].needed = FALSE;
6488 if (dst_page->vmp_clustered == TRUE) {
6489 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6490 } else {
6491 user_page_list[entry].speculative = FALSE;
6492 }
6493 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6494 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6495 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6496 user_page_list[entry].mark = FALSE;
6497 }
6498 /*
6499 * if UPL_RET_ONLY_ABSENT is set, then
6500 * we are working with a fresh page and we've
6501 * just set the clustered flag on it to
6502 * indicate that it was drug in as part of a
6503 * speculative cluster... so leave it alone
6504 */
6505 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6506 /*
6507 * someone is explicitly grabbing this page...
6508 * update clustered and speculative state
6509 *
6510 */
6511 if (dst_page->vmp_clustered) {
6512 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6513 }
6514 }
6515 try_next_page:
6516 if (dwp->dw_mask) {
6517 if (dwp->dw_mask & DW_vm_page_activate) {
6518 counter_inc(&vm_statistics_reactivations);
6519 }
6520
6521 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6522
6523 if (dw_count >= dw_limit) {
6524 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6525
6526 dwp = dwp_start;
6527 dw_count = 0;
6528 }
6529 }
6530 entry++;
6531 dst_offset += PAGE_SIZE_64;
6532 xfer_size -= PAGE_SIZE;
6533 }
6534 if (dw_count) {
6535 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6536 dwp = dwp_start;
6537 dw_count = 0;
6538 }
6539
6540 if (alias_page != NULL) {
6541 VM_PAGE_FREE(alias_page);
6542 }
6543 if (pmap_flushes_delayed == TRUE) {
6544 pmap_flush(&pmap_flush_context_storage);
6545 }
6546
6547 if (page_list_count != NULL) {
6548 if (upl->flags & UPL_INTERNAL) {
6549 *page_list_count = 0;
6550 } else if (*page_list_count > entry) {
6551 *page_list_count = entry;
6552 }
6553 }
6554 #if UPL_DEBUG
6555 upl->upl_state = 1;
6556 #endif
6557 vm_object_unlock(object);
6558
6559 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6560 #if DEVELOPMENT || DEBUG
6561 if (task != NULL) {
6562 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6563 }
6564 #endif /* DEVELOPMENT || DEBUG */
6565
6566 if (dwp_start && dwp_finish_ctx) {
6567 vm_page_delayed_work_finish_ctx(dwp_start);
6568 dwp_start = dwp = NULL;
6569 }
6570
6571 return KERN_SUCCESS;
6572 }
6573
6574 /*
6575 * Routine: vm_object_super_upl_request
6576 * Purpose:
6577 * Cause the population of a portion of a vm_object
6578 * in much the same way as memory_object_upl_request.
6579 * Depending on the nature of the request, the pages
6580 * returned may be contain valid data or be uninitialized.
6581 * However, the region may be expanded up to the super
6582 * cluster size provided.
6583 */
6584
6585 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6586 vm_object_super_upl_request(
6587 vm_object_t object,
6588 vm_object_offset_t offset,
6589 upl_size_t size,
6590 upl_size_t super_cluster,
6591 upl_t *upl,
6592 upl_page_info_t *user_page_list,
6593 unsigned int *page_list_count,
6594 upl_control_flags_t cntrl_flags,
6595 vm_tag_t tag)
6596 {
6597 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6598 return KERN_FAILURE;
6599 }
6600
6601 assert(object->paging_in_progress);
6602 offset = offset - object->paging_offset;
6603
6604 if (super_cluster > size) {
6605 vm_object_offset_t base_offset;
6606 upl_size_t super_size;
6607 vm_object_size_t super_size_64;
6608
6609 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6610 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6611 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6612 super_size = (upl_size_t) super_size_64;
6613 assert(super_size == super_size_64);
6614
6615 if (offset > (base_offset + super_size)) {
6616 panic("vm_object_super_upl_request: Missed target pageout"
6617 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6618 offset, base_offset, super_size, super_cluster,
6619 size, object->paging_offset);
6620 }
6621 /*
6622 * apparently there is a case where the vm requests a
6623 * page to be written out who's offset is beyond the
6624 * object size
6625 */
6626 if ((offset + size) > (base_offset + super_size)) {
6627 super_size_64 = (offset + size) - base_offset;
6628 super_size = (upl_size_t) super_size_64;
6629 assert(super_size == super_size_64);
6630 }
6631
6632 offset = base_offset;
6633 size = super_size;
6634 }
6635 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6636 }
6637
6638 int cs_executable_create_upl = 0;
6639 extern int proc_selfpid(void);
6640 extern char *proc_name_address(void *p);
6641
6642 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6643 vm_map_create_upl(
6644 vm_map_t map,
6645 vm_map_address_t offset,
6646 upl_size_t *upl_size,
6647 upl_t *upl,
6648 upl_page_info_array_t page_list,
6649 unsigned int *count,
6650 upl_control_flags_t *flags,
6651 vm_tag_t tag)
6652 {
6653 vm_map_entry_t entry;
6654 upl_control_flags_t caller_flags;
6655 int force_data_sync;
6656 int sync_cow_data;
6657 vm_object_t local_object;
6658 vm_map_offset_t local_offset;
6659 vm_map_offset_t local_start;
6660 kern_return_t ret;
6661 vm_map_address_t original_offset;
6662 vm_map_size_t original_size, adjusted_size;
6663 vm_map_offset_t local_entry_start;
6664 vm_object_offset_t local_entry_offset;
6665 vm_object_offset_t offset_in_mapped_page;
6666 boolean_t release_map = FALSE;
6667
6668 start_with_map:
6669
6670 original_offset = offset;
6671 original_size = *upl_size;
6672 adjusted_size = original_size;
6673
6674 caller_flags = *flags;
6675
6676 if (caller_flags & ~UPL_VALID_FLAGS) {
6677 /*
6678 * For forward compatibility's sake,
6679 * reject any unknown flag.
6680 */
6681 ret = KERN_INVALID_VALUE;
6682 goto done;
6683 }
6684 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686
6687 if (upl == NULL) {
6688 ret = KERN_INVALID_ARGUMENT;
6689 goto done;
6690 }
6691
6692 REDISCOVER_ENTRY:
6693 vm_map_lock_read(map);
6694
6695 if (!vm_map_lookup_entry(map, offset, &entry)) {
6696 vm_map_unlock_read(map);
6697 ret = KERN_FAILURE;
6698 goto done;
6699 }
6700
6701 local_entry_start = entry->vme_start;
6702 local_entry_offset = VME_OFFSET(entry);
6703
6704 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6706 }
6707
6708 if (entry->vme_end - original_offset < adjusted_size) {
6709 adjusted_size = entry->vme_end - original_offset;
6710 assert(adjusted_size > 0);
6711 *upl_size = (upl_size_t) adjusted_size;
6712 assert(*upl_size == adjusted_size);
6713 }
6714
6715 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716 *flags = 0;
6717
6718 if (!entry->is_sub_map &&
6719 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720 if (VME_OBJECT(entry)->private) {
6721 *flags = UPL_DEV_MEMORY;
6722 }
6723
6724 if (VME_OBJECT(entry)->phys_contiguous) {
6725 *flags |= UPL_PHYS_CONTIG;
6726 }
6727 }
6728 vm_map_unlock_read(map);
6729 ret = KERN_SUCCESS;
6730 goto done;
6731 }
6732
6733 offset_in_mapped_page = 0;
6734 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736 *upl_size = (upl_size_t)
6737 (vm_map_round_page(original_offset + adjusted_size,
6738 VM_MAP_PAGE_MASK(map))
6739 - offset);
6740
6741 offset_in_mapped_page = original_offset - offset;
6742 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743
6744 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745 }
6746
6747 if (!entry->is_sub_map) {
6748 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6749 !VME_OBJECT(entry)->phys_contiguous) {
6750 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751 *upl_size = MAX_UPL_SIZE_BYTES;
6752 }
6753 }
6754
6755 /*
6756 * Create an object if necessary.
6757 */
6758 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759 if (vm_map_lock_read_to_write(map)) {
6760 goto REDISCOVER_ENTRY;
6761 }
6762
6763 VME_OBJECT_SET(entry,
6764 vm_object_allocate((vm_size_t)
6765 vm_object_round_page((entry->vme_end - entry->vme_start))),
6766 false, 0);
6767 VME_OFFSET_SET(entry, 0);
6768 assert(entry->use_pmap);
6769
6770 vm_map_lock_write_to_read(map);
6771 }
6772
6773 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6774 !(entry->protection & VM_PROT_WRITE)) {
6775 vm_map_unlock_read(map);
6776 ret = KERN_PROTECTION_FAILURE;
6777 goto done;
6778 }
6779 }
6780
6781 #if !XNU_TARGET_OS_OSX
6782 if (map->pmap != kernel_pmap &&
6783 (caller_flags & UPL_COPYOUT_FROM) &&
6784 (entry->protection & VM_PROT_EXECUTE) &&
6785 !(entry->protection & VM_PROT_WRITE)) {
6786 vm_offset_t kaddr;
6787 vm_size_t ksize;
6788
6789 /*
6790 * We're about to create a read-only UPL backed by
6791 * memory from an executable mapping.
6792 * Wiring the pages would result in the pages being copied
6793 * (due to the "MAP_PRIVATE" mapping) and no longer
6794 * code-signed, so no longer eligible for execution.
6795 * Instead, let's copy the data into a kernel buffer and
6796 * create the UPL from this kernel buffer.
6797 * The kernel buffer is then freed, leaving the UPL holding
6798 * the last reference on the VM object, so the memory will
6799 * be released when the UPL is committed.
6800 */
6801
6802 vm_map_unlock_read(map);
6803 entry = VM_MAP_ENTRY_NULL;
6804 /* allocate kernel buffer */
6805 ksize = round_page(*upl_size);
6806 kaddr = 0;
6807 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6808 KMA_PAGEABLE | KMA_DATA, tag);
6809 if (ret == KERN_SUCCESS) {
6810 /* copyin the user data */
6811 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6812 }
6813 if (ret == KERN_SUCCESS) {
6814 if (ksize > *upl_size) {
6815 /* zero out the extra space in kernel buffer */
6816 memset((void *)(kaddr + *upl_size),
6817 0,
6818 ksize - *upl_size);
6819 }
6820 /* create the UPL from the kernel buffer */
6821 vm_object_offset_t offset_in_object;
6822 vm_object_offset_t offset_in_object_page;
6823
6824 offset_in_object = offset - local_entry_start + local_entry_offset;
6825 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6826 assert(offset_in_object_page < PAGE_SIZE);
6827 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6828 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6829 ret = vm_map_create_upl(kernel_map,
6830 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6831 upl_size, upl, page_list, count, flags, tag);
6832 }
6833 if (kaddr != 0) {
6834 /* free the kernel buffer */
6835 kmem_free(kernel_map, kaddr, ksize);
6836 kaddr = 0;
6837 ksize = 0;
6838 }
6839 #if DEVELOPMENT || DEBUG
6840 DTRACE_VM4(create_upl_from_executable,
6841 vm_map_t, map,
6842 vm_map_address_t, offset,
6843 upl_size_t, *upl_size,
6844 kern_return_t, ret);
6845 #endif /* DEVELOPMENT || DEBUG */
6846 goto done;
6847 }
6848 #endif /* !XNU_TARGET_OS_OSX */
6849
6850 if (!entry->is_sub_map) {
6851 local_object = VME_OBJECT(entry);
6852 assert(local_object != VM_OBJECT_NULL);
6853 }
6854
6855 if (!entry->is_sub_map &&
6856 !entry->needs_copy &&
6857 *upl_size != 0 &&
6858 local_object->vo_size > *upl_size && /* partial UPL */
6859 entry->wired_count == 0 && /* No COW for entries that are wired */
6860 (map->pmap != kernel_pmap) && /* alias checks */
6861 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6862 ||
6863 ( /* case 2 */
6864 local_object->internal &&
6865 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6866 local_object->ref_count > 1))) {
6867 vm_prot_t prot;
6868
6869 /*
6870 * Case 1:
6871 * Set up the targeted range for copy-on-write to avoid
6872 * applying true_share/copy_delay to the entire object.
6873 *
6874 * Case 2:
6875 * This map entry covers only part of an internal
6876 * object. There could be other map entries covering
6877 * other areas of this object and some of these map
6878 * entries could be marked as "needs_copy", which
6879 * assumes that the object is COPY_SYMMETRIC.
6880 * To avoid marking this object as COPY_DELAY and
6881 * "true_share", let's shadow it and mark the new
6882 * (smaller) object as "true_share" and COPY_DELAY.
6883 */
6884
6885 if (vm_map_lock_read_to_write(map)) {
6886 goto REDISCOVER_ENTRY;
6887 }
6888 vm_map_lock_assert_exclusive(map);
6889 assert(VME_OBJECT(entry) == local_object);
6890
6891 vm_map_clip_start(map,
6892 entry,
6893 vm_map_trunc_page(offset,
6894 VM_MAP_PAGE_MASK(map)));
6895 vm_map_clip_end(map,
6896 entry,
6897 vm_map_round_page(offset + *upl_size,
6898 VM_MAP_PAGE_MASK(map)));
6899 if ((entry->vme_end - offset) < *upl_size) {
6900 *upl_size = (upl_size_t) (entry->vme_end - offset);
6901 assert(*upl_size == entry->vme_end - offset);
6902 }
6903
6904 prot = entry->protection & ~VM_PROT_WRITE;
6905 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6906 prot |= VM_PROT_EXECUTE;
6907 }
6908 vm_object_pmap_protect(local_object,
6909 VME_OFFSET(entry),
6910 entry->vme_end - entry->vme_start,
6911 ((entry->is_shared ||
6912 map->mapped_in_other_pmaps)
6913 ? PMAP_NULL
6914 : map->pmap),
6915 VM_MAP_PAGE_SIZE(map),
6916 entry->vme_start,
6917 prot);
6918
6919 assert(entry->wired_count == 0);
6920
6921 /*
6922 * Lock the VM object and re-check its status: if it's mapped
6923 * in another address space, we could still be racing with
6924 * another thread holding that other VM map exclusively.
6925 */
6926 vm_object_lock(local_object);
6927 if (local_object->true_share) {
6928 /* object is already in proper state: no COW needed */
6929 assert(local_object->copy_strategy !=
6930 MEMORY_OBJECT_COPY_SYMMETRIC);
6931 } else {
6932 /* not true_share: ask for copy-on-write below */
6933 assert(local_object->copy_strategy ==
6934 MEMORY_OBJECT_COPY_SYMMETRIC);
6935 entry->needs_copy = TRUE;
6936 }
6937 vm_object_unlock(local_object);
6938
6939 vm_map_lock_write_to_read(map);
6940 }
6941
6942 if (entry->needs_copy) {
6943 /*
6944 * Honor copy-on-write for COPY_SYMMETRIC
6945 * strategy.
6946 */
6947 vm_map_t local_map;
6948 vm_object_t object;
6949 vm_object_offset_t new_offset;
6950 vm_prot_t prot;
6951 boolean_t wired;
6952 vm_map_version_t version;
6953 vm_map_t real_map;
6954 vm_prot_t fault_type;
6955
6956 local_map = map;
6957
6958 if (caller_flags & UPL_COPYOUT_FROM) {
6959 fault_type = VM_PROT_READ | VM_PROT_COPY;
6960 vm_counters.create_upl_extra_cow++;
6961 vm_counters.create_upl_extra_cow_pages +=
6962 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6963 } else {
6964 fault_type = VM_PROT_WRITE;
6965 }
6966 if (vm_map_lookup_and_lock_object(&local_map,
6967 offset, fault_type,
6968 OBJECT_LOCK_EXCLUSIVE,
6969 &version, &object,
6970 &new_offset, &prot, &wired,
6971 NULL,
6972 &real_map, NULL) != KERN_SUCCESS) {
6973 if (fault_type == VM_PROT_WRITE) {
6974 vm_counters.create_upl_lookup_failure_write++;
6975 } else {
6976 vm_counters.create_upl_lookup_failure_copy++;
6977 }
6978 vm_map_unlock_read(local_map);
6979 ret = KERN_FAILURE;
6980 goto done;
6981 }
6982 if (real_map != local_map) {
6983 vm_map_unlock(real_map);
6984 }
6985 vm_map_unlock_read(local_map);
6986
6987 vm_object_unlock(object);
6988
6989 goto REDISCOVER_ENTRY;
6990 }
6991
6992 if (entry->is_sub_map) {
6993 vm_map_t submap;
6994
6995 submap = VME_SUBMAP(entry);
6996 local_start = entry->vme_start;
6997 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6998
6999 vm_map_reference(submap);
7000 vm_map_unlock_read(map);
7001
7002 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7003 offset += offset_in_mapped_page;
7004 *upl_size -= offset_in_mapped_page;
7005
7006 if (release_map) {
7007 vm_map_deallocate(map);
7008 }
7009 map = submap;
7010 release_map = TRUE;
7011 offset = local_offset + (offset - local_start);
7012 goto start_with_map;
7013 }
7014
7015 if (sync_cow_data &&
7016 (VME_OBJECT(entry)->shadow ||
7017 VME_OBJECT(entry)->copy)) {
7018 local_object = VME_OBJECT(entry);
7019 local_start = entry->vme_start;
7020 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7021
7022 vm_object_reference(local_object);
7023 vm_map_unlock_read(map);
7024
7025 if (local_object->shadow && local_object->copy) {
7026 vm_object_lock_request(local_object->shadow,
7027 ((vm_object_offset_t)
7028 ((offset - local_start) +
7029 local_offset) +
7030 local_object->vo_shadow_offset),
7031 *upl_size, FALSE,
7032 MEMORY_OBJECT_DATA_SYNC,
7033 VM_PROT_NO_CHANGE);
7034 }
7035 sync_cow_data = FALSE;
7036 vm_object_deallocate(local_object);
7037
7038 goto REDISCOVER_ENTRY;
7039 }
7040 if (force_data_sync) {
7041 local_object = VME_OBJECT(entry);
7042 local_start = entry->vme_start;
7043 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7044
7045 vm_object_reference(local_object);
7046 vm_map_unlock_read(map);
7047
7048 vm_object_lock_request(local_object,
7049 ((vm_object_offset_t)
7050 ((offset - local_start) +
7051 local_offset)),
7052 (vm_object_size_t)*upl_size,
7053 FALSE,
7054 MEMORY_OBJECT_DATA_SYNC,
7055 VM_PROT_NO_CHANGE);
7056
7057 force_data_sync = FALSE;
7058 vm_object_deallocate(local_object);
7059
7060 goto REDISCOVER_ENTRY;
7061 }
7062 if (VME_OBJECT(entry)->private) {
7063 *flags = UPL_DEV_MEMORY;
7064 } else {
7065 *flags = 0;
7066 }
7067
7068 if (VME_OBJECT(entry)->phys_contiguous) {
7069 *flags |= UPL_PHYS_CONTIG;
7070 }
7071
7072 local_object = VME_OBJECT(entry);
7073 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7074 local_start = entry->vme_start;
7075
7076 /*
7077 * Wiring will copy the pages to the shadow object.
7078 * The shadow object will not be code-signed so
7079 * attempting to execute code from these copied pages
7080 * would trigger a code-signing violation.
7081 */
7082 if (entry->protection & VM_PROT_EXECUTE) {
7083 #if MACH_ASSERT
7084 printf("pid %d[%s] create_upl out of executable range from "
7085 "0x%llx to 0x%llx: side effects may include "
7086 "code-signing violations later on\n",
7087 proc_selfpid(),
7088 (get_bsdtask_info(current_task())
7089 ? proc_name_address(get_bsdtask_info(current_task()))
7090 : "?"),
7091 (uint64_t) entry->vme_start,
7092 (uint64_t) entry->vme_end);
7093 #endif /* MACH_ASSERT */
7094 DTRACE_VM2(cs_executable_create_upl,
7095 uint64_t, (uint64_t)entry->vme_start,
7096 uint64_t, (uint64_t)entry->vme_end);
7097 cs_executable_create_upl++;
7098 }
7099
7100 vm_object_lock(local_object);
7101
7102 /*
7103 * Ensure that this object is "true_share" and "copy_delay" now,
7104 * while we're still holding the VM map lock. After we unlock the map,
7105 * anything could happen to that mapping, including some copy-on-write
7106 * activity. We need to make sure that the IOPL will point at the
7107 * same memory as the mapping.
7108 */
7109 if (local_object->true_share) {
7110 assert(local_object->copy_strategy !=
7111 MEMORY_OBJECT_COPY_SYMMETRIC);
7112 } else if (local_object != kernel_object &&
7113 local_object != compressor_object &&
7114 !local_object->phys_contiguous) {
7115 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7116 if (!local_object->true_share &&
7117 vm_object_tracking_btlog) {
7118 btlog_record(vm_object_tracking_btlog, local_object,
7119 VM_OBJECT_TRACKING_OP_TRUESHARE,
7120 btref_get(__builtin_frame_address(0), 0));
7121 }
7122 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7123 local_object->true_share = TRUE;
7124 if (local_object->copy_strategy ==
7125 MEMORY_OBJECT_COPY_SYMMETRIC) {
7126 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7127 }
7128 }
7129
7130 vm_object_reference_locked(local_object);
7131 vm_object_unlock(local_object);
7132
7133 vm_map_unlock_read(map);
7134
7135 offset += offset_in_mapped_page;
7136 assert(*upl_size > offset_in_mapped_page);
7137 *upl_size -= offset_in_mapped_page;
7138
7139 ret = vm_object_iopl_request(local_object,
7140 ((vm_object_offset_t)
7141 ((offset - local_start) + local_offset)),
7142 *upl_size,
7143 upl,
7144 page_list,
7145 count,
7146 caller_flags,
7147 tag);
7148 vm_object_deallocate(local_object);
7149
7150 done:
7151 if (release_map) {
7152 vm_map_deallocate(map);
7153 }
7154
7155 return ret;
7156 }
7157
7158 /*
7159 * Internal routine to enter a UPL into a VM map.
7160 *
7161 * JMM - This should just be doable through the standard
7162 * vm_map_enter() API.
7163 */
7164 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7165 vm_map_enter_upl_range(
7166 vm_map_t map,
7167 upl_t upl,
7168 vm_object_offset_t offset_to_map,
7169 upl_size_t size_to_map,
7170 vm_prot_t prot_to_map,
7171 vm_map_offset_t *dst_addr)
7172 {
7173 vm_map_size_t size;
7174 vm_object_offset_t offset;
7175 vm_map_offset_t addr;
7176 vm_page_t m;
7177 kern_return_t kr;
7178 int isVectorUPL = 0, curr_upl = 0;
7179 upl_t vector_upl = NULL;
7180 mach_vm_offset_t vector_upl_dst_addr = 0;
7181 vm_map_t vector_upl_submap = NULL;
7182 upl_offset_t subupl_offset = 0;
7183 upl_size_t subupl_size = 0;
7184
7185 if (upl == UPL_NULL) {
7186 return KERN_INVALID_ARGUMENT;
7187 }
7188
7189 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7190 assert(map == kernel_map);
7191
7192 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7193 int mapped = 0, valid_upls = 0;
7194 vector_upl = upl;
7195
7196 upl_lock(vector_upl);
7197 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7198 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7199 if (upl == NULL) {
7200 continue;
7201 }
7202 valid_upls++;
7203 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7204 mapped++;
7205 }
7206 }
7207
7208 if (mapped) {
7209 if (mapped != valid_upls) {
7210 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7211 } else {
7212 upl_unlock(vector_upl);
7213 return KERN_FAILURE;
7214 }
7215 }
7216
7217 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7218 panic("TODO4K: vector UPL not implemented");
7219 }
7220
7221 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7222 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7223 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7224 VM_KERN_MEMORY_NONE).kmr_submap;
7225 map = vector_upl_submap;
7226 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7227 curr_upl = 0;
7228 } else {
7229 upl_lock(upl);
7230 }
7231
7232 process_upl_to_enter:
7233 if (isVectorUPL) {
7234 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7235 *dst_addr = vector_upl_dst_addr;
7236 upl_unlock(vector_upl);
7237 return KERN_SUCCESS;
7238 }
7239 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7240 if (upl == NULL) {
7241 goto process_upl_to_enter;
7242 }
7243
7244 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7245 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7246 } else {
7247 /*
7248 * check to see if already mapped
7249 */
7250 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7251 upl_unlock(upl);
7252 return KERN_FAILURE;
7253 }
7254 }
7255
7256 if ((!(upl->flags & UPL_SHADOWED)) &&
7257 ((upl->flags & UPL_HAS_BUSY) ||
7258 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7259 vm_object_t object;
7260 vm_page_t alias_page;
7261 vm_object_offset_t new_offset;
7262 unsigned int pg_num;
7263
7264 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7265 object = upl->map_object;
7266 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7267
7268 vm_object_lock(upl->map_object);
7269
7270 upl->map_object->shadow = object;
7271 upl->map_object->pageout = TRUE;
7272 upl->map_object->can_persist = FALSE;
7273 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7274 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7275 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7276 "object %p shadow_offset 0x%llx",
7277 upl->map_object,
7278 (uint64_t)upl->map_object->vo_shadow_offset);
7279 upl->map_object->wimg_bits = object->wimg_bits;
7280 offset = upl->map_object->vo_shadow_offset;
7281 new_offset = 0;
7282
7283 upl->flags |= UPL_SHADOWED;
7284
7285 while (size) {
7286 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7287 assert(pg_num == new_offset / PAGE_SIZE);
7288
7289 if (bitmap_test(upl->lite_list, pg_num)) {
7290 alias_page = vm_page_grab_fictitious(TRUE);
7291
7292 vm_object_lock(object);
7293
7294 m = vm_page_lookup(object, offset);
7295 if (m == VM_PAGE_NULL) {
7296 panic("vm_upl_map: page missing");
7297 }
7298
7299 /*
7300 * Convert the fictitious page to a private
7301 * shadow of the real page.
7302 */
7303 assert(alias_page->vmp_fictitious);
7304 alias_page->vmp_fictitious = FALSE;
7305 alias_page->vmp_private = TRUE;
7306 alias_page->vmp_free_when_done = TRUE;
7307 /*
7308 * since m is a page in the upl it must
7309 * already be wired or BUSY, so it's
7310 * safe to assign the underlying physical
7311 * page to the alias
7312 */
7313 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7314
7315 vm_object_unlock(object);
7316
7317 vm_page_lockspin_queues();
7318 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7319 vm_page_unlock_queues();
7320
7321 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7322
7323 assert(!alias_page->vmp_wanted);
7324 alias_page->vmp_busy = FALSE;
7325 alias_page->vmp_absent = FALSE;
7326 }
7327 size -= PAGE_SIZE;
7328 offset += PAGE_SIZE_64;
7329 new_offset += PAGE_SIZE_64;
7330 }
7331 vm_object_unlock(upl->map_object);
7332 }
7333 if (upl->flags & UPL_SHADOWED) {
7334 if (isVectorUPL) {
7335 offset = 0;
7336 } else {
7337 offset = offset_to_map;
7338 }
7339 } else {
7340 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7341 if (!isVectorUPL) {
7342 offset += offset_to_map;
7343 }
7344 }
7345
7346 if (isVectorUPL) {
7347 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7348 } else {
7349 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7350 }
7351
7352 vm_object_reference(upl->map_object);
7353
7354 if (!isVectorUPL) {
7355 *dst_addr = 0;
7356 /*
7357 * NEED A UPL_MAP ALIAS
7358 */
7359 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7360 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7361 upl->map_object, offset, FALSE,
7362 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7363
7364 if (kr != KERN_SUCCESS) {
7365 vm_object_deallocate(upl->map_object);
7366 upl_unlock(upl);
7367 return kr;
7368 }
7369 } else {
7370 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7371 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7372 upl->map_object, offset, FALSE,
7373 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7374 if (kr) {
7375 panic("vm_map_enter failed for a Vector UPL");
7376 }
7377 }
7378 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7379 /* this will have to be an increment rather than */
7380 /* an assignment. */
7381 vm_object_lock(upl->map_object);
7382
7383 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7384 m = vm_page_lookup(upl->map_object, offset);
7385
7386 if (m) {
7387 m->vmp_pmapped = TRUE;
7388
7389 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7390 * but only in kernel space. If this was on a user map,
7391 * we'd have to set the wpmapped bit. */
7392 /* m->vmp_wpmapped = TRUE; */
7393 assert(map->pmap == kernel_pmap);
7394
7395 PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7396
7397 assert(kr == KERN_SUCCESS);
7398 #if KASAN
7399 kasan_notify_address(addr, PAGE_SIZE_64);
7400 #endif
7401 }
7402 offset += PAGE_SIZE_64;
7403 }
7404 vm_object_unlock(upl->map_object);
7405
7406 /*
7407 * hold a reference for the mapping
7408 */
7409 upl->ref_count++;
7410 upl->flags |= UPL_PAGE_LIST_MAPPED;
7411 upl->kaddr = (vm_offset_t) *dst_addr;
7412 assert(upl->kaddr == *dst_addr);
7413
7414 if (isVectorUPL) {
7415 goto process_upl_to_enter;
7416 }
7417
7418 if (!isVectorUPL) {
7419 vm_map_offset_t addr_adjustment;
7420
7421 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7422 if (addr_adjustment) {
7423 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7424 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7425 *dst_addr += addr_adjustment;
7426 }
7427 }
7428
7429 upl_unlock(upl);
7430
7431 return KERN_SUCCESS;
7432 }
7433
7434 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7435 vm_map_enter_upl(
7436 vm_map_t map,
7437 upl_t upl,
7438 vm_map_offset_t *dst_addr)
7439 {
7440 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7441 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7442 }
7443
7444 /*
7445 * Internal routine to remove a UPL mapping from a VM map.
7446 *
7447 * XXX - This should just be doable through a standard
7448 * vm_map_remove() operation. Otherwise, implicit clean-up
7449 * of the target map won't be able to correctly remove
7450 * these (and release the reference on the UPL). Having
7451 * to do this means we can't map these into user-space
7452 * maps yet.
7453 */
7454 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7455 vm_map_remove_upl_range(
7456 vm_map_t map,
7457 upl_t upl,
7458 __unused vm_object_offset_t offset_to_unmap,
7459 __unused upl_size_t size_to_unmap)
7460 {
7461 vm_address_t addr;
7462 upl_size_t size;
7463 int isVectorUPL = 0, curr_upl = 0;
7464 upl_t vector_upl = NULL;
7465
7466 if (upl == UPL_NULL) {
7467 return KERN_INVALID_ARGUMENT;
7468 }
7469
7470 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7471 int unmapped = 0, valid_upls = 0;
7472 vector_upl = upl;
7473 upl_lock(vector_upl);
7474 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7475 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7476 if (upl == NULL) {
7477 continue;
7478 }
7479 valid_upls++;
7480 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7481 unmapped++;
7482 }
7483 }
7484
7485 if (unmapped) {
7486 if (unmapped != valid_upls) {
7487 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7488 } else {
7489 upl_unlock(vector_upl);
7490 return KERN_FAILURE;
7491 }
7492 }
7493 curr_upl = 0;
7494 } else {
7495 upl_lock(upl);
7496 }
7497
7498 process_upl_to_remove:
7499 if (isVectorUPL) {
7500 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7501 vm_map_t v_upl_submap;
7502 vm_offset_t v_upl_submap_dst_addr;
7503 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7504
7505 kmem_free_guard(map, v_upl_submap_dst_addr,
7506 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7507 vm_map_deallocate(v_upl_submap);
7508 upl_unlock(vector_upl);
7509 return KERN_SUCCESS;
7510 }
7511
7512 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7513 if (upl == NULL) {
7514 goto process_upl_to_remove;
7515 }
7516 }
7517
7518 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7519 addr = upl->kaddr;
7520 size = upl->u_mapped_size;
7521
7522 assert(upl->ref_count > 1);
7523 upl->ref_count--; /* removing mapping ref */
7524
7525 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7526 upl->kaddr = (vm_offset_t) 0;
7527 upl->u_mapped_size = 0;
7528
7529 if (isVectorUPL) {
7530 /*
7531 * If it's a Vectored UPL, we'll be removing the entire
7532 * submap anyways, so no need to remove individual UPL
7533 * element mappings from within the submap
7534 */
7535 goto process_upl_to_remove;
7536 }
7537
7538 upl_unlock(upl);
7539
7540 vm_map_remove(map,
7541 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7542 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7543 return KERN_SUCCESS;
7544 }
7545 upl_unlock(upl);
7546
7547 return KERN_FAILURE;
7548 }
7549
7550 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7551 vm_map_remove_upl(
7552 vm_map_t map,
7553 upl_t upl)
7554 {
7555 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7556 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7557 }
7558
7559 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7560 upl_commit_range(
7561 upl_t upl,
7562 upl_offset_t offset,
7563 upl_size_t size,
7564 int flags,
7565 upl_page_info_t *page_list,
7566 mach_msg_type_number_t count,
7567 boolean_t *empty)
7568 {
7569 upl_size_t xfer_size, subupl_size;
7570 vm_object_t shadow_object;
7571 vm_object_t object;
7572 vm_object_t m_object;
7573 vm_object_offset_t target_offset;
7574 upl_offset_t subupl_offset = offset;
7575 int entry;
7576 int occupied;
7577 int clear_refmod = 0;
7578 int pgpgout_count = 0;
7579 struct vm_page_delayed_work dw_array;
7580 struct vm_page_delayed_work *dwp, *dwp_start;
7581 bool dwp_finish_ctx = TRUE;
7582 int dw_count;
7583 int dw_limit;
7584 int isVectorUPL = 0;
7585 upl_t vector_upl = NULL;
7586 boolean_t should_be_throttled = FALSE;
7587
7588 vm_page_t nxt_page = VM_PAGE_NULL;
7589 int fast_path_possible = 0;
7590 int fast_path_full_commit = 0;
7591 int throttle_page = 0;
7592 int unwired_count = 0;
7593 int local_queue_count = 0;
7594 vm_page_t first_local, last_local;
7595 vm_object_offset_t obj_start, obj_end, obj_offset;
7596 kern_return_t kr = KERN_SUCCESS;
7597
7598 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7599
7600 dwp_start = dwp = NULL;
7601
7602 subupl_size = size;
7603 *empty = FALSE;
7604
7605 if (upl == UPL_NULL) {
7606 return KERN_INVALID_ARGUMENT;
7607 }
7608
7609 dw_count = 0;
7610 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7611 dwp_start = vm_page_delayed_work_get_ctx();
7612 if (dwp_start == NULL) {
7613 dwp_start = &dw_array;
7614 dw_limit = 1;
7615 dwp_finish_ctx = FALSE;
7616 }
7617
7618 dwp = dwp_start;
7619
7620 if (count == 0) {
7621 page_list = NULL;
7622 }
7623
7624 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7625 vector_upl = upl;
7626 upl_lock(vector_upl);
7627 } else {
7628 upl_lock(upl);
7629 }
7630
7631 process_upl_to_commit:
7632
7633 if (isVectorUPL) {
7634 size = subupl_size;
7635 offset = subupl_offset;
7636 if (size == 0) {
7637 upl_unlock(vector_upl);
7638 kr = KERN_SUCCESS;
7639 goto done;
7640 }
7641 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7642 if (upl == NULL) {
7643 upl_unlock(vector_upl);
7644 kr = KERN_FAILURE;
7645 goto done;
7646 }
7647 page_list = upl->page_list;
7648 subupl_size -= size;
7649 subupl_offset += size;
7650 }
7651
7652 #if UPL_DEBUG
7653 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7654 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7655 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7656 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7657
7658 upl->upl_commit_index++;
7659 }
7660 #endif
7661 if (upl->flags & UPL_DEVICE_MEMORY) {
7662 xfer_size = 0;
7663 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7664 xfer_size = size;
7665 } else {
7666 if (!isVectorUPL) {
7667 upl_unlock(upl);
7668 } else {
7669 upl_unlock(vector_upl);
7670 }
7671 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7672 kr = KERN_FAILURE;
7673 goto done;
7674 }
7675 if (upl->flags & UPL_SET_DIRTY) {
7676 flags |= UPL_COMMIT_SET_DIRTY;
7677 }
7678 if (upl->flags & UPL_CLEAR_DIRTY) {
7679 flags |= UPL_COMMIT_CLEAR_DIRTY;
7680 }
7681
7682 object = upl->map_object;
7683
7684 if (upl->flags & UPL_SHADOWED) {
7685 vm_object_lock(object);
7686 shadow_object = object->shadow;
7687 } else {
7688 shadow_object = object;
7689 }
7690 entry = offset / PAGE_SIZE;
7691 target_offset = (vm_object_offset_t)offset;
7692
7693 if (upl->flags & UPL_KERNEL_OBJECT) {
7694 vm_object_lock_shared(shadow_object);
7695 } else {
7696 vm_object_lock(shadow_object);
7697 }
7698
7699 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7700
7701 if (upl->flags & UPL_ACCESS_BLOCKED) {
7702 assert(shadow_object->blocked_access);
7703 shadow_object->blocked_access = FALSE;
7704 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7705 }
7706
7707 if (shadow_object->code_signed) {
7708 /*
7709 * CODE SIGNING:
7710 * If the object is code-signed, do not let this UPL tell
7711 * us if the pages are valid or not. Let the pages be
7712 * validated by VM the normal way (when they get mapped or
7713 * copied).
7714 */
7715 flags &= ~UPL_COMMIT_CS_VALIDATED;
7716 }
7717 if (!page_list) {
7718 /*
7719 * No page list to get the code-signing info from !?
7720 */
7721 flags &= ~UPL_COMMIT_CS_VALIDATED;
7722 }
7723 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7724 should_be_throttled = TRUE;
7725 }
7726
7727 if ((upl->flags & UPL_IO_WIRE) &&
7728 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7729 !isVectorUPL &&
7730 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7731 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7732 if (!vm_page_queue_empty(&shadow_object->memq)) {
7733 if (size == shadow_object->vo_size) {
7734 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7735 fast_path_full_commit = 1;
7736 }
7737 fast_path_possible = 1;
7738
7739 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7740 (shadow_object->purgable == VM_PURGABLE_DENY ||
7741 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7742 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7743 throttle_page = 1;
7744 }
7745 }
7746 }
7747 first_local = VM_PAGE_NULL;
7748 last_local = VM_PAGE_NULL;
7749
7750 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7751 obj_end = obj_start + xfer_size;
7752 obj_start = vm_object_trunc_page(obj_start);
7753 obj_end = vm_object_round_page(obj_end);
7754 for (obj_offset = obj_start;
7755 obj_offset < obj_end;
7756 obj_offset += PAGE_SIZE) {
7757 vm_page_t t, m;
7758
7759 dwp->dw_mask = 0;
7760 clear_refmod = 0;
7761
7762 m = VM_PAGE_NULL;
7763
7764 if (upl->flags & UPL_LITE) {
7765 unsigned int pg_num;
7766
7767 if (nxt_page != VM_PAGE_NULL) {
7768 m = nxt_page;
7769 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7770 target_offset = m->vmp_offset;
7771 }
7772 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7773 assert(pg_num == target_offset / PAGE_SIZE);
7774
7775 if (bitmap_test(upl->lite_list, pg_num)) {
7776 bitmap_clear(upl->lite_list, pg_num);
7777
7778 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7779 m = vm_page_lookup(shadow_object, obj_offset);
7780 }
7781 } else {
7782 m = NULL;
7783 }
7784 }
7785 if (upl->flags & UPL_SHADOWED) {
7786 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7787 t->vmp_free_when_done = FALSE;
7788
7789 VM_PAGE_FREE(t);
7790
7791 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7792 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7793 }
7794 }
7795 }
7796 if (m == VM_PAGE_NULL) {
7797 goto commit_next_page;
7798 }
7799
7800 m_object = VM_PAGE_OBJECT(m);
7801
7802 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7803 assert(m->vmp_busy);
7804
7805 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7806 goto commit_next_page;
7807 }
7808
7809 if (flags & UPL_COMMIT_CS_VALIDATED) {
7810 /*
7811 * CODE SIGNING:
7812 * Set the code signing bits according to
7813 * what the UPL says they should be.
7814 */
7815 m->vmp_cs_validated |= page_list[entry].cs_validated;
7816 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7817 m->vmp_cs_nx |= page_list[entry].cs_nx;
7818 }
7819 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7820 m->vmp_written_by_kernel = TRUE;
7821 }
7822
7823 if (upl->flags & UPL_IO_WIRE) {
7824 if (page_list) {
7825 page_list[entry].phys_addr = 0;
7826 }
7827
7828 if (flags & UPL_COMMIT_SET_DIRTY) {
7829 SET_PAGE_DIRTY(m, FALSE);
7830 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7831 m->vmp_dirty = FALSE;
7832
7833 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7834 m->vmp_cs_validated &&
7835 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7836 /*
7837 * CODE SIGNING:
7838 * This page is no longer dirty
7839 * but could have been modified,
7840 * so it will need to be
7841 * re-validated.
7842 */
7843 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7844
7845 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7846
7847 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7848 }
7849 clear_refmod |= VM_MEM_MODIFIED;
7850 }
7851 if (upl->flags & UPL_ACCESS_BLOCKED) {
7852 /*
7853 * We blocked access to the pages in this UPL.
7854 * Clear the "busy" bit and wake up any waiter
7855 * for this page.
7856 */
7857 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7858 }
7859 if (fast_path_possible) {
7860 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7861 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7862 if (m->vmp_absent) {
7863 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7864 assert(m->vmp_wire_count == 0);
7865 assert(m->vmp_busy);
7866
7867 m->vmp_absent = FALSE;
7868 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7869 } else {
7870 if (m->vmp_wire_count == 0) {
7871 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7872 }
7873 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7874
7875 /*
7876 * XXX FBDP need to update some other
7877 * counters here (purgeable_wired_count)
7878 * (ledgers), ...
7879 */
7880 assert(m->vmp_wire_count > 0);
7881 m->vmp_wire_count--;
7882
7883 if (m->vmp_wire_count == 0) {
7884 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7885 unwired_count++;
7886 }
7887 }
7888 if (m->vmp_wire_count == 0) {
7889 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7890
7891 if (last_local == VM_PAGE_NULL) {
7892 assert(first_local == VM_PAGE_NULL);
7893
7894 last_local = m;
7895 first_local = m;
7896 } else {
7897 assert(first_local != VM_PAGE_NULL);
7898
7899 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7900 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7901 first_local = m;
7902 }
7903 local_queue_count++;
7904
7905 if (throttle_page) {
7906 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7907 } else {
7908 if (flags & UPL_COMMIT_INACTIVATE) {
7909 if (shadow_object->internal) {
7910 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7911 } else {
7912 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7913 }
7914 } else {
7915 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7916 }
7917 }
7918 }
7919 } else {
7920 if (flags & UPL_COMMIT_INACTIVATE) {
7921 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7922 clear_refmod |= VM_MEM_REFERENCED;
7923 }
7924 if (m->vmp_absent) {
7925 if (flags & UPL_COMMIT_FREE_ABSENT) {
7926 dwp->dw_mask |= DW_vm_page_free;
7927 } else {
7928 m->vmp_absent = FALSE;
7929 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7930
7931 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7932 dwp->dw_mask |= DW_vm_page_activate;
7933 }
7934 }
7935 } else {
7936 dwp->dw_mask |= DW_vm_page_unwire;
7937 }
7938 }
7939 goto commit_next_page;
7940 }
7941 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7942
7943 if (page_list) {
7944 page_list[entry].phys_addr = 0;
7945 }
7946
7947 /*
7948 * make sure to clear the hardware
7949 * modify or reference bits before
7950 * releasing the BUSY bit on this page
7951 * otherwise we risk losing a legitimate
7952 * change of state
7953 */
7954 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7955 m->vmp_dirty = FALSE;
7956
7957 clear_refmod |= VM_MEM_MODIFIED;
7958 }
7959 if (m->vmp_laundry) {
7960 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7961 }
7962
7963 if (VM_PAGE_WIRED(m)) {
7964 m->vmp_free_when_done = FALSE;
7965 }
7966
7967 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7968 m->vmp_cs_validated &&
7969 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7970 /*
7971 * CODE SIGNING:
7972 * This page is no longer dirty
7973 * but could have been modified,
7974 * so it will need to be
7975 * re-validated.
7976 */
7977 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7978
7979 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7980
7981 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7982 }
7983 if (m->vmp_overwriting) {
7984 /*
7985 * the (COPY_OUT_FROM == FALSE) request_page_list case
7986 */
7987 if (m->vmp_busy) {
7988 #if CONFIG_PHANTOM_CACHE
7989 if (m->vmp_absent && !m_object->internal) {
7990 dwp->dw_mask |= DW_vm_phantom_cache_update;
7991 }
7992 #endif
7993 m->vmp_absent = FALSE;
7994
7995 dwp->dw_mask |= DW_clear_busy;
7996 } else {
7997 /*
7998 * alternate (COPY_OUT_FROM == FALSE) page_list case
7999 * Occurs when the original page was wired
8000 * at the time of the list request
8001 */
8002 assert(VM_PAGE_WIRED(m));
8003
8004 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8005 }
8006 m->vmp_overwriting = FALSE;
8007 }
8008 m->vmp_cleaning = FALSE;
8009
8010 if (m->vmp_free_when_done) {
8011 /*
8012 * With the clean queue enabled, UPL_PAGEOUT should
8013 * no longer set the pageout bit. Its pages now go
8014 * to the clean queue.
8015 *
8016 * We don't use the cleaned Q anymore and so this
8017 * assert isn't correct. The code for the clean Q
8018 * still exists and might be used in the future. If we
8019 * go back to the cleaned Q, we will re-enable this
8020 * assert.
8021 *
8022 * assert(!(upl->flags & UPL_PAGEOUT));
8023 */
8024 assert(!m_object->internal);
8025
8026 m->vmp_free_when_done = FALSE;
8027
8028 if ((flags & UPL_COMMIT_SET_DIRTY) ||
8029 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8030 /*
8031 * page was re-dirtied after we started
8032 * the pageout... reactivate it since
8033 * we don't know whether the on-disk
8034 * copy matches what is now in memory
8035 */
8036 SET_PAGE_DIRTY(m, FALSE);
8037
8038 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8039
8040 if (upl->flags & UPL_PAGEOUT) {
8041 counter_inc(&vm_statistics_reactivations);
8042 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8043 }
8044 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8045 /*
8046 * Someone else might still be handling this
8047 * page (vm_fault() for example), so let's not
8048 * free it or "un-busy" it!
8049 * Put that page in the "speculative" queue
8050 * for now (since we would otherwise have freed
8051 * it) and let whoever is keeping the page
8052 * "busy" move it if needed when they're done
8053 * with it.
8054 */
8055 dwp->dw_mask |= DW_vm_page_speculate;
8056 } else {
8057 /*
8058 * page has been successfully cleaned
8059 * go ahead and free it for other use
8060 */
8061 if (m_object->internal) {
8062 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8063 } else {
8064 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8065 }
8066 m->vmp_dirty = FALSE;
8067 if (!(upl->flags & UPL_HAS_BUSY)) {
8068 assert(!m->vmp_busy);
8069 }
8070 m->vmp_busy = TRUE;
8071
8072 dwp->dw_mask |= DW_vm_page_free;
8073 }
8074 goto commit_next_page;
8075 }
8076 /*
8077 * It is a part of the semantic of COPYOUT_FROM
8078 * UPLs that a commit implies cache sync
8079 * between the vm page and the backing store
8080 * this can be used to strip the precious bit
8081 * as well as clean
8082 */
8083 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8084 m->vmp_precious = FALSE;
8085 }
8086
8087 if (flags & UPL_COMMIT_SET_DIRTY) {
8088 SET_PAGE_DIRTY(m, FALSE);
8089 } else {
8090 m->vmp_dirty = FALSE;
8091 }
8092
8093 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8094 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8095 pgpgout_count++;
8096
8097 counter_inc(&vm_statistics_pageouts);
8098 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8099
8100 dwp->dw_mask |= DW_enqueue_cleaned;
8101 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8102 /*
8103 * page coming back in from being 'frozen'...
8104 * it was dirty before it was frozen, so keep it so
8105 * the vm_page_activate will notice that it really belongs
8106 * on the throttle queue and put it there
8107 */
8108 SET_PAGE_DIRTY(m, FALSE);
8109 dwp->dw_mask |= DW_vm_page_activate;
8110 } else {
8111 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8112 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8113 clear_refmod |= VM_MEM_REFERENCED;
8114 } else if (!VM_PAGE_PAGEABLE(m)) {
8115 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8116 dwp->dw_mask |= DW_vm_page_speculate;
8117 } else if (m->vmp_reference) {
8118 dwp->dw_mask |= DW_vm_page_activate;
8119 } else {
8120 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8121 clear_refmod |= VM_MEM_REFERENCED;
8122 }
8123 }
8124 }
8125 if (upl->flags & UPL_ACCESS_BLOCKED) {
8126 /*
8127 * We blocked access to the pages in this URL.
8128 * Clear the "busy" bit on this page before we
8129 * wake up any waiter.
8130 */
8131 dwp->dw_mask |= DW_clear_busy;
8132 }
8133 /*
8134 * Wakeup any thread waiting for the page to be un-cleaning.
8135 */
8136 dwp->dw_mask |= DW_PAGE_WAKEUP;
8137
8138 commit_next_page:
8139 if (clear_refmod) {
8140 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8141 }
8142
8143 target_offset += PAGE_SIZE_64;
8144 xfer_size -= PAGE_SIZE;
8145 entry++;
8146
8147 if (dwp->dw_mask) {
8148 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8149 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8150
8151 if (dw_count >= dw_limit) {
8152 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8153
8154 dwp = dwp_start;
8155 dw_count = 0;
8156 }
8157 } else {
8158 if (dwp->dw_mask & DW_clear_busy) {
8159 m->vmp_busy = FALSE;
8160 }
8161
8162 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8163 PAGE_WAKEUP(m);
8164 }
8165 }
8166 }
8167 }
8168 if (dw_count) {
8169 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8170 dwp = dwp_start;
8171 dw_count = 0;
8172 }
8173
8174 if (fast_path_possible) {
8175 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8176 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8177
8178 if (local_queue_count || unwired_count) {
8179 if (local_queue_count) {
8180 vm_page_t first_target;
8181 vm_page_queue_head_t *target_queue;
8182
8183 if (throttle_page) {
8184 target_queue = &vm_page_queue_throttled;
8185 } else {
8186 if (flags & UPL_COMMIT_INACTIVATE) {
8187 if (shadow_object->internal) {
8188 target_queue = &vm_page_queue_anonymous;
8189 } else {
8190 target_queue = &vm_page_queue_inactive;
8191 }
8192 } else {
8193 target_queue = &vm_page_queue_active;
8194 }
8195 }
8196 /*
8197 * Transfer the entire local queue to a regular LRU page queues.
8198 */
8199 vm_page_lockspin_queues();
8200
8201 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8202
8203 if (vm_page_queue_empty(target_queue)) {
8204 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8205 } else {
8206 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8207 }
8208
8209 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8210 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8211 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8212
8213 /*
8214 * Adjust the global page counts.
8215 */
8216 if (throttle_page) {
8217 vm_page_throttled_count += local_queue_count;
8218 } else {
8219 if (flags & UPL_COMMIT_INACTIVATE) {
8220 if (shadow_object->internal) {
8221 vm_page_anonymous_count += local_queue_count;
8222 }
8223 vm_page_inactive_count += local_queue_count;
8224
8225 token_new_pagecount += local_queue_count;
8226 } else {
8227 vm_page_active_count += local_queue_count;
8228 }
8229
8230 if (shadow_object->internal) {
8231 vm_page_pageable_internal_count += local_queue_count;
8232 } else {
8233 vm_page_pageable_external_count += local_queue_count;
8234 }
8235 }
8236 } else {
8237 vm_page_lockspin_queues();
8238 }
8239 if (unwired_count) {
8240 vm_page_wire_count -= unwired_count;
8241 VM_CHECK_MEMORYSTATUS;
8242 }
8243 vm_page_unlock_queues();
8244
8245 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8246 }
8247 }
8248
8249 if (upl->flags & UPL_DEVICE_MEMORY) {
8250 occupied = 0;
8251 } else if (upl->flags & UPL_LITE) {
8252 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8253
8254 occupied = !fast_path_full_commit &&
8255 !bitmap_is_empty(upl->lite_list, pages);
8256 } else {
8257 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8258 }
8259 if (occupied == 0) {
8260 /*
8261 * If this UPL element belongs to a Vector UPL and is
8262 * empty, then this is the right function to deallocate
8263 * it. So go ahead set the *empty variable. The flag
8264 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8265 * should be considered relevant for the Vector UPL and not
8266 * the internal UPLs.
8267 */
8268 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8269 *empty = TRUE;
8270 }
8271
8272 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8273 /*
8274 * this is not a paging object
8275 * so we need to drop the paging reference
8276 * that was taken when we created the UPL
8277 * against this object
8278 */
8279 vm_object_activity_end(shadow_object);
8280 vm_object_collapse(shadow_object, 0, TRUE);
8281 } else {
8282 /*
8283 * we dontated the paging reference to
8284 * the map object... vm_pageout_object_terminate
8285 * will drop this reference
8286 */
8287 }
8288 }
8289 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8290 vm_object_unlock(shadow_object);
8291 if (object != shadow_object) {
8292 vm_object_unlock(object);
8293 }
8294
8295 if (!isVectorUPL) {
8296 upl_unlock(upl);
8297 } else {
8298 /*
8299 * If we completed our operations on an UPL that is
8300 * part of a Vectored UPL and if empty is TRUE, then
8301 * we should go ahead and deallocate this UPL element.
8302 * Then we check if this was the last of the UPL elements
8303 * within that Vectored UPL. If so, set empty to TRUE
8304 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8305 * can go ahead and deallocate the Vector UPL too.
8306 */
8307 if (*empty == TRUE) {
8308 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8309 upl_deallocate(upl);
8310 }
8311 goto process_upl_to_commit;
8312 }
8313 if (pgpgout_count) {
8314 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8315 }
8316
8317 kr = KERN_SUCCESS;
8318 done:
8319 if (dwp_start && dwp_finish_ctx) {
8320 vm_page_delayed_work_finish_ctx(dwp_start);
8321 dwp_start = dwp = NULL;
8322 }
8323
8324 return kr;
8325 }
8326
8327 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8328 upl_abort_range(
8329 upl_t upl,
8330 upl_offset_t offset,
8331 upl_size_t size,
8332 int error,
8333 boolean_t *empty)
8334 {
8335 upl_size_t xfer_size, subupl_size;
8336 vm_object_t shadow_object;
8337 vm_object_t object;
8338 vm_object_offset_t target_offset;
8339 upl_offset_t subupl_offset = offset;
8340 int occupied;
8341 struct vm_page_delayed_work dw_array;
8342 struct vm_page_delayed_work *dwp, *dwp_start;
8343 bool dwp_finish_ctx = TRUE;
8344 int dw_count;
8345 int dw_limit;
8346 int isVectorUPL = 0;
8347 upl_t vector_upl = NULL;
8348 vm_object_offset_t obj_start, obj_end, obj_offset;
8349 kern_return_t kr = KERN_SUCCESS;
8350
8351 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8352
8353 dwp_start = dwp = NULL;
8354
8355 subupl_size = size;
8356 *empty = FALSE;
8357
8358 if (upl == UPL_NULL) {
8359 return KERN_INVALID_ARGUMENT;
8360 }
8361
8362 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8363 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8364 }
8365
8366 dw_count = 0;
8367 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8368 dwp_start = vm_page_delayed_work_get_ctx();
8369 if (dwp_start == NULL) {
8370 dwp_start = &dw_array;
8371 dw_limit = 1;
8372 dwp_finish_ctx = FALSE;
8373 }
8374
8375 dwp = dwp_start;
8376
8377 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8378 vector_upl = upl;
8379 upl_lock(vector_upl);
8380 } else {
8381 upl_lock(upl);
8382 }
8383
8384 process_upl_to_abort:
8385 if (isVectorUPL) {
8386 size = subupl_size;
8387 offset = subupl_offset;
8388 if (size == 0) {
8389 upl_unlock(vector_upl);
8390 kr = KERN_SUCCESS;
8391 goto done;
8392 }
8393 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8394 if (upl == NULL) {
8395 upl_unlock(vector_upl);
8396 kr = KERN_FAILURE;
8397 goto done;
8398 }
8399 subupl_size -= size;
8400 subupl_offset += size;
8401 }
8402
8403 *empty = FALSE;
8404
8405 #if UPL_DEBUG
8406 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8407 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8408 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8409 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8410 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8411
8412 upl->upl_commit_index++;
8413 }
8414 #endif
8415 if (upl->flags & UPL_DEVICE_MEMORY) {
8416 xfer_size = 0;
8417 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8418 xfer_size = size;
8419 } else {
8420 if (!isVectorUPL) {
8421 upl_unlock(upl);
8422 } else {
8423 upl_unlock(vector_upl);
8424 }
8425 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8426 kr = KERN_FAILURE;
8427 goto done;
8428 }
8429 object = upl->map_object;
8430
8431 if (upl->flags & UPL_SHADOWED) {
8432 vm_object_lock(object);
8433 shadow_object = object->shadow;
8434 } else {
8435 shadow_object = object;
8436 }
8437
8438 target_offset = (vm_object_offset_t)offset;
8439
8440 if (upl->flags & UPL_KERNEL_OBJECT) {
8441 vm_object_lock_shared(shadow_object);
8442 } else {
8443 vm_object_lock(shadow_object);
8444 }
8445
8446 if (upl->flags & UPL_ACCESS_BLOCKED) {
8447 assert(shadow_object->blocked_access);
8448 shadow_object->blocked_access = FALSE;
8449 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8450 }
8451
8452 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8453 panic("upl_abort_range: kernel_object being DUMPED");
8454 }
8455
8456 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8457 obj_end = obj_start + xfer_size;
8458 obj_start = vm_object_trunc_page(obj_start);
8459 obj_end = vm_object_round_page(obj_end);
8460 for (obj_offset = obj_start;
8461 obj_offset < obj_end;
8462 obj_offset += PAGE_SIZE) {
8463 vm_page_t t, m;
8464 unsigned int pg_num;
8465 boolean_t needed;
8466
8467 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8468 assert(pg_num == target_offset / PAGE_SIZE);
8469
8470 needed = FALSE;
8471
8472 if (upl->flags & UPL_INTERNAL) {
8473 needed = upl->page_list[pg_num].needed;
8474 }
8475
8476 dwp->dw_mask = 0;
8477 m = VM_PAGE_NULL;
8478
8479 if (upl->flags & UPL_LITE) {
8480 if (bitmap_test(upl->lite_list, pg_num)) {
8481 bitmap_clear(upl->lite_list, pg_num);
8482
8483 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8484 m = vm_page_lookup(shadow_object, obj_offset);
8485 }
8486 }
8487 }
8488 if (upl->flags & UPL_SHADOWED) {
8489 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8490 t->vmp_free_when_done = FALSE;
8491
8492 VM_PAGE_FREE(t);
8493
8494 if (m == VM_PAGE_NULL) {
8495 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8496 }
8497 }
8498 }
8499 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8500 goto abort_next_page;
8501 }
8502
8503 if (m != VM_PAGE_NULL) {
8504 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8505
8506 if (m->vmp_absent) {
8507 boolean_t must_free = TRUE;
8508
8509 /*
8510 * COPYOUT = FALSE case
8511 * check for error conditions which must
8512 * be passed back to the pages customer
8513 */
8514 if (error & UPL_ABORT_RESTART) {
8515 m->vmp_restart = TRUE;
8516 m->vmp_absent = FALSE;
8517 m->vmp_unusual = TRUE;
8518 must_free = FALSE;
8519 } else if (error & UPL_ABORT_UNAVAILABLE) {
8520 m->vmp_restart = FALSE;
8521 m->vmp_unusual = TRUE;
8522 must_free = FALSE;
8523 } else if (error & UPL_ABORT_ERROR) {
8524 m->vmp_restart = FALSE;
8525 m->vmp_absent = FALSE;
8526 m->vmp_error = TRUE;
8527 m->vmp_unusual = TRUE;
8528 must_free = FALSE;
8529 }
8530 if (m->vmp_clustered && needed == FALSE) {
8531 /*
8532 * This page was a part of a speculative
8533 * read-ahead initiated by the kernel
8534 * itself. No one is expecting this
8535 * page and no one will clean up its
8536 * error state if it ever becomes valid
8537 * in the future.
8538 * We have to free it here.
8539 */
8540 must_free = TRUE;
8541 }
8542 m->vmp_cleaning = FALSE;
8543
8544 if (m->vmp_overwriting && !m->vmp_busy) {
8545 /*
8546 * this shouldn't happen since
8547 * this is an 'absent' page, but
8548 * it doesn't hurt to check for
8549 * the 'alternate' method of
8550 * stabilizing the page...
8551 * we will mark 'busy' to be cleared
8552 * in the following code which will
8553 * take care of the primary stabilzation
8554 * method (i.e. setting 'busy' to TRUE)
8555 */
8556 dwp->dw_mask |= DW_vm_page_unwire;
8557 }
8558 m->vmp_overwriting = FALSE;
8559
8560 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8561
8562 if (must_free == TRUE) {
8563 dwp->dw_mask |= DW_vm_page_free;
8564 } else {
8565 dwp->dw_mask |= DW_vm_page_activate;
8566 }
8567 } else {
8568 /*
8569 * Handle the trusted pager throttle.
8570 */
8571 if (m->vmp_laundry) {
8572 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8573 }
8574
8575 if (upl->flags & UPL_ACCESS_BLOCKED) {
8576 /*
8577 * We blocked access to the pages in this UPL.
8578 * Clear the "busy" bit and wake up any waiter
8579 * for this page.
8580 */
8581 dwp->dw_mask |= DW_clear_busy;
8582 }
8583 if (m->vmp_overwriting) {
8584 if (m->vmp_busy) {
8585 dwp->dw_mask |= DW_clear_busy;
8586 } else {
8587 /*
8588 * deal with the 'alternate' method
8589 * of stabilizing the page...
8590 * we will either free the page
8591 * or mark 'busy' to be cleared
8592 * in the following code which will
8593 * take care of the primary stabilzation
8594 * method (i.e. setting 'busy' to TRUE)
8595 */
8596 dwp->dw_mask |= DW_vm_page_unwire;
8597 }
8598 m->vmp_overwriting = FALSE;
8599 }
8600 m->vmp_free_when_done = FALSE;
8601 m->vmp_cleaning = FALSE;
8602
8603 if (error & UPL_ABORT_DUMP_PAGES) {
8604 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8605
8606 dwp->dw_mask |= DW_vm_page_free;
8607 } else {
8608 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8609 if (error & UPL_ABORT_REFERENCE) {
8610 /*
8611 * we've been told to explictly
8612 * reference this page... for
8613 * file I/O, this is done by
8614 * implementing an LRU on the inactive q
8615 */
8616 dwp->dw_mask |= DW_vm_page_lru;
8617 } else if (!VM_PAGE_PAGEABLE(m)) {
8618 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8619 }
8620 }
8621 dwp->dw_mask |= DW_PAGE_WAKEUP;
8622 }
8623 }
8624 }
8625 abort_next_page:
8626 target_offset += PAGE_SIZE_64;
8627 xfer_size -= PAGE_SIZE;
8628
8629 if (dwp->dw_mask) {
8630 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8631 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8632
8633 if (dw_count >= dw_limit) {
8634 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8635
8636 dwp = dwp_start;
8637 dw_count = 0;
8638 }
8639 } else {
8640 if (dwp->dw_mask & DW_clear_busy) {
8641 m->vmp_busy = FALSE;
8642 }
8643
8644 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8645 PAGE_WAKEUP(m);
8646 }
8647 }
8648 }
8649 }
8650 if (dw_count) {
8651 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8652 dwp = dwp_start;
8653 dw_count = 0;
8654 }
8655
8656 if (upl->flags & UPL_DEVICE_MEMORY) {
8657 occupied = 0;
8658 } else if (upl->flags & UPL_LITE) {
8659 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8660
8661 occupied = !bitmap_is_empty(upl->lite_list, pages);
8662 } else {
8663 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8664 }
8665 if (occupied == 0) {
8666 /*
8667 * If this UPL element belongs to a Vector UPL and is
8668 * empty, then this is the right function to deallocate
8669 * it. So go ahead set the *empty variable. The flag
8670 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8671 * should be considered relevant for the Vector UPL and
8672 * not the internal UPLs.
8673 */
8674 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8675 *empty = TRUE;
8676 }
8677
8678 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8679 /*
8680 * this is not a paging object
8681 * so we need to drop the paging reference
8682 * that was taken when we created the UPL
8683 * against this object
8684 */
8685 vm_object_activity_end(shadow_object);
8686 vm_object_collapse(shadow_object, 0, TRUE);
8687 } else {
8688 /*
8689 * we dontated the paging reference to
8690 * the map object... vm_pageout_object_terminate
8691 * will drop this reference
8692 */
8693 }
8694 }
8695 vm_object_unlock(shadow_object);
8696 if (object != shadow_object) {
8697 vm_object_unlock(object);
8698 }
8699
8700 if (!isVectorUPL) {
8701 upl_unlock(upl);
8702 } else {
8703 /*
8704 * If we completed our operations on an UPL that is
8705 * part of a Vectored UPL and if empty is TRUE, then
8706 * we should go ahead and deallocate this UPL element.
8707 * Then we check if this was the last of the UPL elements
8708 * within that Vectored UPL. If so, set empty to TRUE
8709 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8710 * can go ahead and deallocate the Vector UPL too.
8711 */
8712 if (*empty == TRUE) {
8713 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8714 upl_deallocate(upl);
8715 }
8716 goto process_upl_to_abort;
8717 }
8718
8719 kr = KERN_SUCCESS;
8720
8721 done:
8722 if (dwp_start && dwp_finish_ctx) {
8723 vm_page_delayed_work_finish_ctx(dwp_start);
8724 dwp_start = dwp = NULL;
8725 }
8726
8727 return kr;
8728 }
8729
8730
8731 kern_return_t
upl_abort(upl_t upl,int error)8732 upl_abort(
8733 upl_t upl,
8734 int error)
8735 {
8736 boolean_t empty;
8737
8738 if (upl == UPL_NULL) {
8739 return KERN_INVALID_ARGUMENT;
8740 }
8741
8742 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8743 }
8744
8745
8746 /* an option on commit should be wire */
8747 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8748 upl_commit(
8749 upl_t upl,
8750 upl_page_info_t *page_list,
8751 mach_msg_type_number_t count)
8752 {
8753 boolean_t empty;
8754
8755 if (upl == UPL_NULL) {
8756 return KERN_INVALID_ARGUMENT;
8757 }
8758
8759 return upl_commit_range(upl, 0, upl->u_size, 0,
8760 page_list, count, &empty);
8761 }
8762
8763
8764 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8765 iopl_valid_data(
8766 upl_t upl,
8767 vm_tag_t tag)
8768 {
8769 vm_object_t object;
8770 vm_offset_t offset;
8771 vm_page_t m, nxt_page = VM_PAGE_NULL;
8772 upl_size_t size;
8773 int wired_count = 0;
8774
8775 if (upl == NULL) {
8776 panic("iopl_valid_data: NULL upl");
8777 }
8778 if (vector_upl_is_valid(upl)) {
8779 panic("iopl_valid_data: vector upl");
8780 }
8781 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8782 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8783 }
8784
8785 object = upl->map_object;
8786
8787 if (object == kernel_object || object == compressor_object) {
8788 panic("iopl_valid_data: object == kernel or compressor");
8789 }
8790
8791 if (object->purgable == VM_PURGABLE_VOLATILE ||
8792 object->purgable == VM_PURGABLE_EMPTY) {
8793 panic("iopl_valid_data: object %p purgable %d",
8794 object, object->purgable);
8795 }
8796
8797 size = upl_adjusted_size(upl, PAGE_MASK);
8798
8799 vm_object_lock(object);
8800 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8801
8802 bool whole_object;
8803
8804 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8805 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8806 whole_object = true;
8807 } else {
8808 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8809 whole_object = false;
8810 }
8811
8812 while (size) {
8813 if (whole_object) {
8814 if (nxt_page != VM_PAGE_NULL) {
8815 m = nxt_page;
8816 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8817 }
8818 } else {
8819 m = vm_page_lookup(object, offset);
8820 offset += PAGE_SIZE;
8821
8822 if (m == VM_PAGE_NULL) {
8823 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8824 }
8825 }
8826 if (m->vmp_busy) {
8827 if (!m->vmp_absent) {
8828 panic("iopl_valid_data: busy page w/o absent");
8829 }
8830
8831 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8832 panic("iopl_valid_data: busy+absent page on page queue");
8833 }
8834 if (m->vmp_reusable) {
8835 panic("iopl_valid_data: %p is reusable", m);
8836 }
8837
8838 m->vmp_absent = FALSE;
8839 m->vmp_dirty = TRUE;
8840 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8841 assert(m->vmp_wire_count == 0);
8842 m->vmp_wire_count++;
8843 assert(m->vmp_wire_count);
8844 if (m->vmp_wire_count == 1) {
8845 m->vmp_q_state = VM_PAGE_IS_WIRED;
8846 wired_count++;
8847 } else {
8848 panic("iopl_valid_data: %p already wired", m);
8849 }
8850
8851 PAGE_WAKEUP_DONE(m);
8852 }
8853 size -= PAGE_SIZE;
8854 }
8855 if (wired_count) {
8856 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8857 assert(object->resident_page_count >= object->wired_page_count);
8858
8859 /* no need to adjust purgeable accounting for this object: */
8860 assert(object->purgable != VM_PURGABLE_VOLATILE);
8861 assert(object->purgable != VM_PURGABLE_EMPTY);
8862
8863 vm_page_lockspin_queues();
8864 vm_page_wire_count += wired_count;
8865 vm_page_unlock_queues();
8866 }
8867 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8868 vm_object_unlock(object);
8869 }
8870
8871
8872 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8873 vm_object_set_pmap_cache_attr(
8874 vm_object_t object,
8875 upl_page_info_array_t user_page_list,
8876 unsigned int num_pages,
8877 boolean_t batch_pmap_op)
8878 {
8879 unsigned int cache_attr = 0;
8880
8881 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8882 assert(user_page_list);
8883 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8884 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8885 }
8886 }
8887
8888
8889 static bool
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8890 vm_object_iopl_wire_full(
8891 vm_object_t object,
8892 upl_t upl,
8893 upl_page_info_array_t user_page_list,
8894 upl_control_flags_t cntrl_flags,
8895 vm_tag_t tag)
8896 {
8897 vm_page_t dst_page;
8898 unsigned int entry;
8899 int page_count;
8900 int delayed_unlock = 0;
8901 boolean_t retval = TRUE;
8902 ppnum_t phys_page;
8903
8904 vm_object_lock_assert_exclusive(object);
8905 assert(object->purgable != VM_PURGABLE_VOLATILE);
8906 assert(object->purgable != VM_PURGABLE_EMPTY);
8907 assert(object->pager == NULL);
8908 assert(object->copy == NULL);
8909 assert(object->shadow == NULL);
8910
8911 page_count = object->resident_page_count;
8912 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8913
8914 vm_page_lock_queues();
8915
8916 while (page_count--) {
8917 if (dst_page->vmp_busy ||
8918 dst_page->vmp_fictitious ||
8919 dst_page->vmp_absent ||
8920 VMP_ERROR_GET(dst_page) ||
8921 dst_page->vmp_cleaning ||
8922 dst_page->vmp_restart ||
8923 dst_page->vmp_laundry) {
8924 retval = FALSE;
8925 goto done;
8926 }
8927 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8928 retval = FALSE;
8929 goto done;
8930 }
8931 dst_page->vmp_reference = TRUE;
8932
8933 vm_page_wire(dst_page, tag, FALSE);
8934
8935 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8936 SET_PAGE_DIRTY(dst_page, FALSE);
8937 }
8938 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8939 assert(entry >= 0 && entry < object->resident_page_count);
8940 bitmap_set(upl->lite_list, entry);
8941
8942 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8943
8944 if (phys_page > upl->highest_page) {
8945 upl->highest_page = phys_page;
8946 }
8947
8948 if (user_page_list) {
8949 user_page_list[entry].phys_addr = phys_page;
8950 user_page_list[entry].absent = dst_page->vmp_absent;
8951 user_page_list[entry].dirty = dst_page->vmp_dirty;
8952 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8953 user_page_list[entry].precious = dst_page->vmp_precious;
8954 user_page_list[entry].device = FALSE;
8955 user_page_list[entry].speculative = FALSE;
8956 user_page_list[entry].cs_validated = FALSE;
8957 user_page_list[entry].cs_tainted = FALSE;
8958 user_page_list[entry].cs_nx = FALSE;
8959 user_page_list[entry].needed = FALSE;
8960 user_page_list[entry].mark = FALSE;
8961 }
8962 if (delayed_unlock++ > 256) {
8963 delayed_unlock = 0;
8964 lck_mtx_yield(&vm_page_queue_lock);
8965
8966 VM_CHECK_MEMORYSTATUS;
8967 }
8968 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8969 }
8970 done:
8971 vm_page_unlock_queues();
8972
8973 VM_CHECK_MEMORYSTATUS;
8974
8975 return retval;
8976 }
8977
8978
8979 static kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8980 vm_object_iopl_wire_empty(
8981 vm_object_t object,
8982 upl_t upl,
8983 upl_page_info_array_t user_page_list,
8984 upl_control_flags_t cntrl_flags,
8985 vm_tag_t tag,
8986 vm_object_offset_t *dst_offset,
8987 int page_count,
8988 int *page_grab_count)
8989 {
8990 vm_page_t dst_page;
8991 boolean_t no_zero_fill = FALSE;
8992 int interruptible;
8993 int pages_wired = 0;
8994 int pages_inserted = 0;
8995 int entry = 0;
8996 uint64_t delayed_ledger_update = 0;
8997 kern_return_t ret = KERN_SUCCESS;
8998 int grab_options;
8999 ppnum_t phys_page;
9000
9001 vm_object_lock_assert_exclusive(object);
9002 assert(object->purgable != VM_PURGABLE_VOLATILE);
9003 assert(object->purgable != VM_PURGABLE_EMPTY);
9004 assert(object->pager == NULL);
9005 assert(object->copy == NULL);
9006 assert(object->shadow == NULL);
9007
9008 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9009 interruptible = THREAD_ABORTSAFE;
9010 } else {
9011 interruptible = THREAD_UNINT;
9012 }
9013
9014 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9015 no_zero_fill = TRUE;
9016 }
9017
9018 grab_options = 0;
9019 #if CONFIG_SECLUDED_MEMORY
9020 if (object->can_grab_secluded) {
9021 grab_options |= VM_PAGE_GRAB_SECLUDED;
9022 }
9023 #endif /* CONFIG_SECLUDED_MEMORY */
9024
9025 while (page_count--) {
9026 while ((dst_page = vm_page_grab_options(grab_options))
9027 == VM_PAGE_NULL) {
9028 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9029
9030 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9031
9032 if (vm_page_wait(interruptible) == FALSE) {
9033 /*
9034 * interrupted case
9035 */
9036 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9037
9038 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9039
9040 ret = MACH_SEND_INTERRUPTED;
9041 goto done;
9042 }
9043 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9044
9045 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9046 }
9047 if (no_zero_fill == FALSE) {
9048 vm_page_zero_fill(dst_page);
9049 } else {
9050 dst_page->vmp_absent = TRUE;
9051 }
9052
9053 dst_page->vmp_reference = TRUE;
9054
9055 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9056 SET_PAGE_DIRTY(dst_page, FALSE);
9057 }
9058 if (dst_page->vmp_absent == FALSE) {
9059 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9060 assert(dst_page->vmp_wire_count == 0);
9061 dst_page->vmp_wire_count++;
9062 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9063 assert(dst_page->vmp_wire_count);
9064 pages_wired++;
9065 PAGE_WAKEUP_DONE(dst_page);
9066 }
9067 pages_inserted++;
9068
9069 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9070
9071 bitmap_set(upl->lite_list, entry);
9072
9073 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9074
9075 if (phys_page > upl->highest_page) {
9076 upl->highest_page = phys_page;
9077 }
9078
9079 if (user_page_list) {
9080 user_page_list[entry].phys_addr = phys_page;
9081 user_page_list[entry].absent = dst_page->vmp_absent;
9082 user_page_list[entry].dirty = dst_page->vmp_dirty;
9083 user_page_list[entry].free_when_done = FALSE;
9084 user_page_list[entry].precious = FALSE;
9085 user_page_list[entry].device = FALSE;
9086 user_page_list[entry].speculative = FALSE;
9087 user_page_list[entry].cs_validated = FALSE;
9088 user_page_list[entry].cs_tainted = FALSE;
9089 user_page_list[entry].cs_nx = FALSE;
9090 user_page_list[entry].needed = FALSE;
9091 user_page_list[entry].mark = FALSE;
9092 }
9093 entry++;
9094 *dst_offset += PAGE_SIZE_64;
9095 }
9096 done:
9097 if (pages_wired) {
9098 vm_page_lockspin_queues();
9099 vm_page_wire_count += pages_wired;
9100 vm_page_unlock_queues();
9101 }
9102 if (pages_inserted) {
9103 if (object->internal) {
9104 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9105 } else {
9106 OSAddAtomic(pages_inserted, &vm_page_external_count);
9107 }
9108 }
9109 if (delayed_ledger_update) {
9110 task_t owner;
9111 int ledger_idx_volatile;
9112 int ledger_idx_nonvolatile;
9113 int ledger_idx_volatile_compressed;
9114 int ledger_idx_nonvolatile_compressed;
9115 boolean_t do_footprint;
9116
9117 owner = VM_OBJECT_OWNER(object);
9118 assert(owner);
9119
9120 vm_object_ledger_tag_ledgers(object,
9121 &ledger_idx_volatile,
9122 &ledger_idx_nonvolatile,
9123 &ledger_idx_volatile_compressed,
9124 &ledger_idx_nonvolatile_compressed,
9125 &do_footprint);
9126
9127 /* more non-volatile bytes */
9128 ledger_credit(owner->ledger,
9129 ledger_idx_nonvolatile,
9130 delayed_ledger_update);
9131 if (do_footprint) {
9132 /* more footprint */
9133 ledger_credit(owner->ledger,
9134 task_ledgers.phys_footprint,
9135 delayed_ledger_update);
9136 }
9137 }
9138
9139 assert(page_grab_count);
9140 *page_grab_count = pages_inserted;
9141
9142 return ret;
9143 }
9144
9145
9146
9147 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9148 vm_object_iopl_request(
9149 vm_object_t object,
9150 vm_object_offset_t offset,
9151 upl_size_t size,
9152 upl_t *upl_ptr,
9153 upl_page_info_array_t user_page_list,
9154 unsigned int *page_list_count,
9155 upl_control_flags_t cntrl_flags,
9156 vm_tag_t tag)
9157 {
9158 vm_page_t dst_page;
9159 vm_object_offset_t dst_offset;
9160 upl_size_t xfer_size;
9161 upl_t upl = NULL;
9162 unsigned int entry;
9163 int no_zero_fill = FALSE;
9164 unsigned int size_in_pages;
9165 int page_grab_count = 0;
9166 u_int32_t psize;
9167 kern_return_t ret;
9168 vm_prot_t prot;
9169 struct vm_object_fault_info fault_info = {};
9170 struct vm_page_delayed_work dw_array;
9171 struct vm_page_delayed_work *dwp, *dwp_start;
9172 bool dwp_finish_ctx = TRUE;
9173 int dw_count;
9174 int dw_limit;
9175 int dw_index;
9176 boolean_t caller_lookup;
9177 int io_tracking_flag = 0;
9178 int interruptible;
9179 ppnum_t phys_page;
9180
9181 boolean_t set_cache_attr_needed = FALSE;
9182 boolean_t free_wired_pages = FALSE;
9183 boolean_t fast_path_empty_req = FALSE;
9184 boolean_t fast_path_full_req = FALSE;
9185
9186 #if DEVELOPMENT || DEBUG
9187 task_t task = current_task();
9188 #endif /* DEVELOPMENT || DEBUG */
9189
9190 dwp_start = dwp = NULL;
9191
9192 vm_object_offset_t original_offset = offset;
9193 upl_size_t original_size = size;
9194
9195 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9196
9197 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9198 offset = vm_object_trunc_page(offset);
9199 if (size != original_size || offset != original_offset) {
9200 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9201 }
9202
9203 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9204 /*
9205 * For forward compatibility's sake,
9206 * reject any unknown flag.
9207 */
9208 return KERN_INVALID_VALUE;
9209 }
9210 if (vm_lopage_needed == FALSE) {
9211 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9212 }
9213
9214 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9215 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9216 return KERN_INVALID_VALUE;
9217 }
9218
9219 if (object->phys_contiguous) {
9220 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9221 return KERN_INVALID_ADDRESS;
9222 }
9223
9224 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9225 return KERN_INVALID_ADDRESS;
9226 }
9227 }
9228 }
9229 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9230 no_zero_fill = TRUE;
9231 }
9232
9233 if (cntrl_flags & UPL_COPYOUT_FROM) {
9234 prot = VM_PROT_READ;
9235 } else {
9236 prot = VM_PROT_READ | VM_PROT_WRITE;
9237 }
9238
9239 if ((!object->internal) && (object->paging_offset != 0)) {
9240 panic("vm_object_iopl_request: external object with non-zero paging offset");
9241 }
9242
9243
9244 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9245
9246 #if CONFIG_IOSCHED || UPL_DEBUG
9247 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9248 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9249 }
9250 #endif
9251
9252 #if CONFIG_IOSCHED
9253 if (object->io_tracking) {
9254 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9255 if (object != kernel_object) {
9256 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9257 }
9258 }
9259 #endif
9260
9261 if (object->phys_contiguous) {
9262 psize = PAGE_SIZE;
9263 } else {
9264 psize = size;
9265
9266 dw_count = 0;
9267 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9268 dwp_start = vm_page_delayed_work_get_ctx();
9269 if (dwp_start == NULL) {
9270 dwp_start = &dw_array;
9271 dw_limit = 1;
9272 dwp_finish_ctx = FALSE;
9273 }
9274
9275 dwp = dwp_start;
9276 }
9277
9278 if (cntrl_flags & UPL_SET_INTERNAL) {
9279 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9280 user_page_list = size ? upl->page_list : NULL;
9281 } else {
9282 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9283 }
9284 if (user_page_list) {
9285 user_page_list[0].device = FALSE;
9286 }
9287 *upl_ptr = upl;
9288
9289 if (cntrl_flags & UPL_NOZEROFILLIO) {
9290 DTRACE_VM4(upl_nozerofillio,
9291 vm_object_t, object,
9292 vm_object_offset_t, offset,
9293 upl_size_t, size,
9294 upl_t, upl);
9295 }
9296
9297 upl->map_object = object;
9298 upl->u_offset = original_offset;
9299 upl->u_size = original_size;
9300
9301 size_in_pages = size / PAGE_SIZE;
9302
9303 if (object == kernel_object &&
9304 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9305 upl->flags |= UPL_KERNEL_OBJECT;
9306 #if UPL_DEBUG
9307 vm_object_lock(object);
9308 #else
9309 vm_object_lock_shared(object);
9310 #endif
9311 } else {
9312 vm_object_lock(object);
9313 vm_object_activity_begin(object);
9314 }
9315 /*
9316 * paging in progress also protects the paging_offset
9317 */
9318 upl->u_offset = original_offset + object->paging_offset;
9319
9320 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9321 /*
9322 * The user requested that access to the pages in this UPL
9323 * be blocked until the UPL is commited or aborted.
9324 */
9325 upl->flags |= UPL_ACCESS_BLOCKED;
9326 }
9327
9328 #if CONFIG_IOSCHED || UPL_DEBUG
9329 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9330 vm_object_activity_begin(object);
9331 queue_enter(&object->uplq, upl, upl_t, uplq);
9332 }
9333 #endif
9334
9335 if (object->phys_contiguous) {
9336 if (upl->flags & UPL_ACCESS_BLOCKED) {
9337 assert(!object->blocked_access);
9338 object->blocked_access = TRUE;
9339 }
9340
9341 vm_object_unlock(object);
9342
9343 /*
9344 * don't need any shadow mappings for this one
9345 * since it is already I/O memory
9346 */
9347 upl->flags |= UPL_DEVICE_MEMORY;
9348
9349 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9350
9351 if (user_page_list) {
9352 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9353 user_page_list[0].device = TRUE;
9354 }
9355 if (page_list_count != NULL) {
9356 if (upl->flags & UPL_INTERNAL) {
9357 *page_list_count = 0;
9358 } else {
9359 *page_list_count = 1;
9360 }
9361 }
9362
9363 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9364 #if DEVELOPMENT || DEBUG
9365 if (task != NULL) {
9366 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9367 }
9368 #endif /* DEVELOPMENT || DEBUG */
9369 return KERN_SUCCESS;
9370 }
9371 if (object != kernel_object && object != compressor_object) {
9372 /*
9373 * Protect user space from future COW operations
9374 */
9375 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9376 if (!object->true_share &&
9377 vm_object_tracking_btlog) {
9378 btlog_record(vm_object_tracking_btlog, object,
9379 VM_OBJECT_TRACKING_OP_TRUESHARE,
9380 btref_get(__builtin_frame_address(0), 0));
9381 }
9382 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9383
9384 vm_object_lock_assert_exclusive(object);
9385 object->true_share = TRUE;
9386
9387 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9388 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9389 }
9390 }
9391
9392 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9393 object->copy != VM_OBJECT_NULL) {
9394 /*
9395 * Honor copy-on-write obligations
9396 *
9397 * The caller is gathering these pages and
9398 * might modify their contents. We need to
9399 * make sure that the copy object has its own
9400 * private copies of these pages before we let
9401 * the caller modify them.
9402 *
9403 * NOTE: someone else could map the original object
9404 * after we've done this copy-on-write here, and they
9405 * could then see an inconsistent picture of the memory
9406 * while it's being modified via the UPL. To prevent this,
9407 * we would have to block access to these pages until the
9408 * UPL is released. We could use the UPL_BLOCK_ACCESS
9409 * code path for that...
9410 */
9411 vm_object_update(object,
9412 offset,
9413 size,
9414 NULL,
9415 NULL,
9416 FALSE, /* should_return */
9417 MEMORY_OBJECT_COPY_SYNC,
9418 VM_PROT_NO_CHANGE);
9419 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9420 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9421 }
9422 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9423 object->purgable != VM_PURGABLE_VOLATILE &&
9424 object->purgable != VM_PURGABLE_EMPTY &&
9425 object->copy == NULL &&
9426 size == object->vo_size &&
9427 offset == 0 &&
9428 object->shadow == NULL &&
9429 object->pager == NULL) {
9430 if (object->resident_page_count == size_in_pages) {
9431 assert(object != compressor_object);
9432 assert(object != kernel_object);
9433 fast_path_full_req = TRUE;
9434 } else if (object->resident_page_count == 0) {
9435 assert(object != compressor_object);
9436 assert(object != kernel_object);
9437 fast_path_empty_req = TRUE;
9438 set_cache_attr_needed = TRUE;
9439 }
9440 }
9441
9442 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9443 interruptible = THREAD_ABORTSAFE;
9444 } else {
9445 interruptible = THREAD_UNINT;
9446 }
9447
9448 entry = 0;
9449
9450 xfer_size = size;
9451 dst_offset = offset;
9452
9453 if (fast_path_full_req) {
9454 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9455 goto finish;
9456 }
9457 /*
9458 * we couldn't complete the processing of this request on the fast path
9459 * so fall through to the slow path and finish up
9460 */
9461 } else if (fast_path_empty_req) {
9462 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9463 ret = KERN_MEMORY_ERROR;
9464 goto return_err;
9465 }
9466 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9467 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9468
9469 if (ret) {
9470 free_wired_pages = TRUE;
9471 goto return_err;
9472 }
9473 goto finish;
9474 }
9475
9476 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9477 fault_info.lo_offset = offset;
9478 fault_info.hi_offset = offset + xfer_size;
9479 fault_info.mark_zf_absent = TRUE;
9480 fault_info.interruptible = interruptible;
9481 fault_info.batch_pmap_op = TRUE;
9482
9483 while (xfer_size) {
9484 vm_fault_return_t result;
9485
9486 dwp->dw_mask = 0;
9487
9488 if (fast_path_full_req) {
9489 /*
9490 * if we get here, it means that we ran into a page
9491 * state we couldn't handle in the fast path and
9492 * bailed out to the slow path... since the order
9493 * we look at pages is different between the 2 paths,
9494 * the following check is needed to determine whether
9495 * this page was already processed in the fast path
9496 */
9497 if (bitmap_test(upl->lite_list, entry)) {
9498 goto skip_page;
9499 }
9500 }
9501 dst_page = vm_page_lookup(object, dst_offset);
9502
9503 if (dst_page == VM_PAGE_NULL ||
9504 dst_page->vmp_busy ||
9505 VMP_ERROR_GET(dst_page) ||
9506 dst_page->vmp_restart ||
9507 dst_page->vmp_absent ||
9508 dst_page->vmp_fictitious) {
9509 if (object == kernel_object) {
9510 panic("vm_object_iopl_request: missing/bad page in kernel object");
9511 }
9512 if (object == compressor_object) {
9513 panic("vm_object_iopl_request: missing/bad page in compressor object");
9514 }
9515
9516 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9517 ret = KERN_MEMORY_ERROR;
9518 goto return_err;
9519 }
9520 set_cache_attr_needed = TRUE;
9521
9522 /*
9523 * We just looked up the page and the result remains valid
9524 * until the object lock is release, so send it to
9525 * vm_fault_page() (as "dst_page"), to avoid having to
9526 * look it up again there.
9527 */
9528 caller_lookup = TRUE;
9529
9530 do {
9531 vm_page_t top_page;
9532 kern_return_t error_code;
9533
9534 fault_info.cluster_size = xfer_size;
9535
9536 vm_object_paging_begin(object);
9537
9538 result = vm_fault_page(object, dst_offset,
9539 prot | VM_PROT_WRITE, FALSE,
9540 caller_lookup,
9541 &prot, &dst_page, &top_page,
9542 (int *)0,
9543 &error_code, no_zero_fill,
9544 &fault_info);
9545
9546 /* our lookup is no longer valid at this point */
9547 caller_lookup = FALSE;
9548
9549 switch (result) {
9550 case VM_FAULT_SUCCESS:
9551 page_grab_count++;
9552
9553 if (!dst_page->vmp_absent) {
9554 PAGE_WAKEUP_DONE(dst_page);
9555 } else {
9556 /*
9557 * we only get back an absent page if we
9558 * requested that it not be zero-filled
9559 * because we are about to fill it via I/O
9560 *
9561 * absent pages should be left BUSY
9562 * to prevent them from being faulted
9563 * into an address space before we've
9564 * had a chance to complete the I/O on
9565 * them since they may contain info that
9566 * shouldn't be seen by the faulting task
9567 */
9568 }
9569 /*
9570 * Release paging references and
9571 * top-level placeholder page, if any.
9572 */
9573 if (top_page != VM_PAGE_NULL) {
9574 vm_object_t local_object;
9575
9576 local_object = VM_PAGE_OBJECT(top_page);
9577
9578 /*
9579 * comparing 2 packed pointers
9580 */
9581 if (top_page->vmp_object != dst_page->vmp_object) {
9582 vm_object_lock(local_object);
9583 VM_PAGE_FREE(top_page);
9584 vm_object_paging_end(local_object);
9585 vm_object_unlock(local_object);
9586 } else {
9587 VM_PAGE_FREE(top_page);
9588 vm_object_paging_end(local_object);
9589 }
9590 }
9591 vm_object_paging_end(object);
9592 break;
9593
9594 case VM_FAULT_RETRY:
9595 vm_object_lock(object);
9596 break;
9597
9598 case VM_FAULT_MEMORY_SHORTAGE:
9599 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9600
9601 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9602
9603 if (vm_page_wait(interruptible)) {
9604 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9605
9606 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9607 vm_object_lock(object);
9608
9609 break;
9610 }
9611 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9612
9613 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9614 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9615 OS_FALLTHROUGH;
9616
9617 case VM_FAULT_INTERRUPTED:
9618 error_code = MACH_SEND_INTERRUPTED;
9619 OS_FALLTHROUGH;
9620 case VM_FAULT_MEMORY_ERROR:
9621 memory_error:
9622 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9623
9624 vm_object_lock(object);
9625 goto return_err;
9626
9627 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9628 /* success but no page: fail */
9629 vm_object_paging_end(object);
9630 vm_object_unlock(object);
9631 goto memory_error;
9632
9633 default:
9634 panic("vm_object_iopl_request: unexpected error"
9635 " 0x%x from vm_fault_page()\n", result);
9636 }
9637 } while (result != VM_FAULT_SUCCESS);
9638 }
9639 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9640
9641 if (upl->flags & UPL_KERNEL_OBJECT) {
9642 goto record_phys_addr;
9643 }
9644
9645 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9646 dst_page->vmp_busy = TRUE;
9647 goto record_phys_addr;
9648 }
9649
9650 if (dst_page->vmp_cleaning) {
9651 /*
9652 * Someone else is cleaning this page in place.
9653 * In theory, we should be able to proceed and use this
9654 * page but they'll probably end up clearing the "busy"
9655 * bit on it in upl_commit_range() but they didn't set
9656 * it, so they would clear our "busy" bit and open
9657 * us to race conditions.
9658 * We'd better wait for the cleaning to complete and
9659 * then try again.
9660 */
9661 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9662 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9663 continue;
9664 }
9665 if (dst_page->vmp_laundry) {
9666 vm_pageout_steal_laundry(dst_page, FALSE);
9667 }
9668
9669 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9670 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9671 vm_page_t low_page;
9672 int refmod;
9673
9674 /*
9675 * support devices that can't DMA above 32 bits
9676 * by substituting pages from a pool of low address
9677 * memory for any pages we find above the 4G mark
9678 * can't substitute if the page is already wired because
9679 * we don't know whether that physical address has been
9680 * handed out to some other 64 bit capable DMA device to use
9681 */
9682 if (VM_PAGE_WIRED(dst_page)) {
9683 ret = KERN_PROTECTION_FAILURE;
9684 goto return_err;
9685 }
9686 low_page = vm_page_grablo();
9687
9688 if (low_page == VM_PAGE_NULL) {
9689 ret = KERN_RESOURCE_SHORTAGE;
9690 goto return_err;
9691 }
9692 /*
9693 * from here until the vm_page_replace completes
9694 * we musn't drop the object lock... we don't
9695 * want anyone refaulting this page in and using
9696 * it after we disconnect it... we want the fault
9697 * to find the new page being substituted.
9698 */
9699 if (dst_page->vmp_pmapped) {
9700 refmod = pmap_disconnect(phys_page);
9701 } else {
9702 refmod = 0;
9703 }
9704
9705 if (!dst_page->vmp_absent) {
9706 vm_page_copy(dst_page, low_page);
9707 }
9708
9709 low_page->vmp_reference = dst_page->vmp_reference;
9710 low_page->vmp_dirty = dst_page->vmp_dirty;
9711 low_page->vmp_absent = dst_page->vmp_absent;
9712
9713 if (refmod & VM_MEM_REFERENCED) {
9714 low_page->vmp_reference = TRUE;
9715 }
9716 if (refmod & VM_MEM_MODIFIED) {
9717 SET_PAGE_DIRTY(low_page, FALSE);
9718 }
9719
9720 vm_page_replace(low_page, object, dst_offset);
9721
9722 dst_page = low_page;
9723 /*
9724 * vm_page_grablo returned the page marked
9725 * BUSY... we don't need a PAGE_WAKEUP_DONE
9726 * here, because we've never dropped the object lock
9727 */
9728 if (!dst_page->vmp_absent) {
9729 dst_page->vmp_busy = FALSE;
9730 }
9731
9732 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9733 }
9734 if (!dst_page->vmp_busy) {
9735 dwp->dw_mask |= DW_vm_page_wire;
9736 }
9737
9738 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9739 /*
9740 * Mark the page "busy" to block any future page fault
9741 * on this page in addition to wiring it.
9742 * We'll also remove the mapping
9743 * of all these pages before leaving this routine.
9744 */
9745 assert(!dst_page->vmp_fictitious);
9746 dst_page->vmp_busy = TRUE;
9747 }
9748 /*
9749 * expect the page to be used
9750 * page queues lock must be held to set 'reference'
9751 */
9752 dwp->dw_mask |= DW_set_reference;
9753
9754 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9755 SET_PAGE_DIRTY(dst_page, TRUE);
9756 /*
9757 * Page belonging to a code-signed object is about to
9758 * be written. Mark it tainted and disconnect it from
9759 * all pmaps so processes have to fault it back in and
9760 * deal with the tainted bit.
9761 */
9762 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9763 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9764 vm_page_iopl_tainted++;
9765 if (dst_page->vmp_pmapped) {
9766 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9767 if (refmod & VM_MEM_REFERENCED) {
9768 dst_page->vmp_reference = TRUE;
9769 }
9770 }
9771 }
9772 }
9773 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9774 pmap_sync_page_attributes_phys(phys_page);
9775 dst_page->vmp_written_by_kernel = FALSE;
9776 }
9777
9778 record_phys_addr:
9779 if (dst_page->vmp_busy) {
9780 upl->flags |= UPL_HAS_BUSY;
9781 }
9782
9783 bitmap_set(upl->lite_list, entry);
9784
9785 if (phys_page > upl->highest_page) {
9786 upl->highest_page = phys_page;
9787 }
9788
9789 if (user_page_list) {
9790 user_page_list[entry].phys_addr = phys_page;
9791 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9792 user_page_list[entry].absent = dst_page->vmp_absent;
9793 user_page_list[entry].dirty = dst_page->vmp_dirty;
9794 user_page_list[entry].precious = dst_page->vmp_precious;
9795 user_page_list[entry].device = FALSE;
9796 user_page_list[entry].needed = FALSE;
9797 if (dst_page->vmp_clustered == TRUE) {
9798 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9799 } else {
9800 user_page_list[entry].speculative = FALSE;
9801 }
9802 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9803 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9804 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9805 user_page_list[entry].mark = FALSE;
9806 }
9807 if (object != kernel_object && object != compressor_object) {
9808 /*
9809 * someone is explicitly grabbing this page...
9810 * update clustered and speculative state
9811 *
9812 */
9813 if (dst_page->vmp_clustered) {
9814 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9815 }
9816 }
9817 skip_page:
9818 entry++;
9819 dst_offset += PAGE_SIZE_64;
9820 xfer_size -= PAGE_SIZE;
9821
9822 if (dwp->dw_mask) {
9823 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9824
9825 if (dw_count >= dw_limit) {
9826 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9827
9828 dwp = dwp_start;
9829 dw_count = 0;
9830 }
9831 }
9832 }
9833 assert(entry == size_in_pages);
9834
9835 if (dw_count) {
9836 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9837 dwp = dwp_start;
9838 dw_count = 0;
9839 }
9840 finish:
9841 if (user_page_list && set_cache_attr_needed == TRUE) {
9842 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9843 }
9844
9845 if (page_list_count != NULL) {
9846 if (upl->flags & UPL_INTERNAL) {
9847 *page_list_count = 0;
9848 } else if (*page_list_count > size_in_pages) {
9849 *page_list_count = size_in_pages;
9850 }
9851 }
9852 vm_object_unlock(object);
9853
9854 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9855 /*
9856 * We've marked all the pages "busy" so that future
9857 * page faults will block.
9858 * Now remove the mapping for these pages, so that they
9859 * can't be accessed without causing a page fault.
9860 */
9861 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9862 PMAP_NULL,
9863 PAGE_SIZE,
9864 0, VM_PROT_NONE);
9865 assert(!object->blocked_access);
9866 object->blocked_access = TRUE;
9867 }
9868
9869 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9870 #if DEVELOPMENT || DEBUG
9871 if (task != NULL) {
9872 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9873 }
9874 #endif /* DEVELOPMENT || DEBUG */
9875
9876 if (dwp_start && dwp_finish_ctx) {
9877 vm_page_delayed_work_finish_ctx(dwp_start);
9878 dwp_start = dwp = NULL;
9879 }
9880
9881 return KERN_SUCCESS;
9882
9883 return_err:
9884 dw_index = 0;
9885
9886 for (; offset < dst_offset; offset += PAGE_SIZE) {
9887 boolean_t need_unwire;
9888
9889 dst_page = vm_page_lookup(object, offset);
9890
9891 if (dst_page == VM_PAGE_NULL) {
9892 panic("vm_object_iopl_request: Wired page missing.");
9893 }
9894
9895 /*
9896 * if we've already processed this page in an earlier
9897 * dw_do_work, we need to undo the wiring... we will
9898 * leave the dirty and reference bits on if they
9899 * were set, since we don't have a good way of knowing
9900 * what the previous state was and we won't get here
9901 * under any normal circumstances... we will always
9902 * clear BUSY and wakeup any waiters via vm_page_free
9903 * or PAGE_WAKEUP_DONE
9904 */
9905 need_unwire = TRUE;
9906
9907 if (dw_count) {
9908 if ((dwp_start)[dw_index].dw_m == dst_page) {
9909 /*
9910 * still in the deferred work list
9911 * which means we haven't yet called
9912 * vm_page_wire on this page
9913 */
9914 need_unwire = FALSE;
9915
9916 dw_index++;
9917 dw_count--;
9918 }
9919 }
9920 vm_page_lock_queues();
9921
9922 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9923 vm_page_free(dst_page);
9924
9925 need_unwire = FALSE;
9926 } else {
9927 if (need_unwire == TRUE) {
9928 vm_page_unwire(dst_page, TRUE);
9929 }
9930
9931 PAGE_WAKEUP_DONE(dst_page);
9932 }
9933 vm_page_unlock_queues();
9934
9935 if (need_unwire == TRUE) {
9936 counter_inc(&vm_statistics_reactivations);
9937 }
9938 }
9939 #if UPL_DEBUG
9940 upl->upl_state = 2;
9941 #endif
9942 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9943 vm_object_activity_end(object);
9944 vm_object_collapse(object, 0, TRUE);
9945 }
9946 vm_object_unlock(object);
9947 upl_destroy(upl);
9948
9949 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9950 #if DEVELOPMENT || DEBUG
9951 if (task != NULL) {
9952 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9953 }
9954 #endif /* DEVELOPMENT || DEBUG */
9955
9956 if (dwp_start && dwp_finish_ctx) {
9957 vm_page_delayed_work_finish_ctx(dwp_start);
9958 dwp_start = dwp = NULL;
9959 }
9960 return ret;
9961 }
9962
9963 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9964 upl_transpose(
9965 upl_t upl1,
9966 upl_t upl2)
9967 {
9968 kern_return_t retval;
9969 boolean_t upls_locked;
9970 vm_object_t object1, object2;
9971
9972 /* LD: Should mapped UPLs be eligible for a transpose? */
9973 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9974 return KERN_INVALID_ARGUMENT;
9975 }
9976
9977 upls_locked = FALSE;
9978
9979 /*
9980 * Since we need to lock both UPLs at the same time,
9981 * avoid deadlocks by always taking locks in the same order.
9982 */
9983 if (upl1 < upl2) {
9984 upl_lock(upl1);
9985 upl_lock(upl2);
9986 } else {
9987 upl_lock(upl2);
9988 upl_lock(upl1);
9989 }
9990 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9991
9992 object1 = upl1->map_object;
9993 object2 = upl2->map_object;
9994
9995 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9996 upl1->u_size != upl2->u_size) {
9997 /*
9998 * We deal only with full objects, not subsets.
9999 * That's because we exchange the entire backing store info
10000 * for the objects: pager, resident pages, etc... We can't do
10001 * only part of it.
10002 */
10003 retval = KERN_INVALID_VALUE;
10004 goto done;
10005 }
10006
10007 /*
10008 * Tranpose the VM objects' backing store.
10009 */
10010 retval = vm_object_transpose(object1, object2,
10011 upl_adjusted_size(upl1, PAGE_MASK));
10012
10013 if (retval == KERN_SUCCESS) {
10014 /*
10015 * Make each UPL point to the correct VM object, i.e. the
10016 * object holding the pages that the UPL refers to...
10017 */
10018 #if CONFIG_IOSCHED || UPL_DEBUG
10019 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10020 vm_object_lock(object1);
10021 vm_object_lock(object2);
10022 }
10023 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10024 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10025 }
10026 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10027 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10028 }
10029 #endif
10030 upl1->map_object = object2;
10031 upl2->map_object = object1;
10032
10033 #if CONFIG_IOSCHED || UPL_DEBUG
10034 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10035 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10036 }
10037 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10038 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10039 }
10040 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10041 vm_object_unlock(object2);
10042 vm_object_unlock(object1);
10043 }
10044 #endif
10045 }
10046
10047 done:
10048 /*
10049 * Cleanup.
10050 */
10051 if (upls_locked) {
10052 upl_unlock(upl1);
10053 upl_unlock(upl2);
10054 upls_locked = FALSE;
10055 }
10056
10057 return retval;
10058 }
10059
10060 void
upl_range_needed(upl_t upl,int index,int count)10061 upl_range_needed(
10062 upl_t upl,
10063 int index,
10064 int count)
10065 {
10066 int size_in_pages;
10067
10068 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10069 return;
10070 }
10071
10072 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10073
10074 while (count-- && index < size_in_pages) {
10075 upl->page_list[index++].needed = TRUE;
10076 }
10077 }
10078
10079
10080 /*
10081 * Reserve of virtual addresses in the kernel address space.
10082 * We need to map the physical pages in the kernel, so that we
10083 * can call the code-signing or slide routines with a kernel
10084 * virtual address. We keep this pool of pre-allocated kernel
10085 * virtual addresses so that we don't have to scan the kernel's
10086 * virtaul address space each time we need to work with
10087 * a physical page.
10088 */
10089 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10090 #define VM_PAGING_NUM_PAGES 64
10091 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10092 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10093 int vm_paging_max_index = 0;
10094 int vm_paging_page_waiter = 0;
10095 int vm_paging_page_waiter_total = 0;
10096
10097 unsigned long vm_paging_no_kernel_page = 0;
10098 unsigned long vm_paging_objects_mapped = 0;
10099 unsigned long vm_paging_pages_mapped = 0;
10100 unsigned long vm_paging_objects_mapped_slow = 0;
10101 unsigned long vm_paging_pages_mapped_slow = 0;
10102
10103 __startup_func
10104 static void
vm_paging_map_init(void)10105 vm_paging_map_init(void)
10106 {
10107 kmem_alloc(kernel_map, &vm_paging_base_address,
10108 ptoa(VM_PAGING_NUM_PAGES),
10109 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10110 VM_KERN_MEMORY_NONE);
10111 }
10112 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10113
10114 /*
10115 * vm_paging_map_object:
10116 * Maps part of a VM object's pages in the kernel
10117 * virtual address space, using the pre-allocated
10118 * kernel virtual addresses, if possible.
10119 * Context:
10120 * The VM object is locked. This lock will get
10121 * dropped and re-acquired though, so the caller
10122 * must make sure the VM object is kept alive
10123 * (by holding a VM map that has a reference
10124 * on it, for example, or taking an extra reference).
10125 * The page should also be kept busy to prevent
10126 * it from being reclaimed.
10127 */
10128 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10129 vm_paging_map_object(
10130 vm_page_t page,
10131 vm_object_t object,
10132 vm_object_offset_t offset,
10133 vm_prot_t protection,
10134 boolean_t can_unlock_object,
10135 vm_map_size_t *size, /* IN/OUT */
10136 vm_map_offset_t *address, /* OUT */
10137 boolean_t *need_unmap) /* OUT */
10138 {
10139 kern_return_t kr;
10140 vm_map_offset_t page_map_offset;
10141 vm_map_size_t map_size;
10142 vm_object_offset_t object_offset;
10143 int i;
10144
10145 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10146 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10147 *address = (vm_map_offset_t)
10148 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10149 *need_unmap = FALSE;
10150 return KERN_SUCCESS;
10151
10152 assert(page->vmp_busy);
10153 /*
10154 * Use one of the pre-allocated kernel virtual addresses
10155 * and just enter the VM page in the kernel address space
10156 * at that virtual address.
10157 */
10158 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10159
10160 /*
10161 * Try and find an available kernel virtual address
10162 * from our pre-allocated pool.
10163 */
10164 page_map_offset = 0;
10165 for (;;) {
10166 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10167 if (vm_paging_page_inuse[i] == FALSE) {
10168 page_map_offset =
10169 vm_paging_base_address +
10170 (i * PAGE_SIZE);
10171 break;
10172 }
10173 }
10174 if (page_map_offset != 0) {
10175 /* found a space to map our page ! */
10176 break;
10177 }
10178
10179 if (can_unlock_object) {
10180 /*
10181 * If we can afford to unlock the VM object,
10182 * let's take the slow path now...
10183 */
10184 break;
10185 }
10186 /*
10187 * We can't afford to unlock the VM object, so
10188 * let's wait for a space to become available...
10189 */
10190 vm_paging_page_waiter_total++;
10191 vm_paging_page_waiter++;
10192 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10193 if (kr == THREAD_WAITING) {
10194 simple_unlock(&vm_paging_lock);
10195 kr = thread_block(THREAD_CONTINUE_NULL);
10196 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10197 }
10198 vm_paging_page_waiter--;
10199 /* ... and try again */
10200 }
10201
10202 if (page_map_offset != 0) {
10203 /*
10204 * We found a kernel virtual address;
10205 * map the physical page to that virtual address.
10206 */
10207 if (i > vm_paging_max_index) {
10208 vm_paging_max_index = i;
10209 }
10210 vm_paging_page_inuse[i] = TRUE;
10211 simple_unlock(&vm_paging_lock);
10212
10213 page->vmp_pmapped = TRUE;
10214
10215 /*
10216 * Keep the VM object locked over the PMAP_ENTER
10217 * and the actual use of the page by the kernel,
10218 * or this pmap mapping might get undone by a
10219 * vm_object_pmap_protect() call...
10220 */
10221 PMAP_ENTER(kernel_pmap,
10222 page_map_offset,
10223 page,
10224 protection,
10225 VM_PROT_NONE,
10226 0,
10227 TRUE,
10228 kr);
10229 assert(kr == KERN_SUCCESS);
10230 vm_paging_objects_mapped++;
10231 vm_paging_pages_mapped++;
10232 *address = page_map_offset;
10233 *need_unmap = TRUE;
10234
10235 #if KASAN
10236 kasan_notify_address(page_map_offset, PAGE_SIZE);
10237 #endif
10238
10239 /* all done and mapped, ready to use ! */
10240 return KERN_SUCCESS;
10241 }
10242
10243 /*
10244 * We ran out of pre-allocated kernel virtual
10245 * addresses. Just map the page in the kernel
10246 * the slow and regular way.
10247 */
10248 vm_paging_no_kernel_page++;
10249 simple_unlock(&vm_paging_lock);
10250 }
10251
10252 if (!can_unlock_object) {
10253 *address = 0;
10254 *size = 0;
10255 *need_unmap = FALSE;
10256 return KERN_NOT_SUPPORTED;
10257 }
10258
10259 object_offset = vm_object_trunc_page(offset);
10260 map_size = vm_map_round_page(*size,
10261 VM_MAP_PAGE_MASK(kernel_map));
10262
10263 /*
10264 * Try and map the required range of the object
10265 * in the kernel_map. Given that allocation is
10266 * for pageable memory, it shouldn't contain
10267 * pointers and is mapped into the data range.
10268 */
10269
10270 vm_object_reference_locked(object); /* for the map entry */
10271 vm_object_unlock(object);
10272
10273 kr = vm_map_enter(kernel_map,
10274 address,
10275 map_size,
10276 0,
10277 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10278 object,
10279 object_offset,
10280 FALSE,
10281 protection,
10282 VM_PROT_ALL,
10283 VM_INHERIT_NONE);
10284 if (kr != KERN_SUCCESS) {
10285 *address = 0;
10286 *size = 0;
10287 *need_unmap = FALSE;
10288 vm_object_deallocate(object); /* for the map entry */
10289 vm_object_lock(object);
10290 return kr;
10291 }
10292
10293 *size = map_size;
10294
10295 /*
10296 * Enter the mapped pages in the page table now.
10297 */
10298 vm_object_lock(object);
10299 /*
10300 * VM object must be kept locked from before PMAP_ENTER()
10301 * until after the kernel is done accessing the page(s).
10302 * Otherwise, the pmap mappings in the kernel could be
10303 * undone by a call to vm_object_pmap_protect().
10304 */
10305
10306 for (page_map_offset = 0;
10307 map_size != 0;
10308 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10309 page = vm_page_lookup(object, offset + page_map_offset);
10310 if (page == VM_PAGE_NULL) {
10311 printf("vm_paging_map_object: no page !?");
10312 vm_object_unlock(object);
10313 vm_map_remove(kernel_map, *address, *size);
10314 *address = 0;
10315 *size = 0;
10316 *need_unmap = FALSE;
10317 vm_object_lock(object);
10318 return KERN_MEMORY_ERROR;
10319 }
10320 page->vmp_pmapped = TRUE;
10321
10322 PMAP_ENTER(kernel_pmap,
10323 *address + page_map_offset,
10324 page,
10325 protection,
10326 VM_PROT_NONE,
10327 0,
10328 TRUE,
10329 kr);
10330 assert(kr == KERN_SUCCESS);
10331 #if KASAN
10332 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10333 #endif
10334 }
10335
10336 vm_paging_objects_mapped_slow++;
10337 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10338
10339 *need_unmap = TRUE;
10340
10341 return KERN_SUCCESS;
10342 }
10343
10344 /*
10345 * vm_paging_unmap_object:
10346 * Unmaps part of a VM object's pages from the kernel
10347 * virtual address space.
10348 * Context:
10349 * The VM object is locked. This lock will get
10350 * dropped and re-acquired though.
10351 */
10352 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10353 vm_paging_unmap_object(
10354 vm_object_t object,
10355 vm_map_offset_t start,
10356 vm_map_offset_t end)
10357 {
10358 int i;
10359
10360 if ((vm_paging_base_address == 0) ||
10361 (start < vm_paging_base_address) ||
10362 (end > (vm_paging_base_address
10363 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10364 /*
10365 * We didn't use our pre-allocated pool of
10366 * kernel virtual address. Deallocate the
10367 * virtual memory.
10368 */
10369 if (object != VM_OBJECT_NULL) {
10370 vm_object_unlock(object);
10371 }
10372 vm_map_remove(kernel_map, start, end);
10373 if (object != VM_OBJECT_NULL) {
10374 vm_object_lock(object);
10375 }
10376 } else {
10377 /*
10378 * We used a kernel virtual address from our
10379 * pre-allocated pool. Put it back in the pool
10380 * for next time.
10381 */
10382 assert(end - start == PAGE_SIZE);
10383 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10384 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10385
10386 /* undo the pmap mapping */
10387 pmap_remove(kernel_pmap, start, end);
10388
10389 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10390 vm_paging_page_inuse[i] = FALSE;
10391 if (vm_paging_page_waiter) {
10392 thread_wakeup(&vm_paging_page_waiter);
10393 }
10394 simple_unlock(&vm_paging_lock);
10395 }
10396 }
10397
10398
10399 /*
10400 * page->vmp_object must be locked
10401 */
10402 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10403 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10404 {
10405 if (!queues_locked) {
10406 vm_page_lockspin_queues();
10407 }
10408
10409 page->vmp_free_when_done = FALSE;
10410 /*
10411 * need to drop the laundry count...
10412 * we may also need to remove it
10413 * from the I/O paging queue...
10414 * vm_pageout_throttle_up handles both cases
10415 *
10416 * the laundry and pageout_queue flags are cleared...
10417 */
10418 vm_pageout_throttle_up(page);
10419
10420 if (!queues_locked) {
10421 vm_page_unlock_queues();
10422 }
10423 }
10424
10425 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10426
10427 upl_t
vector_upl_create(vm_offset_t upl_offset,uint32_t max_upls)10428 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10429 {
10430 int i = 0;
10431 upl_t upl;
10432
10433 assert(max_upls > 0);
10434 if (max_upls == 0) {
10435 return NULL;
10436 }
10437
10438 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10439 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10440 }
10441 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10442
10443 upl = upl_create(0, UPL_VECTOR, 0);
10444 upl->vector_upl = vector_upl;
10445 upl->u_offset = upl_offset;
10446 vector_upl->size = 0;
10447 vector_upl->offset = upl_offset;
10448 vector_upl->invalid_upls = 0;
10449 vector_upl->num_upls = 0;
10450 vector_upl->pagelist = NULL;
10451 vector_upl->max_upls = max_upls;
10452
10453 for (i = 0; i < max_upls; i++) {
10454 vector_upl->upls[i].iostate.size = 0;
10455 vector_upl->upls[i].iostate.offset = 0;
10456 }
10457 return upl;
10458 }
10459
10460 uint32_t
vector_upl_max_upls(const upl_t upl)10461 vector_upl_max_upls(const upl_t upl)
10462 {
10463 if (!vector_upl_is_valid(upl)) {
10464 return 0;
10465 }
10466 return ((vector_upl_t)(upl->vector_upl))->max_upls;
10467 }
10468
10469 void
vector_upl_deallocate(upl_t upl)10470 vector_upl_deallocate(upl_t upl)
10471 {
10472 vector_upl_t vector_upl = upl->vector_upl;
10473
10474 assert(vector_upl_is_valid(upl));
10475
10476 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10477 panic("Deallocating non-empty Vectored UPL");
10478 }
10479 uint32_t max_upls = vector_upl->max_upls;
10480 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10481 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10482 upl->vector_upl = NULL;
10483 }
10484
10485 boolean_t
vector_upl_is_valid(upl_t upl)10486 vector_upl_is_valid(upl_t upl)
10487 {
10488 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10489 }
10490
10491 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10492 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10493 {
10494 if (vector_upl_is_valid(upl)) {
10495 vector_upl_t vector_upl = upl->vector_upl;
10496
10497 if (vector_upl) {
10498 if (subupl) {
10499 if (io_size) {
10500 if (io_size < PAGE_SIZE) {
10501 io_size = PAGE_SIZE;
10502 }
10503 subupl->vector_upl = (void*)vector_upl;
10504 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10505 vector_upl->size += io_size;
10506 upl->u_size += io_size;
10507 } else {
10508 uint32_t i = 0, invalid_upls = 0;
10509 for (i = 0; i < vector_upl->num_upls; i++) {
10510 if (vector_upl->upls[i].elem == subupl) {
10511 break;
10512 }
10513 }
10514 if (i == vector_upl->num_upls) {
10515 panic("Trying to remove sub-upl when none exists");
10516 }
10517
10518 vector_upl->upls[i].elem = NULL;
10519 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10520 relaxed);
10521 if (invalid_upls == vector_upl->num_upls) {
10522 return TRUE;
10523 } else {
10524 return FALSE;
10525 }
10526 }
10527 } else {
10528 panic("vector_upl_set_subupl was passed a NULL upl element");
10529 }
10530 } else {
10531 panic("vector_upl_set_subupl was passed a non-vectored upl");
10532 }
10533 } else {
10534 panic("vector_upl_set_subupl was passed a NULL upl");
10535 }
10536
10537 return FALSE;
10538 }
10539
10540 void
vector_upl_set_pagelist(upl_t upl)10541 vector_upl_set_pagelist(upl_t upl)
10542 {
10543 if (vector_upl_is_valid(upl)) {
10544 uint32_t i = 0;
10545 vector_upl_t vector_upl = upl->vector_upl;
10546
10547 if (vector_upl) {
10548 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10549
10550 vector_upl->pagelist = kalloc_type(struct upl_page_info,
10551 atop(vector_upl->size), Z_WAITOK);
10552
10553 for (i = 0; i < vector_upl->num_upls; i++) {
10554 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10555 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10556 pagelist_size += cur_upl_pagelist_size;
10557 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10558 upl->highest_page = vector_upl->upls[i].elem->highest_page;
10559 }
10560 }
10561 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10562 } else {
10563 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10564 }
10565 } else {
10566 panic("vector_upl_set_pagelist was passed a NULL upl");
10567 }
10568 }
10569
10570 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10571 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10572 {
10573 if (vector_upl_is_valid(upl)) {
10574 vector_upl_t vector_upl = upl->vector_upl;
10575 if (vector_upl) {
10576 if (index < vector_upl->num_upls) {
10577 return vector_upl->upls[index].elem;
10578 }
10579 } else {
10580 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10581 }
10582 }
10583 return NULL;
10584 }
10585
10586 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10587 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10588 {
10589 if (vector_upl_is_valid(upl)) {
10590 uint32_t i = 0;
10591 vector_upl_t vector_upl = upl->vector_upl;
10592
10593 if (vector_upl) {
10594 upl_t subupl = NULL;
10595 vector_upl_iostates_t subupl_state;
10596
10597 for (i = 0; i < vector_upl->num_upls; i++) {
10598 subupl = vector_upl->upls[i].elem;
10599 subupl_state = vector_upl->upls[i].iostate;
10600 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10601 /* We could have been passed an offset/size pair that belongs
10602 * to an UPL element that has already been committed/aborted.
10603 * If so, return NULL.
10604 */
10605 if (subupl == NULL) {
10606 return NULL;
10607 }
10608 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10609 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10610 if (*upl_size > subupl_state.size) {
10611 *upl_size = subupl_state.size;
10612 }
10613 }
10614 if (*upl_offset >= subupl_state.offset) {
10615 *upl_offset -= subupl_state.offset;
10616 } else if (i) {
10617 panic("Vector UPL offset miscalculation");
10618 }
10619 return subupl;
10620 }
10621 }
10622 } else {
10623 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10624 }
10625 }
10626 return NULL;
10627 }
10628
10629 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10630 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10631 {
10632 *v_upl_submap = NULL;
10633
10634 if (vector_upl_is_valid(upl)) {
10635 vector_upl_t vector_upl = upl->vector_upl;
10636 if (vector_upl) {
10637 *v_upl_submap = vector_upl->submap;
10638 *submap_dst_addr = vector_upl->submap_dst_addr;
10639 } else {
10640 panic("vector_upl_get_submap was passed a non-vectored UPL");
10641 }
10642 } else {
10643 panic("vector_upl_get_submap was passed a null UPL");
10644 }
10645 }
10646
10647 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10648 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10649 {
10650 if (vector_upl_is_valid(upl)) {
10651 vector_upl_t vector_upl = upl->vector_upl;
10652 if (vector_upl) {
10653 vector_upl->submap = submap;
10654 vector_upl->submap_dst_addr = submap_dst_addr;
10655 } else {
10656 panic("vector_upl_get_submap was passed a non-vectored UPL");
10657 }
10658 } else {
10659 panic("vector_upl_get_submap was passed a NULL UPL");
10660 }
10661 }
10662
10663 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10664 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10665 {
10666 if (vector_upl_is_valid(upl)) {
10667 uint32_t i = 0;
10668 vector_upl_t vector_upl = upl->vector_upl;
10669
10670 if (vector_upl) {
10671 for (i = 0; i < vector_upl->num_upls; i++) {
10672 if (vector_upl->upls[i].elem == subupl) {
10673 break;
10674 }
10675 }
10676
10677 if (i == vector_upl->num_upls) {
10678 panic("setting sub-upl iostate when none exists");
10679 }
10680
10681 vector_upl->upls[i].iostate.offset = offset;
10682 if (size < PAGE_SIZE) {
10683 size = PAGE_SIZE;
10684 }
10685 vector_upl->upls[i].iostate.size = size;
10686 } else {
10687 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10688 }
10689 } else {
10690 panic("vector_upl_set_iostate was passed a NULL UPL");
10691 }
10692 }
10693
10694 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10695 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10696 {
10697 if (vector_upl_is_valid(upl)) {
10698 uint32_t i = 0;
10699 vector_upl_t vector_upl = upl->vector_upl;
10700
10701 if (vector_upl) {
10702 for (i = 0; i < vector_upl->num_upls; i++) {
10703 if (vector_upl->upls[i].elem == subupl) {
10704 break;
10705 }
10706 }
10707
10708 if (i == vector_upl->num_upls) {
10709 panic("getting sub-upl iostate when none exists");
10710 }
10711
10712 *offset = vector_upl->upls[i].iostate.offset;
10713 *size = vector_upl->upls[i].iostate.size;
10714 } else {
10715 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10716 }
10717 } else {
10718 panic("vector_upl_get_iostate was passed a NULL UPL");
10719 }
10720 }
10721
10722 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10723 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10724 {
10725 if (vector_upl_is_valid(upl)) {
10726 vector_upl_t vector_upl = upl->vector_upl;
10727 if (vector_upl) {
10728 if (index < vector_upl->num_upls) {
10729 *offset = vector_upl->upls[index].iostate.offset;
10730 *size = vector_upl->upls[index].iostate.size;
10731 } else {
10732 *offset = *size = 0;
10733 }
10734 } else {
10735 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10736 }
10737 } else {
10738 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10739 }
10740 }
10741
10742 void *
upl_get_internal_vectorupl(upl_t upl)10743 upl_get_internal_vectorupl(upl_t upl)
10744 {
10745 return upl->vector_upl;
10746 }
10747
10748 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10749 upl_get_internal_vectorupl_pagelist(upl_t upl)
10750 {
10751 return upl->vector_upl->pagelist;
10752 }
10753
10754 upl_page_info_t *
upl_get_internal_page_list(upl_t upl)10755 upl_get_internal_page_list(upl_t upl)
10756 {
10757 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10758 }
10759
10760 void
upl_clear_dirty(upl_t upl,boolean_t value)10761 upl_clear_dirty(
10762 upl_t upl,
10763 boolean_t value)
10764 {
10765 if (value) {
10766 upl->flags |= UPL_CLEAR_DIRTY;
10767 } else {
10768 upl->flags &= ~UPL_CLEAR_DIRTY;
10769 }
10770 }
10771
10772 void
upl_set_referenced(upl_t upl,boolean_t value)10773 upl_set_referenced(
10774 upl_t upl,
10775 boolean_t value)
10776 {
10777 upl_lock(upl);
10778 if (value) {
10779 upl->ext_ref_count++;
10780 } else {
10781 if (!upl->ext_ref_count) {
10782 panic("upl_set_referenced not %p", upl);
10783 }
10784 upl->ext_ref_count--;
10785 }
10786 upl_unlock(upl);
10787 }
10788
10789 #if CONFIG_IOSCHED
10790 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10791 upl_set_blkno(
10792 upl_t upl,
10793 vm_offset_t upl_offset,
10794 int io_size,
10795 int64_t blkno)
10796 {
10797 int i, j;
10798 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10799 return;
10800 }
10801
10802 assert(upl->upl_reprio_info != 0);
10803 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10804 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10805 }
10806 }
10807 #endif
10808
10809 void inline
memoryshot(unsigned int event,unsigned int control)10810 memoryshot(unsigned int event, unsigned int control)
10811 {
10812 if (vm_debug_events) {
10813 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10814 vm_page_active_count, vm_page_inactive_count,
10815 vm_page_free_count, vm_page_speculative_count,
10816 vm_page_throttled_count);
10817 } else {
10818 (void) event;
10819 (void) control;
10820 }
10821 }
10822
10823 #ifdef MACH_BSD
10824
10825 boolean_t
upl_device_page(upl_page_info_t * upl)10826 upl_device_page(upl_page_info_t *upl)
10827 {
10828 return UPL_DEVICE_PAGE(upl);
10829 }
10830 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10831 upl_page_present(upl_page_info_t *upl, int index)
10832 {
10833 return UPL_PAGE_PRESENT(upl, index);
10834 }
10835 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10836 upl_speculative_page(upl_page_info_t *upl, int index)
10837 {
10838 return UPL_SPECULATIVE_PAGE(upl, index);
10839 }
10840 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10841 upl_dirty_page(upl_page_info_t *upl, int index)
10842 {
10843 return UPL_DIRTY_PAGE(upl, index);
10844 }
10845 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10846 upl_valid_page(upl_page_info_t *upl, int index)
10847 {
10848 return UPL_VALID_PAGE(upl, index);
10849 }
10850 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10851 upl_phys_page(upl_page_info_t *upl, int index)
10852 {
10853 return UPL_PHYS_PAGE(upl, index);
10854 }
10855
10856 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10857 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10858 {
10859 upl[index].mark = v;
10860 }
10861
10862 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10863 upl_page_get_mark(upl_page_info_t *upl, int index)
10864 {
10865 return upl[index].mark;
10866 }
10867
10868 void
vm_countdirtypages(void)10869 vm_countdirtypages(void)
10870 {
10871 vm_page_t m;
10872 int dpages;
10873 int pgopages;
10874 int precpages;
10875
10876
10877 dpages = 0;
10878 pgopages = 0;
10879 precpages = 0;
10880
10881 vm_page_lock_queues();
10882 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10883 do {
10884 if (m == (vm_page_t)0) {
10885 break;
10886 }
10887
10888 if (m->vmp_dirty) {
10889 dpages++;
10890 }
10891 if (m->vmp_free_when_done) {
10892 pgopages++;
10893 }
10894 if (m->vmp_precious) {
10895 precpages++;
10896 }
10897
10898 assert(VM_PAGE_OBJECT(m) != kernel_object);
10899 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10900 if (m == (vm_page_t)0) {
10901 break;
10902 }
10903 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10904 vm_page_unlock_queues();
10905
10906 vm_page_lock_queues();
10907 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10908 do {
10909 if (m == (vm_page_t)0) {
10910 break;
10911 }
10912
10913 dpages++;
10914 assert(m->vmp_dirty);
10915 assert(!m->vmp_free_when_done);
10916 assert(VM_PAGE_OBJECT(m) != kernel_object);
10917 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10918 if (m == (vm_page_t)0) {
10919 break;
10920 }
10921 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10922 vm_page_unlock_queues();
10923
10924 vm_page_lock_queues();
10925 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10926 do {
10927 if (m == (vm_page_t)0) {
10928 break;
10929 }
10930
10931 if (m->vmp_dirty) {
10932 dpages++;
10933 }
10934 if (m->vmp_free_when_done) {
10935 pgopages++;
10936 }
10937 if (m->vmp_precious) {
10938 precpages++;
10939 }
10940
10941 assert(VM_PAGE_OBJECT(m) != kernel_object);
10942 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10943 if (m == (vm_page_t)0) {
10944 break;
10945 }
10946 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10947 vm_page_unlock_queues();
10948
10949 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10950
10951 dpages = 0;
10952 pgopages = 0;
10953 precpages = 0;
10954
10955 vm_page_lock_queues();
10956 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10957
10958 do {
10959 if (m == (vm_page_t)0) {
10960 break;
10961 }
10962 if (m->vmp_dirty) {
10963 dpages++;
10964 }
10965 if (m->vmp_free_when_done) {
10966 pgopages++;
10967 }
10968 if (m->vmp_precious) {
10969 precpages++;
10970 }
10971
10972 assert(VM_PAGE_OBJECT(m) != kernel_object);
10973 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10974 if (m == (vm_page_t)0) {
10975 break;
10976 }
10977 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10978 vm_page_unlock_queues();
10979
10980 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10981 }
10982 #endif /* MACH_BSD */
10983
10984
10985 #if CONFIG_IOSCHED
10986 int
upl_get_cached_tier(upl_t upl)10987 upl_get_cached_tier(upl_t upl)
10988 {
10989 assert(upl);
10990 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10991 return upl->upl_priority;
10992 }
10993 return -1;
10994 }
10995 #endif /* CONFIG_IOSCHED */
10996
10997
10998 void
upl_callout_iodone(upl_t upl)10999 upl_callout_iodone(upl_t upl)
11000 {
11001 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11002
11003 if (upl_ctx) {
11004 void (*iodone_func)(void *, int) = upl_ctx->io_done;
11005
11006 assert(upl_ctx->io_done);
11007
11008 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11009 }
11010 }
11011
11012 void
upl_set_iodone(upl_t upl,void * upl_iodone)11013 upl_set_iodone(upl_t upl, void *upl_iodone)
11014 {
11015 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11016 }
11017
11018 void
upl_set_iodone_error(upl_t upl,int error)11019 upl_set_iodone_error(upl_t upl, int error)
11020 {
11021 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11022
11023 if (upl_ctx) {
11024 upl_ctx->io_error = error;
11025 }
11026 }
11027
11028
11029 ppnum_t
upl_get_highest_page(upl_t upl)11030 upl_get_highest_page(
11031 upl_t upl)
11032 {
11033 return upl->highest_page;
11034 }
11035
11036 upl_size_t
upl_get_size(upl_t upl)11037 upl_get_size(
11038 upl_t upl)
11039 {
11040 return upl_adjusted_size(upl, PAGE_MASK);
11041 }
11042
11043 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11044 upl_adjusted_size(
11045 upl_t upl,
11046 vm_map_offset_t pgmask)
11047 {
11048 vm_object_offset_t start_offset, end_offset;
11049
11050 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11051 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11052
11053 return (upl_size_t)(end_offset - start_offset);
11054 }
11055
11056 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11057 upl_adjusted_offset(
11058 upl_t upl,
11059 vm_map_offset_t pgmask)
11060 {
11061 return trunc_page_mask_64(upl->u_offset, pgmask);
11062 }
11063
11064 vm_object_offset_t
upl_get_data_offset(upl_t upl)11065 upl_get_data_offset(
11066 upl_t upl)
11067 {
11068 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11069 }
11070
11071 upl_t
upl_associated_upl(upl_t upl)11072 upl_associated_upl(upl_t upl)
11073 {
11074 return upl->associated_upl;
11075 }
11076
11077 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11078 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11079 {
11080 upl->associated_upl = associated_upl;
11081 }
11082
11083 struct vnode *
upl_lookup_vnode(upl_t upl)11084 upl_lookup_vnode(upl_t upl)
11085 {
11086 if (!upl->map_object->internal) {
11087 return vnode_pager_lookup_vnode(upl->map_object->pager);
11088 } else {
11089 return NULL;
11090 }
11091 }
11092
11093 #if UPL_DEBUG
11094 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11095 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11096 {
11097 upl->ubc_alias1 = alias1;
11098 upl->ubc_alias2 = alias2;
11099 return KERN_SUCCESS;
11100 }
11101 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11102 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11103 {
11104 if (al) {
11105 *al = upl->ubc_alias1;
11106 }
11107 if (al2) {
11108 *al2 = upl->ubc_alias2;
11109 }
11110 return KERN_SUCCESS;
11111 }
11112 #endif /* UPL_DEBUG */
11113
11114 #if VM_PRESSURE_EVENTS
11115 /*
11116 * Upward trajectory.
11117 */
11118 extern boolean_t vm_compressor_low_on_space(void);
11119
11120 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11121 VM_PRESSURE_NORMAL_TO_WARNING(void)
11122 {
11123 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11124 /* Available pages below our threshold */
11125 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11126 /* No frozen processes to kill */
11127 if (memorystatus_frozen_count == 0) {
11128 /* Not enough suspended processes available. */
11129 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11130 return TRUE;
11131 }
11132 }
11133 }
11134 return FALSE;
11135 } else {
11136 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11137 }
11138 }
11139
11140 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11141 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11142 {
11143 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11144 /* Available pages below our threshold */
11145 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11146 return TRUE;
11147 }
11148 return FALSE;
11149 } else {
11150 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11151 }
11152 }
11153
11154 /*
11155 * Downward trajectory.
11156 */
11157 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11158 VM_PRESSURE_WARNING_TO_NORMAL(void)
11159 {
11160 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161 /* Available pages above our threshold */
11162 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11163 if (memorystatus_available_pages > target_threshold) {
11164 return TRUE;
11165 }
11166 return FALSE;
11167 } else {
11168 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11169 }
11170 }
11171
11172 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11173 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11174 {
11175 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11176 /* Available pages above our threshold */
11177 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11178 if (memorystatus_available_pages > target_threshold) {
11179 return TRUE;
11180 }
11181 return FALSE;
11182 } else {
11183 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11184 }
11185 }
11186 #endif /* VM_PRESSURE_EVENTS */
11187
11188 #if DEVELOPMENT || DEBUG
11189 bool compressor_running_perf_test;
11190 uint64_t compressor_perf_test_pages_processed;
11191
11192 kern_return_t
11193 run_compressor_perf_test(
11194 user_addr_t buf,
11195 size_t buffer_size,
11196 uint64_t *time,
11197 uint64_t *bytes_compressed,
11198 uint64_t *compressor_growth);
11199
11200 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11201 move_pages_to_queue(
11202 vm_map_t map,
11203 user_addr_t start_addr,
11204 size_t buffer_size,
11205 vm_page_queue_head_t *queue,
11206 size_t *pages_moved)
11207 {
11208 kern_return_t err = KERN_SUCCESS;
11209 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11210 boolean_t addr_in_map = FALSE;
11211 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11212 vm_object_t curr_object = VM_OBJECT_NULL;
11213 *pages_moved = 0;
11214
11215
11216 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11217 /*
11218 * We don't currently support benchmarking maps with a different page size
11219 * than the kernel.
11220 */
11221 return KERN_INVALID_ARGUMENT;
11222 }
11223
11224 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11225 return KERN_INVALID_ARGUMENT;
11226 }
11227
11228 vm_map_lock_read(map);
11229 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11230 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11231
11232
11233 while (curr_addr < end_addr) {
11234 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11235 if (!addr_in_map) {
11236 err = KERN_INVALID_ARGUMENT;
11237 break;
11238 }
11239 curr_object = VME_OBJECT(curr_entry);
11240 if (curr_object) {
11241 vm_object_lock(curr_object);
11242 /* We really only want anonymous memory that's in the top level map and object here. */
11243 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11244 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11245 err = KERN_INVALID_ARGUMENT;
11246 vm_object_unlock(curr_object);
11247 break;
11248 }
11249 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11250 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11251 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11252 vm_map_offset_t curr_offset = start_offset;
11253 vm_page_t curr_page;
11254 while (curr_offset < end_offset) {
11255 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11256 if (curr_page != VM_PAGE_NULL) {
11257 vm_page_lock_queues();
11258 if (curr_page->vmp_laundry) {
11259 vm_pageout_steal_laundry(curr_page, TRUE);
11260 }
11261 /*
11262 * we've already factored out pages in the laundry which
11263 * means this page can't be on the pageout queue so it's
11264 * safe to do the vm_page_queues_remove
11265 */
11266 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11267 vm_page_queues_remove(curr_page, TRUE);
11268 if (donate) {
11269 /*
11270 * The compressor needs to see this bit to know
11271 * where this page needs to land. Also if stolen,
11272 * this bit helps put the page back in the right
11273 * special queue where it belongs.
11274 */
11275 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11276 }
11277 // Clear the referenced bit so we ensure this gets paged out
11278 curr_page->vmp_reference = false;
11279 if (curr_page->vmp_pmapped) {
11280 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11281 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11282 }
11283 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11284 vm_page_unlock_queues();
11285 *pages_moved += 1;
11286 }
11287 curr_offset += PAGE_SIZE_64;
11288 curr_addr += PAGE_SIZE_64;
11289 }
11290 }
11291 vm_object_unlock(curr_object);
11292 }
11293 vm_map_unlock_read(map);
11294 return err;
11295 }
11296
11297 /*
11298 * Local queue for processing benchmark pages.
11299 * Can't be allocated on the stack because the pointer has to
11300 * be packable.
11301 */
11302 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11303 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11304 run_compressor_perf_test(
11305 user_addr_t buf,
11306 size_t buffer_size,
11307 uint64_t *time,
11308 uint64_t *bytes_compressed,
11309 uint64_t *compressor_growth)
11310 {
11311 kern_return_t err = KERN_SUCCESS;
11312 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11313 return KERN_NOT_SUPPORTED;
11314 }
11315 if (current_task() == kernel_task) {
11316 return KERN_INVALID_ARGUMENT;
11317 }
11318 vm_page_lock_queues();
11319 if (compressor_running_perf_test) {
11320 /* Only run one instance of the benchmark at a time. */
11321 vm_page_unlock_queues();
11322 return KERN_RESOURCE_SHORTAGE;
11323 }
11324 vm_page_unlock_queues();
11325 size_t page_count = 0;
11326 vm_map_t map;
11327 vm_page_t p, next;
11328 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11329 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11330 *bytes_compressed = *compressor_growth = 0;
11331
11332 vm_page_queue_init(&compressor_perf_test_queue);
11333 map = current_task()->map;
11334 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11335 if (err != KERN_SUCCESS) {
11336 goto out;
11337 }
11338
11339 vm_page_lock_queues();
11340 compressor_running_perf_test = true;
11341 compressor_perf_test_pages_processed = 0;
11342 /*
11343 * At this point the compressor threads should only process the benchmark queue
11344 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11345 * to determine how many compressed bytes we ended up using.
11346 */
11347 compressed_bytes_start = c_segment_compressed_bytes;
11348 vm_page_unlock_queues();
11349
11350 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11351
11352 vm_page_lock_queues();
11353 compressor_perf_test_start = mach_absolute_time();
11354
11355 // Wake up the compressor thread(s)
11356 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11357 pgo_iothread_internal_state[0].pgo_iothread);
11358
11359 /*
11360 * Depending on when this test is run we could overshoot or be right on the mark
11361 * with our page_count. So the comparison is of the _less than_ variety.
11362 */
11363 while (compressor_perf_test_pages_processed < page_count) {
11364 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11365 vm_page_unlock_queues();
11366 thread_block(THREAD_CONTINUE_NULL);
11367 vm_page_lock_queues();
11368 }
11369 compressor_perf_test_end = mach_absolute_time();
11370 compressed_bytes_end = c_segment_compressed_bytes;
11371 vm_page_unlock_queues();
11372
11373
11374 out:
11375 /*
11376 * If we errored out above, then we could still have some pages
11377 * on the local queue. Make sure to put them back on the active queue before
11378 * returning so they're not orphaned.
11379 */
11380 vm_page_lock_queues();
11381 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11382 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11383 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11384 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11385
11386 vm_page_enqueue_active(p, FALSE);
11387 p = next;
11388 }
11389
11390 compressor_running_perf_test = false;
11391 vm_page_unlock_queues();
11392 if (err == KERN_SUCCESS) {
11393 *bytes_compressed = page_count * PAGE_SIZE_64;
11394 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11395 }
11396
11397 /*
11398 * pageout_scan will consider waking the compactor swapper
11399 * before it blocks. Do the same thing here before we return
11400 * to ensure that back to back benchmark runs can't overly fragment the
11401 * compressor pool.
11402 */
11403 vm_consider_waking_compactor_swapper();
11404 return err;
11405 }
11406 #endif /* DEVELOPMENT || DEBUG */
11407