1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91
92 #include <os/log.h>
93
94 #include <sys/kdebug_triage.h>
95
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map_internal.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111
112 #include <san/kasan.h>
113
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121
122 extern int cs_debug;
123
124 #if CONFIG_MBUF_MCACHE
125 extern void mbuf_drain(boolean_t);
126 #endif /* CONFIG_MBUF_MCACHE */
127
128 #if VM_PRESSURE_EVENTS
129 #if CONFIG_JETSAM
130 extern unsigned int memorystatus_available_pages;
131 extern unsigned int memorystatus_available_pages_pressure;
132 extern unsigned int memorystatus_available_pages_critical;
133 #else /* CONFIG_JETSAM */
134 extern uint64_t memorystatus_available_pages;
135 extern uint64_t memorystatus_available_pages_pressure;
136 extern uint64_t memorystatus_available_pages_critical;
137 #endif /* CONFIG_JETSAM */
138
139 extern unsigned int memorystatus_frozen_count;
140 extern unsigned int memorystatus_suspended_count;
141 extern vm_pressure_level_t memorystatus_vm_pressure_level;
142
143 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
144 extern uint32_t memorystatus_jetsam_fg_band_waiters;
145
146 void vm_pressure_response(void);
147 extern void consider_vm_pressure_events(void);
148
149 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
150 #endif /* VM_PRESSURE_EVENTS */
151
152 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
153 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
154 #if CONFIG_VPS_DYNAMIC_PRIO
155 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
156 #else
157 const bool vps_dynamic_priority_enabled = false;
158 #endif
159 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
160
161 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
162 #if !XNU_TARGET_OS_OSX
163 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
164 #else /* !XNU_TARGET_OS_OSX */
165 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
166 #endif /* !XNU_TARGET_OS_OSX */
167 #endif
168
169 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
170 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
171 #endif
172
173 #ifndef VM_PAGE_LAUNDRY_MAX
174 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
175 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
176
177 #ifndef VM_PAGEOUT_BURST_WAIT
178 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
179 #endif /* VM_PAGEOUT_BURST_WAIT */
180
181 #ifndef VM_PAGEOUT_EMPTY_WAIT
182 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
183 #endif /* VM_PAGEOUT_EMPTY_WAIT */
184
185 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
186 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
187 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
188
189 #ifndef VM_PAGEOUT_IDLE_WAIT
190 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
191 #endif /* VM_PAGEOUT_IDLE_WAIT */
192
193 #ifndef VM_PAGEOUT_SWAP_WAIT
194 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
195 #endif /* VM_PAGEOUT_SWAP_WAIT */
196
197
198 #ifndef VM_PAGE_SPECULATIVE_TARGET
199 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
200 #endif /* VM_PAGE_SPECULATIVE_TARGET */
201
202
203 /*
204 * To obtain a reasonable LRU approximation, the inactive queue
205 * needs to be large enough to give pages on it a chance to be
206 * referenced a second time. This macro defines the fraction
207 * of active+inactive pages that should be inactive.
208 * The pageout daemon uses it to update vm_page_inactive_target.
209 *
210 * If vm_page_free_count falls below vm_page_free_target and
211 * vm_page_inactive_count is below vm_page_inactive_target,
212 * then the pageout daemon starts running.
213 */
214
215 #ifndef VM_PAGE_INACTIVE_TARGET
216 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
217 #endif /* VM_PAGE_INACTIVE_TARGET */
218
219 /*
220 * Once the pageout daemon starts running, it keeps going
221 * until vm_page_free_count meets or exceeds vm_page_free_target.
222 */
223
224 #ifndef VM_PAGE_FREE_TARGET
225 #if !XNU_TARGET_OS_OSX
226 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
227 #else /* !XNU_TARGET_OS_OSX */
228 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
229 #endif /* !XNU_TARGET_OS_OSX */
230 #endif /* VM_PAGE_FREE_TARGET */
231
232
233 /*
234 * The pageout daemon always starts running once vm_page_free_count
235 * falls below vm_page_free_min.
236 */
237
238 #ifndef VM_PAGE_FREE_MIN
239 #if !XNU_TARGET_OS_OSX
240 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
241 #else /* !XNU_TARGET_OS_OSX */
242 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
243 #endif /* !XNU_TARGET_OS_OSX */
244 #endif /* VM_PAGE_FREE_MIN */
245
246 #if !XNU_TARGET_OS_OSX
247 #define VM_PAGE_FREE_RESERVED_LIMIT 100
248 #define VM_PAGE_FREE_MIN_LIMIT 1500
249 #define VM_PAGE_FREE_TARGET_LIMIT 2000
250 #else /* !XNU_TARGET_OS_OSX */
251 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
252 #define VM_PAGE_FREE_MIN_LIMIT 3500
253 #define VM_PAGE_FREE_TARGET_LIMIT 4000
254 #endif /* !XNU_TARGET_OS_OSX */
255
256 /*
257 * When vm_page_free_count falls below vm_page_free_reserved,
258 * only vm-privileged threads can allocate pages. vm-privilege
259 * allows the pageout daemon and default pager (and any other
260 * associated threads needed for default pageout) to continue
261 * operation by dipping into the reserved pool of pages.
262 */
263
264 #ifndef VM_PAGE_FREE_RESERVED
265 #define VM_PAGE_FREE_RESERVED(n) \
266 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
267 #endif /* VM_PAGE_FREE_RESERVED */
268
269 /*
270 * When we dequeue pages from the inactive list, they are
271 * reactivated (ie, put back on the active queue) if referenced.
272 * However, it is possible to starve the free list if other
273 * processors are referencing pages faster than we can turn off
274 * the referenced bit. So we limit the number of reactivations
275 * we will make per call of vm_pageout_scan().
276 */
277 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
278
279 #ifndef VM_PAGE_REACTIVATE_LIMIT
280 #if !XNU_TARGET_OS_OSX
281 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
282 #else /* !XNU_TARGET_OS_OSX */
283 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
284 #endif /* !XNU_TARGET_OS_OSX */
285 #endif /* VM_PAGE_REACTIVATE_LIMIT */
286 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
287
288 int vm_pageout_protect_realtime = true;
289
290 extern boolean_t hibernate_cleaning_in_progress;
291
292 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
293 struct pgo_iothread_state pgo_iothread_external_state;
294
295 #if VM_PRESSURE_EVENTS
296 void vm_pressure_thread(void);
297
298 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
299 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
300
301 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
302 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
303 #endif
304
305 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
306 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
307 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
308
309 extern void vm_pageout_continue(void);
310 extern void vm_pageout_scan(void);
311
312 boolean_t vm_pageout_running = FALSE;
313
314 uint32_t vm_page_upl_tainted = 0;
315 uint32_t vm_page_iopl_tainted = 0;
316
317 #if XNU_TARGET_OS_OSX
318 static boolean_t vm_pageout_waiter = FALSE;
319 #endif /* XNU_TARGET_OS_OSX */
320
321
322 #if DEVELOPMENT || DEBUG
323 struct vm_pageout_debug vm_pageout_debug;
324 #endif
325 struct vm_pageout_vminfo vm_pageout_vminfo;
326 struct vm_pageout_state vm_pageout_state;
327 struct vm_config vm_config;
328
329 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
330 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
331 #if DEVELOPMENT || DEBUG
332 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
333 #endif /* DEVELOPMENT || DEBUG */
334
335 int vm_upl_wait_for_pages = 0;
336 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
337
338 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
339
340 int vm_debug_events = 0;
341
342 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
343
344 #if CONFIG_MEMORYSTATUS
345 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
346
347 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
348 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
349
350 #endif
351
352 #if __AMP__
353
354
355 /*
356 * Bind compressor threads to e-cores unless there are multiple non-e clusters
357 */
358 #if (MAX_CPU_CLUSTERS > 2)
359 #define VM_COMPRESSOR_EBOUND_DEFAULT false
360 #else
361 #define VM_COMPRESSOR_EBOUND_DEFAULT true
362 #endif
363
364 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
365 int vm_pgo_pbound = 0;
366 extern void thread_bind_cluster_type(thread_t, char, bool);
367
368 #endif /* __AMP__ */
369
370
371 /*
372 * Routine: vm_pageout_object_terminate
373 * Purpose:
374 * Destroy the pageout_object, and perform all of the
375 * required cleanup actions.
376 *
377 * In/Out conditions:
378 * The object must be locked, and will be returned locked.
379 */
380 void
vm_pageout_object_terminate(vm_object_t object)381 vm_pageout_object_terminate(
382 vm_object_t object)
383 {
384 vm_object_t shadow_object;
385
386 /*
387 * Deal with the deallocation (last reference) of a pageout object
388 * (used for cleaning-in-place) by dropping the paging references/
389 * freeing pages in the original object.
390 */
391
392 assert(object->pageout);
393 shadow_object = object->shadow;
394 vm_object_lock(shadow_object);
395
396 while (!vm_page_queue_empty(&object->memq)) {
397 vm_page_t p, m;
398 vm_object_offset_t offset;
399
400 p = (vm_page_t) vm_page_queue_first(&object->memq);
401
402 assert(p->vmp_private);
403 assert(p->vmp_free_when_done);
404 p->vmp_free_when_done = FALSE;
405 assert(!p->vmp_cleaning);
406 assert(!p->vmp_laundry);
407
408 offset = p->vmp_offset;
409 VM_PAGE_FREE(p);
410 p = VM_PAGE_NULL;
411
412 m = vm_page_lookup(shadow_object,
413 offset + object->vo_shadow_offset);
414
415 if (m == VM_PAGE_NULL) {
416 continue;
417 }
418
419 assert((m->vmp_dirty) || (m->vmp_precious) ||
420 (m->vmp_busy && m->vmp_cleaning));
421
422 /*
423 * Handle the trusted pager throttle.
424 * Also decrement the burst throttle (if external).
425 */
426 vm_page_lock_queues();
427 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
428 vm_pageout_throttle_up(m);
429 }
430
431 /*
432 * Handle the "target" page(s). These pages are to be freed if
433 * successfully cleaned. Target pages are always busy, and are
434 * wired exactly once. The initial target pages are not mapped,
435 * (so cannot be referenced or modified) but converted target
436 * pages may have been modified between the selection as an
437 * adjacent page and conversion to a target.
438 */
439 if (m->vmp_free_when_done) {
440 assert(m->vmp_busy);
441 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
442 assert(m->vmp_wire_count == 1);
443 m->vmp_cleaning = FALSE;
444 m->vmp_free_when_done = FALSE;
445 /*
446 * Revoke all access to the page. Since the object is
447 * locked, and the page is busy, this prevents the page
448 * from being dirtied after the pmap_disconnect() call
449 * returns.
450 *
451 * Since the page is left "dirty" but "not modifed", we
452 * can detect whether the page was redirtied during
453 * pageout by checking the modify state.
454 */
455 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
456 SET_PAGE_DIRTY(m, FALSE);
457 } else {
458 m->vmp_dirty = FALSE;
459 }
460
461 if (m->vmp_dirty) {
462 vm_page_unwire(m, TRUE); /* reactivates */
463 counter_inc(&vm_statistics_reactivations);
464 PAGE_WAKEUP_DONE(m);
465 } else {
466 vm_page_free(m); /* clears busy, etc. */
467 }
468 vm_page_unlock_queues();
469 continue;
470 }
471 /*
472 * Handle the "adjacent" pages. These pages were cleaned in
473 * place, and should be left alone.
474 * If prep_pin_count is nonzero, then someone is using the
475 * page, so make it active.
476 */
477 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
478 if (m->vmp_reference) {
479 vm_page_activate(m);
480 } else {
481 vm_page_deactivate(m);
482 }
483 }
484 if (m->vmp_overwriting) {
485 /*
486 * the (COPY_OUT_FROM == FALSE) request_page_list case
487 */
488 if (m->vmp_busy) {
489 /*
490 * We do not re-set m->vmp_dirty !
491 * The page was busy so no extraneous activity
492 * could have occurred. COPY_INTO is a read into the
493 * new pages. CLEAN_IN_PLACE does actually write
494 * out the pages but handling outside of this code
495 * will take care of resetting dirty. We clear the
496 * modify however for the Programmed I/O case.
497 */
498 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
499
500 m->vmp_busy = FALSE;
501 m->vmp_absent = FALSE;
502 } else {
503 /*
504 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
505 * Occurs when the original page was wired
506 * at the time of the list request
507 */
508 assert(VM_PAGE_WIRED(m));
509 vm_page_unwire(m, TRUE); /* reactivates */
510 }
511 m->vmp_overwriting = FALSE;
512 } else {
513 m->vmp_dirty = FALSE;
514 }
515 m->vmp_cleaning = FALSE;
516
517 /*
518 * Wakeup any thread waiting for the page to be un-cleaning.
519 */
520 PAGE_WAKEUP(m);
521 vm_page_unlock_queues();
522 }
523 /*
524 * Account for the paging reference taken in vm_paging_object_allocate.
525 */
526 vm_object_activity_end(shadow_object);
527 vm_object_unlock(shadow_object);
528
529 assert(object->ref_count == 0);
530 assert(object->paging_in_progress == 0);
531 assert(object->activity_in_progress == 0);
532 assert(object->resident_page_count == 0);
533 return;
534 }
535
536 /*
537 * Routine: vm_pageclean_setup
538 *
539 * Purpose: setup a page to be cleaned (made non-dirty), but not
540 * necessarily flushed from the VM page cache.
541 * This is accomplished by cleaning in place.
542 *
543 * The page must not be busy, and new_object
544 * must be locked.
545 *
546 */
547 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)548 vm_pageclean_setup(
549 vm_page_t m,
550 vm_page_t new_m,
551 vm_object_t new_object,
552 vm_object_offset_t new_offset)
553 {
554 assert(!m->vmp_busy);
555 #if 0
556 assert(!m->vmp_cleaning);
557 #endif
558
559 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
560
561 /*
562 * Mark original page as cleaning in place.
563 */
564 m->vmp_cleaning = TRUE;
565 SET_PAGE_DIRTY(m, FALSE);
566 m->vmp_precious = FALSE;
567
568 /*
569 * Convert the fictitious page to a private shadow of
570 * the real page.
571 */
572 assert(new_m->vmp_fictitious);
573 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
574 new_m->vmp_fictitious = FALSE;
575 new_m->vmp_private = TRUE;
576 new_m->vmp_free_when_done = TRUE;
577 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
578
579 vm_page_lockspin_queues();
580 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 vm_page_unlock_queues();
582
583 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 assert(!new_m->vmp_wanted);
585 new_m->vmp_busy = FALSE;
586 }
587
588 /*
589 * Routine: vm_pageout_initialize_page
590 * Purpose:
591 * Causes the specified page to be initialized in
592 * the appropriate memory object. This routine is used to push
593 * pages into a copy-object when they are modified in the
594 * permanent object.
595 *
596 * The page is moved to a temporary object and paged out.
597 *
598 * In/out conditions:
599 * The page in question must not be on any pageout queues.
600 * The object to which it belongs must be locked.
601 * The page must be busy, but not hold a paging reference.
602 *
603 * Implementation:
604 * Move this page to a completely new object.
605 */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 vm_page_t m)
609 {
610 vm_object_t object;
611 vm_object_offset_t paging_offset;
612 memory_object_t pager;
613
614 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615
616 object = VM_PAGE_OBJECT(m);
617
618 assert(m->vmp_busy);
619 assert(object->internal);
620
621 /*
622 * Verify that we really want to clean this page
623 */
624 assert(!m->vmp_absent);
625 assert(m->vmp_dirty);
626
627 /*
628 * Create a paging reference to let us play with the object.
629 */
630 paging_offset = m->vmp_offset + object->paging_offset;
631
632 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 panic("reservation without pageout?"); /* alan */
634
635 VM_PAGE_FREE(m);
636 vm_object_unlock(object);
637
638 return;
639 }
640
641 /*
642 * If there's no pager, then we can't clean the page. This should
643 * never happen since this should be a copy object and therefore not
644 * an external object, so the pager should always be there.
645 */
646
647 pager = object->pager;
648
649 if (pager == MEMORY_OBJECT_NULL) {
650 panic("missing pager for copy object");
651
652 VM_PAGE_FREE(m);
653 return;
654 }
655
656 /*
657 * set the page for future call to vm_fault_list_request
658 */
659 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 SET_PAGE_DIRTY(m, FALSE);
661
662 /*
663 * keep the object from collapsing or terminating
664 */
665 vm_object_paging_begin(object);
666 vm_object_unlock(object);
667
668 /*
669 * Write the data to its pager.
670 * Note that the data is passed by naming the new object,
671 * not a virtual address; the pager interface has been
672 * manipulated to use the "internal memory" data type.
673 * [The object reference from its allocation is donated
674 * to the eventual recipient.]
675 */
676 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677
678 vm_object_lock(object);
679 vm_object_paging_end(object);
680 }
681
682
683 /*
684 * vm_pageout_cluster:
685 *
686 * Given a page, queue it to the appropriate I/O thread,
687 * which will page it out and attempt to clean adjacent pages
688 * in the same operation.
689 *
690 * The object and queues must be locked. We will take a
691 * paging reference to prevent deallocation or collapse when we
692 * release the object lock back at the call site. The I/O thread
693 * is responsible for consuming this reference
694 *
695 * The page must not be on any pageout queue.
696 */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703
704 typedef enum vmct_state_t {
705 VMCT_IDLE,
706 VMCT_AWAKENED,
707 VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711
712
713
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 vm_object_t object = VM_PAGE_OBJECT(m);
718
719 VM_PAGE_CHECK(m);
720 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 vm_object_lock_assert_exclusive(object);
722
723 /*
724 * Make sure it's OK to page this out.
725 */
726 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 assert(!m->vmp_cleaning && !m->vmp_laundry);
728 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729
730 /*
731 * protect the object from collapse or termination
732 */
733 vm_object_activity_begin(object);
734
735
736 /*
737 * pgo_laundry count is tied to the laundry bit
738 */
739 m->vmp_laundry = TRUE;
740 q->pgo_laundry++;
741
742 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744
745 // the benchmark queue will be woken up independently by the benchmark itself
746 if (
747 object->internal == TRUE
748 #if DEVELOPMENT || DEBUG
749 && q != &vm_pageout_queue_benchmark
750 #endif
751 ) {
752 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
753 m->vmp_busy = TRUE;
754 // Wake up the first compressor thread. It will wake subsequent threads if necessary.
755 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup, pgo_iothread_internal_state[0].pgo_iothread);
756 } else {
757 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
758 }
759 VM_PAGE_CHECK(m);
760 }
761
762 void
vm_pageout_cluster(vm_page_t m)763 vm_pageout_cluster(vm_page_t m)
764 {
765 struct vm_pageout_queue *q;
766 vm_object_t object = VM_PAGE_OBJECT(m);
767 if (object->internal) {
768 q = &vm_pageout_queue_internal;
769 } else {
770 q = &vm_pageout_queue_external;
771 }
772 vm_pageout_cluster_to_queue(m, q);
773 }
774
775
776 /*
777 * A page is back from laundry or we are stealing it back from
778 * the laundering state. See if there are some pages waiting to
779 * go to laundry and if we can let some of them go now.
780 *
781 * Object and page queues must be locked.
782 */
783 void
vm_pageout_throttle_up(vm_page_t m)784 vm_pageout_throttle_up(
785 vm_page_t m)
786 {
787 struct vm_pageout_queue *q;
788 vm_object_t m_object;
789
790 m_object = VM_PAGE_OBJECT(m);
791
792 assert(m_object != VM_OBJECT_NULL);
793 assert(!is_kernel_object(m_object));
794
795 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
796 vm_object_lock_assert_exclusive(m_object);
797
798 if (m_object->internal == TRUE) {
799 q = &vm_pageout_queue_internal;
800 } else {
801 q = &vm_pageout_queue_external;
802 }
803
804 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
805 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
806 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
807
808 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
809
810 vm_object_activity_end(m_object);
811
812 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
813 }
814 if (m->vmp_laundry == TRUE) {
815 m->vmp_laundry = FALSE;
816 q->pgo_laundry--;
817
818 if (q->pgo_throttled == TRUE) {
819 q->pgo_throttled = FALSE;
820 thread_wakeup((event_t) &q->pgo_laundry);
821 }
822 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
823 q->pgo_draining = FALSE;
824 thread_wakeup((event_t) (&q->pgo_laundry + 1));
825 }
826 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
827 }
828 }
829
830
831 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)832 vm_pageout_throttle_up_batch(
833 struct vm_pageout_queue *q,
834 int batch_cnt)
835 {
836 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
837
838 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
839
840 q->pgo_laundry -= batch_cnt;
841
842 if (q->pgo_throttled == TRUE) {
843 q->pgo_throttled = FALSE;
844 thread_wakeup((event_t) &q->pgo_laundry);
845 }
846 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
847 q->pgo_draining = FALSE;
848 thread_wakeup((event_t) (&q->pgo_laundry + 1));
849 }
850 }
851
852
853
854 /*
855 * VM memory pressure monitoring.
856 *
857 * vm_pageout_scan() keeps track of the number of pages it considers and
858 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
859 *
860 * compute_memory_pressure() is called every second from compute_averages()
861 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
862 * of recalimed pages in a new vm_pageout_stat[] bucket.
863 *
864 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
865 * The caller provides the number of seconds ("nsecs") worth of statistics
866 * it wants, up to 30 seconds.
867 * It computes the number of pages reclaimed in the past "nsecs" seconds and
868 * also returns the number of pages the system still needs to reclaim at this
869 * moment in time.
870 */
871 #if DEVELOPMENT || DEBUG
872 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
873 #else
874 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
875 #endif
876 struct vm_pageout_stat {
877 unsigned long vm_page_active_count;
878 unsigned long vm_page_speculative_count;
879 unsigned long vm_page_inactive_count;
880 unsigned long vm_page_anonymous_count;
881
882 unsigned long vm_page_free_count;
883 unsigned long vm_page_wire_count;
884 unsigned long vm_page_compressor_count;
885
886 unsigned long vm_page_pages_compressed;
887 unsigned long vm_page_pageable_internal_count;
888 unsigned long vm_page_pageable_external_count;
889 unsigned long vm_page_xpmapped_external_count;
890
891 unsigned int pages_grabbed;
892 unsigned int pages_freed;
893
894 unsigned int pages_compressed;
895 unsigned int pages_grabbed_by_compressor;
896 unsigned int failed_compressions;
897
898 unsigned int pages_evicted;
899 unsigned int pages_purged;
900
901 unsigned int considered;
902 unsigned int considered_bq_internal;
903 unsigned int considered_bq_external;
904
905 unsigned int skipped_external;
906 unsigned int skipped_internal;
907 unsigned int filecache_min_reactivations;
908
909 unsigned int freed_speculative;
910 unsigned int freed_cleaned;
911 unsigned int freed_internal;
912 unsigned int freed_external;
913
914 unsigned int cleaned_dirty_external;
915 unsigned int cleaned_dirty_internal;
916
917 unsigned int inactive_referenced;
918 unsigned int inactive_nolock;
919 unsigned int reactivation_limit_exceeded;
920 unsigned int forced_inactive_reclaim;
921
922 unsigned int throttled_internal_q;
923 unsigned int throttled_external_q;
924
925 unsigned int phantom_ghosts_found;
926 unsigned int phantom_ghosts_added;
927
928 unsigned int vm_page_realtime_count;
929 unsigned int forcereclaimed_sharedcache;
930 unsigned int forcereclaimed_realtime;
931 unsigned int protected_sharedcache;
932 unsigned int protected_realtime;
933 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
934
935 unsigned int vm_pageout_stat_now = 0;
936
937 #define VM_PAGEOUT_STAT_BEFORE(i) \
938 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
939 #define VM_PAGEOUT_STAT_AFTER(i) \
940 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
941
942 #if VM_PAGE_BUCKETS_CHECK
943 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
944 #endif /* VM_PAGE_BUCKETS_CHECK */
945
946
947 void
948 record_memory_pressure(void);
949 void
record_memory_pressure(void)950 record_memory_pressure(void)
951 {
952 unsigned int vm_pageout_next;
953
954 #if VM_PAGE_BUCKETS_CHECK
955 /* check the consistency of VM page buckets at regular interval */
956 static int counter = 0;
957 if ((++counter % vm_page_buckets_check_interval) == 0) {
958 vm_page_buckets_check();
959 }
960 #endif /* VM_PAGE_BUCKETS_CHECK */
961
962 vm_pageout_state.vm_memory_pressure =
963 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
964 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
965 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
966 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
967
968 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
969
970 /* move "now" forward */
971 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
972
973 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
974
975 vm_pageout_stat_now = vm_pageout_next;
976 }
977
978
979 /*
980 * IMPORTANT
981 * mach_vm_ctl_page_free_wanted() is called indirectly, via
982 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
983 * it must be safe in the restricted stackshot context. Locks and/or
984 * blocking are not allowable.
985 */
986 unsigned int
mach_vm_ctl_page_free_wanted(void)987 mach_vm_ctl_page_free_wanted(void)
988 {
989 unsigned int page_free_target, page_free_count, page_free_wanted;
990
991 page_free_target = vm_page_free_target;
992 page_free_count = vm_page_free_count;
993 if (page_free_target > page_free_count) {
994 page_free_wanted = page_free_target - page_free_count;
995 } else {
996 page_free_wanted = 0;
997 }
998
999 return page_free_wanted;
1000 }
1001
1002
1003 /*
1004 * IMPORTANT:
1005 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1006 * wait_for_pressure FALSE, so that code path must remain safe in the
1007 * restricted stackshot context. No blocking or locks are allowable.
1008 * on that code path.
1009 */
1010
1011 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)1012 mach_vm_pressure_monitor(
1013 boolean_t wait_for_pressure,
1014 unsigned int nsecs_monitored,
1015 unsigned int *pages_reclaimed_p,
1016 unsigned int *pages_wanted_p)
1017 {
1018 wait_result_t wr;
1019 unsigned int vm_pageout_then, vm_pageout_now;
1020 unsigned int pages_reclaimed;
1021 unsigned int units_of_monitor;
1022
1023 units_of_monitor = 8 * nsecs_monitored;
1024 /*
1025 * We don't take the vm_page_queue_lock here because we don't want
1026 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1027 * thread when it's trying to reclaim memory. We don't need fully
1028 * accurate monitoring anyway...
1029 */
1030
1031 if (wait_for_pressure) {
1032 /* wait until there's memory pressure */
1033 while (vm_page_free_count >= vm_page_free_target) {
1034 wr = assert_wait((event_t) &vm_page_free_wanted,
1035 THREAD_INTERRUPTIBLE);
1036 if (wr == THREAD_WAITING) {
1037 wr = thread_block(THREAD_CONTINUE_NULL);
1038 }
1039 if (wr == THREAD_INTERRUPTED) {
1040 return KERN_ABORTED;
1041 }
1042 if (wr == THREAD_AWAKENED) {
1043 /*
1044 * The memory pressure might have already
1045 * been relieved but let's not block again
1046 * and let's report that there was memory
1047 * pressure at some point.
1048 */
1049 break;
1050 }
1051 }
1052 }
1053
1054 /* provide the number of pages the system wants to reclaim */
1055 if (pages_wanted_p != NULL) {
1056 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1057 }
1058
1059 if (pages_reclaimed_p == NULL) {
1060 return KERN_SUCCESS;
1061 }
1062
1063 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1064 vm_pageout_now = vm_pageout_stat_now;
1065 pages_reclaimed = 0;
1066 for (vm_pageout_then =
1067 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1068 vm_pageout_then != vm_pageout_now &&
1069 units_of_monitor-- != 0;
1070 vm_pageout_then =
1071 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1072 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1073 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1074 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1075 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1076 }
1077 *pages_reclaimed_p = pages_reclaimed;
1078
1079 return KERN_SUCCESS;
1080 }
1081
1082
1083
1084 #if DEVELOPMENT || DEBUG
1085
1086 static void
1087 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1088
1089 /*
1090 * condition variable used to make sure there is
1091 * only a single sweep going on at a time
1092 */
1093 bool vm_pageout_disconnect_all_pages_active = false;
1094
1095 void
vm_pageout_disconnect_all_pages()1096 vm_pageout_disconnect_all_pages()
1097 {
1098 vm_page_lock_queues();
1099
1100 if (vm_pageout_disconnect_all_pages_active) {
1101 vm_page_unlock_queues();
1102 return;
1103 }
1104 vm_pageout_disconnect_all_pages_active = true;
1105
1106 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1107 vm_page_throttled_count);
1108 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1109 vm_page_anonymous_count);
1110 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1111 (vm_page_inactive_count - vm_page_anonymous_count));
1112 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1113 vm_page_active_count);
1114 #ifdef CONFIG_SECLUDED_MEMORY
1115 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1116 vm_page_secluded_count);
1117 #endif /* CONFIG_SECLUDED_MEMORY */
1118 vm_page_unlock_queues();
1119
1120 vm_pageout_disconnect_all_pages_active = false;
1121 }
1122
1123 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1124 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1125 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1126 {
1127 vm_page_t m;
1128 vm_object_t t_object = NULL;
1129 vm_object_t l_object = NULL;
1130 vm_object_t m_object = NULL;
1131 int delayed_unlock = 0;
1132 int try_failed_count = 0;
1133 int disconnected_count = 0;
1134 int paused_count = 0;
1135 int object_locked_count = 0;
1136
1137 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1138 DBG_FUNC_START),
1139 q, qcount);
1140
1141 while (qcount && !vm_page_queue_empty(q)) {
1142 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1143
1144 m = (vm_page_t) vm_page_queue_first(q);
1145 m_object = VM_PAGE_OBJECT(m);
1146
1147 /*
1148 * check to see if we currently are working
1149 * with the same object... if so, we've
1150 * already got the lock
1151 */
1152 if (m_object != l_object) {
1153 /*
1154 * the object associated with candidate page is
1155 * different from the one we were just working
1156 * with... dump the lock if we still own it
1157 */
1158 if (l_object != NULL) {
1159 vm_object_unlock(l_object);
1160 l_object = NULL;
1161 }
1162 if (m_object != t_object) {
1163 try_failed_count = 0;
1164 }
1165
1166 /*
1167 * Try to lock object; since we've alread got the
1168 * page queues lock, we can only 'try' for this one.
1169 * if the 'try' fails, we need to do a mutex_pause
1170 * to allow the owner of the object lock a chance to
1171 * run...
1172 */
1173 if (!vm_object_lock_try_scan(m_object)) {
1174 if (try_failed_count > 20) {
1175 goto reenter_pg_on_q;
1176 }
1177 vm_page_unlock_queues();
1178 mutex_pause(try_failed_count++);
1179 vm_page_lock_queues();
1180 delayed_unlock = 0;
1181
1182 paused_count++;
1183
1184 t_object = m_object;
1185 continue;
1186 }
1187 object_locked_count++;
1188
1189 l_object = m_object;
1190 }
1191 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1192 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1193 m->vmp_free_when_done) {
1194 /*
1195 * put it back on the head of its queue
1196 */
1197 goto reenter_pg_on_q;
1198 }
1199 if (m->vmp_pmapped == TRUE) {
1200 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1201
1202 disconnected_count++;
1203 }
1204 reenter_pg_on_q:
1205 vm_page_queue_remove(q, m, vmp_pageq);
1206 vm_page_queue_enter(q, m, vmp_pageq);
1207
1208 qcount--;
1209 try_failed_count = 0;
1210
1211 if (delayed_unlock++ > 128) {
1212 if (l_object != NULL) {
1213 vm_object_unlock(l_object);
1214 l_object = NULL;
1215 }
1216 lck_mtx_yield(&vm_page_queue_lock);
1217 delayed_unlock = 0;
1218 }
1219 }
1220 if (l_object != NULL) {
1221 vm_object_unlock(l_object);
1222 l_object = NULL;
1223 }
1224
1225 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1226 DBG_FUNC_END),
1227 q, disconnected_count, object_locked_count, paused_count);
1228 }
1229
1230 extern char* proc_best_name(struct proc* proc);
1231
1232 int
vm_toggle_task_selfdonate_pages(task_t task)1233 vm_toggle_task_selfdonate_pages(task_t task)
1234 {
1235 int state = 0;
1236 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1237 printf("VM Donation mode is OFF on the system\n");
1238 return state;
1239 }
1240 if (task != kernel_task) {
1241 task_lock(task);
1242 if (!task->donates_own_pages) {
1243 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1244 task->donates_own_pages = true;
1245 state = 1;
1246 } else if (task->donates_own_pages) {
1247 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1248 task->donates_own_pages = false;
1249 state = 0;
1250 }
1251 task_unlock(task);
1252 }
1253 return state;
1254 }
1255 #endif /* DEVELOPMENT || DEBUG */
1256
1257 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1258 vm_task_set_selfdonate_pages(task_t task, bool donate)
1259 {
1260 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1261 assert(task != kernel_task);
1262
1263 task_lock(task);
1264 task->donates_own_pages = donate;
1265 task_unlock(task);
1266 }
1267
1268
1269
1270 static size_t
1271 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1272
1273 /*
1274 * condition variable used to make sure there is
1275 * only a single sweep going on at a time
1276 */
1277 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1278
1279
1280 void
vm_pageout_anonymous_pages()1281 vm_pageout_anonymous_pages()
1282 {
1283 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1284 vm_page_lock_queues();
1285
1286 if (vm_pageout_anonymous_pages_active == TRUE) {
1287 vm_page_unlock_queues();
1288 return;
1289 }
1290 vm_pageout_anonymous_pages_active = TRUE;
1291 vm_page_unlock_queues();
1292
1293 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1294 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1295 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1296
1297 if (VM_CONFIG_SWAP_IS_PRESENT) {
1298 vm_consider_swapping();
1299 }
1300
1301 vm_page_lock_queues();
1302 vm_pageout_anonymous_pages_active = FALSE;
1303 vm_page_unlock_queues();
1304 }
1305 }
1306
1307
1308 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1309 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1310 {
1311 vm_page_t m;
1312 vm_object_t t_object = NULL;
1313 vm_object_t l_object = NULL;
1314 vm_object_t m_object = NULL;
1315 int delayed_unlock = 0;
1316 int try_failed_count = 0;
1317 int refmod_state;
1318 int pmap_options;
1319 struct vm_pageout_queue *iq;
1320 ppnum_t phys_page;
1321 size_t pages_moved = 0;
1322
1323
1324 iq = &vm_pageout_queue_internal;
1325
1326 vm_page_lock_queues();
1327
1328 #if DEVELOPMENT || DEBUG
1329 if (perf_test) {
1330 iq = &vm_pageout_queue_benchmark;
1331 // ensure the benchmark queue isn't throttled
1332 iq->pgo_maxlaundry = (unsigned int) qcount;
1333 }
1334 #endif /* DEVELOPMENT ||DEBUG */
1335
1336 while (qcount && !vm_page_queue_empty(q)) {
1337 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1338
1339 if (VM_PAGE_Q_THROTTLED(iq)) {
1340 if (l_object != NULL) {
1341 vm_object_unlock(l_object);
1342 l_object = NULL;
1343 }
1344 iq->pgo_draining = TRUE;
1345
1346 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1347 vm_page_unlock_queues();
1348
1349 thread_block(THREAD_CONTINUE_NULL);
1350
1351 vm_page_lock_queues();
1352 delayed_unlock = 0;
1353 continue;
1354 }
1355 m = (vm_page_t) vm_page_queue_first(q);
1356 m_object = VM_PAGE_OBJECT(m);
1357
1358 /*
1359 * check to see if we currently are working
1360 * with the same object... if so, we've
1361 * already got the lock
1362 */
1363 if (m_object != l_object) {
1364 if (!m_object->internal) {
1365 goto reenter_pg_on_q;
1366 }
1367
1368 /*
1369 * the object associated with candidate page is
1370 * different from the one we were just working
1371 * with... dump the lock if we still own it
1372 */
1373 if (l_object != NULL) {
1374 vm_object_unlock(l_object);
1375 l_object = NULL;
1376 }
1377 if (m_object != t_object) {
1378 try_failed_count = 0;
1379 }
1380
1381 /*
1382 * Try to lock object; since we've alread got the
1383 * page queues lock, we can only 'try' for this one.
1384 * if the 'try' fails, we need to do a mutex_pause
1385 * to allow the owner of the object lock a chance to
1386 * run...
1387 */
1388 if (!vm_object_lock_try_scan(m_object)) {
1389 if (try_failed_count > 20) {
1390 goto reenter_pg_on_q;
1391 }
1392 vm_page_unlock_queues();
1393 mutex_pause(try_failed_count++);
1394 vm_page_lock_queues();
1395 delayed_unlock = 0;
1396
1397 t_object = m_object;
1398 continue;
1399 }
1400 l_object = m_object;
1401 }
1402 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1403 /*
1404 * page is not to be cleaned
1405 * put it back on the head of its queue
1406 */
1407 goto reenter_pg_on_q;
1408 }
1409 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1410
1411 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1412 refmod_state = pmap_get_refmod(phys_page);
1413
1414 if (refmod_state & VM_MEM_REFERENCED) {
1415 m->vmp_reference = TRUE;
1416 }
1417 if (refmod_state & VM_MEM_MODIFIED) {
1418 SET_PAGE_DIRTY(m, FALSE);
1419 }
1420 }
1421 if (m->vmp_reference == TRUE) {
1422 m->vmp_reference = FALSE;
1423 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1424 goto reenter_pg_on_q;
1425 }
1426 if (m->vmp_pmapped == TRUE) {
1427 if (m->vmp_dirty || m->vmp_precious) {
1428 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1429 } else {
1430 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1431 }
1432 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1433 if (refmod_state & VM_MEM_MODIFIED) {
1434 SET_PAGE_DIRTY(m, FALSE);
1435 }
1436 }
1437
1438 if (!m->vmp_dirty && !m->vmp_precious) {
1439 vm_page_unlock_queues();
1440 VM_PAGE_FREE(m);
1441 vm_page_lock_queues();
1442 delayed_unlock = 0;
1443
1444 goto next_pg;
1445 }
1446 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1447 if (!m_object->pager_initialized) {
1448 vm_page_unlock_queues();
1449
1450 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1451
1452 if (!m_object->pager_initialized) {
1453 vm_object_compressor_pager_create(m_object);
1454 }
1455
1456 vm_page_lock_queues();
1457 delayed_unlock = 0;
1458 }
1459 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1460 goto reenter_pg_on_q;
1461 }
1462 /*
1463 * vm_object_compressor_pager_create will drop the object lock
1464 * which means 'm' may no longer be valid to use
1465 */
1466 continue;
1467 }
1468
1469 if (!perf_test) {
1470 /*
1471 * we've already factored out pages in the laundry which
1472 * means this page can't be on the pageout queue so it's
1473 * safe to do the vm_page_queues_remove
1474 */
1475 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1476 vm_page_queues_remove(m, TRUE);
1477 if (donate) {
1478 /*
1479 * The compressor needs to see this bit to know
1480 * where this page needs to land. Also if stolen,
1481 * this bit helps put the page back in the right
1482 * special queue where it belongs.
1483 */
1484 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1485 }
1486 } else {
1487 vm_page_queue_remove(q, m, vmp_pageq);
1488 }
1489
1490 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1491
1492 vm_pageout_cluster_to_queue(m, iq);
1493
1494 pages_moved++;
1495 goto next_pg;
1496
1497 reenter_pg_on_q:
1498 vm_page_queue_remove(q, m, vmp_pageq);
1499 vm_page_queue_enter(q, m, vmp_pageq);
1500 next_pg:
1501 qcount--;
1502 try_failed_count = 0;
1503
1504 if (delayed_unlock++ > 128) {
1505 if (l_object != NULL) {
1506 vm_object_unlock(l_object);
1507 l_object = NULL;
1508 }
1509 lck_mtx_yield(&vm_page_queue_lock);
1510 delayed_unlock = 0;
1511 }
1512 }
1513 if (l_object != NULL) {
1514 vm_object_unlock(l_object);
1515 l_object = NULL;
1516 }
1517 vm_page_unlock_queues();
1518 return pages_moved;
1519 }
1520
1521
1522
1523 /*
1524 * function in BSD to apply I/O throttle to the pageout thread
1525 */
1526 extern void vm_pageout_io_throttle(void);
1527
1528 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1529 MACRO_BEGIN \
1530 /* \
1531 * If a "reusable" page somehow made it back into \
1532 * the active queue, it's been re-used and is not \
1533 * quite re-usable. \
1534 * If the VM object was "all_reusable", consider it \
1535 * as "all re-used" instead of converting it to \
1536 * "partially re-used", which could be expensive. \
1537 */ \
1538 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1539 if ((m)->vmp_reusable || \
1540 (obj)->all_reusable) { \
1541 vm_object_reuse_pages((obj), \
1542 (m)->vmp_offset, \
1543 (m)->vmp_offset + PAGE_SIZE_64, \
1544 FALSE); \
1545 } \
1546 MACRO_END
1547
1548
1549 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1550 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1551
1552 #define FCS_IDLE 0
1553 #define FCS_DELAYED 1
1554 #define FCS_DEADLOCK_DETECTED 2
1555
1556 struct flow_control {
1557 int state;
1558 mach_timespec_t ts;
1559 };
1560
1561
1562 uint64_t vm_pageout_rejected_bq_internal = 0;
1563 uint64_t vm_pageout_rejected_bq_external = 0;
1564 uint64_t vm_pageout_skipped_bq_internal = 0;
1565 uint64_t vm_pageout_skipped_bq_external = 0;
1566
1567 #define ANONS_GRABBED_LIMIT 2
1568
1569
1570 #if 0
1571 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1572 #endif
1573 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1574
1575 #define VM_PAGEOUT_PB_NO_ACTION 0
1576 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1577 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1578
1579
1580 #if 0
1581 static void
1582 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1583 {
1584 if (*local_freeq) {
1585 vm_page_unlock_queues();
1586
1587 VM_DEBUG_CONSTANT_EVENT(
1588 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1589 vm_page_free_count, 0, 0, 1);
1590
1591 vm_page_free_list(*local_freeq, TRUE);
1592
1593 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1594 vm_page_free_count, *local_freed, 0, 1);
1595
1596 *local_freeq = NULL;
1597 *local_freed = 0;
1598
1599 vm_page_lock_queues();
1600 } else {
1601 lck_mtx_yield(&vm_page_queue_lock);
1602 }
1603 *delayed_unlock = 1;
1604 }
1605 #endif
1606
1607
1608 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1609 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1610 vm_page_t *local_freeq, int *local_freed, int action)
1611 {
1612 vm_page_unlock_queues();
1613
1614 if (*object != NULL) {
1615 vm_object_unlock(*object);
1616 *object = NULL;
1617 }
1618 if (*local_freeq) {
1619 vm_page_free_list(*local_freeq, TRUE);
1620
1621 *local_freeq = NULL;
1622 *local_freed = 0;
1623 }
1624 *delayed_unlock = 1;
1625
1626 switch (action) {
1627 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1628 vm_consider_waking_compactor_swapper();
1629 break;
1630 case VM_PAGEOUT_PB_THREAD_YIELD:
1631 thread_yield_internal(1);
1632 break;
1633 case VM_PAGEOUT_PB_NO_ACTION:
1634 default:
1635 break;
1636 }
1637 vm_page_lock_queues();
1638 }
1639
1640
1641 static struct vm_pageout_vminfo last;
1642
1643 uint64_t last_vm_page_pages_grabbed = 0;
1644
1645 extern uint32_t c_segment_pages_compressed;
1646
1647 extern uint64_t shared_region_pager_reclaimed;
1648 extern struct memory_object_pager_ops shared_region_pager_ops;
1649
1650 void
update_vm_info(void)1651 update_vm_info(void)
1652 {
1653 unsigned long tmp;
1654 uint64_t tmp64;
1655
1656 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1657 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1658 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1659 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1660
1661 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1662 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1663 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1664
1665 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1666 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1667 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1668 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1669 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1670
1671 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1672 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1673 last.vm_pageout_considered_page = tmp;
1674
1675 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1676 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1677 last.vm_pageout_compressions = tmp64;
1678
1679 tmp = vm_pageout_vminfo.vm_compressor_failed;
1680 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1681 last.vm_compressor_failed = tmp;
1682
1683 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1684 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1685 last.vm_compressor_pages_grabbed = tmp64;
1686
1687 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1688 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1689 last.vm_phantom_cache_found_ghost = tmp;
1690
1691 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1692 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1693 last.vm_phantom_cache_added_ghost = tmp;
1694
1695 tmp64 = counter_load(&vm_page_grab_count);
1696 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1697 last_vm_page_pages_grabbed = tmp64;
1698
1699 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1700 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1701 last.vm_page_pages_freed = tmp;
1702
1703 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1704 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1705 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1706 last.vm_pageout_pages_evicted = tmp;
1707
1708 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1709 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1710 last.vm_pageout_pages_purged = tmp;
1711
1712 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1713 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1714 last.vm_pageout_freed_speculative = tmp;
1715
1716 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1717 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1718 last.vm_pageout_freed_external = tmp;
1719
1720 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1721 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1722 last.vm_pageout_inactive_referenced = tmp;
1723
1724 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1725 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1726 last.vm_pageout_scan_inactive_throttled_external = tmp;
1727
1728 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1729 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1730 last.vm_pageout_inactive_dirty_external = tmp;
1731
1732 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1733 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1734 last.vm_pageout_freed_cleaned = tmp;
1735
1736 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1737 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1738 last.vm_pageout_inactive_nolock = tmp;
1739
1740 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1741 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1742 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1743
1744 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1745 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1746 last.vm_pageout_skipped_external = tmp;
1747
1748 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1749 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1750 last.vm_pageout_skipped_internal = tmp;
1751
1752 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1753 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1754 last.vm_pageout_reactivation_limit_exceeded = tmp;
1755
1756 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1757 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1758 last.vm_pageout_inactive_force_reclaim = tmp;
1759
1760 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1761 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1762 last.vm_pageout_freed_internal = tmp;
1763
1764 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1765 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1766 last.vm_pageout_considered_bq_internal = tmp;
1767
1768 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1769 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1770 last.vm_pageout_considered_bq_external = tmp;
1771
1772 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1773 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1774 last.vm_pageout_filecache_min_reactivated = tmp;
1775
1776 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1777 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1778 last.vm_pageout_inactive_dirty_internal = tmp;
1779
1780 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1781 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1782 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1783
1784 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1785 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1786 last.vm_pageout_forcereclaimed_realtime = tmp;
1787
1788 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1789 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1790 last.vm_pageout_protected_sharedcache = tmp;
1791
1792 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1793 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1794 last.vm_pageout_protected_realtime = tmp;
1795 }
1796
1797 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1798 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1799 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1800 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1801 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1802 0);
1803
1804 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1805 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1806 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1807 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1808 0,
1809 0);
1810
1811 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1812 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1813 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1814 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1815 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1816 0);
1817
1818 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1819 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1820 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1821 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1822 vm_pageout_stats[vm_pageout_stat_now].considered,
1823 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1824 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1825 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1826 0);
1827
1828 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1829 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1830 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1831 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1832 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1833 0);
1834
1835 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1836 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1837 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1838 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1839 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1840 0);
1841
1842 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1843 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1844 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1845 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1846 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1847 0);
1848
1849 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1850 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1851 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1852 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1853 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1854 0);
1855
1856 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1857 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1858 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1859 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1860 vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1861 0);
1862 }
1863 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1864 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1865 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1866 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1867 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1868 0);
1869
1870 record_memory_pressure();
1871 }
1872
1873 extern boolean_t hibernation_vmqueues_inspection;
1874
1875 /*
1876 * Return values for functions called by vm_pageout_scan
1877 * that control its flow.
1878 *
1879 * PROCEED -- vm_pageout_scan will keep making forward progress.
1880 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1881 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1882 */
1883
1884 #define VM_PAGEOUT_SCAN_PROCEED (0)
1885 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1886 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1887
1888 /*
1889 * This function is called only from vm_pageout_scan and
1890 * it moves overflow secluded pages (one-at-a-time) to the
1891 * batched 'local' free Q or active Q.
1892 */
1893 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1894 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1895 {
1896 #if CONFIG_SECLUDED_MEMORY
1897 /*
1898 * Deal with secluded_q overflow.
1899 */
1900 if (vm_page_secluded_count > vm_page_secluded_target) {
1901 vm_page_t secluded_page;
1902
1903 /*
1904 * SECLUDED_AGING_BEFORE_ACTIVE:
1905 * Excess secluded pages go to the active queue and
1906 * will later go to the inactive queue.
1907 */
1908 assert((vm_page_secluded_count_free +
1909 vm_page_secluded_count_inuse) ==
1910 vm_page_secluded_count);
1911 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1912 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1913
1914 vm_page_queues_remove(secluded_page, FALSE);
1915 assert(!secluded_page->vmp_fictitious);
1916 assert(!VM_PAGE_WIRED(secluded_page));
1917
1918 if (secluded_page->vmp_object == 0) {
1919 /* transfer to free queue */
1920 assert(secluded_page->vmp_busy);
1921 secluded_page->vmp_snext = *local_freeq;
1922 *local_freeq = secluded_page;
1923 *local_freed += 1;
1924 } else {
1925 /* transfer to head of active queue */
1926 vm_page_enqueue_active(secluded_page, FALSE);
1927 secluded_page = VM_PAGE_NULL;
1928 }
1929 }
1930 #else /* CONFIG_SECLUDED_MEMORY */
1931
1932 #pragma unused(local_freeq)
1933 #pragma unused(local_freed)
1934
1935 return;
1936
1937 #endif /* CONFIG_SECLUDED_MEMORY */
1938 }
1939
1940 /*
1941 * This function is called only from vm_pageout_scan and
1942 * it initializes the loop targets for vm_pageout_scan().
1943 */
1944 static void
vps_init_page_targets(void)1945 vps_init_page_targets(void)
1946 {
1947 /*
1948 * LD TODO: Other page targets should be calculated here too.
1949 */
1950 vm_page_anonymous_min = vm_page_inactive_target / 20;
1951
1952 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1953 vm_pageout_state.vm_page_speculative_percentage = 50;
1954 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1955 vm_pageout_state.vm_page_speculative_percentage = 1;
1956 }
1957
1958 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1959 vm_page_inactive_count);
1960 }
1961
1962 /*
1963 * This function is called only from vm_pageout_scan and
1964 * it purges a single VM object at-a-time and will either
1965 * make vm_pageout_scan() restart the loop or keeping moving forward.
1966 */
1967 static int
vps_purge_object()1968 vps_purge_object()
1969 {
1970 int force_purge;
1971
1972 assert(available_for_purge >= 0);
1973 force_purge = 0; /* no force-purging */
1974
1975 #if VM_PRESSURE_EVENTS
1976 vm_pressure_level_t pressure_level;
1977
1978 pressure_level = memorystatus_vm_pressure_level;
1979
1980 if (pressure_level > kVMPressureNormal) {
1981 if (pressure_level >= kVMPressureCritical) {
1982 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1983 } else if (pressure_level >= kVMPressureUrgent) {
1984 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1985 } else if (pressure_level >= kVMPressureWarning) {
1986 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1987 }
1988 }
1989 #endif /* VM_PRESSURE_EVENTS */
1990
1991 if (available_for_purge || force_purge) {
1992 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1993
1994 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1995 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1996 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1997 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1998 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1999
2000 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001 }
2002 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2003 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2004 }
2005
2006 return VM_PAGEOUT_SCAN_PROCEED;
2007 }
2008
2009 /*
2010 * This function is called only from vm_pageout_scan and
2011 * it will try to age the next speculative Q if the oldest
2012 * one is empty.
2013 */
2014 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)2015 vps_age_speculative_queue(boolean_t force_speculative_aging)
2016 {
2017 #define DELAY_SPECULATIVE_AGE 1000
2018
2019 /*
2020 * try to pull pages from the aging bins...
2021 * see vm_page.h for an explanation of how
2022 * this mechanism works
2023 */
2024 boolean_t can_steal = FALSE;
2025 int num_scanned_queues;
2026 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2027 mach_timespec_t ts;
2028 struct vm_speculative_age_q *aq;
2029 struct vm_speculative_age_q *sq;
2030
2031 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2032
2033 aq = &vm_page_queue_speculative[speculative_steal_index];
2034
2035 num_scanned_queues = 0;
2036 while (vm_page_queue_empty(&aq->age_q) &&
2037 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2038 speculative_steal_index++;
2039
2040 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2041 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2042 }
2043
2044 aq = &vm_page_queue_speculative[speculative_steal_index];
2045 }
2046
2047 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2048 /*
2049 * XXX We've scanned all the speculative
2050 * queues but still haven't found one
2051 * that is not empty, even though
2052 * vm_page_speculative_count is not 0.
2053 */
2054 if (!vm_page_queue_empty(&sq->age_q)) {
2055 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2056 }
2057 #if DEVELOPMENT || DEBUG
2058 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2059 #endif
2060 /* readjust... */
2061 vm_page_speculative_count = 0;
2062 /* ... and continue */
2063 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2064 }
2065
2066 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2067 can_steal = TRUE;
2068 } else {
2069 if (!delay_speculative_age) {
2070 mach_timespec_t ts_fully_aged;
2071
2072 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2073 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2074 * 1000 * NSEC_PER_USEC;
2075
2076 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2077
2078 clock_sec_t sec;
2079 clock_nsec_t nsec;
2080 clock_get_system_nanotime(&sec, &nsec);
2081 ts.tv_sec = (unsigned int) sec;
2082 ts.tv_nsec = nsec;
2083
2084 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2085 can_steal = TRUE;
2086 } else {
2087 delay_speculative_age++;
2088 }
2089 } else {
2090 delay_speculative_age++;
2091 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2092 delay_speculative_age = 0;
2093 }
2094 }
2095 }
2096 if (can_steal == TRUE) {
2097 vm_page_speculate_ageit(aq);
2098 }
2099
2100 return VM_PAGEOUT_SCAN_PROCEED;
2101 }
2102
2103 /*
2104 * This function is called only from vm_pageout_scan and
2105 * it evicts a single VM object from the cache.
2106 */
2107 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2108 vps_object_cache_evict(vm_object_t *object_to_unlock)
2109 {
2110 static int cache_evict_throttle = 0;
2111 struct vm_speculative_age_q *sq;
2112
2113 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2114
2115 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2116 int pages_evicted;
2117
2118 if (*object_to_unlock != NULL) {
2119 vm_object_unlock(*object_to_unlock);
2120 *object_to_unlock = NULL;
2121 }
2122 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2123
2124 pages_evicted = vm_object_cache_evict(100, 10);
2125
2126 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2127
2128 if (pages_evicted) {
2129 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2130
2131 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2132 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2133 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2134
2135 /*
2136 * we just freed up to 100 pages,
2137 * so go back to the top of the main loop
2138 * and re-evaulate the memory situation
2139 */
2140 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2141 } else {
2142 cache_evict_throttle = 1000;
2143 }
2144 }
2145 if (cache_evict_throttle) {
2146 cache_evict_throttle--;
2147 }
2148
2149 return VM_PAGEOUT_SCAN_PROCEED;
2150 }
2151
2152
2153 /*
2154 * This function is called only from vm_pageout_scan and
2155 * it calculates the filecache min. that needs to be maintained
2156 * as we start to steal pages.
2157 */
2158 static void
vps_calculate_filecache_min(void)2159 vps_calculate_filecache_min(void)
2160 {
2161 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2162
2163 #if CONFIG_JETSAM
2164 /*
2165 * don't let the filecache_min fall below 15% of available memory
2166 * on systems with an active compressor that isn't nearing its
2167 * limits w/r to accepting new data
2168 *
2169 * on systems w/o the compressor/swapper, the filecache is always
2170 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2171 * since most (if not all) of the anonymous pages are in the
2172 * throttled queue (which isn't counted as available) which
2173 * effectively disables this filter
2174 */
2175 if (vm_compressor_low_on_space() || divisor == 0) {
2176 vm_pageout_state.vm_page_filecache_min = 0;
2177 } else {
2178 vm_pageout_state.vm_page_filecache_min =
2179 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2180 }
2181 #else
2182 if (vm_compressor_out_of_space() || divisor == 0) {
2183 vm_pageout_state.vm_page_filecache_min = 0;
2184 } else {
2185 /*
2186 * don't let the filecache_min fall below the specified critical level
2187 */
2188 vm_pageout_state.vm_page_filecache_min =
2189 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2190 }
2191 #endif
2192 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2193 vm_pageout_state.vm_page_filecache_min = 0;
2194 }
2195 }
2196
2197 /*
2198 * This function is called only from vm_pageout_scan and
2199 * it updates the flow control time to detect if VM pageoutscan
2200 * isn't making progress.
2201 */
2202 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2203 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2204 {
2205 mach_timespec_t ts;
2206 clock_sec_t sec;
2207 clock_nsec_t nsec;
2208
2209 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2210 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2211 clock_get_system_nanotime(&sec, &nsec);
2212 flow_control->ts.tv_sec = (unsigned int) sec;
2213 flow_control->ts.tv_nsec = nsec;
2214 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2215
2216 flow_control->state = FCS_DELAYED;
2217
2218 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2219 }
2220
2221 /*
2222 * This function is called only from vm_pageout_scan and
2223 * it is the flow control logic of VM pageout scan which
2224 * controls if it should block and for how long.
2225 * Any blocking of vm_pageout_scan happens ONLY in this function.
2226 */
2227 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2228 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2229 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2230 {
2231 boolean_t exceeded_burst_throttle = FALSE;
2232 unsigned int msecs = 0;
2233 uint32_t inactive_external_count;
2234 mach_timespec_t ts;
2235 struct vm_pageout_queue *iq;
2236 struct vm_pageout_queue *eq;
2237 struct vm_speculative_age_q *sq;
2238
2239 iq = &vm_pageout_queue_internal;
2240 eq = &vm_pageout_queue_external;
2241 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2242
2243 /*
2244 * Sometimes we have to pause:
2245 * 1) No inactive pages - nothing to do.
2246 * 2) Loop control - no acceptable pages found on the inactive queue
2247 * within the last vm_pageout_burst_inactive_throttle iterations
2248 * 3) Flow control - default pageout queue is full
2249 */
2250 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2251 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2252 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2253 vm_page_queue_empty(&sq->age_q)) {
2254 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2255 msecs = vm_pageout_state.vm_pageout_empty_wait;
2256 } else if (inactive_burst_count >=
2257 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2258 (vm_page_inactive_count +
2259 vm_page_speculative_count))) {
2260 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2261 msecs = vm_pageout_state.vm_pageout_burst_wait;
2262
2263 exceeded_burst_throttle = TRUE;
2264 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2265 VM_DYNAMIC_PAGING_ENABLED()) {
2266 clock_sec_t sec;
2267 clock_nsec_t nsec;
2268
2269 switch (flow_control->state) {
2270 case FCS_IDLE:
2271 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2272 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2273 /*
2274 * since the compressor is running independently of vm_pageout_scan
2275 * let's not wait for it just yet... as long as we have a healthy supply
2276 * of filecache pages to work with, let's keep stealing those.
2277 */
2278 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2279
2280 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2281 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2282 *anons_grabbed = ANONS_GRABBED_LIMIT;
2283 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2284 return VM_PAGEOUT_SCAN_PROCEED;
2285 }
2286 }
2287
2288 vps_flow_control_reset_deadlock_timer(flow_control);
2289 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2290
2291 break;
2292
2293 case FCS_DELAYED:
2294 clock_get_system_nanotime(&sec, &nsec);
2295 ts.tv_sec = (unsigned int) sec;
2296 ts.tv_nsec = nsec;
2297
2298 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2299 /*
2300 * the pageout thread for the default pager is potentially
2301 * deadlocked since the
2302 * default pager queue has been throttled for more than the
2303 * allowable time... we need to move some clean pages or dirty
2304 * pages belonging to the external pagers if they aren't throttled
2305 * vm_page_free_wanted represents the number of threads currently
2306 * blocked waiting for pages... we'll move one page for each of
2307 * these plus a fixed amount to break the logjam... once we're done
2308 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2309 * with a new timeout target since we have no way of knowing
2310 * whether we've broken the deadlock except through observation
2311 * of the queue associated with the default pager... we need to
2312 * stop moving pages and allow the system to run to see what
2313 * state it settles into.
2314 */
2315
2316 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2317 vm_page_free_wanted + vm_page_free_wanted_privileged;
2318 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2319 flow_control->state = FCS_DEADLOCK_DETECTED;
2320 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2321 return VM_PAGEOUT_SCAN_PROCEED;
2322 }
2323 /*
2324 * just resniff instead of trying
2325 * to compute a new delay time... we're going to be
2326 * awakened immediately upon a laundry completion,
2327 * so we won't wait any longer than necessary
2328 */
2329 msecs = vm_pageout_state.vm_pageout_idle_wait;
2330 break;
2331
2332 case FCS_DEADLOCK_DETECTED:
2333 if (*vm_pageout_deadlock_target) {
2334 return VM_PAGEOUT_SCAN_PROCEED;
2335 }
2336
2337 vps_flow_control_reset_deadlock_timer(flow_control);
2338 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2339
2340 break;
2341 }
2342 } else {
2343 /*
2344 * No need to pause...
2345 */
2346 return VM_PAGEOUT_SCAN_PROCEED;
2347 }
2348
2349 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2350
2351 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2352 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2353
2354 if (vm_page_free_count >= vm_page_free_target) {
2355 /*
2356 * we're here because
2357 * 1) someone else freed up some pages while we had
2358 * the queues unlocked above
2359 * and we've hit one of the 3 conditions that
2360 * cause us to pause the pageout scan thread
2361 *
2362 * since we already have enough free pages,
2363 * let's avoid stalling and return normally
2364 *
2365 * before we return, make sure the pageout I/O threads
2366 * are running throttled in case there are still requests
2367 * in the laundry... since we have enough free pages
2368 * we don't need the laundry to be cleaned in a timely
2369 * fashion... so let's avoid interfering with foreground
2370 * activity
2371 *
2372 * we don't want to hold vm_page_queue_free_lock when
2373 * calling vm_pageout_adjust_eq_iothrottle (since it
2374 * may cause other locks to be taken), we do the intitial
2375 * check outside of the lock. Once we take the lock,
2376 * we recheck the condition since it may have changed.
2377 * if it has, no problem, we will make the threads
2378 * non-throttled before actually blocking
2379 */
2380 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2381 }
2382 vm_free_page_lock();
2383
2384 if (vm_page_free_count >= vm_page_free_target &&
2385 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2386 return VM_PAGEOUT_SCAN_DONE_RETURN;
2387 }
2388 vm_free_page_unlock();
2389
2390 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2391 /*
2392 * we're most likely about to block due to one of
2393 * the 3 conditions that cause vm_pageout_scan to
2394 * not be able to make forward progress w/r
2395 * to providing new pages to the free queue,
2396 * so unthrottle the I/O threads in case we
2397 * have laundry to be cleaned... it needs
2398 * to be completed ASAP.
2399 *
2400 * even if we don't block, we want the io threads
2401 * running unthrottled since the sum of free +
2402 * clean pages is still under our free target
2403 */
2404 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2405 }
2406 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2407 /*
2408 * if we get here we're below our free target and
2409 * we're stalling due to a full laundry queue or
2410 * we don't have any inactive pages other then
2411 * those in the clean queue...
2412 * however, we have pages on the clean queue that
2413 * can be moved to the free queue, so let's not
2414 * stall the pageout scan
2415 */
2416 flow_control->state = FCS_IDLE;
2417 return VM_PAGEOUT_SCAN_PROCEED;
2418 }
2419 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2420 flow_control->state = FCS_IDLE;
2421 return VM_PAGEOUT_SCAN_PROCEED;
2422 }
2423
2424 VM_CHECK_MEMORYSTATUS;
2425
2426 if (flow_control->state != FCS_IDLE) {
2427 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2428 }
2429
2430 iq->pgo_throttled = TRUE;
2431 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2432
2433 vm_page_unlock_queues();
2434
2435 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2436
2437 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2438 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2439 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2440
2441 thread_block(THREAD_CONTINUE_NULL);
2442
2443 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2444 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2445 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2446
2447 vm_page_lock_queues();
2448
2449 iq->pgo_throttled = FALSE;
2450
2451 vps_init_page_targets();
2452
2453 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2454 }
2455
2456 extern boolean_t vm_darkwake_mode;
2457 /*
2458 * This function is called only from vm_pageout_scan and
2459 * it will find and return the most appropriate page to be
2460 * reclaimed.
2461 */
2462 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2463 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2464 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2465 {
2466 vm_page_t m = NULL;
2467 vm_object_t m_object = VM_OBJECT_NULL;
2468 uint32_t inactive_external_count;
2469 struct vm_speculative_age_q *sq;
2470 struct vm_pageout_queue *iq;
2471 int retval = VM_PAGEOUT_SCAN_PROCEED;
2472
2473 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2474 iq = &vm_pageout_queue_internal;
2475
2476 *is_page_from_bg_q = FALSE;
2477
2478 m = NULL;
2479 m_object = VM_OBJECT_NULL;
2480
2481 if (VM_DYNAMIC_PAGING_ENABLED()) {
2482 assert(vm_page_throttled_count == 0);
2483 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2484 }
2485
2486 /*
2487 * Try for a clean-queue inactive page.
2488 * These are pages that vm_pageout_scan tried to steal earlier, but
2489 * were dirty and had to be cleaned. Pick them up now that they are clean.
2490 */
2491 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2492 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2493
2494 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2495
2496 goto found_page;
2497 }
2498
2499 /*
2500 * The next most eligible pages are ones we paged in speculatively,
2501 * but which have not yet been touched and have been aged out.
2502 */
2503 if (!vm_page_queue_empty(&sq->age_q)) {
2504 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2505
2506 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2507
2508 if (!m->vmp_dirty || force_anonymous == FALSE) {
2509 goto found_page;
2510 } else {
2511 m = NULL;
2512 }
2513 }
2514
2515 #if !CONFIG_JETSAM
2516 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2517 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2518 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2519 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2520 goto found_page;
2521 }
2522 }
2523 #endif /* !CONFIG_JETSAM */
2524
2525 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2526 vm_object_t bg_m_object = NULL;
2527
2528 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2529
2530 bg_m_object = VM_PAGE_OBJECT(m);
2531
2532 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2533 /*
2534 * This page is on the background queue
2535 * but not on a pageable queue OR is busy during
2536 * darkwake mode when the target is artificially lowered.
2537 * If it is busy during darkwake mode, and we don't skip it,
2538 * we will just swing back around and try again with the same
2539 * queue and might hit the same page or its neighbor in a
2540 * similar state. Both of these are transient states and will
2541 * get resolved, but, at this point let's ignore this page.
2542 */
2543 if (vm_darkwake_mode && m->vmp_busy) {
2544 if (bg_m_object->internal) {
2545 vm_pageout_skipped_bq_internal++;
2546 } else {
2547 vm_pageout_skipped_bq_external++;
2548 }
2549 }
2550 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2551 if (bg_m_object->internal &&
2552 (VM_PAGE_Q_THROTTLED(iq) ||
2553 vm_compressor_out_of_space() == TRUE ||
2554 vm_page_free_count < (vm_page_free_reserved / 4))) {
2555 vm_pageout_skipped_bq_internal++;
2556 } else {
2557 *is_page_from_bg_q = TRUE;
2558
2559 if (bg_m_object->internal) {
2560 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2561 } else {
2562 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2563 }
2564 goto found_page;
2565 }
2566 }
2567 }
2568
2569 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2570
2571 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2572 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2573 *grab_anonymous = TRUE;
2574 *anons_grabbed = 0;
2575
2576 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2577 vm_pageout_vminfo.vm_pageout_skipped_external++;
2578 } else {
2579 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2580 /*
2581 * No swap and we are in dangerously low levels of free memory.
2582 * If we keep going ahead with anonymous pages, we are going to run into a situation
2583 * where the compressor will be stuck waiting for free pages (if it isn't already).
2584 *
2585 * So, pick a file backed page...
2586 */
2587 *grab_anonymous = FALSE;
2588 *anons_grabbed = ANONS_GRABBED_LIMIT;
2589 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2590 }
2591 }
2592 goto want_anonymous;
2593 }
2594 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2595
2596 #if CONFIG_JETSAM
2597 /* If the file-backed pool has accumulated
2598 * significantly more pages than the jetsam
2599 * threshold, prefer to reclaim those
2600 * inline to minimise compute overhead of reclaiming
2601 * anonymous pages.
2602 * This calculation does not account for the CPU local
2603 * external page queues, as those are expected to be
2604 * much smaller relative to the global pools.
2605 */
2606
2607 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2608
2609 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2610 if (vm_page_pageable_external_count >
2611 vm_pageout_state.vm_page_filecache_min) {
2612 if ((vm_page_pageable_external_count *
2613 vm_pageout_memorystatus_fb_factor_dr) >
2614 (memorystatus_available_pages_critical *
2615 vm_pageout_memorystatus_fb_factor_nr)) {
2616 *grab_anonymous = FALSE;
2617
2618 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2619 }
2620 }
2621 if (*grab_anonymous) {
2622 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2623 }
2624 }
2625 #endif /* CONFIG_JETSAM */
2626
2627 want_anonymous:
2628 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2629 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2630 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2631
2632 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2633 *anons_grabbed = 0;
2634
2635 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2636 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637 if ((++(*reactivated_this_call) % 100)) {
2638 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2639
2640 vm_page_activate(m);
2641 counter_inc(&vm_statistics_reactivations);
2642 #if DEVELOPMENT || DEBUG
2643 if (*is_page_from_bg_q == TRUE) {
2644 if (m_object->internal) {
2645 vm_pageout_rejected_bq_internal++;
2646 } else {
2647 vm_pageout_rejected_bq_external++;
2648 }
2649 }
2650 #endif /* DEVELOPMENT || DEBUG */
2651 vm_pageout_state.vm_pageout_inactive_used++;
2652
2653 m = NULL;
2654 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2655
2656 goto found_page;
2657 }
2658
2659 /*
2660 * steal 1 of the file backed pages even if
2661 * we are under the limit that has been set
2662 * for a healthy filecache
2663 */
2664 }
2665 }
2666 goto found_page;
2667 }
2668 }
2669 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2670 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2671
2672 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2673 *anons_grabbed += 1;
2674
2675 goto found_page;
2676 }
2677
2678 m = NULL;
2679
2680 found_page:
2681 *victim_page = m;
2682
2683 return retval;
2684 }
2685
2686 /*
2687 * This function is called only from vm_pageout_scan and
2688 * it will put a page back on the active/inactive queue
2689 * if we can't reclaim it for some reason.
2690 */
2691 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2692 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2693 {
2694 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2695 vm_page_enqueue_inactive(m, FALSE);
2696 } else {
2697 vm_page_activate(m);
2698 }
2699
2700 #if DEVELOPMENT || DEBUG
2701 vm_object_t m_object = VM_PAGE_OBJECT(m);
2702
2703 if (page_from_bg_q == TRUE) {
2704 if (m_object->internal) {
2705 vm_pageout_rejected_bq_internal++;
2706 } else {
2707 vm_pageout_rejected_bq_external++;
2708 }
2709 }
2710 #endif /* DEVELOPMENT || DEBUG */
2711 }
2712
2713 /*
2714 * This function is called only from vm_pageout_scan and
2715 * it will try to grab the victim page's VM object (m_object)
2716 * which differs from the previous victim page's object (object).
2717 */
2718 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2719 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2720 {
2721 struct vm_speculative_age_q *sq;
2722
2723 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2724
2725 /*
2726 * the object associated with candidate page is
2727 * different from the one we were just working
2728 * with... dump the lock if we still own it
2729 */
2730 if (*object != NULL) {
2731 vm_object_unlock(*object);
2732 *object = NULL;
2733 }
2734 /*
2735 * Try to lock object; since we've alread got the
2736 * page queues lock, we can only 'try' for this one.
2737 * if the 'try' fails, we need to do a mutex_pause
2738 * to allow the owner of the object lock a chance to
2739 * run... otherwise, we're likely to trip over this
2740 * object in the same state as we work our way through
2741 * the queue... clumps of pages associated with the same
2742 * object are fairly typical on the inactive and active queues
2743 */
2744 if (!vm_object_lock_try_scan(m_object)) {
2745 vm_page_t m_want = NULL;
2746
2747 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2748
2749 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2750 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2751 }
2752
2753 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2754
2755 m->vmp_reference = FALSE;
2756
2757 if (!m_object->object_is_shared_cache) {
2758 /*
2759 * don't apply this optimization if this is the shared cache
2760 * object, it's too easy to get rid of very hot and important
2761 * pages...
2762 * m->vmp_object must be stable since we hold the page queues lock...
2763 * we can update the scan_collisions field sans the object lock
2764 * since it is a separate field and this is the only spot that does
2765 * a read-modify-write operation and it is never executed concurrently...
2766 * we can asynchronously set this field to 0 when creating a UPL, so it
2767 * is possible for the value to be a bit non-determistic, but that's ok
2768 * since it's only used as a hint
2769 */
2770 m_object->scan_collisions = 1;
2771 }
2772 if (page_from_bg_q) {
2773 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2774 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2775 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2776 } else if (!vm_page_queue_empty(&sq->age_q)) {
2777 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2778 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2779 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2780 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2781 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2782 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2783 }
2784
2785 /*
2786 * this is the next object we're going to be interested in
2787 * try to make sure its available after the mutex_pause
2788 * returns control
2789 */
2790 if (m_want) {
2791 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2792 }
2793
2794 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2795
2796 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2797 } else {
2798 *object = m_object;
2799 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2800 }
2801
2802 return VM_PAGEOUT_SCAN_PROCEED;
2803 }
2804
2805 /*
2806 * This function is called only from vm_pageout_scan and
2807 * it notices that pageout scan may be rendered ineffective
2808 * due to a FS deadlock and will jetsam a process if possible.
2809 * If jetsam isn't supported, it'll move the page to the active
2810 * queue to try and get some different pages pushed onwards so
2811 * we can try to get out of this scenario.
2812 */
2813 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2814 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2815 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2816 {
2817 struct vm_pageout_queue *eq;
2818 vm_object_t cur_object = VM_OBJECT_NULL;
2819
2820 cur_object = *object;
2821
2822 eq = &vm_pageout_queue_external;
2823
2824 if (cur_object->internal == FALSE) {
2825 /*
2826 * we need to break up the following potential deadlock case...
2827 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2828 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2829 * c) Most of the pages in the inactive queue belong to this file.
2830 *
2831 * we are potentially in this deadlock because...
2832 * a) the external pageout queue is throttled
2833 * b) we're done with the active queue and moved on to the inactive queue
2834 * c) we've got a dirty external page
2835 *
2836 * since we don't know the reason for the external pageout queue being throttled we
2837 * must suspect that we are deadlocked, so move the current page onto the active queue
2838 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2839 *
2840 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2841 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2842 * pool the next time we select a victim page... if we can make enough new free pages,
2843 * the deadlock will break, the external pageout queue will empty and it will no longer
2844 * be throttled
2845 *
2846 * if we have jetsam configured, keep a count of the pages reactivated this way so
2847 * that we can try to find clean pages in the active/inactive queues before
2848 * deciding to jetsam a process
2849 */
2850 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2851
2852 vm_page_check_pageable_safe(m);
2853 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2854 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2855 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2856 vm_page_active_count++;
2857 vm_page_pageable_external_count++;
2858
2859 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2860
2861 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2862
2863 #pragma unused(force_anonymous)
2864
2865 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2866
2867 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2868 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2869 /*
2870 * Possible deadlock scenario so request jetsam action
2871 */
2872
2873 assert(cur_object);
2874 vm_object_unlock(cur_object);
2875
2876 cur_object = VM_OBJECT_NULL;
2877
2878 /*
2879 * VM pageout scan needs to know we have dropped this lock and so set the
2880 * object variable we got passed in to NULL.
2881 */
2882 *object = VM_OBJECT_NULL;
2883
2884 vm_page_unlock_queues();
2885
2886 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2887 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2888
2889 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2890 if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2891 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2892 }
2893
2894 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2895 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2896
2897 vm_page_lock_queues();
2898 *delayed_unlock = 1;
2899 }
2900 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2901
2902 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2903 #pragma unused(delayed_unlock)
2904
2905 *force_anonymous = TRUE;
2906 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2907 } else {
2908 vm_page_activate(m);
2909 counter_inc(&vm_statistics_reactivations);
2910
2911 #if DEVELOPMENT || DEBUG
2912 if (is_page_from_bg_q == TRUE) {
2913 if (cur_object->internal) {
2914 vm_pageout_rejected_bq_internal++;
2915 } else {
2916 vm_pageout_rejected_bq_external++;
2917 }
2918 }
2919 #endif /* DEVELOPMENT || DEBUG */
2920
2921 vm_pageout_state.vm_pageout_inactive_used++;
2922 }
2923 }
2924
2925
2926 void
vm_page_balance_inactive(int max_to_move)2927 vm_page_balance_inactive(int max_to_move)
2928 {
2929 vm_page_t m;
2930
2931 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2932
2933 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2934 /*
2935 * It is likely that the hibernation code path is
2936 * dealing with these very queues as we are about
2937 * to move pages around in/from them and completely
2938 * change the linkage of the pages.
2939 *
2940 * And so we skip the rebalancing of these queues.
2941 */
2942 return;
2943 }
2944 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2945 vm_page_inactive_count +
2946 vm_page_speculative_count);
2947
2948 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2949 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2950
2951 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2952
2953 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2954 assert(!m->vmp_laundry);
2955 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2956 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2957
2958 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2959
2960 /*
2961 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2962 *
2963 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2964 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2965 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2966 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2967 * by pageout_scan, which is just fine since the last reference would have happened quite far
2968 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2969 * have happened before we moved the page
2970 */
2971 if (m->vmp_pmapped == TRUE) {
2972 /*
2973 * We might be holding the page queue lock as a
2974 * spin lock and clearing the "referenced" bit could
2975 * take a while if there are lots of mappings of
2976 * that page, so make sure we acquire the lock as
2977 * as mutex to avoid a spinlock timeout.
2978 */
2979 vm_page_lockconvert_queues();
2980 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2981 }
2982
2983 /*
2984 * The page might be absent or busy,
2985 * but vm_page_deactivate can handle that.
2986 * FALSE indicates that we don't want a H/W clear reference
2987 */
2988 vm_page_deactivate_internal(m, FALSE);
2989 }
2990 }
2991
2992 /*
2993 * vm_pageout_scan does the dirty work for the pageout daemon.
2994 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2995 * held and vm_page_free_wanted == 0.
2996 */
2997 void
vm_pageout_scan(void)2998 vm_pageout_scan(void)
2999 {
3000 unsigned int loop_count = 0;
3001 unsigned int inactive_burst_count = 0;
3002 unsigned int reactivated_this_call;
3003 unsigned int reactivate_limit;
3004 vm_page_t local_freeq = NULL;
3005 int local_freed = 0;
3006 int delayed_unlock;
3007 int delayed_unlock_limit = 0;
3008 int refmod_state = 0;
3009 int vm_pageout_deadlock_target = 0;
3010 struct vm_pageout_queue *iq;
3011 struct vm_pageout_queue *eq;
3012 struct vm_speculative_age_q *sq;
3013 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3014 boolean_t inactive_throttled = FALSE;
3015 vm_object_t object = NULL;
3016 uint32_t inactive_reclaim_run;
3017 boolean_t grab_anonymous = FALSE;
3018 boolean_t force_anonymous = FALSE;
3019 boolean_t force_speculative_aging = FALSE;
3020 int anons_grabbed = 0;
3021 int page_prev_q_state = 0;
3022 boolean_t page_from_bg_q = FALSE;
3023 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3024 vm_object_t m_object = VM_OBJECT_NULL;
3025 int retval = 0;
3026 boolean_t lock_yield_check = FALSE;
3027
3028
3029 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3030 vm_pageout_vminfo.vm_pageout_freed_speculative,
3031 vm_pageout_state.vm_pageout_inactive_clean,
3032 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3033 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3034
3035 flow_control.state = FCS_IDLE;
3036 iq = &vm_pageout_queue_internal;
3037 eq = &vm_pageout_queue_external;
3038 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3039
3040 /* Ask the pmap layer to return any pages it no longer needs. */
3041 pmap_release_pages_fast();
3042
3043 vm_page_lock_queues();
3044
3045 delayed_unlock = 1;
3046
3047 /*
3048 * Calculate the max number of referenced pages on the inactive
3049 * queue that we will reactivate.
3050 */
3051 reactivated_this_call = 0;
3052 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3053 vm_page_inactive_count);
3054 inactive_reclaim_run = 0;
3055
3056 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3057
3058 /*
3059 * We must limit the rate at which we send pages to the pagers
3060 * so that we don't tie up too many pages in the I/O queues.
3061 * We implement a throttling mechanism using the laundry count
3062 * to limit the number of pages outstanding to the default
3063 * and external pagers. We can bypass the throttles and look
3064 * for clean pages if the pageout queues don't drain in a timely
3065 * fashion since this may indicate that the pageout paths are
3066 * stalled waiting for memory, which only we can provide.
3067 */
3068
3069 vps_init_page_targets();
3070 assert(object == NULL);
3071 assert(delayed_unlock != 0);
3072
3073 for (;;) {
3074 vm_page_t m;
3075
3076 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3077
3078 if (lock_yield_check) {
3079 lock_yield_check = FALSE;
3080
3081 if (delayed_unlock++ > delayed_unlock_limit) {
3082 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3083 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3084 } else if (vm_pageout_scan_wants_object) {
3085 vm_page_unlock_queues();
3086 mutex_pause(0);
3087 vm_page_lock_queues();
3088 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3089 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3090 }
3091 }
3092
3093 if (vm_upl_wait_for_pages < 0) {
3094 vm_upl_wait_for_pages = 0;
3095 }
3096
3097 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3098
3099 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3100 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3101 }
3102
3103 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3104
3105 assert(delayed_unlock);
3106
3107 /*
3108 * maintain our balance
3109 */
3110 vm_page_balance_inactive(1);
3111
3112
3113 /**********************************************************************
3114 * above this point we're playing with the active and secluded queues
3115 * below this point we're playing with the throttling mechanisms
3116 * and the inactive queue
3117 **********************************************************************/
3118
3119 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3120 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3121
3122 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3123 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3124 /*
3125 * make sure the pageout I/O threads are running
3126 * throttled in case there are still requests
3127 * in the laundry... since we have met our targets
3128 * we don't need the laundry to be cleaned in a timely
3129 * fashion... so let's avoid interfering with foreground
3130 * activity
3131 */
3132 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3133
3134 vm_free_page_lock();
3135
3136 if ((vm_page_free_count >= vm_page_free_target) &&
3137 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3138 /*
3139 * done - we have met our target *and*
3140 * there is no one waiting for a page.
3141 */
3142 return_from_scan:
3143 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3144
3145 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3146 vm_pageout_state.vm_pageout_inactive,
3147 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3148 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3149 vm_pageout_vminfo.vm_pageout_freed_speculative,
3150 vm_pageout_state.vm_pageout_inactive_clean,
3151 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3152 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3153
3154 return;
3155 }
3156 vm_free_page_unlock();
3157 }
3158
3159 /*
3160 * Before anything, we check if we have any ripe volatile
3161 * objects around. If so, try to purge the first object.
3162 * If the purge fails, fall through to reclaim a page instead.
3163 * If the purge succeeds, go back to the top and reevalute
3164 * the new memory situation.
3165 */
3166 retval = vps_purge_object();
3167
3168 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3169 /*
3170 * Success
3171 */
3172 if (object != NULL) {
3173 vm_object_unlock(object);
3174 object = NULL;
3175 }
3176
3177 lock_yield_check = FALSE;
3178 continue;
3179 }
3180
3181 /*
3182 * If our 'aged' queue is empty and we have some speculative pages
3183 * in the other queues, let's go through and see if we need to age
3184 * them.
3185 *
3186 * If we succeeded in aging a speculative Q or just that everything
3187 * looks normal w.r.t queue age and queue counts, we keep going onward.
3188 *
3189 * If, for some reason, we seem to have a mismatch between the spec.
3190 * page count and the page queues, we reset those variables and
3191 * restart the loop (LD TODO: Track this better?).
3192 */
3193 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3194 retval = vps_age_speculative_queue(force_speculative_aging);
3195
3196 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3197 lock_yield_check = FALSE;
3198 continue;
3199 }
3200 }
3201 force_speculative_aging = FALSE;
3202
3203 /*
3204 * Check to see if we need to evict objects from the cache.
3205 *
3206 * Note: 'object' here doesn't have anything to do with
3207 * the eviction part. We just need to make sure we have dropped
3208 * any object lock we might be holding if we need to go down
3209 * into the eviction logic.
3210 */
3211 retval = vps_object_cache_evict(&object);
3212
3213 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3214 lock_yield_check = FALSE;
3215 continue;
3216 }
3217
3218
3219 /*
3220 * Calculate our filecache_min that will affect the loop
3221 * going forward.
3222 */
3223 vps_calculate_filecache_min();
3224
3225 /*
3226 * LD TODO: Use a structure to hold all state variables for a single
3227 * vm_pageout_scan iteration and pass that structure to this function instead.
3228 */
3229 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3230 &delayed_unlock, &local_freeq, &local_freed,
3231 &vm_pageout_deadlock_target, inactive_burst_count);
3232
3233 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3234 if (loop_count >= vm_page_inactive_count) {
3235 loop_count = 0;
3236 }
3237
3238 inactive_burst_count = 0;
3239
3240 assert(object == NULL);
3241 assert(delayed_unlock != 0);
3242
3243 lock_yield_check = FALSE;
3244 continue;
3245 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3246 goto return_from_scan;
3247 }
3248
3249 flow_control.state = FCS_IDLE;
3250
3251 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3252 vm_pageout_inactive_external_forced_reactivate_limit);
3253 loop_count++;
3254 inactive_burst_count++;
3255 vm_pageout_state.vm_pageout_inactive++;
3256
3257 /*
3258 * Choose a victim.
3259 */
3260
3261 m = NULL;
3262 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3263
3264 if (m == NULL) {
3265 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3266 inactive_burst_count = 0;
3267
3268 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3269 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3270 }
3271
3272 lock_yield_check = TRUE;
3273 continue;
3274 }
3275
3276 /*
3277 * if we've gotten here, we have no victim page.
3278 * check to see if we've not finished balancing the queues
3279 * or we have a page on the aged speculative queue that we
3280 * skipped due to force_anonymous == TRUE.. or we have
3281 * speculative pages that we can prematurely age... if
3282 * one of these cases we'll keep going, else panic
3283 */
3284 force_anonymous = FALSE;
3285 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3286
3287 if (!vm_page_queue_empty(&sq->age_q)) {
3288 lock_yield_check = TRUE;
3289 continue;
3290 }
3291
3292 if (vm_page_speculative_count) {
3293 force_speculative_aging = TRUE;
3294 lock_yield_check = TRUE;
3295 continue;
3296 }
3297 panic("vm_pageout: no victim");
3298
3299 /* NOTREACHED */
3300 }
3301
3302 assert(VM_PAGE_PAGEABLE(m));
3303 m_object = VM_PAGE_OBJECT(m);
3304 force_anonymous = FALSE;
3305
3306 page_prev_q_state = m->vmp_q_state;
3307 /*
3308 * we just found this page on one of our queues...
3309 * it can't also be on the pageout queue, so safe
3310 * to call vm_page_queues_remove
3311 */
3312 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3313 vm_page_queues_remove(m, TRUE);
3314 if (donate) {
3315 /*
3316 * The compressor needs to see this bit to know
3317 * where this page needs to land. Also if stolen,
3318 * this bit helps put the page back in the right
3319 * special queue where it belongs.
3320 */
3321 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3322 }
3323
3324 assert(!m->vmp_laundry);
3325 assert(!m->vmp_private);
3326 assert(!m->vmp_fictitious);
3327 assert(!is_kernel_object(m_object));
3328 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3329
3330 vm_pageout_vminfo.vm_pageout_considered_page++;
3331
3332 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3333
3334 /*
3335 * check to see if we currently are working
3336 * with the same object... if so, we've
3337 * already got the lock
3338 */
3339 if (m_object != object) {
3340 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3341
3342 /*
3343 * vps_switch_object() will always drop the 'object' lock first
3344 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3345 * either 'm_object' or NULL.
3346 */
3347 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3348
3349 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3350 lock_yield_check = TRUE;
3351 continue;
3352 }
3353 }
3354 assert(m_object == object);
3355 assert(VM_PAGE_OBJECT(m) == m_object);
3356
3357 if (m->vmp_busy) {
3358 /*
3359 * Somebody is already playing with this page.
3360 * Put it back on the appropriate queue
3361 *
3362 */
3363 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3364
3365 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3366 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3367 }
3368
3369 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3370
3371 lock_yield_check = TRUE;
3372 continue;
3373 }
3374
3375 /*
3376 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3377 * If already cleaning this page in place
3378 * just leave if off the paging queues.
3379 * We can leave the page mapped, and upl_commit_range
3380 * will put it on the clean queue.
3381 *
3382 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3383 * an msync INVALIDATE is in progress...
3384 * this page has been marked for destruction
3385 * after it has been cleaned,
3386 * but not yet gathered into a UPL
3387 * where 'cleaning' will be set...
3388 * just leave it off the paging queues
3389 *
3390 * if (m->vmp_free_when_done && m->vmp_clenaing)
3391 * an msync INVALIDATE is in progress
3392 * and the UPL has already gathered this page...
3393 * just leave it off the paging queues
3394 */
3395 if (m->vmp_free_when_done || m->vmp_cleaning) {
3396 lock_yield_check = TRUE;
3397 continue;
3398 }
3399
3400
3401 /*
3402 * If it's absent, in error or the object is no longer alive,
3403 * we can reclaim the page... in the no longer alive case,
3404 * there are 2 states the page can be in that preclude us
3405 * from reclaiming it - busy or cleaning - that we've already
3406 * dealt with
3407 */
3408 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3409 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3410 if (m->vmp_absent) {
3411 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3412 } else if (!object->alive ||
3413 (!object->internal &&
3414 object->pager == MEMORY_OBJECT_NULL)) {
3415 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3416 } else {
3417 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3418 }
3419 reclaim_page:
3420 if (vm_pageout_deadlock_target) {
3421 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3422 vm_pageout_deadlock_target--;
3423 }
3424
3425 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3426
3427 if (object->internal) {
3428 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3429 } else {
3430 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3431 }
3432 assert(!m->vmp_cleaning);
3433 assert(!m->vmp_laundry);
3434
3435 if (!object->internal &&
3436 object->pager != NULL &&
3437 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3438 shared_region_pager_reclaimed++;
3439 }
3440
3441 m->vmp_busy = TRUE;
3442
3443 /*
3444 * remove page from object here since we're already
3445 * behind the object lock... defer the rest of the work
3446 * we'd normally do in vm_page_free_prepare_object
3447 * until 'vm_page_free_list' is called
3448 */
3449 if (m->vmp_tabled) {
3450 vm_page_remove(m, TRUE);
3451 }
3452
3453 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3454 m->vmp_snext = local_freeq;
3455 local_freeq = m;
3456 local_freed++;
3457
3458 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3459 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3460 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3461 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3462 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3463 vm_pageout_vminfo.vm_pageout_freed_internal++;
3464 } else {
3465 vm_pageout_vminfo.vm_pageout_freed_external++;
3466 }
3467
3468 inactive_burst_count = 0;
3469
3470 lock_yield_check = TRUE;
3471 continue;
3472 }
3473 if (object->vo_copy == VM_OBJECT_NULL) {
3474 /*
3475 * No one else can have any interest in this page.
3476 * If this is an empty purgable object, the page can be
3477 * reclaimed even if dirty.
3478 * If the page belongs to a volatile purgable object, we
3479 * reactivate it if the compressor isn't active.
3480 */
3481 if (object->purgable == VM_PURGABLE_EMPTY) {
3482 if (m->vmp_pmapped == TRUE) {
3483 /* unmap the page */
3484 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3485 if (refmod_state & VM_MEM_MODIFIED) {
3486 SET_PAGE_DIRTY(m, FALSE);
3487 }
3488 }
3489 if (m->vmp_dirty || m->vmp_precious) {
3490 /* we saved the cost of cleaning this page ! */
3491 vm_page_purged_count++;
3492 }
3493 goto reclaim_page;
3494 }
3495
3496 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3497 /*
3498 * With the VM compressor, the cost of
3499 * reclaiming a page is much lower (no I/O),
3500 * so if we find a "volatile" page, it's better
3501 * to let it get compressed rather than letting
3502 * it occupy a full page until it gets purged.
3503 * So no need to check for "volatile" here.
3504 */
3505 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3506 /*
3507 * Avoid cleaning a "volatile" page which might
3508 * be purged soon.
3509 */
3510
3511 /* if it's wired, we can't put it on our queue */
3512 assert(!VM_PAGE_WIRED(m));
3513
3514 /* just stick it back on! */
3515 reactivated_this_call++;
3516
3517 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3518 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3519 }
3520
3521 goto reactivate_page;
3522 }
3523 }
3524 /*
3525 * If it's being used, reactivate.
3526 * (Fictitious pages are either busy or absent.)
3527 * First, update the reference and dirty bits
3528 * to make sure the page is unreferenced.
3529 */
3530 refmod_state = -1;
3531
3532 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3533 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3534
3535 if (refmod_state & VM_MEM_REFERENCED) {
3536 m->vmp_reference = TRUE;
3537 }
3538 if (refmod_state & VM_MEM_MODIFIED) {
3539 SET_PAGE_DIRTY(m, FALSE);
3540 }
3541 }
3542
3543 if (m->vmp_reference || m->vmp_dirty) {
3544 /* deal with a rogue "reusable" page */
3545 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3546 }
3547
3548 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3549 vm_pageout_state.vm_page_xpmapped_min = 0;
3550 } else {
3551 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3552 }
3553
3554 if (!m->vmp_no_cache &&
3555 page_from_bg_q == FALSE &&
3556 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3557 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3558 /*
3559 * The page we pulled off the inactive list has
3560 * been referenced. It is possible for other
3561 * processors to be touching pages faster than we
3562 * can clear the referenced bit and traverse the
3563 * inactive queue, so we limit the number of
3564 * reactivations.
3565 */
3566 if (++reactivated_this_call >= reactivate_limit &&
3567 !object->object_is_shared_cache &&
3568 !((m->vmp_realtime ||
3569 object->for_realtime) &&
3570 vm_pageout_protect_realtime)) {
3571 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3572 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3573 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3574 if (object->object_is_shared_cache) {
3575 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3576 } else if (m->vmp_realtime ||
3577 object->for_realtime) {
3578 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3579 }
3580 } else {
3581 uint32_t isinuse;
3582
3583 if (reactivated_this_call >= reactivate_limit) {
3584 if (object->object_is_shared_cache) {
3585 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3586 } else if ((m->vmp_realtime ||
3587 object->for_realtime) &&
3588 vm_pageout_protect_realtime) {
3589 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3590 }
3591 }
3592 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3593 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3594 }
3595
3596 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3597 reactivate_page:
3598 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3599 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3600 /*
3601 * no explict mappings of this object exist
3602 * and it's not open via the filesystem
3603 */
3604 vm_page_deactivate(m);
3605 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3606 } else {
3607 /*
3608 * The page was/is being used, so put back on active list.
3609 */
3610 vm_page_activate(m);
3611 counter_inc(&vm_statistics_reactivations);
3612 inactive_burst_count = 0;
3613 }
3614 #if DEVELOPMENT || DEBUG
3615 if (page_from_bg_q == TRUE) {
3616 if (m_object->internal) {
3617 vm_pageout_rejected_bq_internal++;
3618 } else {
3619 vm_pageout_rejected_bq_external++;
3620 }
3621 }
3622 #endif /* DEVELOPMENT || DEBUG */
3623
3624 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3625 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3626 }
3627 vm_pageout_state.vm_pageout_inactive_used++;
3628
3629 lock_yield_check = TRUE;
3630 continue;
3631 }
3632 /*
3633 * Make sure we call pmap_get_refmod() if it
3634 * wasn't already called just above, to update
3635 * the dirty bit.
3636 */
3637 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3638 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3639 if (refmod_state & VM_MEM_MODIFIED) {
3640 SET_PAGE_DIRTY(m, FALSE);
3641 }
3642 }
3643 }
3644
3645 /*
3646 * we've got a candidate page to steal...
3647 *
3648 * m->vmp_dirty is up to date courtesy of the
3649 * preceding check for m->vmp_reference... if
3650 * we get here, then m->vmp_reference had to be
3651 * FALSE (or possibly "reactivate_limit" was
3652 * exceeded), but in either case we called
3653 * pmap_get_refmod() and updated both
3654 * m->vmp_reference and m->vmp_dirty
3655 *
3656 * if it's dirty or precious we need to
3657 * see if the target queue is throtttled
3658 * it if is, we need to skip over it by moving it back
3659 * to the end of the inactive queue
3660 */
3661
3662 inactive_throttled = FALSE;
3663
3664 if (m->vmp_dirty || m->vmp_precious) {
3665 if (object->internal) {
3666 if (VM_PAGE_Q_THROTTLED(iq)) {
3667 inactive_throttled = TRUE;
3668 }
3669 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3670 inactive_throttled = TRUE;
3671 }
3672 }
3673 throttle_inactive:
3674 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3675 object->internal && m->vmp_dirty &&
3676 (object->purgable == VM_PURGABLE_DENY ||
3677 object->purgable == VM_PURGABLE_NONVOLATILE ||
3678 object->purgable == VM_PURGABLE_VOLATILE)) {
3679 vm_page_check_pageable_safe(m);
3680 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3681 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3682 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3683 vm_page_throttled_count++;
3684
3685 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3686
3687 inactive_burst_count = 0;
3688
3689 lock_yield_check = TRUE;
3690 continue;
3691 }
3692 if (inactive_throttled == TRUE) {
3693 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3694 &delayed_unlock, &force_anonymous, page_from_bg_q);
3695
3696 inactive_burst_count = 0;
3697
3698 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3699 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3700 }
3701
3702 lock_yield_check = TRUE;
3703 continue;
3704 }
3705
3706 /*
3707 * we've got a page that we can steal...
3708 * eliminate all mappings and make sure
3709 * we have the up-to-date modified state
3710 *
3711 * if we need to do a pmap_disconnect then we
3712 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3713 * provides the true state atomically... the
3714 * page was still mapped up to the pmap_disconnect
3715 * and may have been dirtied at the last microsecond
3716 *
3717 * Note that if 'pmapped' is FALSE then the page is not
3718 * and has not been in any map, so there is no point calling
3719 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3720 * of likely usage of the page.
3721 */
3722 if (m->vmp_pmapped == TRUE) {
3723 int pmap_options;
3724
3725 /*
3726 * Don't count this page as going into the compressor
3727 * if any of these are true:
3728 * 1) compressed pager isn't enabled
3729 * 2) Freezer enabled device with compressed pager
3730 * backend (exclusive use) i.e. most of the VM system
3731 * (including vm_pageout_scan) has no knowledge of
3732 * the compressor
3733 * 3) This page belongs to a file and hence will not be
3734 * sent into the compressor
3735 */
3736 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3737 object->internal == FALSE) {
3738 pmap_options = 0;
3739 } else if (m->vmp_dirty || m->vmp_precious) {
3740 /*
3741 * VM knows that this page is dirty (or
3742 * precious) and needs to be compressed
3743 * rather than freed.
3744 * Tell the pmap layer to count this page
3745 * as "compressed".
3746 */
3747 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3748 } else {
3749 /*
3750 * VM does not know if the page needs to
3751 * be preserved but the pmap layer might tell
3752 * us if any mapping has "modified" it.
3753 * Let's the pmap layer to count this page
3754 * as compressed if and only if it has been
3755 * modified.
3756 */
3757 pmap_options =
3758 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3759 }
3760 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3761 pmap_options,
3762 NULL);
3763 if (refmod_state & VM_MEM_MODIFIED) {
3764 SET_PAGE_DIRTY(m, FALSE);
3765 }
3766 }
3767
3768 /*
3769 * reset our count of pages that have been reclaimed
3770 * since the last page was 'stolen'
3771 */
3772 inactive_reclaim_run = 0;
3773
3774 /*
3775 * If it's clean and not precious, we can free the page.
3776 */
3777 if (!m->vmp_dirty && !m->vmp_precious) {
3778 vm_pageout_state.vm_pageout_inactive_clean++;
3779
3780 /*
3781 * OK, at this point we have found a page we are going to free.
3782 */
3783 #if CONFIG_PHANTOM_CACHE
3784 if (!object->internal) {
3785 vm_phantom_cache_add_ghost(m);
3786 }
3787 #endif
3788 goto reclaim_page;
3789 }
3790
3791 /*
3792 * The page may have been dirtied since the last check
3793 * for a throttled target queue (which may have been skipped
3794 * if the page was clean then). With the dirty page
3795 * disconnected here, we can make one final check.
3796 */
3797 if (object->internal) {
3798 if (VM_PAGE_Q_THROTTLED(iq)) {
3799 inactive_throttled = TRUE;
3800 }
3801 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3802 inactive_throttled = TRUE;
3803 }
3804
3805 if (inactive_throttled == TRUE) {
3806 goto throttle_inactive;
3807 }
3808
3809 #if VM_PRESSURE_EVENTS
3810 #if CONFIG_JETSAM
3811
3812 /*
3813 * If Jetsam is enabled, then the sending
3814 * of memory pressure notifications is handled
3815 * from the same thread that takes care of high-water
3816 * and other jetsams i.e. the memorystatus_thread.
3817 */
3818
3819 #else /* CONFIG_JETSAM */
3820
3821 vm_pressure_response();
3822
3823 #endif /* CONFIG_JETSAM */
3824 #endif /* VM_PRESSURE_EVENTS */
3825
3826 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3827 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3828 }
3829
3830 if (object->internal) {
3831 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3832 } else {
3833 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3834 }
3835
3836 /*
3837 * internal pages will go to the compressor...
3838 * external pages will go to the appropriate pager to be cleaned
3839 * and upon completion will end up on 'vm_page_queue_cleaned' which
3840 * is a preferred queue to steal from
3841 */
3842 vm_pageout_cluster(m);
3843 inactive_burst_count = 0;
3844
3845 /*
3846 * back to top of pageout scan loop
3847 */
3848 }
3849 }
3850
3851
3852 void
vm_page_free_reserve(int pages)3853 vm_page_free_reserve(
3854 int pages)
3855 {
3856 int free_after_reserve;
3857
3858 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3859 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3860 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3861 } else {
3862 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3863 }
3864 } else {
3865 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3866 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3867 } else {
3868 vm_page_free_reserved += pages;
3869 }
3870 }
3871 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3872
3873 vm_page_free_min = vm_page_free_reserved +
3874 VM_PAGE_FREE_MIN(free_after_reserve);
3875
3876 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3877 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3878 }
3879
3880 vm_page_free_target = vm_page_free_reserved +
3881 VM_PAGE_FREE_TARGET(free_after_reserve);
3882
3883 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3884 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3885 }
3886
3887 if (vm_page_free_target < vm_page_free_min + 5) {
3888 vm_page_free_target = vm_page_free_min + 5;
3889 }
3890
3891 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3892 }
3893
3894 /*
3895 * vm_pageout is the high level pageout daemon.
3896 */
3897
3898 void
vm_pageout_continue(void)3899 vm_pageout_continue(void)
3900 {
3901 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3902 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3903
3904 vm_free_page_lock();
3905 vm_pageout_running = TRUE;
3906 vm_free_page_unlock();
3907
3908 vm_pageout_scan();
3909 /*
3910 * we hold both the vm_page_queue_free_lock
3911 * and the vm_page_queues_lock at this point
3912 */
3913 assert(vm_page_free_wanted == 0);
3914 assert(vm_page_free_wanted_privileged == 0);
3915 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3916
3917 vm_pageout_running = FALSE;
3918 #if XNU_TARGET_OS_OSX
3919 if (vm_pageout_waiter) {
3920 vm_pageout_waiter = FALSE;
3921 thread_wakeup((event_t)&vm_pageout_waiter);
3922 }
3923 #endif /* XNU_TARGET_OS_OSX */
3924
3925 vm_free_page_unlock();
3926 vm_page_unlock_queues();
3927
3928 thread_block((thread_continue_t)vm_pageout_continue);
3929 /*NOTREACHED*/
3930 }
3931
3932 #if XNU_TARGET_OS_OSX
3933 kern_return_t
vm_pageout_wait(uint64_t deadline)3934 vm_pageout_wait(uint64_t deadline)
3935 {
3936 kern_return_t kr;
3937
3938 vm_free_page_lock();
3939 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3940 vm_pageout_waiter = TRUE;
3941 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3942 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3943 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3944 kr = KERN_OPERATION_TIMED_OUT;
3945 }
3946 }
3947 vm_free_page_unlock();
3948
3949 return kr;
3950 }
3951 #endif /* XNU_TARGET_OS_OSX */
3952
3953 OS_NORETURN
3954 static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state * ethr,__unused wait_result_t w)3955 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3956 {
3957 vm_page_t m = NULL;
3958 vm_object_t object;
3959 vm_object_offset_t offset;
3960 memory_object_t pager;
3961 struct vm_pageout_queue *q = ethr->q;
3962
3963 /* On systems with a compressor, the external IO thread clears its
3964 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3965 * creation)
3966 */
3967 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3968 current_thread()->options &= ~TH_OPT_VMPRIV;
3969 }
3970
3971 sched_cond_ack(&(ethr->pgo_wakeup));
3972
3973 while (true) {
3974 vm_page_lockspin_queues();
3975
3976 while (!vm_page_queue_empty(&q->pgo_pending)) {
3977 q->pgo_busy = TRUE;
3978 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3979
3980 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3981 VM_PAGE_CHECK(m);
3982 /*
3983 * grab a snapshot of the object and offset this
3984 * page is tabled in so that we can relookup this
3985 * page after we've taken the object lock - these
3986 * fields are stable while we hold the page queues lock
3987 * but as soon as we drop it, there is nothing to keep
3988 * this page in this object... we hold an activity_in_progress
3989 * on this object which will keep it from terminating
3990 */
3991 object = VM_PAGE_OBJECT(m);
3992 offset = m->vmp_offset;
3993
3994 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3995 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3996
3997 vm_page_unlock_queues();
3998
3999 vm_object_lock(object);
4000
4001 m = vm_page_lookup(object, offset);
4002
4003 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4004 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4005 /*
4006 * it's either the same page that someone else has
4007 * started cleaning (or it's finished cleaning or
4008 * been put back on the pageout queue), or
4009 * the page has been freed or we have found a
4010 * new page at this offset... in all of these cases
4011 * we merely need to release the activity_in_progress
4012 * we took when we put the page on the pageout queue
4013 */
4014 vm_object_activity_end(object);
4015 vm_object_unlock(object);
4016
4017 vm_page_lockspin_queues();
4018 continue;
4019 }
4020 pager = object->pager;
4021
4022 if (pager == MEMORY_OBJECT_NULL) {
4023 /*
4024 * This pager has been destroyed by either
4025 * memory_object_destroy or vm_object_destroy, and
4026 * so there is nowhere for the page to go.
4027 */
4028 if (m->vmp_free_when_done) {
4029 /*
4030 * Just free the page... VM_PAGE_FREE takes
4031 * care of cleaning up all the state...
4032 * including doing the vm_pageout_throttle_up
4033 */
4034 VM_PAGE_FREE(m);
4035 } else {
4036 vm_page_lockspin_queues();
4037
4038 vm_pageout_throttle_up(m);
4039 vm_page_activate(m);
4040
4041 vm_page_unlock_queues();
4042
4043 /*
4044 * And we are done with it.
4045 */
4046 }
4047 vm_object_activity_end(object);
4048 vm_object_unlock(object);
4049
4050 vm_page_lockspin_queues();
4051 continue;
4052 }
4053 #if 0
4054 /*
4055 * we don't hold the page queue lock
4056 * so this check isn't safe to make
4057 */
4058 VM_PAGE_CHECK(m);
4059 #endif
4060 /*
4061 * give back the activity_in_progress reference we
4062 * took when we queued up this page and replace it
4063 * it with a paging_in_progress reference that will
4064 * also hold the paging offset from changing and
4065 * prevent the object from terminating
4066 */
4067 vm_object_activity_end(object);
4068 vm_object_paging_begin(object);
4069 vm_object_unlock(object);
4070
4071 /*
4072 * Send the data to the pager.
4073 * any pageout clustering happens there
4074 */
4075 memory_object_data_return(pager,
4076 m->vmp_offset + object->paging_offset,
4077 PAGE_SIZE,
4078 NULL,
4079 NULL,
4080 FALSE,
4081 FALSE,
4082 0);
4083
4084 vm_object_lock(object);
4085 vm_object_paging_end(object);
4086 vm_object_unlock(object);
4087
4088 vm_pageout_io_throttle();
4089
4090 vm_page_lockspin_queues();
4091 }
4092 q->pgo_busy = FALSE;
4093
4094 vm_page_unlock_queues();
4095 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4096 }
4097 /*NOTREACHED*/
4098 }
4099
4100
4101 #define MAX_FREE_BATCH 32
4102 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4103 * this thread.
4104 */
4105
4106
4107 OS_NORETURN
4108 static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state * cq,__unused wait_result_t w)4109 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4110 {
4111 struct vm_pageout_queue *q;
4112 vm_page_t m = NULL;
4113 boolean_t pgo_draining;
4114 vm_page_t local_q;
4115 int local_cnt;
4116 vm_page_t local_freeq = NULL;
4117 int local_freed = 0;
4118 int local_batch_size;
4119 #if DEVELOPMENT || DEBUG
4120 int ncomps = 0;
4121 boolean_t marked_active = FALSE;
4122 int num_pages_processed = 0;
4123 #endif
4124 void *chead = NULL;
4125
4126 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4127
4128 sched_cond_ack(&(cq->pgo_wakeup));
4129
4130 q = cq->q;
4131
4132 while (true) {
4133 #if DEVELOPMENT || DEBUG
4134 bool benchmark_accounting = false;
4135 /*
4136 * If we're running the compressor perf test, only process the benchmark pages.
4137 * We'll get back to our regular queue once the benchmark is done
4138 */
4139 if (compressor_running_perf_test) {
4140 q = cq->benchmark_q;
4141 if (!vm_page_queue_empty(&q->pgo_pending)) {
4142 benchmark_accounting = true;
4143 } else {
4144 q = cq->q;
4145 benchmark_accounting = false;
4146 }
4147 }
4148 #endif /* DEVELOPMENT || DEBUG */
4149
4150 #if __AMP__
4151 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4152 local_batch_size = (q->pgo_maxlaundry >> 3);
4153 local_batch_size = MAX(local_batch_size, 16);
4154 } else {
4155 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4156 }
4157 #else
4158 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4159 #endif
4160
4161 #if RECORD_THE_COMPRESSED_DATA
4162 if (q->pgo_laundry) {
4163 c_compressed_record_init();
4164 }
4165 #endif
4166 while (true) {
4167 int pages_left_on_q = 0;
4168
4169 local_cnt = 0;
4170 local_q = NULL;
4171
4172 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4173
4174 vm_page_lock_queues();
4175 #if DEVELOPMENT || DEBUG
4176 if (marked_active == FALSE) {
4177 vmct_active++;
4178 vmct_state[cq->id] = VMCT_ACTIVE;
4179 marked_active = TRUE;
4180 if (vmct_active == 1) {
4181 vm_compressor_epoch_start = mach_absolute_time();
4182 }
4183 }
4184 #endif
4185 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4186
4187 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4188
4189 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4190 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4191 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4192 VM_PAGE_CHECK(m);
4193
4194 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4195 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4196 m->vmp_laundry = FALSE;
4197
4198 m->vmp_snext = local_q;
4199 local_q = m;
4200 local_cnt++;
4201 }
4202 if (local_q == NULL) {
4203 break;
4204 }
4205
4206 q->pgo_busy = TRUE;
4207
4208 if ((pgo_draining = q->pgo_draining) == FALSE) {
4209 vm_pageout_throttle_up_batch(q, local_cnt);
4210 pages_left_on_q = q->pgo_laundry;
4211 } else {
4212 pages_left_on_q = q->pgo_laundry - local_cnt;
4213 }
4214
4215 vm_page_unlock_queues();
4216
4217 #if !RECORD_THE_COMPRESSED_DATA
4218 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4219 // wake up the next compressor thread
4220 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4221 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4222 }
4223 #endif
4224 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4225
4226 while (local_q) {
4227 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4228
4229 m = local_q;
4230 local_q = m->vmp_snext;
4231 m->vmp_snext = NULL;
4232
4233 /*
4234 * Technically we need the pageq locks to manipulate this field.
4235 * However, this page has been removed from all queues and is only
4236 * known to this compressor thread dealing with this local queue.
4237 *
4238 * TODO LIONEL: Add a second localq that is the early localq and
4239 * put special pages like this one on that queue in the block above
4240 * under the pageq lock to avoid this 'works but not clean' logic.
4241 */
4242 void *donate_queue_head;
4243 #if XNU_TARGET_OS_OSX
4244 donate_queue_head = &cq->current_early_swapout_chead;
4245 #else /* XNU_TARGET_OS_OSX */
4246 donate_queue_head = &cq->current_late_swapout_chead;
4247 #endif /* XNU_TARGET_OS_OSX */
4248 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4249 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4250 chead = donate_queue_head;
4251 } else {
4252 chead = &cq->current_regular_swapout_chead;
4253 }
4254
4255 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4256 #if DEVELOPMENT || DEBUG
4257 ncomps++;
4258 #endif
4259 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4260
4261 m->vmp_snext = local_freeq;
4262 local_freeq = m;
4263 local_freed++;
4264
4265 if (local_freed >= MAX_FREE_BATCH) {
4266 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4267
4268 vm_page_free_list(local_freeq, TRUE);
4269
4270 local_freeq = NULL;
4271 local_freed = 0;
4272 }
4273 }
4274 #if DEVELOPMENT || DEBUG
4275 num_pages_processed++;
4276 #endif /* DEVELOPMENT || DEBUG */
4277 #if !CONFIG_JETSAM
4278 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4279 kern_return_t wait_result;
4280 int need_wakeup = 0;
4281
4282 if (local_freeq) {
4283 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4284
4285 vm_page_free_list(local_freeq, TRUE);
4286 local_freeq = NULL;
4287 local_freed = 0;
4288
4289 continue;
4290 }
4291 vm_free_page_lock_spin();
4292
4293 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4294 if (vm_page_free_wanted_privileged++ == 0) {
4295 need_wakeup = 1;
4296 }
4297 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4298
4299 vm_free_page_unlock();
4300
4301 if (need_wakeup) {
4302 thread_wakeup((event_t)&vm_page_free_wanted);
4303 }
4304
4305 if (wait_result == THREAD_WAITING) {
4306 thread_block(THREAD_CONTINUE_NULL);
4307 }
4308 } else {
4309 vm_free_page_unlock();
4310 }
4311 }
4312 #endif
4313 }
4314 if (local_freeq) {
4315 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4316
4317 vm_page_free_list(local_freeq, TRUE);
4318 local_freeq = NULL;
4319 local_freed = 0;
4320 }
4321 if (pgo_draining == TRUE) {
4322 vm_page_lockspin_queues();
4323 vm_pageout_throttle_up_batch(q, local_cnt);
4324 vm_page_unlock_queues();
4325 }
4326 }
4327 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4328
4329 /*
4330 * queue lock is held and our q is empty
4331 */
4332 q->pgo_busy = FALSE;
4333 #if DEVELOPMENT || DEBUG
4334 if (marked_active == TRUE) {
4335 vmct_active--;
4336 vmct_state[cq->id] = VMCT_IDLE;
4337
4338 if (vmct_active == 0) {
4339 vm_compressor_epoch_stop = mach_absolute_time();
4340 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4341 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4342 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4343 /* This interval includes intervals where one or more
4344 * compressor threads were pre-empted
4345 */
4346 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4347 }
4348 }
4349 if (compressor_running_perf_test && benchmark_accounting) {
4350 /*
4351 * We could turn ON compressor_running_perf_test while still processing
4352 * regular non-benchmark pages. We shouldn't count them here else we
4353 * could overshoot. We might also still be populating that benchmark Q
4354 * and be under pressure. So we will go back to the regular queues. And
4355 * benchmark accounting will be off for that case too.
4356 */
4357 compressor_perf_test_pages_processed += num_pages_processed;
4358 thread_wakeup(&compressor_perf_test_pages_processed);
4359 }
4360 #endif
4361 vm_page_unlock_queues();
4362 #if DEVELOPMENT || DEBUG
4363 if (__improbable(vm_compressor_time_thread)) {
4364 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4365 vmct_stats.vmct_pages[cq->id] += ncomps;
4366 vmct_stats.vmct_iterations[cq->id]++;
4367 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4368 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4369 }
4370 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4371 vmct_stats.vmct_minpages[cq->id] = ncomps;
4372 }
4373 }
4374 #endif
4375
4376 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4377 #if DEVELOPMENT || DEBUG
4378 if (compressor_running_perf_test && benchmark_accounting) {
4379 /*
4380 * We've been exclusively compressing pages from the benchmark queue,
4381 * do 1 pass over the internal queue before blocking.
4382 */
4383 continue;
4384 }
4385 #endif
4386
4387 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4388 }
4389 /*NOTREACHED*/
4390 }
4391
4392
4393 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4394 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4395 {
4396 vm_object_t object;
4397 memory_object_t pager;
4398 int compressed_count_delta;
4399 kern_return_t retval;
4400
4401 object = VM_PAGE_OBJECT(m);
4402
4403 assert(!m->vmp_free_when_done);
4404 assert(!m->vmp_laundry);
4405
4406 pager = object->pager;
4407
4408 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4409 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4410
4411 vm_object_lock(object);
4412
4413 /*
4414 * If there is no memory object for the page, create
4415 * one and hand it to the compression pager.
4416 */
4417
4418 if (!object->pager_initialized) {
4419 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4420 }
4421 if (!object->pager_initialized) {
4422 vm_object_compressor_pager_create(object);
4423 }
4424
4425 pager = object->pager;
4426
4427 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4428 /*
4429 * Still no pager for the object,
4430 * or the pager has been destroyed.
4431 * Reactivate the page.
4432 *
4433 * Should only happen if there is no
4434 * compression pager
4435 */
4436 PAGE_WAKEUP_DONE(m);
4437
4438 vm_page_lockspin_queues();
4439 vm_page_activate(m);
4440 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4441 vm_page_unlock_queues();
4442
4443 /*
4444 * And we are done with it.
4445 */
4446 vm_object_activity_end(object);
4447 vm_object_unlock(object);
4448
4449 return KERN_FAILURE;
4450 }
4451 vm_object_unlock(object);
4452
4453 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4454 }
4455 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4456 assert(object->activity_in_progress > 0);
4457
4458 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4459 if (m->vmp_unmodified_ro == true) {
4460 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4461 }
4462 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4463
4464 retval = vm_compressor_pager_put(
4465 pager,
4466 m->vmp_offset + object->paging_offset,
4467 VM_PAGE_GET_PHYS_PAGE(m),
4468 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4469 m->vmp_unmodified_ro,
4470 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4471 false,
4472 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4473 current_chead,
4474 scratch_buf,
4475 &compressed_count_delta);
4476
4477 vm_object_lock(object);
4478
4479 assert(object->activity_in_progress > 0);
4480 assert(VM_PAGE_OBJECT(m) == object);
4481 assert( !VM_PAGE_WIRED(m));
4482
4483 vm_compressor_pager_count(pager,
4484 compressed_count_delta,
4485 FALSE, /* shared_lock */
4486 object);
4487
4488 if (retval == KERN_SUCCESS) {
4489 /*
4490 * If the object is purgeable, its owner's
4491 * purgeable ledgers will be updated in
4492 * vm_page_remove() but the page still
4493 * contributes to the owner's memory footprint,
4494 * so account for it as such.
4495 */
4496 if ((object->purgable != VM_PURGABLE_DENY ||
4497 object->vo_ledger_tag) &&
4498 object->vo_owner != NULL) {
4499 /* one more compressed purgeable/tagged page */
4500 vm_object_owner_compressed_update(object,
4501 compressed_count_delta);
4502 }
4503 counter_inc(&vm_statistics_compressions);
4504
4505 if (m->vmp_tabled) {
4506 vm_page_remove(m, TRUE);
4507 }
4508 } else {
4509 PAGE_WAKEUP_DONE(m);
4510
4511 vm_page_lockspin_queues();
4512
4513 vm_page_activate(m);
4514 vm_pageout_vminfo.vm_compressor_failed++;
4515
4516 vm_page_unlock_queues();
4517 }
4518 vm_object_activity_end(object);
4519 vm_object_unlock(object);
4520
4521 return retval;
4522 }
4523
4524
4525 static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state * ethr,boolean_t req_lowpriority)4526 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4527 {
4528 uint32_t policy;
4529
4530 if (hibernate_cleaning_in_progress == TRUE) {
4531 req_lowpriority = FALSE;
4532 }
4533
4534 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4535 vm_page_unlock_queues();
4536
4537 if (req_lowpriority == TRUE) {
4538 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4539 DTRACE_VM(laundrythrottle);
4540 } else {
4541 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4542 DTRACE_VM(laundryunthrottle);
4543 }
4544 proc_set_thread_policy(ethr->pgo_iothread,
4545 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4546
4547 vm_page_lock_queues();
4548 ethr->q->pgo_lowpriority = req_lowpriority;
4549 }
4550 }
4551
4552 OS_NORETURN
4553 static void
vm_pageout_iothread_external(struct pgo_iothread_state * ethr,__unused wait_result_t w)4554 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4555 {
4556 thread_t self = current_thread();
4557
4558 self->options |= TH_OPT_VMPRIV;
4559
4560 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4561
4562 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4563 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4564
4565 vm_page_lock_queues();
4566
4567 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4568 vm_pageout_queue_external.pgo_inited = TRUE;
4569
4570 vm_page_unlock_queues();
4571
4572 #if CONFIG_THREAD_GROUPS
4573 thread_group_vm_add();
4574 #endif /* CONFIG_THREAD_GROUPS */
4575
4576 vm_pageout_iothread_external_continue(ethr, 0);
4577 /*NOTREACHED*/
4578 }
4579
4580
4581 OS_NORETURN
4582 static void
vm_pageout_iothread_internal(struct pgo_iothread_state * cthr,__unused wait_result_t w)4583 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4584 {
4585 thread_t self = current_thread();
4586
4587 self->options |= TH_OPT_VMPRIV;
4588
4589 vm_page_lock_queues();
4590
4591 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4592 vm_pageout_queue_internal.pgo_inited = TRUE;
4593
4594 #if DEVELOPMENT || DEBUG
4595 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4596 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4597 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4598 #endif /* DEVELOPMENT || DEBUG */
4599
4600 vm_page_unlock_queues();
4601
4602 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4603 thread_vm_bind_group_add();
4604 }
4605
4606 #if CONFIG_THREAD_GROUPS
4607 thread_group_vm_add();
4608 #endif /* CONFIG_THREAD_GROUPS */
4609
4610 #if __AMP__
4611 if (vm_compressor_ebound) {
4612 /*
4613 * Use the soft bound option for vm_compressor to allow it to run on
4614 * P-cores if E-cluster is unavailable.
4615 */
4616 thread_bind_cluster_type(self, 'E', true);
4617 }
4618 #endif /* __AMP__ */
4619
4620 thread_set_thread_name(current_thread(), "VM_compressor");
4621 #if DEVELOPMENT || DEBUG
4622 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4623 #endif
4624 vm_pageout_iothread_internal_continue(cthr, 0);
4625
4626 /*NOTREACHED*/
4627 }
4628
4629 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4630 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4631 {
4632 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4633 return KERN_SUCCESS;
4634 } else {
4635 return KERN_FAILURE; /* Already set */
4636 }
4637 }
4638
4639 extern boolean_t memorystatus_manual_testing_on;
4640 extern unsigned int memorystatus_level;
4641
4642
4643 #if VM_PRESSURE_EVENTS
4644
4645 boolean_t vm_pressure_events_enabled = FALSE;
4646
4647 extern uint64_t next_warning_notification_sent_at_ts;
4648 extern uint64_t next_critical_notification_sent_at_ts;
4649
4650 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4651
4652 /*
4653 * The last time there was change in pressure level OR we forced a check
4654 * because the system is stuck in a non-normal pressure level.
4655 */
4656 uint64_t vm_pressure_last_level_transition_abs = 0;
4657
4658 /*
4659 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4660 * level before resending out notifications for that level again.
4661 */
4662 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4663
4664 void
vm_pressure_response(void)4665 vm_pressure_response(void)
4666 {
4667 vm_pressure_level_t old_level = kVMPressureNormal;
4668 int new_level = -1;
4669 unsigned int total_pages;
4670 uint64_t available_memory = 0;
4671 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4672 bool force_check = false;
4673 int time_in_mins;
4674
4675
4676 if (vm_pressure_events_enabled == FALSE) {
4677 return;
4678 }
4679
4680 #if !XNU_TARGET_OS_OSX
4681
4682 available_memory = (uint64_t) memorystatus_available_pages;
4683
4684 #else /* !XNU_TARGET_OS_OSX */
4685
4686 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4687 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4688
4689 #endif /* !XNU_TARGET_OS_OSX */
4690
4691 total_pages = (unsigned int) atop_64(max_mem);
4692 #if CONFIG_SECLUDED_MEMORY
4693 total_pages -= vm_page_secluded_count;
4694 #endif /* CONFIG_SECLUDED_MEMORY */
4695 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4696
4697 if (memorystatus_manual_testing_on) {
4698 return;
4699 }
4700
4701 curr_ts = mach_absolute_time();
4702 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4703
4704 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4705 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4706 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4707
4708 old_level = memorystatus_vm_pressure_level;
4709
4710 switch (memorystatus_vm_pressure_level) {
4711 case kVMPressureNormal:
4712 {
4713 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4714 new_level = kVMPressureCritical;
4715 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4716 new_level = kVMPressureWarning;
4717 }
4718 break;
4719 }
4720
4721 case kVMPressureWarning:
4722 case kVMPressureUrgent:
4723 {
4724 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4725 new_level = kVMPressureNormal;
4726 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4727 new_level = kVMPressureCritical;
4728 } else if (force_check) {
4729 new_level = kVMPressureWarning;
4730 next_warning_notification_sent_at_ts = curr_ts;
4731 }
4732 break;
4733 }
4734
4735 case kVMPressureCritical:
4736 {
4737 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4738 new_level = kVMPressureNormal;
4739 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4740 new_level = kVMPressureWarning;
4741 } else if (force_check) {
4742 new_level = kVMPressureCritical;
4743 next_critical_notification_sent_at_ts = curr_ts;
4744 }
4745 break;
4746 }
4747
4748 default:
4749 return;
4750 }
4751
4752 if (new_level != -1 || force_check) {
4753 if (new_level != -1) {
4754 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4755
4756 if (new_level != (int) old_level) {
4757 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4758 new_level, old_level, 0, 0);
4759 }
4760 } else {
4761 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4762 new_level, old_level, force_check, 0);
4763 }
4764
4765 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4766 /*
4767 * We don't want to schedule a wakeup while hibernation is in progress
4768 * because that could collide with checks for non-monotonicity in the scheduler.
4769 * We do however do all the updates to memorystatus_vm_pressure_level because
4770 * we _might_ want to use that for decisions regarding which pages or how
4771 * many pages we want to dump in hibernation.
4772 */
4773 return;
4774 }
4775
4776 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4777 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4778 thread_wakeup(&vm_pressure_thread);
4779 }
4780
4781 if (old_level != memorystatus_vm_pressure_level) {
4782 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4783 }
4784 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4785 }
4786 }
4787 }
4788 #endif /* VM_PRESSURE_EVENTS */
4789
4790
4791 /**
4792 * Called by a kernel thread to ask if a number of pages may be wired.
4793 */
4794 kern_return_t
mach_vm_wire_level_monitor(int64_t requested_pages)4795 mach_vm_wire_level_monitor(int64_t requested_pages)
4796 {
4797 if (requested_pages <= 0) {
4798 return KERN_INVALID_ARGUMENT;
4799 }
4800
4801 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4802 /**
4803 * Available pages can be negative in the case where more system memory is
4804 * wired than the threshold, so we must use a signed integer.
4805 */
4806 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4807
4808 if (requested_pages > available_pages) {
4809 return KERN_RESOURCE_SHORTAGE;
4810 }
4811 return KERN_SUCCESS;
4812 }
4813
4814 /*
4815 * Function called by a kernel thread to either get the current pressure level or
4816 * wait until memory pressure changes from a given level.
4817 */
4818 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4819 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4820 {
4821 #if !VM_PRESSURE_EVENTS
4822
4823 return KERN_FAILURE;
4824
4825 #else /* VM_PRESSURE_EVENTS */
4826
4827 wait_result_t wr = 0;
4828 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4829
4830 if (pressure_level == NULL) {
4831 return KERN_INVALID_ARGUMENT;
4832 }
4833
4834 if (*pressure_level == kVMPressureJetsam) {
4835 if (!wait_for_pressure) {
4836 return KERN_INVALID_ARGUMENT;
4837 }
4838
4839 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4840 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4841 THREAD_INTERRUPTIBLE);
4842 if (wr == THREAD_WAITING) {
4843 ++memorystatus_jetsam_fg_band_waiters;
4844 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4845 wr = thread_block(THREAD_CONTINUE_NULL);
4846 } else {
4847 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4848 }
4849 if (wr != THREAD_AWAKENED) {
4850 return KERN_ABORTED;
4851 }
4852 *pressure_level = kVMPressureJetsam;
4853 return KERN_SUCCESS;
4854 }
4855
4856 if (wait_for_pressure == TRUE) {
4857 while (old_level == *pressure_level) {
4858 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4859 THREAD_INTERRUPTIBLE);
4860 if (wr == THREAD_WAITING) {
4861 wr = thread_block(THREAD_CONTINUE_NULL);
4862 }
4863 if (wr == THREAD_INTERRUPTED) {
4864 return KERN_ABORTED;
4865 }
4866
4867 if (wr == THREAD_AWAKENED) {
4868 old_level = memorystatus_vm_pressure_level;
4869 }
4870 }
4871 }
4872
4873 *pressure_level = old_level;
4874 return KERN_SUCCESS;
4875 #endif /* VM_PRESSURE_EVENTS */
4876 }
4877
4878 #if VM_PRESSURE_EVENTS
4879 void
vm_pressure_thread(void)4880 vm_pressure_thread(void)
4881 {
4882 static boolean_t thread_initialized = FALSE;
4883
4884 if (thread_initialized == TRUE) {
4885 vm_pageout_state.vm_pressure_thread_running = TRUE;
4886 consider_vm_pressure_events();
4887 vm_pageout_state.vm_pressure_thread_running = FALSE;
4888 }
4889
4890 #if CONFIG_THREAD_GROUPS
4891 thread_group_vm_add();
4892 #endif /* CONFIG_THREAD_GROUPS */
4893
4894 thread_set_thread_name(current_thread(), "VM_pressure");
4895 thread_initialized = TRUE;
4896 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4897 thread_block((thread_continue_t)vm_pressure_thread);
4898 }
4899 #endif /* VM_PRESSURE_EVENTS */
4900
4901
4902 /*
4903 * called once per-second via "compute_averages"
4904 */
4905 void
compute_pageout_gc_throttle(__unused void * arg)4906 compute_pageout_gc_throttle(__unused void *arg)
4907 {
4908 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4909 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4910
4911 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4912 }
4913 }
4914
4915 /*
4916 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4917 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4918 * jetsams. We need to check if the zone map size is above its jetsam limit to
4919 * decide if this was indeed the case.
4920 *
4921 * We need to do this on a different thread because of the following reasons:
4922 *
4923 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4924 * itself causing the system to hang. We perform synchronous jetsams if we're
4925 * leaking in the VM map entries zone, so the leaking process could be doing a
4926 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4927 * jetsam itself. We also need the vm_map lock on the process termination path,
4928 * which would now lead the dying process to deadlock against itself.
4929 *
4930 * 2. The jetsam path might need to allocate zone memory itself. We could try
4931 * using the non-blocking variant of zalloc for this path, but we can still
4932 * end up trying to do a kmem_alloc when the zone maps are almost full.
4933 */
4934 __dead2
4935 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4936 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4937 {
4938 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4939
4940 if (step == VM_PAGEOUT_GC_INIT) {
4941 /* first time being called is not about GC */
4942 #if CONFIG_THREAD_GROUPS
4943 thread_group_vm_add();
4944 #endif /* CONFIG_THREAD_GROUPS */
4945 } else if (zone_map_nearing_exhaustion()) {
4946 /*
4947 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4948 *
4949 * Bail out after calling zone_gc (which triggers the
4950 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4951 * operations that clear out a bunch of caches might allocate zone
4952 * memory themselves (for eg. vm_map operations would need VM map
4953 * entries). Since the zone map is almost full at this point, we
4954 * could end up with a panic. We just need to quickly jetsam a
4955 * process and exit here.
4956 *
4957 * It could so happen that we were woken up to relieve memory
4958 * pressure and the zone map also happened to be near its limit at
4959 * the time, in which case we'll skip out early. But that should be
4960 * ok; if memory pressure persists, the thread will simply be woken
4961 * up again.
4962 */
4963 zone_gc(ZONE_GC_JETSAM);
4964 } else {
4965 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4966 boolean_t buf_large_zfree = FALSE;
4967 boolean_t first_try = TRUE;
4968
4969 stack_collect();
4970
4971 consider_machine_collect();
4972 #if CONFIG_MBUF_MCACHE
4973 mbuf_drain(FALSE);
4974 #endif /* CONFIG_MBUF_MCACHE */
4975
4976 do {
4977 if (consider_buffer_cache_collect != NULL) {
4978 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4979 }
4980 if (first_try == TRUE || buf_large_zfree == TRUE) {
4981 /*
4982 * zone_gc should be last, because the other operations
4983 * might return memory to zones.
4984 */
4985 zone_gc(ZONE_GC_TRIM);
4986 }
4987 first_try = FALSE;
4988 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4989
4990 consider_machine_adjust();
4991 }
4992
4993 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4994
4995 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4996 __builtin_unreachable();
4997 }
4998
4999
5000 #if VM_PAGE_BUCKETS_CHECK
5001 #if VM_PAGE_FAKE_BUCKETS
5002 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5003 #endif /* VM_PAGE_FAKE_BUCKETS */
5004 #endif /* VM_PAGE_BUCKETS_CHECK */
5005
5006
5007
5008 void
vm_set_restrictions(unsigned int num_cpus)5009 vm_set_restrictions(unsigned int num_cpus)
5010 {
5011 int vm_restricted_to_single_processor = 0;
5012
5013 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5014 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5015 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5016 } else {
5017 assert(num_cpus > 0);
5018
5019 if (num_cpus <= 3) {
5020 /*
5021 * on systems with a limited number of CPUS, bind the
5022 * 4 major threads that can free memory and that tend to use
5023 * a fair bit of CPU under pressured conditions to a single processor.
5024 * This insures that these threads don't hog all of the available CPUs
5025 * (important for camera launch), while allowing them to run independently
5026 * w/r to locks... the 4 threads are
5027 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5028 * vm_compressor_swap_trigger_thread (minor and major compactions),
5029 * memorystatus_thread (jetsams).
5030 *
5031 * the first time the thread is run, it is responsible for checking the
5032 * state of vm_restricted_to_single_processor, and if TRUE it calls
5033 * thread_bind_master... someday this should be replaced with a group
5034 * scheduling mechanism and KPI.
5035 */
5036 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5037 } else {
5038 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5039 }
5040 }
5041 }
5042
5043 /*
5044 * Set up vm_config based on the vm_compressor_mode.
5045 * Must run BEFORE the pageout thread starts up.
5046 */
5047 __startup_func
5048 void
vm_config_init(void)5049 vm_config_init(void)
5050 {
5051 bzero(&vm_config, sizeof(vm_config));
5052
5053 switch (vm_compressor_mode) {
5054 case VM_PAGER_DEFAULT:
5055 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5056 OS_FALLTHROUGH;
5057
5058 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5059 vm_config.compressor_is_present = TRUE;
5060 vm_config.swap_is_present = TRUE;
5061 vm_config.compressor_is_active = TRUE;
5062 vm_config.swap_is_active = TRUE;
5063 break;
5064
5065 case VM_PAGER_COMPRESSOR_NO_SWAP:
5066 vm_config.compressor_is_present = TRUE;
5067 vm_config.swap_is_present = TRUE;
5068 vm_config.compressor_is_active = TRUE;
5069 break;
5070
5071 case VM_PAGER_FREEZER_DEFAULT:
5072 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5073 OS_FALLTHROUGH;
5074
5075 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5076 vm_config.compressor_is_present = TRUE;
5077 vm_config.swap_is_present = TRUE;
5078 break;
5079
5080 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5081 vm_config.compressor_is_present = TRUE;
5082 vm_config.swap_is_present = TRUE;
5083 vm_config.compressor_is_active = TRUE;
5084 vm_config.freezer_swap_is_active = TRUE;
5085 break;
5086
5087 case VM_PAGER_NOT_CONFIGURED:
5088 break;
5089
5090 default:
5091 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5092 break;
5093 }
5094 }
5095
5096 __startup_func
5097 static void
vm_pageout_create_gc_thread(void)5098 vm_pageout_create_gc_thread(void)
5099 {
5100 thread_t thread;
5101
5102 if (kernel_thread_create(vm_pageout_garbage_collect,
5103 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5104 panic("vm_pageout_garbage_collect: create failed");
5105 }
5106 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5107 if (thread->reserved_stack == 0) {
5108 assert(thread->kernel_stack);
5109 thread->reserved_stack = thread->kernel_stack;
5110 }
5111
5112 /* thread is started in vm_pageout() */
5113 vm_pageout_gc_thread = thread;
5114 }
5115 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5116
5117 void
vm_pageout(void)5118 vm_pageout(void)
5119 {
5120 thread_t self = current_thread();
5121 thread_t thread;
5122 kern_return_t result;
5123 spl_t s;
5124
5125 /*
5126 * Set thread privileges.
5127 */
5128 s = splsched();
5129
5130 #if CONFIG_VPS_DYNAMIC_PRIO
5131 if (vps_dynamic_priority_enabled) {
5132 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5133 thread_set_eager_preempt(self);
5134 } else {
5135 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5136 }
5137 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5138 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5139 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5140
5141 thread_lock(self);
5142 self->options |= TH_OPT_VMPRIV;
5143 thread_unlock(self);
5144
5145 if (!self->reserved_stack) {
5146 self->reserved_stack = self->kernel_stack;
5147 }
5148
5149 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5150 !vps_dynamic_priority_enabled) {
5151 thread_vm_bind_group_add();
5152 }
5153
5154
5155 #if CONFIG_THREAD_GROUPS
5156 thread_group_vm_add();
5157 #endif /* CONFIG_THREAD_GROUPS */
5158
5159 #if __AMP__
5160 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5161 if (vm_pgo_pbound) {
5162 /*
5163 * Use the soft bound option for vm pageout to allow it to run on
5164 * E-cores if P-cluster is unavailable.
5165 */
5166 thread_bind_cluster_type(self, 'P', true);
5167 }
5168 #endif /* __AMP__ */
5169
5170 PE_parse_boot_argn("vmpgo_protect_realtime",
5171 &vm_pageout_protect_realtime,
5172 sizeof(vm_pageout_protect_realtime));
5173 splx(s);
5174
5175 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5176
5177 /*
5178 * Initialize some paging parameters.
5179 */
5180
5181 vm_pageout_state.vm_pressure_thread_running = FALSE;
5182 vm_pageout_state.vm_pressure_changed = FALSE;
5183 vm_pageout_state.memorystatus_purge_on_warning = 2;
5184 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5185 vm_pageout_state.memorystatus_purge_on_critical = 8;
5186 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5187 vm_pageout_state.vm_page_speculative_percentage = 5;
5188 vm_pageout_state.vm_page_speculative_target = 0;
5189
5190 vm_pageout_state.vm_pageout_swap_wait = 0;
5191 vm_pageout_state.vm_pageout_idle_wait = 0;
5192 vm_pageout_state.vm_pageout_empty_wait = 0;
5193 vm_pageout_state.vm_pageout_burst_wait = 0;
5194 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5195 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5196 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5197
5198 vm_pageout_state.vm_pageout_inactive = 0;
5199 vm_pageout_state.vm_pageout_inactive_used = 0;
5200 vm_pageout_state.vm_pageout_inactive_clean = 0;
5201
5202 vm_pageout_state.vm_memory_pressure = 0;
5203 vm_pageout_state.vm_page_filecache_min = 0;
5204 #if CONFIG_JETSAM
5205 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5206 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5207 #else
5208 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5209 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5210 #endif
5211 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5212
5213 vm_pageout_state.vm_pageout_considered_page_last = 0;
5214
5215 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5216 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5217 }
5218
5219 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5220 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5221 }
5222
5223 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5224 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5225 }
5226
5227 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5228 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5229 }
5230
5231 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5232 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5233 }
5234
5235 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5236 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5237 }
5238
5239 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5240 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5241 }
5242 /*
5243 * even if we've already called vm_page_free_reserve
5244 * call it again here to insure that the targets are
5245 * accurately calculated (it uses vm_page_free_count_init)
5246 * calling it with an arg of 0 will not change the reserve
5247 * but will re-calculate free_min and free_target
5248 */
5249 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5250 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5251 } else {
5252 vm_page_free_reserve(0);
5253 }
5254
5255 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5256 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5257
5258 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5259 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5260
5261 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5262
5263 #if DEVELOPMENT || DEBUG
5264 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5265 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5266 #endif /* DEVELOPMENT || DEBUG */
5267
5268
5269 /* internal pageout thread started when default pager registered first time */
5270 /* external pageout and garbage collection threads started here */
5271 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5272 ethr->id = 0;
5273 ethr->q = &vm_pageout_queue_external;
5274 ethr->current_early_swapout_chead = NULL;
5275 ethr->current_regular_swapout_chead = NULL;
5276 ethr->current_late_swapout_chead = NULL;
5277 ethr->scratch_buf = NULL;
5278 #if DEVELOPMENT || DEBUG
5279 ethr->benchmark_q = NULL;
5280 #endif /* DEVELOPMENT || DEBUG */
5281 sched_cond_init(&(ethr->pgo_wakeup));
5282
5283 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5284 (void *)ethr, BASEPRI_VM,
5285 &(ethr->pgo_iothread));
5286 if (result != KERN_SUCCESS) {
5287 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5288 }
5289 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5290
5291 thread_mtx_lock(vm_pageout_gc_thread );
5292 thread_start(vm_pageout_gc_thread );
5293 thread_mtx_unlock(vm_pageout_gc_thread);
5294
5295 #if VM_PRESSURE_EVENTS
5296 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5297 BASEPRI_DEFAULT,
5298 &thread);
5299
5300 if (result != KERN_SUCCESS) {
5301 panic("vm_pressure_thread: create failed");
5302 }
5303
5304 thread_deallocate(thread);
5305 #endif
5306
5307 vm_object_reaper_init();
5308
5309
5310 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5311 vm_compressor_init();
5312 }
5313
5314 #if VM_PRESSURE_EVENTS
5315 vm_pressure_events_enabled = TRUE;
5316 #endif /* VM_PRESSURE_EVENTS */
5317
5318 #if CONFIG_PHANTOM_CACHE
5319 vm_phantom_cache_init();
5320 #endif
5321 #if VM_PAGE_BUCKETS_CHECK
5322 #if VM_PAGE_FAKE_BUCKETS
5323 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5324 (uint64_t) vm_page_fake_buckets_start,
5325 (uint64_t) vm_page_fake_buckets_end);
5326 pmap_protect(kernel_pmap,
5327 vm_page_fake_buckets_start,
5328 vm_page_fake_buckets_end,
5329 VM_PROT_READ);
5330 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5331 #endif /* VM_PAGE_FAKE_BUCKETS */
5332 #endif /* VM_PAGE_BUCKETS_CHECK */
5333
5334 #if VM_OBJECT_TRACKING
5335 vm_object_tracking_init();
5336 #endif /* VM_OBJECT_TRACKING */
5337
5338 #if __arm64__
5339 // vm_tests();
5340 #endif /* __arm64__ */
5341
5342 vm_pageout_continue();
5343
5344 /*
5345 * Unreached code!
5346 *
5347 * The vm_pageout_continue() call above never returns, so the code below is never
5348 * executed. We take advantage of this to declare several DTrace VM related probe
5349 * points that our kernel doesn't have an analog for. These are probe points that
5350 * exist in Solaris and are in the DTrace documentation, so people may have written
5351 * scripts that use them. Declaring the probe points here means their scripts will
5352 * compile and execute which we want for portability of the scripts, but since this
5353 * section of code is never reached, the probe points will simply never fire. Yes,
5354 * this is basically a hack. The problem is the DTrace probe points were chosen with
5355 * Solaris specific VM events in mind, not portability to different VM implementations.
5356 */
5357
5358 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5359 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5360 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5361 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5362 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5363 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5364 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5365 /*NOTREACHED*/
5366 }
5367
5368
5369
5370 kern_return_t
vm_pageout_internal_start(void)5371 vm_pageout_internal_start(void)
5372 {
5373 kern_return_t result = KERN_SUCCESS;
5374 host_basic_info_data_t hinfo;
5375 vm_offset_t buf, bufsize;
5376
5377 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5378
5379 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5380 #define BSD_HOST 1
5381 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5382
5383 assert(hinfo.max_cpus > 0);
5384
5385 #if !XNU_TARGET_OS_OSX
5386 vm_pageout_state.vm_compressor_thread_count = 1;
5387 #else /* !XNU_TARGET_OS_OSX */
5388 if (hinfo.max_cpus > 4) {
5389 vm_pageout_state.vm_compressor_thread_count = 2;
5390 } else {
5391 vm_pageout_state.vm_compressor_thread_count = 1;
5392 }
5393 #endif /* !XNU_TARGET_OS_OSX */
5394 #if __AMP__
5395 if (vm_compressor_ebound) {
5396 vm_pageout_state.vm_compressor_thread_count = 2;
5397 }
5398 #endif
5399 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5400 sizeof(vm_pageout_state.vm_compressor_thread_count));
5401
5402 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5403 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5404 }
5405 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5406 vm_pageout_state.vm_compressor_thread_count = 1;
5407 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5408 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5409 }
5410
5411 vm_pageout_queue_internal.pgo_maxlaundry =
5412 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5413
5414 PE_parse_boot_argn("vmpgoi_maxlaundry",
5415 &vm_pageout_queue_internal.pgo_maxlaundry,
5416 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5417
5418 #if DEVELOPMENT || DEBUG
5419 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5420 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5421 #endif /* DEVELOPMENT || DEBUG */
5422
5423 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5424
5425 kmem_alloc(kernel_map, &buf,
5426 bufsize * vm_pageout_state.vm_compressor_thread_count,
5427 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5428 VM_KERN_MEMORY_COMPRESSOR);
5429
5430 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5431 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5432 iq->id = i;
5433 iq->q = &vm_pageout_queue_internal;
5434 iq->current_early_swapout_chead = NULL;
5435 iq->current_regular_swapout_chead = NULL;
5436 iq->current_late_swapout_chead = NULL;
5437 iq->scratch_buf = (char *)(buf + i * bufsize);
5438 #if DEVELOPMENT || DEBUG
5439 iq->benchmark_q = &vm_pageout_queue_benchmark;
5440 #endif /* DEVELOPMENT || DEBUG */
5441 sched_cond_init(&(iq->pgo_wakeup));
5442 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5443 (void *)iq, BASEPRI_VM,
5444 &(iq->pgo_iothread));
5445
5446 if (result != KERN_SUCCESS) {
5447 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5448 }
5449 }
5450 return result;
5451 }
5452
5453 #if CONFIG_IOSCHED
5454 /*
5455 * To support I/O Expedite for compressed files we mark the upls with special flags.
5456 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5457 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5458 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5459 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5460 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5461 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5462 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5463 * unless the real I/O upl is being destroyed).
5464 */
5465
5466
5467 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5468 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5469 {
5470 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5471
5472 upl_lock(src_upl);
5473 if (src_upl->decmp_io_upl) {
5474 /*
5475 * If there is already an alive real I/O UPL, ignore this new UPL.
5476 * This case should rarely happen and even if it does, it just means
5477 * that we might issue a spurious expedite which the driver is expected
5478 * to handle.
5479 */
5480 upl_unlock(src_upl);
5481 return;
5482 }
5483 src_upl->decmp_io_upl = (void *)upl;
5484 src_upl->ref_count++;
5485
5486 upl->flags |= UPL_DECMP_REAL_IO;
5487 upl->decmp_io_upl = (void *)src_upl;
5488 upl_unlock(src_upl);
5489 }
5490 #endif /* CONFIG_IOSCHED */
5491
5492 #if UPL_DEBUG
5493 int upl_debug_enabled = 1;
5494 #else
5495 int upl_debug_enabled = 0;
5496 #endif
5497
5498 static upl_t
upl_create(int type,int flags,upl_size_t size)5499 upl_create(int type, int flags, upl_size_t size)
5500 {
5501 uint32_t pages = (uint32_t)atop(round_page_32(size));
5502 upl_t upl;
5503
5504 assert(page_aligned(size));
5505
5506 /*
5507 * FIXME: this code assumes the allocation always succeeds,
5508 * however `pages` can be up to MAX_UPL_SIZE.
5509 *
5510 * The allocation size is above 32k (resp. 128k)
5511 * on 16k pages (resp. 4k), which kalloc might fail
5512 * to allocate.
5513 */
5514 upl = kalloc_type(struct upl, struct upl_page_info,
5515 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5516 if (type & UPL_CREATE_INTERNAL) {
5517 flags |= UPL_INTERNAL;
5518 }
5519
5520 if (type & UPL_CREATE_LITE) {
5521 flags |= UPL_LITE;
5522 if (pages) {
5523 upl->lite_list = bitmap_alloc(pages);
5524 }
5525 }
5526
5527 upl->flags = flags;
5528 upl->ref_count = 1;
5529 upl_lock_init(upl);
5530 #if CONFIG_IOSCHED
5531 if (type & UPL_CREATE_IO_TRACKING) {
5532 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5533 }
5534
5535 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5536 /* Only support expedite on internal UPLs */
5537 thread_t curthread = current_thread();
5538 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5539 Z_WAITOK | Z_ZERO);
5540 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5541 if (curthread->decmp_upl != NULL) {
5542 upl_set_decmp_info(upl, curthread->decmp_upl);
5543 }
5544 }
5545 #endif
5546 #if CONFIG_IOSCHED || UPL_DEBUG
5547 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5548 upl->upl_creator = current_thread();
5549 upl->flags |= UPL_TRACKED_BY_OBJECT;
5550 }
5551 #endif
5552
5553 #if UPL_DEBUG
5554 upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5555 #endif /* UPL_DEBUG */
5556
5557 return upl;
5558 }
5559
5560 static void
upl_destroy(upl_t upl)5561 upl_destroy(upl_t upl)
5562 {
5563 uint32_t pages;
5564
5565 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5566
5567 if (upl->ext_ref_count) {
5568 panic("upl(%p) ext_ref_count", upl);
5569 }
5570
5571 #if CONFIG_IOSCHED
5572 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5573 upl_t src_upl;
5574 src_upl = upl->decmp_io_upl;
5575 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5576 upl_lock(src_upl);
5577 src_upl->decmp_io_upl = NULL;
5578 upl_unlock(src_upl);
5579 upl_deallocate(src_upl);
5580 }
5581 #endif /* CONFIG_IOSCHED */
5582
5583 #if CONFIG_IOSCHED || UPL_DEBUG
5584 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5585 !(upl->flags & UPL_VECTOR)) {
5586 vm_object_t object;
5587
5588 if (upl->flags & UPL_SHADOWED) {
5589 object = upl->map_object->shadow;
5590 } else {
5591 object = upl->map_object;
5592 }
5593
5594 vm_object_lock(object);
5595 queue_remove(&object->uplq, upl, upl_t, uplq);
5596 vm_object_activity_end(object);
5597 vm_object_collapse(object, 0, TRUE);
5598 vm_object_unlock(object);
5599 }
5600 #endif
5601 /*
5602 * drop a reference on the map_object whether or
5603 * not a pageout object is inserted
5604 */
5605 if (upl->flags & UPL_SHADOWED) {
5606 vm_object_deallocate(upl->map_object);
5607 }
5608
5609 if (upl->flags & UPL_DEVICE_MEMORY) {
5610 pages = 1;
5611 } else {
5612 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5613 }
5614
5615 upl_lock_destroy(upl);
5616
5617 #if CONFIG_IOSCHED
5618 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5619 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5620 }
5621 #endif
5622
5623 #if UPL_DEBUG
5624 for (int i = 0; i < upl->upl_commit_index; i++) {
5625 btref_put(upl->upl_commit_records[i].c_btref);
5626 }
5627 btref_put(upl->uple_create_btref);
5628 #endif /* UPL_DEBUG */
5629
5630 if ((upl->flags & UPL_LITE) && pages) {
5631 bitmap_free(upl->lite_list, pages);
5632 }
5633 kfree_type(struct upl, struct upl_page_info,
5634 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5635 }
5636
5637 void
upl_deallocate(upl_t upl)5638 upl_deallocate(upl_t upl)
5639 {
5640 upl_lock(upl);
5641
5642 if (--upl->ref_count == 0) {
5643 if (vector_upl_is_valid(upl)) {
5644 vector_upl_deallocate(upl);
5645 }
5646 upl_unlock(upl);
5647
5648 if (upl->upl_iodone) {
5649 upl_callout_iodone(upl);
5650 }
5651
5652 upl_destroy(upl);
5653 } else {
5654 upl_unlock(upl);
5655 }
5656 }
5657
5658 #if CONFIG_IOSCHED
5659 void
upl_mark_decmp(upl_t upl)5660 upl_mark_decmp(upl_t upl)
5661 {
5662 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5663 upl->flags |= UPL_DECMP_REQ;
5664 upl->upl_creator->decmp_upl = (void *)upl;
5665 }
5666 }
5667
5668 void
upl_unmark_decmp(upl_t upl)5669 upl_unmark_decmp(upl_t upl)
5670 {
5671 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5672 upl->upl_creator->decmp_upl = NULL;
5673 }
5674 }
5675
5676 #endif /* CONFIG_IOSCHED */
5677
5678 #define VM_PAGE_Q_BACKING_UP(q) \
5679 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5680
5681 boolean_t must_throttle_writes(void);
5682
5683 boolean_t
must_throttle_writes()5684 must_throttle_writes()
5685 {
5686 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5687 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5688 return TRUE;
5689 }
5690
5691 return FALSE;
5692 }
5693
5694 int vm_page_delayed_work_ctx_needed = 0;
5695 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5696
5697 __startup_func
5698 static void
vm_page_delayed_work_init_ctx(void)5699 vm_page_delayed_work_init_ctx(void)
5700 {
5701 uint16_t min_delayed_work_ctx_allocated = 16;
5702
5703 /*
5704 * try really hard to always keep NCPU elements around in the zone
5705 * in order for the UPL code to almost always get an element.
5706 */
5707 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5708 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5709 }
5710
5711 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5712 }
5713 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5714
5715 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5716 vm_page_delayed_work_get_ctx(void)
5717 {
5718 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5719
5720 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5721
5722 if (__probable(dw_ctx)) {
5723 dw_ctx->delayed_owner = current_thread();
5724 } else {
5725 vm_page_delayed_work_ctx_needed++;
5726 }
5727 return dw_ctx ? dw_ctx->dwp : NULL;
5728 }
5729
5730 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5731 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5732 {
5733 struct vm_page_delayed_work_ctx *ldw_ctx;
5734
5735 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5736 ldw_ctx->delayed_owner = NULL;
5737
5738 zfree(dw_ctx_zone, ldw_ctx);
5739 }
5740
5741 /*
5742 * Routine: vm_object_upl_request
5743 * Purpose:
5744 * Cause the population of a portion of a vm_object.
5745 * Depending on the nature of the request, the pages
5746 * returned may be contain valid data or be uninitialized.
5747 * A page list structure, listing the physical pages
5748 * will be returned upon request.
5749 * This function is called by the file system or any other
5750 * supplier of backing store to a pager.
5751 * IMPORTANT NOTE: The caller must still respect the relationship
5752 * between the vm_object and its backing memory object. The
5753 * caller MUST NOT substitute changes in the backing file
5754 * without first doing a memory_object_lock_request on the
5755 * target range unless it is know that the pages are not
5756 * shared with another entity at the pager level.
5757 * Copy_in_to:
5758 * if a page list structure is present
5759 * return the mapped physical pages, where a
5760 * page is not present, return a non-initialized
5761 * one. If the no_sync bit is turned on, don't
5762 * call the pager unlock to synchronize with other
5763 * possible copies of the page. Leave pages busy
5764 * in the original object, if a page list structure
5765 * was specified. When a commit of the page list
5766 * pages is done, the dirty bit will be set for each one.
5767 * Copy_out_from:
5768 * If a page list structure is present, return
5769 * all mapped pages. Where a page does not exist
5770 * map a zero filled one. Leave pages busy in
5771 * the original object. If a page list structure
5772 * is not specified, this call is a no-op.
5773 *
5774 * Note: access of default pager objects has a rather interesting
5775 * twist. The caller of this routine, presumably the file system
5776 * page cache handling code, will never actually make a request
5777 * against a default pager backed object. Only the default
5778 * pager will make requests on backing store related vm_objects
5779 * In this way the default pager can maintain the relationship
5780 * between backing store files (abstract memory objects) and
5781 * the vm_objects (cache objects), they support.
5782 *
5783 */
5784
5785 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5786 vm_object_upl_request(
5787 vm_object_t object,
5788 vm_object_offset_t offset,
5789 upl_size_t size,
5790 upl_t *upl_ptr,
5791 upl_page_info_array_t user_page_list,
5792 unsigned int *page_list_count,
5793 upl_control_flags_t cntrl_flags,
5794 vm_tag_t tag)
5795 {
5796 vm_page_t dst_page = VM_PAGE_NULL;
5797 vm_object_offset_t dst_offset;
5798 upl_size_t xfer_size;
5799 unsigned int size_in_pages;
5800 boolean_t dirty;
5801 boolean_t hw_dirty;
5802 upl_t upl = NULL;
5803 unsigned int entry;
5804 vm_page_t alias_page = NULL;
5805 int refmod_state = 0;
5806 vm_object_t last_copy_object;
5807 struct vm_page_delayed_work dw_array;
5808 struct vm_page_delayed_work *dwp, *dwp_start;
5809 bool dwp_finish_ctx = TRUE;
5810 int dw_count;
5811 int dw_limit;
5812 int io_tracking_flag = 0;
5813 int grab_options;
5814 int page_grab_count = 0;
5815 ppnum_t phys_page;
5816 pmap_flush_context pmap_flush_context_storage;
5817 boolean_t pmap_flushes_delayed = FALSE;
5818 #if DEVELOPMENT || DEBUG
5819 task_t task = current_task();
5820 #endif /* DEVELOPMENT || DEBUG */
5821
5822 dwp_start = dwp = NULL;
5823
5824 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5825 /*
5826 * For forward compatibility's sake,
5827 * reject any unknown flag.
5828 */
5829 return KERN_INVALID_VALUE;
5830 }
5831 if ((!object->internal) && (object->paging_offset != 0)) {
5832 panic("vm_object_upl_request: external object with non-zero paging offset");
5833 }
5834 if (object->phys_contiguous) {
5835 panic("vm_object_upl_request: contiguous object specified");
5836 }
5837
5838 assertf(page_aligned(offset) && page_aligned(size),
5839 "offset 0x%llx size 0x%x",
5840 offset, size);
5841
5842 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5843
5844 dw_count = 0;
5845 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5846 dwp_start = vm_page_delayed_work_get_ctx();
5847 if (dwp_start == NULL) {
5848 dwp_start = &dw_array;
5849 dw_limit = 1;
5850 dwp_finish_ctx = FALSE;
5851 }
5852
5853 dwp = dwp_start;
5854
5855 if (size > MAX_UPL_SIZE_BYTES) {
5856 size = MAX_UPL_SIZE_BYTES;
5857 }
5858
5859 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5860 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5861 }
5862
5863 #if CONFIG_IOSCHED || UPL_DEBUG
5864 if (object->io_tracking || upl_debug_enabled) {
5865 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5866 }
5867 #endif
5868 #if CONFIG_IOSCHED
5869 if (object->io_tracking) {
5870 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5871 }
5872 #endif
5873
5874 if (cntrl_flags & UPL_SET_INTERNAL) {
5875 if (cntrl_flags & UPL_SET_LITE) {
5876 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5877 } else {
5878 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5879 }
5880 user_page_list = size ? upl->page_list : NULL;
5881 } else {
5882 if (cntrl_flags & UPL_SET_LITE) {
5883 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5884 } else {
5885 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5886 }
5887 }
5888 *upl_ptr = upl;
5889
5890 if (user_page_list) {
5891 user_page_list[0].device = FALSE;
5892 }
5893
5894 if (cntrl_flags & UPL_SET_LITE) {
5895 upl->map_object = object;
5896 } else {
5897 upl->map_object = vm_object_allocate(size);
5898 /*
5899 * No neeed to lock the new object: nobody else knows
5900 * about it yet, so it's all ours so far.
5901 */
5902 upl->map_object->shadow = object;
5903 upl->map_object->pageout = TRUE;
5904 upl->map_object->can_persist = FALSE;
5905 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5906 upl->map_object->vo_shadow_offset = offset;
5907 upl->map_object->wimg_bits = object->wimg_bits;
5908 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5909 "object %p shadow_offset 0x%llx",
5910 upl->map_object, upl->map_object->vo_shadow_offset);
5911
5912 alias_page = vm_page_grab_fictitious(TRUE);
5913
5914 upl->flags |= UPL_SHADOWED;
5915 }
5916 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5917 upl->flags |= UPL_PAGEOUT;
5918 }
5919
5920 vm_object_lock(object);
5921 vm_object_activity_begin(object);
5922
5923 grab_options = 0;
5924 #if CONFIG_SECLUDED_MEMORY
5925 if (object->can_grab_secluded) {
5926 grab_options |= VM_PAGE_GRAB_SECLUDED;
5927 }
5928 #endif /* CONFIG_SECLUDED_MEMORY */
5929
5930 /*
5931 * we can lock in the paging_offset once paging_in_progress is set
5932 */
5933 upl->u_size = size;
5934 upl->u_offset = offset + object->paging_offset;
5935
5936 #if CONFIG_IOSCHED || UPL_DEBUG
5937 if (object->io_tracking || upl_debug_enabled) {
5938 vm_object_activity_begin(object);
5939 queue_enter(&object->uplq, upl, upl_t, uplq);
5940 }
5941 #endif
5942 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5943 /*
5944 * Honor copy-on-write obligations
5945 *
5946 * The caller is gathering these pages and
5947 * might modify their contents. We need to
5948 * make sure that the copy object has its own
5949 * private copies of these pages before we let
5950 * the caller modify them.
5951 */
5952 vm_object_update(object,
5953 offset,
5954 size,
5955 NULL,
5956 NULL,
5957 FALSE, /* should_return */
5958 MEMORY_OBJECT_COPY_SYNC,
5959 VM_PROT_NO_CHANGE);
5960
5961 VM_PAGEOUT_DEBUG(upl_cow, 1);
5962 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5963 }
5964 /*
5965 * remember which copy object we synchronized with
5966 */
5967 last_copy_object = object->vo_copy;
5968 entry = 0;
5969
5970 xfer_size = size;
5971 dst_offset = offset;
5972 size_in_pages = size / PAGE_SIZE;
5973
5974 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5975 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5976 object->scan_collisions = 0;
5977 }
5978
5979 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5980 boolean_t isSSD = FALSE;
5981
5982 #if !XNU_TARGET_OS_OSX
5983 isSSD = TRUE;
5984 #else /* !XNU_TARGET_OS_OSX */
5985 vnode_pager_get_isSSD(object->pager, &isSSD);
5986 #endif /* !XNU_TARGET_OS_OSX */
5987 vm_object_unlock(object);
5988
5989 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5990
5991 if (isSSD == TRUE) {
5992 delay(1000 * size_in_pages);
5993 } else {
5994 delay(5000 * size_in_pages);
5995 }
5996 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5997
5998 vm_object_lock(object);
5999 }
6000
6001 while (xfer_size) {
6002 dwp->dw_mask = 0;
6003
6004 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6005 vm_object_unlock(object);
6006 alias_page = vm_page_grab_fictitious(TRUE);
6007 vm_object_lock(object);
6008 }
6009 if (cntrl_flags & UPL_COPYOUT_FROM) {
6010 upl->flags |= UPL_PAGE_SYNC_DONE;
6011
6012 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6013 dst_page->vmp_fictitious ||
6014 dst_page->vmp_absent ||
6015 VMP_ERROR_GET(dst_page) ||
6016 dst_page->vmp_cleaning ||
6017 (VM_PAGE_WIRED(dst_page))) {
6018 if (user_page_list) {
6019 user_page_list[entry].phys_addr = 0;
6020 }
6021
6022 goto try_next_page;
6023 }
6024 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6025
6026 /*
6027 * grab this up front...
6028 * a high percentange of the time we're going to
6029 * need the hardware modification state a bit later
6030 * anyway... so we can eliminate an extra call into
6031 * the pmap layer by grabbing it here and recording it
6032 */
6033 if (dst_page->vmp_pmapped) {
6034 refmod_state = pmap_get_refmod(phys_page);
6035 } else {
6036 refmod_state = 0;
6037 }
6038
6039 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6040 /*
6041 * page is on inactive list and referenced...
6042 * reactivate it now... this gets it out of the
6043 * way of vm_pageout_scan which would have to
6044 * reactivate it upon tripping over it
6045 */
6046 dwp->dw_mask |= DW_vm_page_activate;
6047 }
6048 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6049 /*
6050 * we're only asking for DIRTY pages to be returned
6051 */
6052 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6053 /*
6054 * if we were the page stolen by vm_pageout_scan to be
6055 * cleaned (as opposed to a buddy being clustered in
6056 * or this request is not being driven by a PAGEOUT cluster
6057 * then we only need to check for the page being dirty or
6058 * precious to decide whether to return it
6059 */
6060 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6061 goto check_busy;
6062 }
6063 goto dont_return;
6064 }
6065 /*
6066 * this is a request for a PAGEOUT cluster and this page
6067 * is merely along for the ride as a 'buddy'... not only
6068 * does it have to be dirty to be returned, but it also
6069 * can't have been referenced recently...
6070 */
6071 if ((hibernate_cleaning_in_progress == TRUE ||
6072 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6073 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6074 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6075 goto check_busy;
6076 }
6077 dont_return:
6078 /*
6079 * if we reach here, we're not to return
6080 * the page... go on to the next one
6081 */
6082 if (dst_page->vmp_laundry == TRUE) {
6083 /*
6084 * if we get here, the page is not 'cleaning' (filtered out above).
6085 * since it has been referenced, remove it from the laundry
6086 * so we don't pay the cost of an I/O to clean a page
6087 * we're just going to take back
6088 */
6089 vm_page_lockspin_queues();
6090
6091 vm_pageout_steal_laundry(dst_page, TRUE);
6092 vm_page_activate(dst_page);
6093
6094 vm_page_unlock_queues();
6095 }
6096 if (user_page_list) {
6097 user_page_list[entry].phys_addr = 0;
6098 }
6099
6100 goto try_next_page;
6101 }
6102 check_busy:
6103 if (dst_page->vmp_busy) {
6104 if (cntrl_flags & UPL_NOBLOCK) {
6105 if (user_page_list) {
6106 user_page_list[entry].phys_addr = 0;
6107 }
6108 dwp->dw_mask = 0;
6109
6110 goto try_next_page;
6111 }
6112 /*
6113 * someone else is playing with the
6114 * page. We will have to wait.
6115 */
6116 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6117
6118 continue;
6119 }
6120 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6121 vm_page_lockspin_queues();
6122
6123 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6124 /*
6125 * we've buddied up a page for a clustered pageout
6126 * that has already been moved to the pageout
6127 * queue by pageout_scan... we need to remove
6128 * it from the queue and drop the laundry count
6129 * on that queue
6130 */
6131 vm_pageout_throttle_up(dst_page);
6132 }
6133 vm_page_unlock_queues();
6134 }
6135 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6136 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6137
6138 if (phys_page > upl->highest_page) {
6139 upl->highest_page = phys_page;
6140 }
6141
6142 assert(!pmap_is_noencrypt(phys_page));
6143
6144 if (cntrl_flags & UPL_SET_LITE) {
6145 unsigned int pg_num;
6146
6147 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6148 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6149 bitmap_set(upl->lite_list, pg_num);
6150
6151 if (hw_dirty) {
6152 if (pmap_flushes_delayed == FALSE) {
6153 pmap_flush_context_init(&pmap_flush_context_storage);
6154 pmap_flushes_delayed = TRUE;
6155 }
6156 pmap_clear_refmod_options(phys_page,
6157 VM_MEM_MODIFIED,
6158 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6159 &pmap_flush_context_storage);
6160 }
6161
6162 /*
6163 * Mark original page as cleaning
6164 * in place.
6165 */
6166 dst_page->vmp_cleaning = TRUE;
6167 dst_page->vmp_precious = FALSE;
6168 } else {
6169 /*
6170 * use pageclean setup, it is more
6171 * convenient even for the pageout
6172 * cases here
6173 */
6174 vm_object_lock(upl->map_object);
6175 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6176 vm_object_unlock(upl->map_object);
6177
6178 alias_page->vmp_absent = FALSE;
6179 alias_page = NULL;
6180 }
6181 if (dirty) {
6182 SET_PAGE_DIRTY(dst_page, FALSE);
6183 } else {
6184 dst_page->vmp_dirty = FALSE;
6185 }
6186
6187 if (!dirty) {
6188 dst_page->vmp_precious = TRUE;
6189 }
6190
6191 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6192 if (!VM_PAGE_WIRED(dst_page)) {
6193 dst_page->vmp_free_when_done = TRUE;
6194 }
6195 }
6196 } else {
6197 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != last_copy_object) {
6198 /*
6199 * Honor copy-on-write obligations
6200 *
6201 * The copy object has changed since we
6202 * last synchronized for copy-on-write.
6203 * Another copy object might have been
6204 * inserted while we released the object's
6205 * lock. Since someone could have seen the
6206 * original contents of the remaining pages
6207 * through that new object, we have to
6208 * synchronize with it again for the remaining
6209 * pages only. The previous pages are "busy"
6210 * so they can not be seen through the new
6211 * mapping. The new mapping will see our
6212 * upcoming changes for those previous pages,
6213 * but that's OK since they couldn't see what
6214 * was there before. It's just a race anyway
6215 * and there's no guarantee of consistency or
6216 * atomicity. We just don't want new mappings
6217 * to see both the *before* and *after* pages.
6218 */
6219 if (object->vo_copy != VM_OBJECT_NULL) {
6220 vm_object_update(
6221 object,
6222 dst_offset,/* current offset */
6223 xfer_size, /* remaining size */
6224 NULL,
6225 NULL,
6226 FALSE, /* should_return */
6227 MEMORY_OBJECT_COPY_SYNC,
6228 VM_PROT_NO_CHANGE);
6229
6230 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6231 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6232 }
6233 /*
6234 * remember the copy object we synced with
6235 */
6236 last_copy_object = object->vo_copy;
6237 }
6238 dst_page = vm_page_lookup(object, dst_offset);
6239
6240 if (dst_page != VM_PAGE_NULL) {
6241 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6242 /*
6243 * skip over pages already present in the cache
6244 */
6245 if (user_page_list) {
6246 user_page_list[entry].phys_addr = 0;
6247 }
6248
6249 goto try_next_page;
6250 }
6251 if (dst_page->vmp_fictitious) {
6252 panic("need corner case for fictitious page");
6253 }
6254
6255 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6256 /*
6257 * someone else is playing with the
6258 * page. We will have to wait.
6259 */
6260 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6261
6262 continue;
6263 }
6264 if (dst_page->vmp_laundry) {
6265 vm_pageout_steal_laundry(dst_page, FALSE);
6266 }
6267 } else {
6268 if (object->private) {
6269 /*
6270 * This is a nasty wrinkle for users
6271 * of upl who encounter device or
6272 * private memory however, it is
6273 * unavoidable, only a fault can
6274 * resolve the actual backing
6275 * physical page by asking the
6276 * backing device.
6277 */
6278 if (user_page_list) {
6279 user_page_list[entry].phys_addr = 0;
6280 }
6281
6282 goto try_next_page;
6283 }
6284 if (object->scan_collisions) {
6285 /*
6286 * the pageout_scan thread is trying to steal
6287 * pages from this object, but has run into our
6288 * lock... grab 2 pages from the head of the object...
6289 * the first is freed on behalf of pageout_scan, the
6290 * 2nd is for our own use... we use vm_object_page_grab
6291 * in both cases to avoid taking pages from the free
6292 * list since we are under memory pressure and our
6293 * lock on this object is getting in the way of
6294 * relieving it
6295 */
6296 dst_page = vm_object_page_grab(object);
6297
6298 if (dst_page != VM_PAGE_NULL) {
6299 vm_page_release(dst_page,
6300 FALSE);
6301 }
6302
6303 dst_page = vm_object_page_grab(object);
6304 }
6305 if (dst_page == VM_PAGE_NULL) {
6306 /*
6307 * need to allocate a page
6308 */
6309 dst_page = vm_page_grab_options(grab_options);
6310 if (dst_page != VM_PAGE_NULL) {
6311 page_grab_count++;
6312 }
6313 }
6314 if (dst_page == VM_PAGE_NULL) {
6315 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6316 /*
6317 * we don't want to stall waiting for pages to come onto the free list
6318 * while we're already holding absent pages in this UPL
6319 * the caller will deal with the empty slots
6320 */
6321 if (user_page_list) {
6322 user_page_list[entry].phys_addr = 0;
6323 }
6324
6325 goto try_next_page;
6326 }
6327 /*
6328 * no pages available... wait
6329 * then try again for the same
6330 * offset...
6331 */
6332 vm_object_unlock(object);
6333
6334 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6335
6336 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6337
6338 VM_PAGE_WAIT();
6339 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6340
6341 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6342
6343 vm_object_lock(object);
6344
6345 continue;
6346 }
6347 vm_page_insert(dst_page, object, dst_offset);
6348
6349 dst_page->vmp_absent = TRUE;
6350 dst_page->vmp_busy = FALSE;
6351
6352 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6353 /*
6354 * if UPL_RET_ONLY_ABSENT was specified,
6355 * than we're definitely setting up a
6356 * upl for a clustered read/pagein
6357 * operation... mark the pages as clustered
6358 * so upl_commit_range can put them on the
6359 * speculative list
6360 */
6361 dst_page->vmp_clustered = TRUE;
6362
6363 if (!(cntrl_flags & UPL_FILE_IO)) {
6364 counter_inc(&vm_statistics_pageins);
6365 }
6366 }
6367 }
6368 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6369
6370 dst_page->vmp_overwriting = TRUE;
6371
6372 if (dst_page->vmp_pmapped) {
6373 if (!(cntrl_flags & UPL_FILE_IO)) {
6374 /*
6375 * eliminate all mappings from the
6376 * original object and its prodigy
6377 */
6378 refmod_state = pmap_disconnect(phys_page);
6379 } else {
6380 refmod_state = pmap_get_refmod(phys_page);
6381 }
6382 } else {
6383 refmod_state = 0;
6384 }
6385
6386 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6387 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6388
6389 if (cntrl_flags & UPL_SET_LITE) {
6390 unsigned int pg_num;
6391
6392 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6393 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6394 bitmap_set(upl->lite_list, pg_num);
6395
6396 if (hw_dirty) {
6397 pmap_clear_modify(phys_page);
6398 }
6399
6400 /*
6401 * Mark original page as cleaning
6402 * in place.
6403 */
6404 dst_page->vmp_cleaning = TRUE;
6405 dst_page->vmp_precious = FALSE;
6406 } else {
6407 /*
6408 * use pageclean setup, it is more
6409 * convenient even for the pageout
6410 * cases here
6411 */
6412 vm_object_lock(upl->map_object);
6413 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6414 vm_object_unlock(upl->map_object);
6415
6416 alias_page->vmp_absent = FALSE;
6417 alias_page = NULL;
6418 }
6419
6420 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6421 upl->flags &= ~UPL_CLEAR_DIRTY;
6422 upl->flags |= UPL_SET_DIRTY;
6423 dirty = TRUE;
6424 /*
6425 * Page belonging to a code-signed object is about to
6426 * be written. Mark it tainted and disconnect it from
6427 * all pmaps so processes have to fault it back in and
6428 * deal with the tainted bit.
6429 */
6430 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6431 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6432 vm_page_upl_tainted++;
6433 if (dst_page->vmp_pmapped) {
6434 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6435 if (refmod_state & VM_MEM_REFERENCED) {
6436 dst_page->vmp_reference = TRUE;
6437 }
6438 }
6439 }
6440 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6441 /*
6442 * clean in place for read implies
6443 * that a write will be done on all
6444 * the pages that are dirty before
6445 * a upl commit is done. The caller
6446 * is obligated to preserve the
6447 * contents of all pages marked dirty
6448 */
6449 upl->flags |= UPL_CLEAR_DIRTY;
6450 }
6451 dst_page->vmp_dirty = dirty;
6452
6453 if (!dirty) {
6454 dst_page->vmp_precious = TRUE;
6455 }
6456
6457 if (!VM_PAGE_WIRED(dst_page)) {
6458 /*
6459 * deny access to the target page while
6460 * it is being worked on
6461 */
6462 dst_page->vmp_busy = TRUE;
6463 } else {
6464 dwp->dw_mask |= DW_vm_page_wire;
6465 }
6466
6467 /*
6468 * We might be about to satisfy a fault which has been
6469 * requested. So no need for the "restart" bit.
6470 */
6471 dst_page->vmp_restart = FALSE;
6472 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6473 /*
6474 * expect the page to be used
6475 */
6476 dwp->dw_mask |= DW_set_reference;
6477 }
6478 if (cntrl_flags & UPL_PRECIOUS) {
6479 if (object->internal) {
6480 SET_PAGE_DIRTY(dst_page, FALSE);
6481 dst_page->vmp_precious = FALSE;
6482 } else {
6483 dst_page->vmp_precious = TRUE;
6484 }
6485 } else {
6486 dst_page->vmp_precious = FALSE;
6487 }
6488 }
6489 if (dst_page->vmp_busy) {
6490 upl->flags |= UPL_HAS_BUSY;
6491 }
6492
6493 if (phys_page > upl->highest_page) {
6494 upl->highest_page = phys_page;
6495 }
6496 assert(!pmap_is_noencrypt(phys_page));
6497 if (user_page_list) {
6498 user_page_list[entry].phys_addr = phys_page;
6499 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6500 user_page_list[entry].absent = dst_page->vmp_absent;
6501 user_page_list[entry].dirty = dst_page->vmp_dirty;
6502 user_page_list[entry].precious = dst_page->vmp_precious;
6503 user_page_list[entry].device = FALSE;
6504 user_page_list[entry].needed = FALSE;
6505 if (dst_page->vmp_clustered == TRUE) {
6506 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6507 } else {
6508 user_page_list[entry].speculative = FALSE;
6509 }
6510 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6511 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6512 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6513 user_page_list[entry].mark = FALSE;
6514 }
6515 /*
6516 * if UPL_RET_ONLY_ABSENT is set, then
6517 * we are working with a fresh page and we've
6518 * just set the clustered flag on it to
6519 * indicate that it was drug in as part of a
6520 * speculative cluster... so leave it alone
6521 */
6522 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6523 /*
6524 * someone is explicitly grabbing this page...
6525 * update clustered and speculative state
6526 *
6527 */
6528 if (dst_page->vmp_clustered) {
6529 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6530 }
6531 }
6532 try_next_page:
6533 if (dwp->dw_mask) {
6534 if (dwp->dw_mask & DW_vm_page_activate) {
6535 counter_inc(&vm_statistics_reactivations);
6536 }
6537
6538 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6539
6540 if (dw_count >= dw_limit) {
6541 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6542
6543 dwp = dwp_start;
6544 dw_count = 0;
6545 }
6546 }
6547 entry++;
6548 dst_offset += PAGE_SIZE_64;
6549 xfer_size -= PAGE_SIZE;
6550 }
6551 if (dw_count) {
6552 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6553 dwp = dwp_start;
6554 dw_count = 0;
6555 }
6556
6557 if (alias_page != NULL) {
6558 VM_PAGE_FREE(alias_page);
6559 }
6560 if (pmap_flushes_delayed == TRUE) {
6561 pmap_flush(&pmap_flush_context_storage);
6562 }
6563
6564 if (page_list_count != NULL) {
6565 if (upl->flags & UPL_INTERNAL) {
6566 *page_list_count = 0;
6567 } else if (*page_list_count > entry) {
6568 *page_list_count = entry;
6569 }
6570 }
6571 #if UPL_DEBUG
6572 upl->upl_state = 1;
6573 #endif
6574 vm_object_unlock(object);
6575
6576 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6577 #if DEVELOPMENT || DEBUG
6578 if (task != NULL) {
6579 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6580 }
6581 #endif /* DEVELOPMENT || DEBUG */
6582
6583 if (dwp_start && dwp_finish_ctx) {
6584 vm_page_delayed_work_finish_ctx(dwp_start);
6585 dwp_start = dwp = NULL;
6586 }
6587
6588 return KERN_SUCCESS;
6589 }
6590
6591 /*
6592 * Routine: vm_object_super_upl_request
6593 * Purpose:
6594 * Cause the population of a portion of a vm_object
6595 * in much the same way as memory_object_upl_request.
6596 * Depending on the nature of the request, the pages
6597 * returned may be contain valid data or be uninitialized.
6598 * However, the region may be expanded up to the super
6599 * cluster size provided.
6600 */
6601
6602 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6603 vm_object_super_upl_request(
6604 vm_object_t object,
6605 vm_object_offset_t offset,
6606 upl_size_t size,
6607 upl_size_t super_cluster,
6608 upl_t *upl,
6609 upl_page_info_t *user_page_list,
6610 unsigned int *page_list_count,
6611 upl_control_flags_t cntrl_flags,
6612 vm_tag_t tag)
6613 {
6614 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6615 return KERN_FAILURE;
6616 }
6617
6618 assert(object->paging_in_progress);
6619 offset = offset - object->paging_offset;
6620
6621 if (super_cluster > size) {
6622 vm_object_offset_t base_offset;
6623 upl_size_t super_size;
6624 vm_object_size_t super_size_64;
6625
6626 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6627 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6628 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6629 super_size = (upl_size_t) super_size_64;
6630 assert(super_size == super_size_64);
6631
6632 if (offset > (base_offset + super_size)) {
6633 panic("vm_object_super_upl_request: Missed target pageout"
6634 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6635 offset, base_offset, super_size, super_cluster,
6636 size, object->paging_offset);
6637 }
6638 /*
6639 * apparently there is a case where the vm requests a
6640 * page to be written out who's offset is beyond the
6641 * object size
6642 */
6643 if ((offset + size) > (base_offset + super_size)) {
6644 super_size_64 = (offset + size) - base_offset;
6645 super_size = (upl_size_t) super_size_64;
6646 assert(super_size == super_size_64);
6647 }
6648
6649 offset = base_offset;
6650 size = super_size;
6651 }
6652 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6653 }
6654
6655 int cs_executable_create_upl = 0;
6656 extern int proc_selfpid(void);
6657 extern char *proc_name_address(void *p);
6658
6659 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6660 vm_map_create_upl(
6661 vm_map_t map,
6662 vm_map_address_t offset,
6663 upl_size_t *upl_size,
6664 upl_t *upl,
6665 upl_page_info_array_t page_list,
6666 unsigned int *count,
6667 upl_control_flags_t *flags,
6668 vm_tag_t tag)
6669 {
6670 vm_map_entry_t entry;
6671 upl_control_flags_t caller_flags;
6672 int force_data_sync;
6673 int sync_cow_data;
6674 vm_object_t local_object;
6675 vm_map_offset_t local_offset;
6676 vm_map_offset_t local_start;
6677 kern_return_t ret;
6678 vm_map_address_t original_offset;
6679 vm_map_size_t original_size, adjusted_size;
6680 vm_map_offset_t local_entry_start;
6681 vm_object_offset_t local_entry_offset;
6682 vm_object_offset_t offset_in_mapped_page;
6683 boolean_t release_map = FALSE;
6684
6685 start_with_map:
6686
6687 original_offset = offset;
6688 original_size = *upl_size;
6689 adjusted_size = original_size;
6690
6691 caller_flags = *flags;
6692
6693 if (caller_flags & ~UPL_VALID_FLAGS) {
6694 /*
6695 * For forward compatibility's sake,
6696 * reject any unknown flag.
6697 */
6698 ret = KERN_INVALID_VALUE;
6699 goto done;
6700 }
6701 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6702 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6703
6704 if (upl == NULL) {
6705 ret = KERN_INVALID_ARGUMENT;
6706 goto done;
6707 }
6708
6709 REDISCOVER_ENTRY:
6710 vm_map_lock_read(map);
6711
6712 if (!vm_map_lookup_entry(map, offset, &entry)) {
6713 vm_map_unlock_read(map);
6714 ret = KERN_FAILURE;
6715 goto done;
6716 }
6717
6718 local_entry_start = entry->vme_start;
6719 local_entry_offset = VME_OFFSET(entry);
6720
6721 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6722 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6723 }
6724
6725 if (entry->vme_end - original_offset < adjusted_size) {
6726 adjusted_size = entry->vme_end - original_offset;
6727 assert(adjusted_size > 0);
6728 *upl_size = (upl_size_t) adjusted_size;
6729 assert(*upl_size == adjusted_size);
6730 }
6731
6732 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6733 *flags = 0;
6734
6735 if (!entry->is_sub_map &&
6736 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6737 if (VME_OBJECT(entry)->private) {
6738 *flags = UPL_DEV_MEMORY;
6739 }
6740
6741 if (VME_OBJECT(entry)->phys_contiguous) {
6742 *flags |= UPL_PHYS_CONTIG;
6743 }
6744 }
6745 vm_map_unlock_read(map);
6746 ret = KERN_SUCCESS;
6747 goto done;
6748 }
6749
6750 offset_in_mapped_page = 0;
6751 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6752 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6753 *upl_size = (upl_size_t)
6754 (vm_map_round_page(original_offset + adjusted_size,
6755 VM_MAP_PAGE_MASK(map))
6756 - offset);
6757
6758 offset_in_mapped_page = original_offset - offset;
6759 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6760
6761 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6762 }
6763
6764 if (!entry->is_sub_map) {
6765 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6766 !VME_OBJECT(entry)->phys_contiguous) {
6767 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6768 *upl_size = MAX_UPL_SIZE_BYTES;
6769 }
6770 }
6771
6772 /*
6773 * Create an object if necessary.
6774 */
6775 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6776 if (vm_map_lock_read_to_write(map)) {
6777 goto REDISCOVER_ENTRY;
6778 }
6779
6780 VME_OBJECT_SET(entry,
6781 vm_object_allocate((vm_size_t)
6782 vm_object_round_page((entry->vme_end - entry->vme_start))),
6783 false, 0);
6784 VME_OFFSET_SET(entry, 0);
6785 assert(entry->use_pmap);
6786
6787 vm_map_lock_write_to_read(map);
6788 }
6789
6790 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6791 !(entry->protection & VM_PROT_WRITE)) {
6792 vm_map_unlock_read(map);
6793 ret = KERN_PROTECTION_FAILURE;
6794 goto done;
6795 }
6796 }
6797
6798 #if !XNU_TARGET_OS_OSX
6799 if (map->pmap != kernel_pmap &&
6800 (caller_flags & UPL_COPYOUT_FROM) &&
6801 (entry->protection & VM_PROT_EXECUTE) &&
6802 !(entry->protection & VM_PROT_WRITE)) {
6803 vm_offset_t kaddr;
6804 vm_size_t ksize;
6805
6806 /*
6807 * We're about to create a read-only UPL backed by
6808 * memory from an executable mapping.
6809 * Wiring the pages would result in the pages being copied
6810 * (due to the "MAP_PRIVATE" mapping) and no longer
6811 * code-signed, so no longer eligible for execution.
6812 * Instead, let's copy the data into a kernel buffer and
6813 * create the UPL from this kernel buffer.
6814 * The kernel buffer is then freed, leaving the UPL holding
6815 * the last reference on the VM object, so the memory will
6816 * be released when the UPL is committed.
6817 */
6818
6819 vm_map_unlock_read(map);
6820 entry = VM_MAP_ENTRY_NULL;
6821 /* allocate kernel buffer */
6822 ksize = round_page(*upl_size);
6823 kaddr = 0;
6824 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6825 KMA_PAGEABLE | KMA_DATA, tag);
6826 if (ret == KERN_SUCCESS) {
6827 /* copyin the user data */
6828 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6829 }
6830 if (ret == KERN_SUCCESS) {
6831 if (ksize > *upl_size) {
6832 /* zero out the extra space in kernel buffer */
6833 memset((void *)(kaddr + *upl_size),
6834 0,
6835 ksize - *upl_size);
6836 }
6837 /* create the UPL from the kernel buffer */
6838 vm_object_offset_t offset_in_object;
6839 vm_object_offset_t offset_in_object_page;
6840
6841 offset_in_object = offset - local_entry_start + local_entry_offset;
6842 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6843 assert(offset_in_object_page < PAGE_SIZE);
6844 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6845 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6846 ret = vm_map_create_upl(kernel_map,
6847 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6848 upl_size, upl, page_list, count, flags, tag);
6849 }
6850 if (kaddr != 0) {
6851 /* free the kernel buffer */
6852 kmem_free(kernel_map, kaddr, ksize);
6853 kaddr = 0;
6854 ksize = 0;
6855 }
6856 #if DEVELOPMENT || DEBUG
6857 DTRACE_VM4(create_upl_from_executable,
6858 vm_map_t, map,
6859 vm_map_address_t, offset,
6860 upl_size_t, *upl_size,
6861 kern_return_t, ret);
6862 #endif /* DEVELOPMENT || DEBUG */
6863 goto done;
6864 }
6865 #endif /* !XNU_TARGET_OS_OSX */
6866
6867 if (!entry->is_sub_map) {
6868 local_object = VME_OBJECT(entry);
6869 assert(local_object != VM_OBJECT_NULL);
6870 }
6871
6872 if (!entry->is_sub_map &&
6873 !entry->needs_copy &&
6874 *upl_size != 0 &&
6875 local_object->vo_size > *upl_size && /* partial UPL */
6876 entry->wired_count == 0 && /* No COW for entries that are wired */
6877 (map->pmap != kernel_pmap) && /* alias checks */
6878 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6879 ||
6880 ( /* case 2 */
6881 local_object->internal &&
6882 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6883 local_object->ref_count > 1))) {
6884 vm_prot_t prot;
6885
6886 /*
6887 * Case 1:
6888 * Set up the targeted range for copy-on-write to avoid
6889 * applying true_share/copy_delay to the entire object.
6890 *
6891 * Case 2:
6892 * This map entry covers only part of an internal
6893 * object. There could be other map entries covering
6894 * other areas of this object and some of these map
6895 * entries could be marked as "needs_copy", which
6896 * assumes that the object is COPY_SYMMETRIC.
6897 * To avoid marking this object as COPY_DELAY and
6898 * "true_share", let's shadow it and mark the new
6899 * (smaller) object as "true_share" and COPY_DELAY.
6900 */
6901
6902 if (vm_map_lock_read_to_write(map)) {
6903 goto REDISCOVER_ENTRY;
6904 }
6905 vm_map_lock_assert_exclusive(map);
6906 assert(VME_OBJECT(entry) == local_object);
6907
6908 vm_map_clip_start(map,
6909 entry,
6910 vm_map_trunc_page(offset,
6911 VM_MAP_PAGE_MASK(map)));
6912 vm_map_clip_end(map,
6913 entry,
6914 vm_map_round_page(offset + *upl_size,
6915 VM_MAP_PAGE_MASK(map)));
6916 if ((entry->vme_end - offset) < *upl_size) {
6917 *upl_size = (upl_size_t) (entry->vme_end - offset);
6918 assert(*upl_size == entry->vme_end - offset);
6919 }
6920
6921 prot = entry->protection & ~VM_PROT_WRITE;
6922 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6923 prot |= VM_PROT_EXECUTE;
6924 }
6925 vm_object_pmap_protect(local_object,
6926 VME_OFFSET(entry),
6927 entry->vme_end - entry->vme_start,
6928 ((entry->is_shared ||
6929 map->mapped_in_other_pmaps)
6930 ? PMAP_NULL
6931 : map->pmap),
6932 VM_MAP_PAGE_SIZE(map),
6933 entry->vme_start,
6934 prot);
6935
6936 assert(entry->wired_count == 0);
6937
6938 /*
6939 * Lock the VM object and re-check its status: if it's mapped
6940 * in another address space, we could still be racing with
6941 * another thread holding that other VM map exclusively.
6942 */
6943 vm_object_lock(local_object);
6944 if (local_object->true_share) {
6945 /* object is already in proper state: no COW needed */
6946 assert(local_object->copy_strategy !=
6947 MEMORY_OBJECT_COPY_SYMMETRIC);
6948 } else {
6949 /* not true_share: ask for copy-on-write below */
6950 assert(local_object->copy_strategy ==
6951 MEMORY_OBJECT_COPY_SYMMETRIC);
6952 entry->needs_copy = TRUE;
6953 }
6954 vm_object_unlock(local_object);
6955
6956 vm_map_lock_write_to_read(map);
6957 }
6958
6959 if (entry->needs_copy) {
6960 /*
6961 * Honor copy-on-write for COPY_SYMMETRIC
6962 * strategy.
6963 */
6964 vm_map_t local_map;
6965 vm_object_t object;
6966 vm_object_offset_t new_offset;
6967 vm_prot_t prot;
6968 boolean_t wired;
6969 vm_map_version_t version;
6970 vm_map_t real_map;
6971 vm_prot_t fault_type;
6972
6973 local_map = map;
6974
6975 if (caller_flags & UPL_COPYOUT_FROM) {
6976 fault_type = VM_PROT_READ | VM_PROT_COPY;
6977 vm_counters.create_upl_extra_cow++;
6978 vm_counters.create_upl_extra_cow_pages +=
6979 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6980 } else {
6981 fault_type = VM_PROT_WRITE;
6982 }
6983 if (vm_map_lookup_and_lock_object(&local_map,
6984 offset, fault_type,
6985 OBJECT_LOCK_EXCLUSIVE,
6986 &version, &object,
6987 &new_offset, &prot, &wired,
6988 NULL,
6989 &real_map, NULL) != KERN_SUCCESS) {
6990 if (fault_type == VM_PROT_WRITE) {
6991 vm_counters.create_upl_lookup_failure_write++;
6992 } else {
6993 vm_counters.create_upl_lookup_failure_copy++;
6994 }
6995 vm_map_unlock_read(local_map);
6996 ret = KERN_FAILURE;
6997 goto done;
6998 }
6999 if (real_map != local_map) {
7000 vm_map_unlock(real_map);
7001 }
7002 vm_map_unlock_read(local_map);
7003
7004 vm_object_unlock(object);
7005
7006 goto REDISCOVER_ENTRY;
7007 }
7008
7009 if (entry->is_sub_map) {
7010 vm_map_t submap;
7011
7012 submap = VME_SUBMAP(entry);
7013 local_start = entry->vme_start;
7014 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7015
7016 vm_map_reference(submap);
7017 vm_map_unlock_read(map);
7018
7019 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7020 offset += offset_in_mapped_page;
7021 *upl_size -= offset_in_mapped_page;
7022
7023 if (release_map) {
7024 vm_map_deallocate(map);
7025 }
7026 map = submap;
7027 release_map = TRUE;
7028 offset = local_offset + (offset - local_start);
7029 goto start_with_map;
7030 }
7031
7032 if (sync_cow_data &&
7033 (VME_OBJECT(entry)->shadow ||
7034 VME_OBJECT(entry)->vo_copy)) {
7035 local_object = VME_OBJECT(entry);
7036 local_start = entry->vme_start;
7037 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7038
7039 vm_object_reference(local_object);
7040 vm_map_unlock_read(map);
7041
7042 if (local_object->shadow && local_object->vo_copy) {
7043 vm_object_lock_request(local_object->shadow,
7044 ((vm_object_offset_t)
7045 ((offset - local_start) +
7046 local_offset) +
7047 local_object->vo_shadow_offset),
7048 *upl_size, FALSE,
7049 MEMORY_OBJECT_DATA_SYNC,
7050 VM_PROT_NO_CHANGE);
7051 }
7052 sync_cow_data = FALSE;
7053 vm_object_deallocate(local_object);
7054
7055 goto REDISCOVER_ENTRY;
7056 }
7057 if (force_data_sync) {
7058 local_object = VME_OBJECT(entry);
7059 local_start = entry->vme_start;
7060 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7061
7062 vm_object_reference(local_object);
7063 vm_map_unlock_read(map);
7064
7065 vm_object_lock_request(local_object,
7066 ((vm_object_offset_t)
7067 ((offset - local_start) +
7068 local_offset)),
7069 (vm_object_size_t)*upl_size,
7070 FALSE,
7071 MEMORY_OBJECT_DATA_SYNC,
7072 VM_PROT_NO_CHANGE);
7073
7074 force_data_sync = FALSE;
7075 vm_object_deallocate(local_object);
7076
7077 goto REDISCOVER_ENTRY;
7078 }
7079 if (VME_OBJECT(entry)->private) {
7080 *flags = UPL_DEV_MEMORY;
7081 } else {
7082 *flags = 0;
7083 }
7084
7085 if (VME_OBJECT(entry)->phys_contiguous) {
7086 *flags |= UPL_PHYS_CONTIG;
7087 }
7088
7089 local_object = VME_OBJECT(entry);
7090 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7091 local_start = entry->vme_start;
7092
7093 /*
7094 * Wiring will copy the pages to the shadow object.
7095 * The shadow object will not be code-signed so
7096 * attempting to execute code from these copied pages
7097 * would trigger a code-signing violation.
7098 */
7099 if (entry->protection & VM_PROT_EXECUTE) {
7100 #if MACH_ASSERT
7101 printf("pid %d[%s] create_upl out of executable range from "
7102 "0x%llx to 0x%llx: side effects may include "
7103 "code-signing violations later on\n",
7104 proc_selfpid(),
7105 (get_bsdtask_info(current_task())
7106 ? proc_name_address(get_bsdtask_info(current_task()))
7107 : "?"),
7108 (uint64_t) entry->vme_start,
7109 (uint64_t) entry->vme_end);
7110 #endif /* MACH_ASSERT */
7111 DTRACE_VM2(cs_executable_create_upl,
7112 uint64_t, (uint64_t)entry->vme_start,
7113 uint64_t, (uint64_t)entry->vme_end);
7114 cs_executable_create_upl++;
7115 }
7116
7117 vm_object_lock(local_object);
7118
7119 /*
7120 * Ensure that this object is "true_share" and "copy_delay" now,
7121 * while we're still holding the VM map lock. After we unlock the map,
7122 * anything could happen to that mapping, including some copy-on-write
7123 * activity. We need to make sure that the IOPL will point at the
7124 * same memory as the mapping.
7125 */
7126 if (local_object->true_share) {
7127 assert(local_object->copy_strategy !=
7128 MEMORY_OBJECT_COPY_SYMMETRIC);
7129 } else if (!is_kernel_object(local_object) &&
7130 local_object != compressor_object &&
7131 !local_object->phys_contiguous) {
7132 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7133 if (!local_object->true_share &&
7134 vm_object_tracking_btlog) {
7135 btlog_record(vm_object_tracking_btlog, local_object,
7136 VM_OBJECT_TRACKING_OP_TRUESHARE,
7137 btref_get(__builtin_frame_address(0), 0));
7138 }
7139 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7140 local_object->true_share = TRUE;
7141 if (local_object->copy_strategy ==
7142 MEMORY_OBJECT_COPY_SYMMETRIC) {
7143 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7144 }
7145 }
7146
7147 vm_object_reference_locked(local_object);
7148 vm_object_unlock(local_object);
7149
7150 vm_map_unlock_read(map);
7151
7152 offset += offset_in_mapped_page;
7153 assert(*upl_size > offset_in_mapped_page);
7154 *upl_size -= offset_in_mapped_page;
7155
7156 ret = vm_object_iopl_request(local_object,
7157 ((vm_object_offset_t)
7158 ((offset - local_start) + local_offset)),
7159 *upl_size,
7160 upl,
7161 page_list,
7162 count,
7163 caller_flags,
7164 tag);
7165 vm_object_deallocate(local_object);
7166
7167 done:
7168 if (release_map) {
7169 vm_map_deallocate(map);
7170 }
7171
7172 return ret;
7173 }
7174
7175 /*
7176 * Internal routine to enter a UPL into a VM map.
7177 *
7178 * JMM - This should just be doable through the standard
7179 * vm_map_enter() API.
7180 */
7181 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7182 vm_map_enter_upl_range(
7183 vm_map_t map,
7184 upl_t upl,
7185 vm_object_offset_t offset_to_map,
7186 upl_size_t size_to_map,
7187 vm_prot_t prot_to_map,
7188 vm_map_offset_t *dst_addr)
7189 {
7190 vm_map_size_t size;
7191 vm_object_offset_t offset;
7192 vm_map_offset_t addr;
7193 vm_page_t m;
7194 kern_return_t kr;
7195 int isVectorUPL = 0, curr_upl = 0;
7196 upl_t vector_upl = NULL;
7197 mach_vm_offset_t vector_upl_dst_addr = 0;
7198 vm_map_t vector_upl_submap = NULL;
7199 upl_offset_t subupl_offset = 0;
7200 upl_size_t subupl_size = 0;
7201
7202 if (upl == UPL_NULL) {
7203 return KERN_INVALID_ARGUMENT;
7204 }
7205
7206 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7207 assert(map == kernel_map);
7208
7209 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7210 int mapped = 0, valid_upls = 0;
7211 vector_upl = upl;
7212
7213 upl_lock(vector_upl);
7214 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7215 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7216 if (upl == NULL) {
7217 continue;
7218 }
7219 valid_upls++;
7220 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7221 mapped++;
7222 }
7223 }
7224
7225 if (mapped) {
7226 if (mapped != valid_upls) {
7227 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7228 } else {
7229 upl_unlock(vector_upl);
7230 return KERN_FAILURE;
7231 }
7232 }
7233
7234 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7235 panic("TODO4K: vector UPL not implemented");
7236 }
7237
7238 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7239 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7240 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7241 VM_KERN_MEMORY_NONE).kmr_submap;
7242 map = vector_upl_submap;
7243 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7244 curr_upl = 0;
7245 } else {
7246 upl_lock(upl);
7247 }
7248
7249 process_upl_to_enter:
7250 if (isVectorUPL) {
7251 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7252 *dst_addr = vector_upl_dst_addr;
7253 upl_unlock(vector_upl);
7254 return KERN_SUCCESS;
7255 }
7256 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7257 if (upl == NULL) {
7258 goto process_upl_to_enter;
7259 }
7260
7261 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7262 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7263 } else {
7264 /*
7265 * check to see if already mapped
7266 */
7267 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7268 upl_unlock(upl);
7269 return KERN_FAILURE;
7270 }
7271 }
7272
7273 if ((!(upl->flags & UPL_SHADOWED)) &&
7274 ((upl->flags & UPL_HAS_BUSY) ||
7275 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7276 vm_object_t object;
7277 vm_page_t alias_page;
7278 vm_object_offset_t new_offset;
7279 unsigned int pg_num;
7280
7281 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7282 object = upl->map_object;
7283 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7284
7285 vm_object_lock(upl->map_object);
7286
7287 upl->map_object->shadow = object;
7288 upl->map_object->pageout = TRUE;
7289 upl->map_object->can_persist = FALSE;
7290 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7291 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7292 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7293 "object %p shadow_offset 0x%llx",
7294 upl->map_object,
7295 (uint64_t)upl->map_object->vo_shadow_offset);
7296 upl->map_object->wimg_bits = object->wimg_bits;
7297 offset = upl->map_object->vo_shadow_offset;
7298 new_offset = 0;
7299
7300 upl->flags |= UPL_SHADOWED;
7301
7302 while (size) {
7303 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7304 assert(pg_num == new_offset / PAGE_SIZE);
7305
7306 if (bitmap_test(upl->lite_list, pg_num)) {
7307 alias_page = vm_page_grab_fictitious(TRUE);
7308
7309 vm_object_lock(object);
7310
7311 m = vm_page_lookup(object, offset);
7312 if (m == VM_PAGE_NULL) {
7313 panic("vm_upl_map: page missing");
7314 }
7315
7316 /*
7317 * Convert the fictitious page to a private
7318 * shadow of the real page.
7319 */
7320 assert(alias_page->vmp_fictitious);
7321 alias_page->vmp_fictitious = FALSE;
7322 alias_page->vmp_private = TRUE;
7323 alias_page->vmp_free_when_done = TRUE;
7324 /*
7325 * since m is a page in the upl it must
7326 * already be wired or BUSY, so it's
7327 * safe to assign the underlying physical
7328 * page to the alias
7329 */
7330 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7331
7332 vm_object_unlock(object);
7333
7334 vm_page_lockspin_queues();
7335 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7336 vm_page_unlock_queues();
7337
7338 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7339
7340 assert(!alias_page->vmp_wanted);
7341 alias_page->vmp_busy = FALSE;
7342 alias_page->vmp_absent = FALSE;
7343 }
7344 size -= PAGE_SIZE;
7345 offset += PAGE_SIZE_64;
7346 new_offset += PAGE_SIZE_64;
7347 }
7348 vm_object_unlock(upl->map_object);
7349 }
7350 if (upl->flags & UPL_SHADOWED) {
7351 if (isVectorUPL) {
7352 offset = 0;
7353 } else {
7354 offset = offset_to_map;
7355 }
7356 } else {
7357 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7358 if (!isVectorUPL) {
7359 offset += offset_to_map;
7360 }
7361 }
7362
7363 if (isVectorUPL) {
7364 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7365 } else {
7366 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7367 }
7368
7369 vm_object_reference(upl->map_object);
7370
7371 if (!isVectorUPL) {
7372 *dst_addr = 0;
7373 /*
7374 * NEED A UPL_MAP ALIAS
7375 */
7376 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7377 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7378 upl->map_object, offset, FALSE,
7379 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7380
7381 if (kr != KERN_SUCCESS) {
7382 vm_object_deallocate(upl->map_object);
7383 upl_unlock(upl);
7384 return kr;
7385 }
7386 } else {
7387 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7388 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7389 upl->map_object, offset, FALSE,
7390 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7391 if (kr) {
7392 panic("vm_map_enter failed for a Vector UPL");
7393 }
7394 }
7395 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7396 /* this will have to be an increment rather than */
7397 /* an assignment. */
7398 vm_object_lock(upl->map_object);
7399
7400 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7401 m = vm_page_lookup(upl->map_object, offset);
7402
7403 if (m) {
7404 m->vmp_pmapped = TRUE;
7405
7406 /*
7407 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7408 * but only in kernel space. If this was on a user map,
7409 * we'd have to set the wpmapped bit.
7410 */
7411 /* m->vmp_wpmapped = TRUE; */
7412 assert(map->pmap == kernel_pmap);
7413
7414 kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE);
7415
7416 assert(kr == KERN_SUCCESS);
7417 #if KASAN
7418 kasan_notify_address(addr, PAGE_SIZE_64);
7419 #endif
7420 }
7421 offset += PAGE_SIZE_64;
7422 }
7423 vm_object_unlock(upl->map_object);
7424
7425 /*
7426 * hold a reference for the mapping
7427 */
7428 upl->ref_count++;
7429 upl->flags |= UPL_PAGE_LIST_MAPPED;
7430 upl->kaddr = (vm_offset_t) *dst_addr;
7431 assert(upl->kaddr == *dst_addr);
7432
7433 if (isVectorUPL) {
7434 goto process_upl_to_enter;
7435 }
7436
7437 if (!isVectorUPL) {
7438 vm_map_offset_t addr_adjustment;
7439
7440 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7441 if (addr_adjustment) {
7442 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7443 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7444 *dst_addr += addr_adjustment;
7445 }
7446 }
7447
7448 upl_unlock(upl);
7449
7450 return KERN_SUCCESS;
7451 }
7452
7453 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7454 vm_map_enter_upl(
7455 vm_map_t map,
7456 upl_t upl,
7457 vm_map_offset_t *dst_addr)
7458 {
7459 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7460 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7461 }
7462
7463 /*
7464 * Internal routine to remove a UPL mapping from a VM map.
7465 *
7466 * XXX - This should just be doable through a standard
7467 * vm_map_remove() operation. Otherwise, implicit clean-up
7468 * of the target map won't be able to correctly remove
7469 * these (and release the reference on the UPL). Having
7470 * to do this means we can't map these into user-space
7471 * maps yet.
7472 */
7473 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7474 vm_map_remove_upl_range(
7475 vm_map_t map,
7476 upl_t upl,
7477 __unused vm_object_offset_t offset_to_unmap,
7478 __unused upl_size_t size_to_unmap)
7479 {
7480 vm_address_t addr;
7481 upl_size_t size;
7482 int isVectorUPL = 0, curr_upl = 0;
7483 upl_t vector_upl = NULL;
7484
7485 if (upl == UPL_NULL) {
7486 return KERN_INVALID_ARGUMENT;
7487 }
7488
7489 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7490 int unmapped = 0, valid_upls = 0;
7491 vector_upl = upl;
7492 upl_lock(vector_upl);
7493 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7494 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7495 if (upl == NULL) {
7496 continue;
7497 }
7498 valid_upls++;
7499 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7500 unmapped++;
7501 }
7502 }
7503
7504 if (unmapped) {
7505 if (unmapped != valid_upls) {
7506 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7507 } else {
7508 upl_unlock(vector_upl);
7509 return KERN_FAILURE;
7510 }
7511 }
7512 curr_upl = 0;
7513 } else {
7514 upl_lock(upl);
7515 }
7516
7517 process_upl_to_remove:
7518 if (isVectorUPL) {
7519 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7520 vm_map_t v_upl_submap;
7521 vm_offset_t v_upl_submap_dst_addr;
7522 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7523
7524 kmem_free_guard(map, v_upl_submap_dst_addr,
7525 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7526 vm_map_deallocate(v_upl_submap);
7527 upl_unlock(vector_upl);
7528 return KERN_SUCCESS;
7529 }
7530
7531 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7532 if (upl == NULL) {
7533 goto process_upl_to_remove;
7534 }
7535 }
7536
7537 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7538 addr = upl->kaddr;
7539 size = upl->u_mapped_size;
7540
7541 assert(upl->ref_count > 1);
7542 upl->ref_count--; /* removing mapping ref */
7543
7544 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7545 upl->kaddr = (vm_offset_t) 0;
7546 upl->u_mapped_size = 0;
7547
7548 if (isVectorUPL) {
7549 /*
7550 * If it's a Vectored UPL, we'll be removing the entire
7551 * submap anyways, so no need to remove individual UPL
7552 * element mappings from within the submap
7553 */
7554 goto process_upl_to_remove;
7555 }
7556
7557 upl_unlock(upl);
7558
7559 vm_map_remove(map,
7560 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7561 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7562 return KERN_SUCCESS;
7563 }
7564 upl_unlock(upl);
7565
7566 return KERN_FAILURE;
7567 }
7568
7569 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7570 vm_map_remove_upl(
7571 vm_map_t map,
7572 upl_t upl)
7573 {
7574 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7575 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7576 }
7577
7578 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7579 upl_commit_range(
7580 upl_t upl,
7581 upl_offset_t offset,
7582 upl_size_t size,
7583 int flags,
7584 upl_page_info_t *page_list,
7585 mach_msg_type_number_t count,
7586 boolean_t *empty)
7587 {
7588 upl_size_t xfer_size, subupl_size;
7589 vm_object_t shadow_object;
7590 vm_object_t object;
7591 vm_object_t m_object;
7592 vm_object_offset_t target_offset;
7593 upl_offset_t subupl_offset = offset;
7594 int entry;
7595 int occupied;
7596 int clear_refmod = 0;
7597 int pgpgout_count = 0;
7598 struct vm_page_delayed_work dw_array;
7599 struct vm_page_delayed_work *dwp, *dwp_start;
7600 bool dwp_finish_ctx = TRUE;
7601 int dw_count;
7602 int dw_limit;
7603 int isVectorUPL = 0;
7604 upl_t vector_upl = NULL;
7605 boolean_t should_be_throttled = FALSE;
7606
7607 vm_page_t nxt_page = VM_PAGE_NULL;
7608 int fast_path_possible = 0;
7609 int fast_path_full_commit = 0;
7610 int throttle_page = 0;
7611 int unwired_count = 0;
7612 int local_queue_count = 0;
7613 vm_page_t first_local, last_local;
7614 vm_object_offset_t obj_start, obj_end, obj_offset;
7615 kern_return_t kr = KERN_SUCCESS;
7616
7617 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7618
7619 dwp_start = dwp = NULL;
7620
7621 subupl_size = size;
7622 *empty = FALSE;
7623
7624 if (upl == UPL_NULL) {
7625 return KERN_INVALID_ARGUMENT;
7626 }
7627
7628 dw_count = 0;
7629 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7630 dwp_start = vm_page_delayed_work_get_ctx();
7631 if (dwp_start == NULL) {
7632 dwp_start = &dw_array;
7633 dw_limit = 1;
7634 dwp_finish_ctx = FALSE;
7635 }
7636
7637 dwp = dwp_start;
7638
7639 if (count == 0) {
7640 page_list = NULL;
7641 }
7642
7643 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7644 vector_upl = upl;
7645 upl_lock(vector_upl);
7646 } else {
7647 upl_lock(upl);
7648 }
7649
7650 process_upl_to_commit:
7651
7652 if (isVectorUPL) {
7653 size = subupl_size;
7654 offset = subupl_offset;
7655 if (size == 0) {
7656 upl_unlock(vector_upl);
7657 kr = KERN_SUCCESS;
7658 goto done;
7659 }
7660 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7661 if (upl == NULL) {
7662 upl_unlock(vector_upl);
7663 kr = KERN_FAILURE;
7664 goto done;
7665 }
7666 page_list = upl->page_list;
7667 subupl_size -= size;
7668 subupl_offset += size;
7669 }
7670
7671 #if UPL_DEBUG
7672 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7673 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7674 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7675 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7676
7677 upl->upl_commit_index++;
7678 }
7679 #endif
7680 if (upl->flags & UPL_DEVICE_MEMORY) {
7681 xfer_size = 0;
7682 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7683 xfer_size = size;
7684 } else {
7685 if (!isVectorUPL) {
7686 upl_unlock(upl);
7687 } else {
7688 upl_unlock(vector_upl);
7689 }
7690 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7691 kr = KERN_FAILURE;
7692 goto done;
7693 }
7694 if (upl->flags & UPL_SET_DIRTY) {
7695 flags |= UPL_COMMIT_SET_DIRTY;
7696 }
7697 if (upl->flags & UPL_CLEAR_DIRTY) {
7698 flags |= UPL_COMMIT_CLEAR_DIRTY;
7699 }
7700
7701 object = upl->map_object;
7702
7703 if (upl->flags & UPL_SHADOWED) {
7704 vm_object_lock(object);
7705 shadow_object = object->shadow;
7706 } else {
7707 shadow_object = object;
7708 }
7709 entry = offset / PAGE_SIZE;
7710 target_offset = (vm_object_offset_t)offset;
7711
7712 if (upl->flags & UPL_KERNEL_OBJECT) {
7713 vm_object_lock_shared(shadow_object);
7714 } else {
7715 vm_object_lock(shadow_object);
7716 }
7717
7718 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7719
7720 if (upl->flags & UPL_ACCESS_BLOCKED) {
7721 assert(shadow_object->blocked_access);
7722 shadow_object->blocked_access = FALSE;
7723 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7724 }
7725
7726 if (shadow_object->code_signed) {
7727 /*
7728 * CODE SIGNING:
7729 * If the object is code-signed, do not let this UPL tell
7730 * us if the pages are valid or not. Let the pages be
7731 * validated by VM the normal way (when they get mapped or
7732 * copied).
7733 */
7734 flags &= ~UPL_COMMIT_CS_VALIDATED;
7735 }
7736 if (!page_list) {
7737 /*
7738 * No page list to get the code-signing info from !?
7739 */
7740 flags &= ~UPL_COMMIT_CS_VALIDATED;
7741 }
7742 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7743 should_be_throttled = TRUE;
7744 }
7745
7746 if ((upl->flags & UPL_IO_WIRE) &&
7747 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7748 !isVectorUPL &&
7749 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7750 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7751 if (!vm_page_queue_empty(&shadow_object->memq)) {
7752 if (shadow_object->internal && size == shadow_object->vo_size) {
7753 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7754 fast_path_full_commit = 1;
7755 }
7756 fast_path_possible = 1;
7757
7758 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7759 (shadow_object->purgable == VM_PURGABLE_DENY ||
7760 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7761 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7762 throttle_page = 1;
7763 }
7764 }
7765 }
7766 first_local = VM_PAGE_NULL;
7767 last_local = VM_PAGE_NULL;
7768
7769 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7770 obj_end = obj_start + xfer_size;
7771 obj_start = vm_object_trunc_page(obj_start);
7772 obj_end = vm_object_round_page(obj_end);
7773 for (obj_offset = obj_start;
7774 obj_offset < obj_end;
7775 obj_offset += PAGE_SIZE) {
7776 vm_page_t t, m;
7777
7778 dwp->dw_mask = 0;
7779 clear_refmod = 0;
7780
7781 m = VM_PAGE_NULL;
7782
7783 if (upl->flags & UPL_LITE) {
7784 unsigned int pg_num;
7785
7786 if (nxt_page != VM_PAGE_NULL) {
7787 m = nxt_page;
7788 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7789 target_offset = m->vmp_offset;
7790 }
7791 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7792 assert(pg_num == target_offset / PAGE_SIZE);
7793
7794 if (bitmap_test(upl->lite_list, pg_num)) {
7795 bitmap_clear(upl->lite_list, pg_num);
7796
7797 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7798 m = vm_page_lookup(shadow_object, obj_offset);
7799 }
7800 } else {
7801 m = NULL;
7802 }
7803 }
7804 if (upl->flags & UPL_SHADOWED) {
7805 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7806 t->vmp_free_when_done = FALSE;
7807
7808 VM_PAGE_FREE(t);
7809
7810 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7811 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7812 }
7813 }
7814 }
7815 if (m == VM_PAGE_NULL) {
7816 goto commit_next_page;
7817 }
7818
7819 m_object = VM_PAGE_OBJECT(m);
7820
7821 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7822 assert(m->vmp_busy);
7823
7824 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7825 goto commit_next_page;
7826 }
7827
7828 if (flags & UPL_COMMIT_CS_VALIDATED) {
7829 /*
7830 * CODE SIGNING:
7831 * Set the code signing bits according to
7832 * what the UPL says they should be.
7833 */
7834 m->vmp_cs_validated |= page_list[entry].cs_validated;
7835 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7836 m->vmp_cs_nx |= page_list[entry].cs_nx;
7837 }
7838 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7839 m->vmp_written_by_kernel = TRUE;
7840 }
7841
7842 if (upl->flags & UPL_IO_WIRE) {
7843 if (page_list) {
7844 page_list[entry].phys_addr = 0;
7845 }
7846
7847 if (flags & UPL_COMMIT_SET_DIRTY) {
7848 SET_PAGE_DIRTY(m, FALSE);
7849 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7850 m->vmp_dirty = FALSE;
7851
7852 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7853 m->vmp_cs_validated &&
7854 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7855 /*
7856 * CODE SIGNING:
7857 * This page is no longer dirty
7858 * but could have been modified,
7859 * so it will need to be
7860 * re-validated.
7861 */
7862 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7863
7864 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7865
7866 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7867 }
7868 clear_refmod |= VM_MEM_MODIFIED;
7869 }
7870 if (upl->flags & UPL_ACCESS_BLOCKED) {
7871 /*
7872 * We blocked access to the pages in this UPL.
7873 * Clear the "busy" bit and wake up any waiter
7874 * for this page.
7875 */
7876 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7877 }
7878 if (fast_path_possible) {
7879 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7880 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7881 if (m->vmp_absent) {
7882 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7883 assert(m->vmp_wire_count == 0);
7884 assert(m->vmp_busy);
7885
7886 m->vmp_absent = FALSE;
7887 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7888 } else {
7889 if (m->vmp_wire_count == 0) {
7890 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7891 }
7892 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7893
7894 /*
7895 * XXX FBDP need to update some other
7896 * counters here (purgeable_wired_count)
7897 * (ledgers), ...
7898 */
7899 assert(m->vmp_wire_count > 0);
7900 m->vmp_wire_count--;
7901
7902 if (m->vmp_wire_count == 0) {
7903 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7904 unwired_count++;
7905 }
7906 }
7907 if (m->vmp_wire_count == 0) {
7908 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7909
7910 if (last_local == VM_PAGE_NULL) {
7911 assert(first_local == VM_PAGE_NULL);
7912
7913 last_local = m;
7914 first_local = m;
7915 } else {
7916 assert(first_local != VM_PAGE_NULL);
7917
7918 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7919 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7920 first_local = m;
7921 }
7922 local_queue_count++;
7923
7924 if (throttle_page) {
7925 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7926 } else {
7927 if (flags & UPL_COMMIT_INACTIVATE) {
7928 if (shadow_object->internal) {
7929 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7930 } else {
7931 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7932 }
7933 } else {
7934 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7935 }
7936 }
7937 }
7938 } else {
7939 if (flags & UPL_COMMIT_INACTIVATE) {
7940 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7941 clear_refmod |= VM_MEM_REFERENCED;
7942 }
7943 if (m->vmp_absent) {
7944 if (flags & UPL_COMMIT_FREE_ABSENT) {
7945 dwp->dw_mask |= DW_vm_page_free;
7946 } else {
7947 m->vmp_absent = FALSE;
7948 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7949
7950 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7951 dwp->dw_mask |= DW_vm_page_activate;
7952 }
7953 }
7954 } else {
7955 dwp->dw_mask |= DW_vm_page_unwire;
7956 }
7957 }
7958 goto commit_next_page;
7959 }
7960 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7961
7962 if (page_list) {
7963 page_list[entry].phys_addr = 0;
7964 }
7965
7966 /*
7967 * make sure to clear the hardware
7968 * modify or reference bits before
7969 * releasing the BUSY bit on this page
7970 * otherwise we risk losing a legitimate
7971 * change of state
7972 */
7973 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7974 m->vmp_dirty = FALSE;
7975
7976 clear_refmod |= VM_MEM_MODIFIED;
7977 }
7978 if (m->vmp_laundry) {
7979 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7980 }
7981
7982 if (VM_PAGE_WIRED(m)) {
7983 m->vmp_free_when_done = FALSE;
7984 }
7985
7986 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7987 m->vmp_cs_validated &&
7988 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7989 /*
7990 * CODE SIGNING:
7991 * This page is no longer dirty
7992 * but could have been modified,
7993 * so it will need to be
7994 * re-validated.
7995 */
7996 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7997
7998 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7999
8000 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8001 }
8002 if (m->vmp_overwriting) {
8003 /*
8004 * the (COPY_OUT_FROM == FALSE) request_page_list case
8005 */
8006 if (m->vmp_busy) {
8007 #if CONFIG_PHANTOM_CACHE
8008 if (m->vmp_absent && !m_object->internal) {
8009 dwp->dw_mask |= DW_vm_phantom_cache_update;
8010 }
8011 #endif
8012 m->vmp_absent = FALSE;
8013
8014 dwp->dw_mask |= DW_clear_busy;
8015 } else {
8016 /*
8017 * alternate (COPY_OUT_FROM == FALSE) page_list case
8018 * Occurs when the original page was wired
8019 * at the time of the list request
8020 */
8021 assert(VM_PAGE_WIRED(m));
8022
8023 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8024 }
8025 m->vmp_overwriting = FALSE;
8026 }
8027 m->vmp_cleaning = FALSE;
8028
8029 if (m->vmp_free_when_done) {
8030 /*
8031 * With the clean queue enabled, UPL_PAGEOUT should
8032 * no longer set the pageout bit. Its pages now go
8033 * to the clean queue.
8034 *
8035 * We don't use the cleaned Q anymore and so this
8036 * assert isn't correct. The code for the clean Q
8037 * still exists and might be used in the future. If we
8038 * go back to the cleaned Q, we will re-enable this
8039 * assert.
8040 *
8041 * assert(!(upl->flags & UPL_PAGEOUT));
8042 */
8043 assert(!m_object->internal);
8044
8045 m->vmp_free_when_done = FALSE;
8046
8047 if ((flags & UPL_COMMIT_SET_DIRTY) ||
8048 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8049 /*
8050 * page was re-dirtied after we started
8051 * the pageout... reactivate it since
8052 * we don't know whether the on-disk
8053 * copy matches what is now in memory
8054 */
8055 SET_PAGE_DIRTY(m, FALSE);
8056
8057 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8058
8059 if (upl->flags & UPL_PAGEOUT) {
8060 counter_inc(&vm_statistics_reactivations);
8061 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8062 }
8063 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8064 /*
8065 * Someone else might still be handling this
8066 * page (vm_fault() for example), so let's not
8067 * free it or "un-busy" it!
8068 * Put that page in the "speculative" queue
8069 * for now (since we would otherwise have freed
8070 * it) and let whoever is keeping the page
8071 * "busy" move it if needed when they're done
8072 * with it.
8073 */
8074 dwp->dw_mask |= DW_vm_page_speculate;
8075 } else {
8076 /*
8077 * page has been successfully cleaned
8078 * go ahead and free it for other use
8079 */
8080 if (m_object->internal) {
8081 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8082 } else {
8083 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8084 }
8085 m->vmp_dirty = FALSE;
8086 if (!(upl->flags & UPL_HAS_BUSY)) {
8087 assert(!m->vmp_busy);
8088 }
8089 m->vmp_busy = TRUE;
8090
8091 dwp->dw_mask |= DW_vm_page_free;
8092 }
8093 goto commit_next_page;
8094 }
8095 /*
8096 * It is a part of the semantic of COPYOUT_FROM
8097 * UPLs that a commit implies cache sync
8098 * between the vm page and the backing store
8099 * this can be used to strip the precious bit
8100 * as well as clean
8101 */
8102 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8103 m->vmp_precious = FALSE;
8104 }
8105
8106 if (flags & UPL_COMMIT_SET_DIRTY) {
8107 SET_PAGE_DIRTY(m, FALSE);
8108 } else {
8109 m->vmp_dirty = FALSE;
8110 }
8111
8112 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8113 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8114 pgpgout_count++;
8115
8116 counter_inc(&vm_statistics_pageouts);
8117 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8118
8119 dwp->dw_mask |= DW_enqueue_cleaned;
8120 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8121 /*
8122 * page coming back in from being 'frozen'...
8123 * it was dirty before it was frozen, so keep it so
8124 * the vm_page_activate will notice that it really belongs
8125 * on the throttle queue and put it there
8126 */
8127 SET_PAGE_DIRTY(m, FALSE);
8128 dwp->dw_mask |= DW_vm_page_activate;
8129 } else {
8130 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8131 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8132 clear_refmod |= VM_MEM_REFERENCED;
8133 } else if (!VM_PAGE_PAGEABLE(m)) {
8134 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8135 dwp->dw_mask |= DW_vm_page_speculate;
8136 } else if (m->vmp_reference) {
8137 dwp->dw_mask |= DW_vm_page_activate;
8138 } else {
8139 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8140 clear_refmod |= VM_MEM_REFERENCED;
8141 }
8142 }
8143 }
8144 if (upl->flags & UPL_ACCESS_BLOCKED) {
8145 /*
8146 * We blocked access to the pages in this URL.
8147 * Clear the "busy" bit on this page before we
8148 * wake up any waiter.
8149 */
8150 dwp->dw_mask |= DW_clear_busy;
8151 }
8152 /*
8153 * Wakeup any thread waiting for the page to be un-cleaning.
8154 */
8155 dwp->dw_mask |= DW_PAGE_WAKEUP;
8156
8157 commit_next_page:
8158 if (clear_refmod) {
8159 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8160 }
8161
8162 target_offset += PAGE_SIZE_64;
8163 xfer_size -= PAGE_SIZE;
8164 entry++;
8165
8166 if (dwp->dw_mask) {
8167 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8168 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8169
8170 if (dw_count >= dw_limit) {
8171 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8172
8173 dwp = dwp_start;
8174 dw_count = 0;
8175 }
8176 } else {
8177 if (dwp->dw_mask & DW_clear_busy) {
8178 m->vmp_busy = FALSE;
8179 }
8180
8181 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8182 PAGE_WAKEUP(m);
8183 }
8184 }
8185 }
8186 }
8187 if (dw_count) {
8188 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8189 dwp = dwp_start;
8190 dw_count = 0;
8191 }
8192
8193 if (fast_path_possible) {
8194 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8195 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8196
8197 if (local_queue_count || unwired_count) {
8198 if (local_queue_count) {
8199 vm_page_t first_target;
8200 vm_page_queue_head_t *target_queue;
8201
8202 if (throttle_page) {
8203 target_queue = &vm_page_queue_throttled;
8204 } else {
8205 if (flags & UPL_COMMIT_INACTIVATE) {
8206 if (shadow_object->internal) {
8207 target_queue = &vm_page_queue_anonymous;
8208 } else {
8209 target_queue = &vm_page_queue_inactive;
8210 }
8211 } else {
8212 target_queue = &vm_page_queue_active;
8213 }
8214 }
8215 /*
8216 * Transfer the entire local queue to a regular LRU page queues.
8217 */
8218 vm_page_lockspin_queues();
8219
8220 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8221
8222 if (vm_page_queue_empty(target_queue)) {
8223 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8224 } else {
8225 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8226 }
8227
8228 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8229 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8230 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8231
8232 /*
8233 * Adjust the global page counts.
8234 */
8235 if (throttle_page) {
8236 vm_page_throttled_count += local_queue_count;
8237 } else {
8238 if (flags & UPL_COMMIT_INACTIVATE) {
8239 if (shadow_object->internal) {
8240 vm_page_anonymous_count += local_queue_count;
8241 }
8242 vm_page_inactive_count += local_queue_count;
8243
8244 token_new_pagecount += local_queue_count;
8245 } else {
8246 vm_page_active_count += local_queue_count;
8247 }
8248
8249 if (shadow_object->internal) {
8250 vm_page_pageable_internal_count += local_queue_count;
8251 } else {
8252 vm_page_pageable_external_count += local_queue_count;
8253 }
8254 }
8255 } else {
8256 vm_page_lockspin_queues();
8257 }
8258 if (unwired_count) {
8259 vm_page_wire_count -= unwired_count;
8260 VM_CHECK_MEMORYSTATUS;
8261 }
8262 vm_page_unlock_queues();
8263
8264 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8265 }
8266 }
8267
8268 if (upl->flags & UPL_DEVICE_MEMORY) {
8269 occupied = 0;
8270 } else if (upl->flags & UPL_LITE) {
8271 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8272
8273 occupied = !fast_path_full_commit &&
8274 !bitmap_is_empty(upl->lite_list, pages);
8275 } else {
8276 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8277 }
8278 if (occupied == 0) {
8279 /*
8280 * If this UPL element belongs to a Vector UPL and is
8281 * empty, then this is the right function to deallocate
8282 * it. So go ahead set the *empty variable. The flag
8283 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8284 * should be considered relevant for the Vector UPL and not
8285 * the internal UPLs.
8286 */
8287 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8288 *empty = TRUE;
8289 }
8290
8291 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8292 /*
8293 * this is not a paging object
8294 * so we need to drop the paging reference
8295 * that was taken when we created the UPL
8296 * against this object
8297 */
8298 vm_object_activity_end(shadow_object);
8299 vm_object_collapse(shadow_object, 0, TRUE);
8300 } else {
8301 /*
8302 * we dontated the paging reference to
8303 * the map object... vm_pageout_object_terminate
8304 * will drop this reference
8305 */
8306 }
8307 }
8308 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8309 vm_object_unlock(shadow_object);
8310 if (object != shadow_object) {
8311 vm_object_unlock(object);
8312 }
8313
8314 if (!isVectorUPL) {
8315 upl_unlock(upl);
8316 } else {
8317 /*
8318 * If we completed our operations on an UPL that is
8319 * part of a Vectored UPL and if empty is TRUE, then
8320 * we should go ahead and deallocate this UPL element.
8321 * Then we check if this was the last of the UPL elements
8322 * within that Vectored UPL. If so, set empty to TRUE
8323 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8324 * can go ahead and deallocate the Vector UPL too.
8325 */
8326 if (*empty == TRUE) {
8327 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8328 upl_deallocate(upl);
8329 }
8330 goto process_upl_to_commit;
8331 }
8332 if (pgpgout_count) {
8333 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8334 }
8335
8336 kr = KERN_SUCCESS;
8337 done:
8338 if (dwp_start && dwp_finish_ctx) {
8339 vm_page_delayed_work_finish_ctx(dwp_start);
8340 dwp_start = dwp = NULL;
8341 }
8342
8343 return kr;
8344 }
8345
8346 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8347 upl_abort_range(
8348 upl_t upl,
8349 upl_offset_t offset,
8350 upl_size_t size,
8351 int error,
8352 boolean_t *empty)
8353 {
8354 upl_size_t xfer_size, subupl_size;
8355 vm_object_t shadow_object;
8356 vm_object_t object;
8357 vm_object_offset_t target_offset;
8358 upl_offset_t subupl_offset = offset;
8359 int occupied;
8360 struct vm_page_delayed_work dw_array;
8361 struct vm_page_delayed_work *dwp, *dwp_start;
8362 bool dwp_finish_ctx = TRUE;
8363 int dw_count;
8364 int dw_limit;
8365 int isVectorUPL = 0;
8366 upl_t vector_upl = NULL;
8367 vm_object_offset_t obj_start, obj_end, obj_offset;
8368 kern_return_t kr = KERN_SUCCESS;
8369
8370 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8371
8372 dwp_start = dwp = NULL;
8373
8374 subupl_size = size;
8375 *empty = FALSE;
8376
8377 if (upl == UPL_NULL) {
8378 return KERN_INVALID_ARGUMENT;
8379 }
8380
8381 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8382 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8383 }
8384
8385 dw_count = 0;
8386 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8387 dwp_start = vm_page_delayed_work_get_ctx();
8388 if (dwp_start == NULL) {
8389 dwp_start = &dw_array;
8390 dw_limit = 1;
8391 dwp_finish_ctx = FALSE;
8392 }
8393
8394 dwp = dwp_start;
8395
8396 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8397 vector_upl = upl;
8398 upl_lock(vector_upl);
8399 } else {
8400 upl_lock(upl);
8401 }
8402
8403 process_upl_to_abort:
8404 if (isVectorUPL) {
8405 size = subupl_size;
8406 offset = subupl_offset;
8407 if (size == 0) {
8408 upl_unlock(vector_upl);
8409 kr = KERN_SUCCESS;
8410 goto done;
8411 }
8412 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8413 if (upl == NULL) {
8414 upl_unlock(vector_upl);
8415 kr = KERN_FAILURE;
8416 goto done;
8417 }
8418 subupl_size -= size;
8419 subupl_offset += size;
8420 }
8421
8422 *empty = FALSE;
8423
8424 #if UPL_DEBUG
8425 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8426 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8427 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8428 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8429 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8430
8431 upl->upl_commit_index++;
8432 }
8433 #endif
8434 if (upl->flags & UPL_DEVICE_MEMORY) {
8435 xfer_size = 0;
8436 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8437 xfer_size = size;
8438 } else {
8439 if (!isVectorUPL) {
8440 upl_unlock(upl);
8441 } else {
8442 upl_unlock(vector_upl);
8443 }
8444 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8445 kr = KERN_FAILURE;
8446 goto done;
8447 }
8448 object = upl->map_object;
8449
8450 if (upl->flags & UPL_SHADOWED) {
8451 vm_object_lock(object);
8452 shadow_object = object->shadow;
8453 } else {
8454 shadow_object = object;
8455 }
8456
8457 target_offset = (vm_object_offset_t)offset;
8458
8459 if (upl->flags & UPL_KERNEL_OBJECT) {
8460 vm_object_lock_shared(shadow_object);
8461 } else {
8462 vm_object_lock(shadow_object);
8463 }
8464
8465 if (upl->flags & UPL_ACCESS_BLOCKED) {
8466 assert(shadow_object->blocked_access);
8467 shadow_object->blocked_access = FALSE;
8468 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8469 }
8470
8471 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8472 panic("upl_abort_range: kernel_object being DUMPED");
8473 }
8474
8475 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8476 obj_end = obj_start + xfer_size;
8477 obj_start = vm_object_trunc_page(obj_start);
8478 obj_end = vm_object_round_page(obj_end);
8479 for (obj_offset = obj_start;
8480 obj_offset < obj_end;
8481 obj_offset += PAGE_SIZE) {
8482 vm_page_t t, m;
8483 unsigned int pg_num;
8484 boolean_t needed;
8485
8486 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8487 assert(pg_num == target_offset / PAGE_SIZE);
8488
8489 needed = FALSE;
8490
8491 if (upl->flags & UPL_INTERNAL) {
8492 needed = upl->page_list[pg_num].needed;
8493 }
8494
8495 dwp->dw_mask = 0;
8496 m = VM_PAGE_NULL;
8497
8498 if (upl->flags & UPL_LITE) {
8499 if (bitmap_test(upl->lite_list, pg_num)) {
8500 bitmap_clear(upl->lite_list, pg_num);
8501
8502 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8503 m = vm_page_lookup(shadow_object, obj_offset);
8504 }
8505 }
8506 }
8507 if (upl->flags & UPL_SHADOWED) {
8508 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8509 t->vmp_free_when_done = FALSE;
8510
8511 VM_PAGE_FREE(t);
8512
8513 if (m == VM_PAGE_NULL) {
8514 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8515 }
8516 }
8517 }
8518 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8519 goto abort_next_page;
8520 }
8521
8522 if (m != VM_PAGE_NULL) {
8523 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8524
8525 if (m->vmp_absent) {
8526 boolean_t must_free = TRUE;
8527
8528 /*
8529 * COPYOUT = FALSE case
8530 * check for error conditions which must
8531 * be passed back to the pages customer
8532 */
8533 if (error & UPL_ABORT_RESTART) {
8534 m->vmp_restart = TRUE;
8535 m->vmp_absent = FALSE;
8536 m->vmp_unusual = TRUE;
8537 must_free = FALSE;
8538 } else if (error & UPL_ABORT_UNAVAILABLE) {
8539 m->vmp_restart = FALSE;
8540 m->vmp_unusual = TRUE;
8541 must_free = FALSE;
8542 } else if (error & UPL_ABORT_ERROR) {
8543 m->vmp_restart = FALSE;
8544 m->vmp_absent = FALSE;
8545 m->vmp_error = TRUE;
8546 m->vmp_unusual = TRUE;
8547 must_free = FALSE;
8548 }
8549 if (m->vmp_clustered && needed == FALSE) {
8550 /*
8551 * This page was a part of a speculative
8552 * read-ahead initiated by the kernel
8553 * itself. No one is expecting this
8554 * page and no one will clean up its
8555 * error state if it ever becomes valid
8556 * in the future.
8557 * We have to free it here.
8558 */
8559 must_free = TRUE;
8560 }
8561 m->vmp_cleaning = FALSE;
8562
8563 if (m->vmp_overwriting && !m->vmp_busy) {
8564 /*
8565 * this shouldn't happen since
8566 * this is an 'absent' page, but
8567 * it doesn't hurt to check for
8568 * the 'alternate' method of
8569 * stabilizing the page...
8570 * we will mark 'busy' to be cleared
8571 * in the following code which will
8572 * take care of the primary stabilzation
8573 * method (i.e. setting 'busy' to TRUE)
8574 */
8575 dwp->dw_mask |= DW_vm_page_unwire;
8576 }
8577 m->vmp_overwriting = FALSE;
8578
8579 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8580
8581 if (must_free == TRUE) {
8582 dwp->dw_mask |= DW_vm_page_free;
8583 } else {
8584 dwp->dw_mask |= DW_vm_page_activate;
8585 }
8586 } else {
8587 /*
8588 * Handle the trusted pager throttle.
8589 */
8590 if (m->vmp_laundry) {
8591 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8592 }
8593
8594 if (upl->flags & UPL_ACCESS_BLOCKED) {
8595 /*
8596 * We blocked access to the pages in this UPL.
8597 * Clear the "busy" bit and wake up any waiter
8598 * for this page.
8599 */
8600 dwp->dw_mask |= DW_clear_busy;
8601 }
8602 if (m->vmp_overwriting) {
8603 if (m->vmp_busy) {
8604 dwp->dw_mask |= DW_clear_busy;
8605 } else {
8606 /*
8607 * deal with the 'alternate' method
8608 * of stabilizing the page...
8609 * we will either free the page
8610 * or mark 'busy' to be cleared
8611 * in the following code which will
8612 * take care of the primary stabilzation
8613 * method (i.e. setting 'busy' to TRUE)
8614 */
8615 dwp->dw_mask |= DW_vm_page_unwire;
8616 }
8617 m->vmp_overwriting = FALSE;
8618 }
8619 m->vmp_free_when_done = FALSE;
8620 m->vmp_cleaning = FALSE;
8621
8622 if (error & UPL_ABORT_DUMP_PAGES) {
8623 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8624
8625 dwp->dw_mask |= DW_vm_page_free;
8626 } else {
8627 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8628 if (error & UPL_ABORT_REFERENCE) {
8629 /*
8630 * we've been told to explictly
8631 * reference this page... for
8632 * file I/O, this is done by
8633 * implementing an LRU on the inactive q
8634 */
8635 dwp->dw_mask |= DW_vm_page_lru;
8636 } else if (!VM_PAGE_PAGEABLE(m)) {
8637 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8638 }
8639 }
8640 dwp->dw_mask |= DW_PAGE_WAKEUP;
8641 }
8642 }
8643 }
8644 abort_next_page:
8645 target_offset += PAGE_SIZE_64;
8646 xfer_size -= PAGE_SIZE;
8647
8648 if (dwp->dw_mask) {
8649 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8650 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8651
8652 if (dw_count >= dw_limit) {
8653 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8654
8655 dwp = dwp_start;
8656 dw_count = 0;
8657 }
8658 } else {
8659 if (dwp->dw_mask & DW_clear_busy) {
8660 m->vmp_busy = FALSE;
8661 }
8662
8663 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8664 PAGE_WAKEUP(m);
8665 }
8666 }
8667 }
8668 }
8669 if (dw_count) {
8670 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8671 dwp = dwp_start;
8672 dw_count = 0;
8673 }
8674
8675 if (upl->flags & UPL_DEVICE_MEMORY) {
8676 occupied = 0;
8677 } else if (upl->flags & UPL_LITE) {
8678 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8679
8680 occupied = !bitmap_is_empty(upl->lite_list, pages);
8681 } else {
8682 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8683 }
8684 if (occupied == 0) {
8685 /*
8686 * If this UPL element belongs to a Vector UPL and is
8687 * empty, then this is the right function to deallocate
8688 * it. So go ahead set the *empty variable. The flag
8689 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8690 * should be considered relevant for the Vector UPL and
8691 * not the internal UPLs.
8692 */
8693 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8694 *empty = TRUE;
8695 }
8696
8697 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8698 /*
8699 * this is not a paging object
8700 * so we need to drop the paging reference
8701 * that was taken when we created the UPL
8702 * against this object
8703 */
8704 vm_object_activity_end(shadow_object);
8705 vm_object_collapse(shadow_object, 0, TRUE);
8706 } else {
8707 /*
8708 * we dontated the paging reference to
8709 * the map object... vm_pageout_object_terminate
8710 * will drop this reference
8711 */
8712 }
8713 }
8714 vm_object_unlock(shadow_object);
8715 if (object != shadow_object) {
8716 vm_object_unlock(object);
8717 }
8718
8719 if (!isVectorUPL) {
8720 upl_unlock(upl);
8721 } else {
8722 /*
8723 * If we completed our operations on an UPL that is
8724 * part of a Vectored UPL and if empty is TRUE, then
8725 * we should go ahead and deallocate this UPL element.
8726 * Then we check if this was the last of the UPL elements
8727 * within that Vectored UPL. If so, set empty to TRUE
8728 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8729 * can go ahead and deallocate the Vector UPL too.
8730 */
8731 if (*empty == TRUE) {
8732 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8733 upl_deallocate(upl);
8734 }
8735 goto process_upl_to_abort;
8736 }
8737
8738 kr = KERN_SUCCESS;
8739
8740 done:
8741 if (dwp_start && dwp_finish_ctx) {
8742 vm_page_delayed_work_finish_ctx(dwp_start);
8743 dwp_start = dwp = NULL;
8744 }
8745
8746 return kr;
8747 }
8748
8749
8750 kern_return_t
upl_abort(upl_t upl,int error)8751 upl_abort(
8752 upl_t upl,
8753 int error)
8754 {
8755 boolean_t empty;
8756
8757 if (upl == UPL_NULL) {
8758 return KERN_INVALID_ARGUMENT;
8759 }
8760
8761 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8762 }
8763
8764
8765 /* an option on commit should be wire */
8766 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8767 upl_commit(
8768 upl_t upl,
8769 upl_page_info_t *page_list,
8770 mach_msg_type_number_t count)
8771 {
8772 boolean_t empty;
8773
8774 if (upl == UPL_NULL) {
8775 return KERN_INVALID_ARGUMENT;
8776 }
8777
8778 return upl_commit_range(upl, 0, upl->u_size, 0,
8779 page_list, count, &empty);
8780 }
8781
8782
8783 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8784 iopl_valid_data(
8785 upl_t upl,
8786 vm_tag_t tag)
8787 {
8788 vm_object_t object;
8789 vm_offset_t offset;
8790 vm_page_t m, nxt_page = VM_PAGE_NULL;
8791 upl_size_t size;
8792 int wired_count = 0;
8793
8794 if (upl == NULL) {
8795 panic("iopl_valid_data: NULL upl");
8796 }
8797 if (vector_upl_is_valid(upl)) {
8798 panic("iopl_valid_data: vector upl");
8799 }
8800 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8801 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8802 }
8803
8804 object = upl->map_object;
8805
8806 if (is_kernel_object(object) || object == compressor_object) {
8807 panic("iopl_valid_data: object == kernel or compressor");
8808 }
8809
8810 if (object->purgable == VM_PURGABLE_VOLATILE ||
8811 object->purgable == VM_PURGABLE_EMPTY) {
8812 panic("iopl_valid_data: object %p purgable %d",
8813 object, object->purgable);
8814 }
8815
8816 size = upl_adjusted_size(upl, PAGE_MASK);
8817
8818 vm_object_lock(object);
8819 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8820
8821 bool whole_object;
8822
8823 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8824 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8825 whole_object = true;
8826 } else {
8827 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8828 whole_object = false;
8829 }
8830
8831 while (size) {
8832 if (whole_object) {
8833 if (nxt_page != VM_PAGE_NULL) {
8834 m = nxt_page;
8835 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8836 }
8837 } else {
8838 m = vm_page_lookup(object, offset);
8839 offset += PAGE_SIZE;
8840
8841 if (m == VM_PAGE_NULL) {
8842 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8843 }
8844 }
8845 if (m->vmp_busy) {
8846 if (!m->vmp_absent) {
8847 panic("iopl_valid_data: busy page w/o absent");
8848 }
8849
8850 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8851 panic("iopl_valid_data: busy+absent page on page queue");
8852 }
8853 if (m->vmp_reusable) {
8854 panic("iopl_valid_data: %p is reusable", m);
8855 }
8856
8857 m->vmp_absent = FALSE;
8858 m->vmp_dirty = TRUE;
8859 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8860 assert(m->vmp_wire_count == 0);
8861 m->vmp_wire_count++;
8862 assert(m->vmp_wire_count);
8863 if (m->vmp_wire_count == 1) {
8864 m->vmp_q_state = VM_PAGE_IS_WIRED;
8865 wired_count++;
8866 } else {
8867 panic("iopl_valid_data: %p already wired", m);
8868 }
8869
8870 PAGE_WAKEUP_DONE(m);
8871 }
8872 size -= PAGE_SIZE;
8873 }
8874 if (wired_count) {
8875 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8876 assert(object->resident_page_count >= object->wired_page_count);
8877
8878 /* no need to adjust purgeable accounting for this object: */
8879 assert(object->purgable != VM_PURGABLE_VOLATILE);
8880 assert(object->purgable != VM_PURGABLE_EMPTY);
8881
8882 vm_page_lockspin_queues();
8883 vm_page_wire_count += wired_count;
8884 vm_page_unlock_queues();
8885 }
8886 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8887 vm_object_unlock(object);
8888 }
8889
8890
8891 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8892 vm_object_set_pmap_cache_attr(
8893 vm_object_t object,
8894 upl_page_info_array_t user_page_list,
8895 unsigned int num_pages,
8896 boolean_t batch_pmap_op)
8897 {
8898 unsigned int cache_attr = 0;
8899
8900 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8901 assert(user_page_list);
8902 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8903 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8904 }
8905 }
8906
8907
8908 static bool
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8909 vm_object_iopl_wire_full(
8910 vm_object_t object,
8911 upl_t upl,
8912 upl_page_info_array_t user_page_list,
8913 upl_control_flags_t cntrl_flags,
8914 vm_tag_t tag)
8915 {
8916 vm_page_t dst_page;
8917 unsigned int entry;
8918 int page_count;
8919 int delayed_unlock = 0;
8920 boolean_t retval = TRUE;
8921 ppnum_t phys_page;
8922
8923 vm_object_lock_assert_exclusive(object);
8924 assert(object->purgable != VM_PURGABLE_VOLATILE);
8925 assert(object->purgable != VM_PURGABLE_EMPTY);
8926 assert(object->pager == NULL);
8927 assert(object->vo_copy == NULL);
8928 assert(object->shadow == NULL);
8929
8930 page_count = object->resident_page_count;
8931 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8932
8933 vm_page_lock_queues();
8934
8935 while (page_count--) {
8936 if (dst_page->vmp_busy ||
8937 dst_page->vmp_fictitious ||
8938 dst_page->vmp_absent ||
8939 VMP_ERROR_GET(dst_page) ||
8940 dst_page->vmp_cleaning ||
8941 dst_page->vmp_restart ||
8942 dst_page->vmp_laundry) {
8943 retval = FALSE;
8944 goto done;
8945 }
8946 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8947 retval = FALSE;
8948 goto done;
8949 }
8950 dst_page->vmp_reference = TRUE;
8951
8952 vm_page_wire(dst_page, tag, FALSE);
8953
8954 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8955 SET_PAGE_DIRTY(dst_page, FALSE);
8956 }
8957 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8958 assert(entry >= 0 && entry < object->resident_page_count);
8959 bitmap_set(upl->lite_list, entry);
8960
8961 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8962
8963 if (phys_page > upl->highest_page) {
8964 upl->highest_page = phys_page;
8965 }
8966
8967 if (user_page_list) {
8968 user_page_list[entry].phys_addr = phys_page;
8969 user_page_list[entry].absent = dst_page->vmp_absent;
8970 user_page_list[entry].dirty = dst_page->vmp_dirty;
8971 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8972 user_page_list[entry].precious = dst_page->vmp_precious;
8973 user_page_list[entry].device = FALSE;
8974 user_page_list[entry].speculative = FALSE;
8975 user_page_list[entry].cs_validated = FALSE;
8976 user_page_list[entry].cs_tainted = FALSE;
8977 user_page_list[entry].cs_nx = FALSE;
8978 user_page_list[entry].needed = FALSE;
8979 user_page_list[entry].mark = FALSE;
8980 }
8981 if (delayed_unlock++ > 256) {
8982 delayed_unlock = 0;
8983 lck_mtx_yield(&vm_page_queue_lock);
8984
8985 VM_CHECK_MEMORYSTATUS;
8986 }
8987 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8988 }
8989 done:
8990 vm_page_unlock_queues();
8991
8992 VM_CHECK_MEMORYSTATUS;
8993
8994 return retval;
8995 }
8996
8997
8998 static kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8999 vm_object_iopl_wire_empty(
9000 vm_object_t object,
9001 upl_t upl,
9002 upl_page_info_array_t user_page_list,
9003 upl_control_flags_t cntrl_flags,
9004 vm_tag_t tag,
9005 vm_object_offset_t *dst_offset,
9006 int page_count,
9007 int *page_grab_count)
9008 {
9009 vm_page_t dst_page;
9010 boolean_t no_zero_fill = FALSE;
9011 int interruptible;
9012 int pages_wired = 0;
9013 int pages_inserted = 0;
9014 int entry = 0;
9015 uint64_t delayed_ledger_update = 0;
9016 kern_return_t ret = KERN_SUCCESS;
9017 int grab_options;
9018 ppnum_t phys_page;
9019
9020 vm_object_lock_assert_exclusive(object);
9021 assert(object->purgable != VM_PURGABLE_VOLATILE);
9022 assert(object->purgable != VM_PURGABLE_EMPTY);
9023 assert(object->pager == NULL);
9024 assert(object->vo_copy == NULL);
9025 assert(object->shadow == NULL);
9026
9027 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9028 interruptible = THREAD_ABORTSAFE;
9029 } else {
9030 interruptible = THREAD_UNINT;
9031 }
9032
9033 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9034 no_zero_fill = TRUE;
9035 }
9036
9037 grab_options = 0;
9038 #if CONFIG_SECLUDED_MEMORY
9039 if (object->can_grab_secluded) {
9040 grab_options |= VM_PAGE_GRAB_SECLUDED;
9041 }
9042 #endif /* CONFIG_SECLUDED_MEMORY */
9043
9044 while (page_count--) {
9045 while ((dst_page = vm_page_grab_options(grab_options))
9046 == VM_PAGE_NULL) {
9047 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9048
9049 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9050
9051 if (vm_page_wait(interruptible) == FALSE) {
9052 /*
9053 * interrupted case
9054 */
9055 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9056
9057 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9058
9059 ret = MACH_SEND_INTERRUPTED;
9060 goto done;
9061 }
9062 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9063
9064 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9065 }
9066 if (no_zero_fill == FALSE) {
9067 vm_page_zero_fill(dst_page);
9068 } else {
9069 dst_page->vmp_absent = TRUE;
9070 }
9071
9072 dst_page->vmp_reference = TRUE;
9073
9074 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9075 SET_PAGE_DIRTY(dst_page, FALSE);
9076 }
9077 if (dst_page->vmp_absent == FALSE) {
9078 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9079 assert(dst_page->vmp_wire_count == 0);
9080 dst_page->vmp_wire_count++;
9081 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9082 assert(dst_page->vmp_wire_count);
9083 pages_wired++;
9084 PAGE_WAKEUP_DONE(dst_page);
9085 }
9086 pages_inserted++;
9087
9088 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9089
9090 bitmap_set(upl->lite_list, entry);
9091
9092 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9093
9094 if (phys_page > upl->highest_page) {
9095 upl->highest_page = phys_page;
9096 }
9097
9098 if (user_page_list) {
9099 user_page_list[entry].phys_addr = phys_page;
9100 user_page_list[entry].absent = dst_page->vmp_absent;
9101 user_page_list[entry].dirty = dst_page->vmp_dirty;
9102 user_page_list[entry].free_when_done = FALSE;
9103 user_page_list[entry].precious = FALSE;
9104 user_page_list[entry].device = FALSE;
9105 user_page_list[entry].speculative = FALSE;
9106 user_page_list[entry].cs_validated = FALSE;
9107 user_page_list[entry].cs_tainted = FALSE;
9108 user_page_list[entry].cs_nx = FALSE;
9109 user_page_list[entry].needed = FALSE;
9110 user_page_list[entry].mark = FALSE;
9111 }
9112 entry++;
9113 *dst_offset += PAGE_SIZE_64;
9114 }
9115 done:
9116 if (pages_wired) {
9117 vm_page_lockspin_queues();
9118 vm_page_wire_count += pages_wired;
9119 vm_page_unlock_queues();
9120 }
9121 if (pages_inserted) {
9122 if (object->internal) {
9123 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9124 } else {
9125 OSAddAtomic(pages_inserted, &vm_page_external_count);
9126 }
9127 }
9128 if (delayed_ledger_update) {
9129 task_t owner;
9130 int ledger_idx_volatile;
9131 int ledger_idx_nonvolatile;
9132 int ledger_idx_volatile_compressed;
9133 int ledger_idx_nonvolatile_compressed;
9134 boolean_t do_footprint;
9135
9136 owner = VM_OBJECT_OWNER(object);
9137 assert(owner);
9138
9139 vm_object_ledger_tag_ledgers(object,
9140 &ledger_idx_volatile,
9141 &ledger_idx_nonvolatile,
9142 &ledger_idx_volatile_compressed,
9143 &ledger_idx_nonvolatile_compressed,
9144 &do_footprint);
9145
9146 /* more non-volatile bytes */
9147 ledger_credit(owner->ledger,
9148 ledger_idx_nonvolatile,
9149 delayed_ledger_update);
9150 if (do_footprint) {
9151 /* more footprint */
9152 ledger_credit(owner->ledger,
9153 task_ledgers.phys_footprint,
9154 delayed_ledger_update);
9155 }
9156 }
9157
9158 assert(page_grab_count);
9159 *page_grab_count = pages_inserted;
9160
9161 return ret;
9162 }
9163
9164
9165
9166 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9167 vm_object_iopl_request(
9168 vm_object_t object,
9169 vm_object_offset_t offset,
9170 upl_size_t size,
9171 upl_t *upl_ptr,
9172 upl_page_info_array_t user_page_list,
9173 unsigned int *page_list_count,
9174 upl_control_flags_t cntrl_flags,
9175 vm_tag_t tag)
9176 {
9177 vm_page_t dst_page;
9178 vm_object_offset_t dst_offset;
9179 upl_size_t xfer_size;
9180 upl_t upl = NULL;
9181 unsigned int entry;
9182 int no_zero_fill = FALSE;
9183 unsigned int size_in_pages;
9184 int page_grab_count = 0;
9185 u_int32_t psize;
9186 kern_return_t ret;
9187 vm_prot_t prot;
9188 struct vm_object_fault_info fault_info = {};
9189 struct vm_page_delayed_work dw_array;
9190 struct vm_page_delayed_work *dwp, *dwp_start;
9191 bool dwp_finish_ctx = TRUE;
9192 int dw_count;
9193 int dw_limit;
9194 int dw_index;
9195 boolean_t caller_lookup;
9196 int io_tracking_flag = 0;
9197 int interruptible;
9198 ppnum_t phys_page;
9199
9200 boolean_t set_cache_attr_needed = FALSE;
9201 boolean_t free_wired_pages = FALSE;
9202 boolean_t fast_path_empty_req = FALSE;
9203 boolean_t fast_path_full_req = FALSE;
9204
9205 #if DEVELOPMENT || DEBUG
9206 task_t task = current_task();
9207 #endif /* DEVELOPMENT || DEBUG */
9208
9209 dwp_start = dwp = NULL;
9210
9211 vm_object_offset_t original_offset = offset;
9212 upl_size_t original_size = size;
9213
9214 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9215
9216 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9217 offset = vm_object_trunc_page(offset);
9218 if (size != original_size || offset != original_offset) {
9219 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9220 }
9221
9222 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9223 /*
9224 * For forward compatibility's sake,
9225 * reject any unknown flag.
9226 */
9227 return KERN_INVALID_VALUE;
9228 }
9229 if (vm_lopage_needed == FALSE) {
9230 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9231 }
9232
9233 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9234 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9235 return KERN_INVALID_VALUE;
9236 }
9237
9238 if (object->phys_contiguous) {
9239 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9240 return KERN_INVALID_ADDRESS;
9241 }
9242
9243 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9244 return KERN_INVALID_ADDRESS;
9245 }
9246 }
9247 }
9248 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9249 no_zero_fill = TRUE;
9250 }
9251
9252 if (cntrl_flags & UPL_COPYOUT_FROM) {
9253 prot = VM_PROT_READ;
9254 } else {
9255 prot = VM_PROT_READ | VM_PROT_WRITE;
9256 }
9257
9258 if ((!object->internal) && (object->paging_offset != 0)) {
9259 panic("vm_object_iopl_request: external object with non-zero paging offset");
9260 }
9261
9262
9263 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9264
9265 #if CONFIG_IOSCHED || UPL_DEBUG
9266 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
9267 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9268 }
9269 #endif
9270
9271 #if CONFIG_IOSCHED
9272 if (object->io_tracking) {
9273 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9274 if (!is_kernel_object(object)) {
9275 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9276 }
9277 }
9278 #endif
9279
9280 if (object->phys_contiguous) {
9281 psize = PAGE_SIZE;
9282 } else {
9283 psize = size;
9284
9285 dw_count = 0;
9286 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9287 dwp_start = vm_page_delayed_work_get_ctx();
9288 if (dwp_start == NULL) {
9289 dwp_start = &dw_array;
9290 dw_limit = 1;
9291 dwp_finish_ctx = FALSE;
9292 }
9293
9294 dwp = dwp_start;
9295 }
9296
9297 if (cntrl_flags & UPL_SET_INTERNAL) {
9298 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9299 user_page_list = size ? upl->page_list : NULL;
9300 } else {
9301 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9302 }
9303 if (user_page_list) {
9304 user_page_list[0].device = FALSE;
9305 }
9306 *upl_ptr = upl;
9307
9308 if (cntrl_flags & UPL_NOZEROFILLIO) {
9309 DTRACE_VM4(upl_nozerofillio,
9310 vm_object_t, object,
9311 vm_object_offset_t, offset,
9312 upl_size_t, size,
9313 upl_t, upl);
9314 }
9315
9316 upl->map_object = object;
9317 upl->u_offset = original_offset;
9318 upl->u_size = original_size;
9319
9320 size_in_pages = size / PAGE_SIZE;
9321
9322 if (is_kernel_object(object) &&
9323 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9324 upl->flags |= UPL_KERNEL_OBJECT;
9325 #if UPL_DEBUG
9326 vm_object_lock(object);
9327 #else
9328 vm_object_lock_shared(object);
9329 #endif
9330 } else {
9331 vm_object_lock(object);
9332 vm_object_activity_begin(object);
9333 }
9334 /*
9335 * paging in progress also protects the paging_offset
9336 */
9337 upl->u_offset = original_offset + object->paging_offset;
9338
9339 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9340 /*
9341 * The user requested that access to the pages in this UPL
9342 * be blocked until the UPL is commited or aborted.
9343 */
9344 upl->flags |= UPL_ACCESS_BLOCKED;
9345 }
9346
9347 #if CONFIG_IOSCHED || UPL_DEBUG
9348 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9349 vm_object_activity_begin(object);
9350 queue_enter(&object->uplq, upl, upl_t, uplq);
9351 }
9352 #endif
9353
9354 if (object->phys_contiguous) {
9355 if (upl->flags & UPL_ACCESS_BLOCKED) {
9356 assert(!object->blocked_access);
9357 object->blocked_access = TRUE;
9358 }
9359
9360 vm_object_unlock(object);
9361
9362 /*
9363 * don't need any shadow mappings for this one
9364 * since it is already I/O memory
9365 */
9366 upl->flags |= UPL_DEVICE_MEMORY;
9367
9368 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9369
9370 if (user_page_list) {
9371 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9372 user_page_list[0].device = TRUE;
9373 }
9374 if (page_list_count != NULL) {
9375 if (upl->flags & UPL_INTERNAL) {
9376 *page_list_count = 0;
9377 } else {
9378 *page_list_count = 1;
9379 }
9380 }
9381
9382 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9383 #if DEVELOPMENT || DEBUG
9384 if (task != NULL) {
9385 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9386 }
9387 #endif /* DEVELOPMENT || DEBUG */
9388 return KERN_SUCCESS;
9389 }
9390 if (!is_kernel_object(object) && object != compressor_object) {
9391 /*
9392 * Protect user space from future COW operations
9393 */
9394 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9395 if (!object->true_share &&
9396 vm_object_tracking_btlog) {
9397 btlog_record(vm_object_tracking_btlog, object,
9398 VM_OBJECT_TRACKING_OP_TRUESHARE,
9399 btref_get(__builtin_frame_address(0), 0));
9400 }
9401 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9402
9403 vm_object_lock_assert_exclusive(object);
9404 object->true_share = TRUE;
9405
9406 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9407 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9408 }
9409 }
9410
9411 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9412 object->vo_copy != VM_OBJECT_NULL) {
9413 /*
9414 * Honor copy-on-write obligations
9415 *
9416 * The caller is gathering these pages and
9417 * might modify their contents. We need to
9418 * make sure that the copy object has its own
9419 * private copies of these pages before we let
9420 * the caller modify them.
9421 *
9422 * NOTE: someone else could map the original object
9423 * after we've done this copy-on-write here, and they
9424 * could then see an inconsistent picture of the memory
9425 * while it's being modified via the UPL. To prevent this,
9426 * we would have to block access to these pages until the
9427 * UPL is released. We could use the UPL_BLOCK_ACCESS
9428 * code path for that...
9429 */
9430 vm_object_update(object,
9431 offset,
9432 size,
9433 NULL,
9434 NULL,
9435 FALSE, /* should_return */
9436 MEMORY_OBJECT_COPY_SYNC,
9437 VM_PROT_NO_CHANGE);
9438 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9439 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9440 }
9441 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9442 object->purgable != VM_PURGABLE_VOLATILE &&
9443 object->purgable != VM_PURGABLE_EMPTY &&
9444 object->vo_copy == NULL &&
9445 size == object->vo_size &&
9446 offset == 0 &&
9447 object->shadow == NULL &&
9448 object->pager == NULL) {
9449 if (object->resident_page_count == size_in_pages) {
9450 assert(object != compressor_object);
9451 assert(!is_kernel_object(object));
9452 fast_path_full_req = TRUE;
9453 } else if (object->resident_page_count == 0) {
9454 assert(object != compressor_object);
9455 assert(!is_kernel_object(object));
9456 fast_path_empty_req = TRUE;
9457 set_cache_attr_needed = TRUE;
9458 }
9459 }
9460
9461 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9462 interruptible = THREAD_ABORTSAFE;
9463 } else {
9464 interruptible = THREAD_UNINT;
9465 }
9466
9467 entry = 0;
9468
9469 xfer_size = size;
9470 dst_offset = offset;
9471
9472 if (fast_path_full_req) {
9473 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9474 goto finish;
9475 }
9476 /*
9477 * we couldn't complete the processing of this request on the fast path
9478 * so fall through to the slow path and finish up
9479 */
9480 } else if (fast_path_empty_req) {
9481 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9482 ret = KERN_MEMORY_ERROR;
9483 goto return_err;
9484 }
9485 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9486 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9487
9488 if (ret) {
9489 free_wired_pages = TRUE;
9490 goto return_err;
9491 }
9492 goto finish;
9493 }
9494
9495 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9496 fault_info.lo_offset = offset;
9497 fault_info.hi_offset = offset + xfer_size;
9498 fault_info.mark_zf_absent = TRUE;
9499 fault_info.interruptible = interruptible;
9500 fault_info.batch_pmap_op = TRUE;
9501
9502 while (xfer_size) {
9503 vm_fault_return_t result;
9504
9505 dwp->dw_mask = 0;
9506
9507 if (fast_path_full_req) {
9508 /*
9509 * if we get here, it means that we ran into a page
9510 * state we couldn't handle in the fast path and
9511 * bailed out to the slow path... since the order
9512 * we look at pages is different between the 2 paths,
9513 * the following check is needed to determine whether
9514 * this page was already processed in the fast path
9515 */
9516 if (bitmap_test(upl->lite_list, entry)) {
9517 goto skip_page;
9518 }
9519 }
9520 dst_page = vm_page_lookup(object, dst_offset);
9521
9522 if (dst_page == VM_PAGE_NULL ||
9523 dst_page->vmp_busy ||
9524 VMP_ERROR_GET(dst_page) ||
9525 dst_page->vmp_restart ||
9526 dst_page->vmp_absent ||
9527 dst_page->vmp_fictitious) {
9528 if (is_kernel_object(object)) {
9529 panic("vm_object_iopl_request: missing/bad page in kernel object");
9530 }
9531 if (object == compressor_object) {
9532 panic("vm_object_iopl_request: missing/bad page in compressor object");
9533 }
9534
9535 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9536 ret = KERN_MEMORY_ERROR;
9537 goto return_err;
9538 }
9539 set_cache_attr_needed = TRUE;
9540
9541 /*
9542 * We just looked up the page and the result remains valid
9543 * until the object lock is release, so send it to
9544 * vm_fault_page() (as "dst_page"), to avoid having to
9545 * look it up again there.
9546 */
9547 caller_lookup = TRUE;
9548
9549 do {
9550 vm_page_t top_page;
9551 kern_return_t error_code;
9552
9553 fault_info.cluster_size = xfer_size;
9554
9555 vm_object_paging_begin(object);
9556
9557 result = vm_fault_page(object, dst_offset,
9558 prot | VM_PROT_WRITE, FALSE,
9559 caller_lookup,
9560 &prot, &dst_page, &top_page,
9561 (int *)0,
9562 &error_code, no_zero_fill,
9563 &fault_info);
9564
9565 /* our lookup is no longer valid at this point */
9566 caller_lookup = FALSE;
9567
9568 switch (result) {
9569 case VM_FAULT_SUCCESS:
9570 page_grab_count++;
9571
9572 if (!dst_page->vmp_absent) {
9573 PAGE_WAKEUP_DONE(dst_page);
9574 } else {
9575 /*
9576 * we only get back an absent page if we
9577 * requested that it not be zero-filled
9578 * because we are about to fill it via I/O
9579 *
9580 * absent pages should be left BUSY
9581 * to prevent them from being faulted
9582 * into an address space before we've
9583 * had a chance to complete the I/O on
9584 * them since they may contain info that
9585 * shouldn't be seen by the faulting task
9586 */
9587 }
9588 /*
9589 * Release paging references and
9590 * top-level placeholder page, if any.
9591 */
9592 if (top_page != VM_PAGE_NULL) {
9593 vm_object_t local_object;
9594
9595 local_object = VM_PAGE_OBJECT(top_page);
9596
9597 /*
9598 * comparing 2 packed pointers
9599 */
9600 if (top_page->vmp_object != dst_page->vmp_object) {
9601 vm_object_lock(local_object);
9602 VM_PAGE_FREE(top_page);
9603 vm_object_paging_end(local_object);
9604 vm_object_unlock(local_object);
9605 } else {
9606 VM_PAGE_FREE(top_page);
9607 vm_object_paging_end(local_object);
9608 }
9609 }
9610 vm_object_paging_end(object);
9611 break;
9612
9613 case VM_FAULT_RETRY:
9614 vm_object_lock(object);
9615 break;
9616
9617 case VM_FAULT_MEMORY_SHORTAGE:
9618 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9619
9620 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9621
9622 if (vm_page_wait(interruptible)) {
9623 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9624
9625 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9626 vm_object_lock(object);
9627
9628 break;
9629 }
9630 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9631
9632 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9633 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9634 OS_FALLTHROUGH;
9635
9636 case VM_FAULT_INTERRUPTED:
9637 error_code = MACH_SEND_INTERRUPTED;
9638 OS_FALLTHROUGH;
9639 case VM_FAULT_MEMORY_ERROR:
9640 memory_error:
9641 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9642
9643 vm_object_lock(object);
9644 goto return_err;
9645
9646 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9647 /* success but no page: fail */
9648 vm_object_paging_end(object);
9649 vm_object_unlock(object);
9650 goto memory_error;
9651
9652 default:
9653 panic("vm_object_iopl_request: unexpected error"
9654 " 0x%x from vm_fault_page()\n", result);
9655 }
9656 } while (result != VM_FAULT_SUCCESS);
9657 }
9658 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9659
9660 if (upl->flags & UPL_KERNEL_OBJECT) {
9661 goto record_phys_addr;
9662 }
9663
9664 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9665 dst_page->vmp_busy = TRUE;
9666 goto record_phys_addr;
9667 }
9668
9669 if (dst_page->vmp_cleaning) {
9670 /*
9671 * Someone else is cleaning this page in place.
9672 * In theory, we should be able to proceed and use this
9673 * page but they'll probably end up clearing the "busy"
9674 * bit on it in upl_commit_range() but they didn't set
9675 * it, so they would clear our "busy" bit and open
9676 * us to race conditions.
9677 * We'd better wait for the cleaning to complete and
9678 * then try again.
9679 */
9680 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9681 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9682 continue;
9683 }
9684 if (dst_page->vmp_laundry) {
9685 vm_pageout_steal_laundry(dst_page, FALSE);
9686 }
9687
9688 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9689 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9690 vm_page_t low_page;
9691 int refmod;
9692
9693 /*
9694 * support devices that can't DMA above 32 bits
9695 * by substituting pages from a pool of low address
9696 * memory for any pages we find above the 4G mark
9697 * can't substitute if the page is already wired because
9698 * we don't know whether that physical address has been
9699 * handed out to some other 64 bit capable DMA device to use
9700 */
9701 if (VM_PAGE_WIRED(dst_page)) {
9702 ret = KERN_PROTECTION_FAILURE;
9703 goto return_err;
9704 }
9705 low_page = vm_page_grablo();
9706
9707 if (low_page == VM_PAGE_NULL) {
9708 ret = KERN_RESOURCE_SHORTAGE;
9709 goto return_err;
9710 }
9711 /*
9712 * from here until the vm_page_replace completes
9713 * we musn't drop the object lock... we don't
9714 * want anyone refaulting this page in and using
9715 * it after we disconnect it... we want the fault
9716 * to find the new page being substituted.
9717 */
9718 if (dst_page->vmp_pmapped) {
9719 refmod = pmap_disconnect(phys_page);
9720 } else {
9721 refmod = 0;
9722 }
9723
9724 if (!dst_page->vmp_absent) {
9725 vm_page_copy(dst_page, low_page);
9726 }
9727
9728 low_page->vmp_reference = dst_page->vmp_reference;
9729 low_page->vmp_dirty = dst_page->vmp_dirty;
9730 low_page->vmp_absent = dst_page->vmp_absent;
9731
9732 if (refmod & VM_MEM_REFERENCED) {
9733 low_page->vmp_reference = TRUE;
9734 }
9735 if (refmod & VM_MEM_MODIFIED) {
9736 SET_PAGE_DIRTY(low_page, FALSE);
9737 }
9738
9739 vm_page_replace(low_page, object, dst_offset);
9740
9741 dst_page = low_page;
9742 /*
9743 * vm_page_grablo returned the page marked
9744 * BUSY... we don't need a PAGE_WAKEUP_DONE
9745 * here, because we've never dropped the object lock
9746 */
9747 if (!dst_page->vmp_absent) {
9748 dst_page->vmp_busy = FALSE;
9749 }
9750
9751 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9752 }
9753 if (!dst_page->vmp_busy) {
9754 dwp->dw_mask |= DW_vm_page_wire;
9755 }
9756
9757 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9758 /*
9759 * Mark the page "busy" to block any future page fault
9760 * on this page in addition to wiring it.
9761 * We'll also remove the mapping
9762 * of all these pages before leaving this routine.
9763 */
9764 assert(!dst_page->vmp_fictitious);
9765 dst_page->vmp_busy = TRUE;
9766 }
9767 /*
9768 * expect the page to be used
9769 * page queues lock must be held to set 'reference'
9770 */
9771 dwp->dw_mask |= DW_set_reference;
9772
9773 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9774 SET_PAGE_DIRTY(dst_page, TRUE);
9775 /*
9776 * Page belonging to a code-signed object is about to
9777 * be written. Mark it tainted and disconnect it from
9778 * all pmaps so processes have to fault it back in and
9779 * deal with the tainted bit.
9780 */
9781 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9782 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9783 vm_page_iopl_tainted++;
9784 if (dst_page->vmp_pmapped) {
9785 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9786 if (refmod & VM_MEM_REFERENCED) {
9787 dst_page->vmp_reference = TRUE;
9788 }
9789 }
9790 }
9791 }
9792 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9793 pmap_sync_page_attributes_phys(phys_page);
9794 dst_page->vmp_written_by_kernel = FALSE;
9795 }
9796
9797 record_phys_addr:
9798 if (dst_page->vmp_busy) {
9799 upl->flags |= UPL_HAS_BUSY;
9800 }
9801
9802 bitmap_set(upl->lite_list, entry);
9803
9804 if (phys_page > upl->highest_page) {
9805 upl->highest_page = phys_page;
9806 }
9807
9808 if (user_page_list) {
9809 user_page_list[entry].phys_addr = phys_page;
9810 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9811 user_page_list[entry].absent = dst_page->vmp_absent;
9812 user_page_list[entry].dirty = dst_page->vmp_dirty;
9813 user_page_list[entry].precious = dst_page->vmp_precious;
9814 user_page_list[entry].device = FALSE;
9815 user_page_list[entry].needed = FALSE;
9816 if (dst_page->vmp_clustered == TRUE) {
9817 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9818 } else {
9819 user_page_list[entry].speculative = FALSE;
9820 }
9821 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9822 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9823 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9824 user_page_list[entry].mark = FALSE;
9825 }
9826 if (!is_kernel_object(object) && object != compressor_object) {
9827 /*
9828 * someone is explicitly grabbing this page...
9829 * update clustered and speculative state
9830 *
9831 */
9832 if (dst_page->vmp_clustered) {
9833 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9834 }
9835 }
9836 skip_page:
9837 entry++;
9838 dst_offset += PAGE_SIZE_64;
9839 xfer_size -= PAGE_SIZE;
9840
9841 if (dwp->dw_mask) {
9842 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9843
9844 if (dw_count >= dw_limit) {
9845 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9846
9847 dwp = dwp_start;
9848 dw_count = 0;
9849 }
9850 }
9851 }
9852 assert(entry == size_in_pages);
9853
9854 if (dw_count) {
9855 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9856 dwp = dwp_start;
9857 dw_count = 0;
9858 }
9859 finish:
9860 if (user_page_list && set_cache_attr_needed == TRUE) {
9861 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9862 }
9863
9864 if (page_list_count != NULL) {
9865 if (upl->flags & UPL_INTERNAL) {
9866 *page_list_count = 0;
9867 } else if (*page_list_count > size_in_pages) {
9868 *page_list_count = size_in_pages;
9869 }
9870 }
9871 vm_object_unlock(object);
9872
9873 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9874 /*
9875 * We've marked all the pages "busy" so that future
9876 * page faults will block.
9877 * Now remove the mapping for these pages, so that they
9878 * can't be accessed without causing a page fault.
9879 */
9880 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9881 PMAP_NULL,
9882 PAGE_SIZE,
9883 0, VM_PROT_NONE);
9884 assert(!object->blocked_access);
9885 object->blocked_access = TRUE;
9886 }
9887
9888 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9889 #if DEVELOPMENT || DEBUG
9890 if (task != NULL) {
9891 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9892 }
9893 #endif /* DEVELOPMENT || DEBUG */
9894
9895 if (dwp_start && dwp_finish_ctx) {
9896 vm_page_delayed_work_finish_ctx(dwp_start);
9897 dwp_start = dwp = NULL;
9898 }
9899
9900 return KERN_SUCCESS;
9901
9902 return_err:
9903 dw_index = 0;
9904
9905 for (; offset < dst_offset; offset += PAGE_SIZE) {
9906 boolean_t need_unwire;
9907
9908 dst_page = vm_page_lookup(object, offset);
9909
9910 if (dst_page == VM_PAGE_NULL) {
9911 panic("vm_object_iopl_request: Wired page missing.");
9912 }
9913
9914 /*
9915 * if we've already processed this page in an earlier
9916 * dw_do_work, we need to undo the wiring... we will
9917 * leave the dirty and reference bits on if they
9918 * were set, since we don't have a good way of knowing
9919 * what the previous state was and we won't get here
9920 * under any normal circumstances... we will always
9921 * clear BUSY and wakeup any waiters via vm_page_free
9922 * or PAGE_WAKEUP_DONE
9923 */
9924 need_unwire = TRUE;
9925
9926 if (dw_count) {
9927 if ((dwp_start)[dw_index].dw_m == dst_page) {
9928 /*
9929 * still in the deferred work list
9930 * which means we haven't yet called
9931 * vm_page_wire on this page
9932 */
9933 need_unwire = FALSE;
9934
9935 dw_index++;
9936 dw_count--;
9937 }
9938 }
9939 vm_page_lock_queues();
9940
9941 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9942 vm_page_free(dst_page);
9943
9944 need_unwire = FALSE;
9945 } else {
9946 if (need_unwire == TRUE) {
9947 vm_page_unwire(dst_page, TRUE);
9948 }
9949
9950 PAGE_WAKEUP_DONE(dst_page);
9951 }
9952 vm_page_unlock_queues();
9953
9954 if (need_unwire == TRUE) {
9955 counter_inc(&vm_statistics_reactivations);
9956 }
9957 }
9958 #if UPL_DEBUG
9959 upl->upl_state = 2;
9960 #endif
9961 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9962 vm_object_activity_end(object);
9963 vm_object_collapse(object, 0, TRUE);
9964 }
9965 vm_object_unlock(object);
9966 upl_destroy(upl);
9967
9968 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9969 #if DEVELOPMENT || DEBUG
9970 if (task != NULL) {
9971 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9972 }
9973 #endif /* DEVELOPMENT || DEBUG */
9974
9975 if (dwp_start && dwp_finish_ctx) {
9976 vm_page_delayed_work_finish_ctx(dwp_start);
9977 dwp_start = dwp = NULL;
9978 }
9979 return ret;
9980 }
9981
9982 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9983 upl_transpose(
9984 upl_t upl1,
9985 upl_t upl2)
9986 {
9987 kern_return_t retval;
9988 boolean_t upls_locked;
9989 vm_object_t object1, object2;
9990
9991 /* LD: Should mapped UPLs be eligible for a transpose? */
9992 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9993 return KERN_INVALID_ARGUMENT;
9994 }
9995
9996 upls_locked = FALSE;
9997
9998 /*
9999 * Since we need to lock both UPLs at the same time,
10000 * avoid deadlocks by always taking locks in the same order.
10001 */
10002 if (upl1 < upl2) {
10003 upl_lock(upl1);
10004 upl_lock(upl2);
10005 } else {
10006 upl_lock(upl2);
10007 upl_lock(upl1);
10008 }
10009 upls_locked = TRUE; /* the UPLs will need to be unlocked */
10010
10011 object1 = upl1->map_object;
10012 object2 = upl2->map_object;
10013
10014 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
10015 upl1->u_size != upl2->u_size) {
10016 /*
10017 * We deal only with full objects, not subsets.
10018 * That's because we exchange the entire backing store info
10019 * for the objects: pager, resident pages, etc... We can't do
10020 * only part of it.
10021 */
10022 retval = KERN_INVALID_VALUE;
10023 goto done;
10024 }
10025
10026 /*
10027 * Tranpose the VM objects' backing store.
10028 */
10029 retval = vm_object_transpose(object1, object2,
10030 upl_adjusted_size(upl1, PAGE_MASK));
10031
10032 if (retval == KERN_SUCCESS) {
10033 /*
10034 * Make each UPL point to the correct VM object, i.e. the
10035 * object holding the pages that the UPL refers to...
10036 */
10037 #if CONFIG_IOSCHED || UPL_DEBUG
10038 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10039 vm_object_lock(object1);
10040 vm_object_lock(object2);
10041 }
10042 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10043 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10044 }
10045 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10046 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10047 }
10048 #endif
10049 upl1->map_object = object2;
10050 upl2->map_object = object1;
10051
10052 #if CONFIG_IOSCHED || UPL_DEBUG
10053 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10054 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10055 }
10056 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10057 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10058 }
10059 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10060 vm_object_unlock(object2);
10061 vm_object_unlock(object1);
10062 }
10063 #endif
10064 }
10065
10066 done:
10067 /*
10068 * Cleanup.
10069 */
10070 if (upls_locked) {
10071 upl_unlock(upl1);
10072 upl_unlock(upl2);
10073 upls_locked = FALSE;
10074 }
10075
10076 return retval;
10077 }
10078
10079 void
upl_range_needed(upl_t upl,int index,int count)10080 upl_range_needed(
10081 upl_t upl,
10082 int index,
10083 int count)
10084 {
10085 int size_in_pages;
10086
10087 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10088 return;
10089 }
10090
10091 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10092
10093 while (count-- && index < size_in_pages) {
10094 upl->page_list[index++].needed = TRUE;
10095 }
10096 }
10097
10098
10099 /*
10100 * Reserve of virtual addresses in the kernel address space.
10101 * We need to map the physical pages in the kernel, so that we
10102 * can call the code-signing or slide routines with a kernel
10103 * virtual address. We keep this pool of pre-allocated kernel
10104 * virtual addresses so that we don't have to scan the kernel's
10105 * virtaul address space each time we need to work with
10106 * a physical page.
10107 */
10108 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10109 #define VM_PAGING_NUM_PAGES 64
10110 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10111 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10112 int vm_paging_max_index = 0;
10113 int vm_paging_page_waiter = 0;
10114 int vm_paging_page_waiter_total = 0;
10115
10116 unsigned long vm_paging_no_kernel_page = 0;
10117 unsigned long vm_paging_objects_mapped = 0;
10118 unsigned long vm_paging_pages_mapped = 0;
10119 unsigned long vm_paging_objects_mapped_slow = 0;
10120 unsigned long vm_paging_pages_mapped_slow = 0;
10121
10122 __startup_func
10123 static void
vm_paging_map_init(void)10124 vm_paging_map_init(void)
10125 {
10126 kmem_alloc(kernel_map, &vm_paging_base_address,
10127 ptoa(VM_PAGING_NUM_PAGES),
10128 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10129 VM_KERN_MEMORY_NONE);
10130 }
10131 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10132
10133 /*
10134 * vm_paging_map_object:
10135 * Maps part of a VM object's pages in the kernel
10136 * virtual address space, using the pre-allocated
10137 * kernel virtual addresses, if possible.
10138 * Context:
10139 * The VM object is locked. This lock will get
10140 * dropped and re-acquired though, so the caller
10141 * must make sure the VM object is kept alive
10142 * (by holding a VM map that has a reference
10143 * on it, for example, or taking an extra reference).
10144 * The page should also be kept busy to prevent
10145 * it from being reclaimed.
10146 */
10147 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10148 vm_paging_map_object(
10149 vm_page_t page,
10150 vm_object_t object,
10151 vm_object_offset_t offset,
10152 vm_prot_t protection,
10153 boolean_t can_unlock_object,
10154 vm_map_size_t *size, /* IN/OUT */
10155 vm_map_offset_t *address, /* OUT */
10156 boolean_t *need_unmap) /* OUT */
10157 {
10158 kern_return_t kr;
10159 vm_map_offset_t page_map_offset;
10160 vm_map_size_t map_size;
10161 vm_object_offset_t object_offset;
10162 int i;
10163
10164 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10165 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10166 *address = (vm_map_offset_t)
10167 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10168 *need_unmap = FALSE;
10169 return KERN_SUCCESS;
10170
10171 assert(page->vmp_busy);
10172 /*
10173 * Use one of the pre-allocated kernel virtual addresses
10174 * and just enter the VM page in the kernel address space
10175 * at that virtual address.
10176 */
10177 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10178
10179 /*
10180 * Try and find an available kernel virtual address
10181 * from our pre-allocated pool.
10182 */
10183 page_map_offset = 0;
10184 for (;;) {
10185 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10186 if (vm_paging_page_inuse[i] == FALSE) {
10187 page_map_offset =
10188 vm_paging_base_address +
10189 (i * PAGE_SIZE);
10190 break;
10191 }
10192 }
10193 if (page_map_offset != 0) {
10194 /* found a space to map our page ! */
10195 break;
10196 }
10197
10198 if (can_unlock_object) {
10199 /*
10200 * If we can afford to unlock the VM object,
10201 * let's take the slow path now...
10202 */
10203 break;
10204 }
10205 /*
10206 * We can't afford to unlock the VM object, so
10207 * let's wait for a space to become available...
10208 */
10209 vm_paging_page_waiter_total++;
10210 vm_paging_page_waiter++;
10211 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10212 if (kr == THREAD_WAITING) {
10213 simple_unlock(&vm_paging_lock);
10214 kr = thread_block(THREAD_CONTINUE_NULL);
10215 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10216 }
10217 vm_paging_page_waiter--;
10218 /* ... and try again */
10219 }
10220
10221 if (page_map_offset != 0) {
10222 /*
10223 * We found a kernel virtual address;
10224 * map the physical page to that virtual address.
10225 */
10226 if (i > vm_paging_max_index) {
10227 vm_paging_max_index = i;
10228 }
10229 vm_paging_page_inuse[i] = TRUE;
10230 simple_unlock(&vm_paging_lock);
10231
10232 page->vmp_pmapped = TRUE;
10233
10234 /*
10235 * Keep the VM object locked over the PMAP_ENTER
10236 * and the actual use of the page by the kernel,
10237 * or this pmap mapping might get undone by a
10238 * vm_object_pmap_protect() call...
10239 */
10240 kr = pmap_enter_check(kernel_pmap,
10241 page_map_offset,
10242 page,
10243 protection,
10244 VM_PROT_NONE,
10245 0,
10246 TRUE);
10247 assert(kr == KERN_SUCCESS);
10248 vm_paging_objects_mapped++;
10249 vm_paging_pages_mapped++;
10250 *address = page_map_offset;
10251 *need_unmap = TRUE;
10252
10253 #if KASAN
10254 kasan_notify_address(page_map_offset, PAGE_SIZE);
10255 #endif
10256
10257 /* all done and mapped, ready to use ! */
10258 return KERN_SUCCESS;
10259 }
10260
10261 /*
10262 * We ran out of pre-allocated kernel virtual
10263 * addresses. Just map the page in the kernel
10264 * the slow and regular way.
10265 */
10266 vm_paging_no_kernel_page++;
10267 simple_unlock(&vm_paging_lock);
10268 }
10269
10270 if (!can_unlock_object) {
10271 *address = 0;
10272 *size = 0;
10273 *need_unmap = FALSE;
10274 return KERN_NOT_SUPPORTED;
10275 }
10276
10277 object_offset = vm_object_trunc_page(offset);
10278 map_size = vm_map_round_page(*size,
10279 VM_MAP_PAGE_MASK(kernel_map));
10280
10281 /*
10282 * Try and map the required range of the object
10283 * in the kernel_map. Given that allocation is
10284 * for pageable memory, it shouldn't contain
10285 * pointers and is mapped into the data range.
10286 */
10287
10288 vm_object_reference_locked(object); /* for the map entry */
10289 vm_object_unlock(object);
10290
10291 kr = vm_map_enter(kernel_map,
10292 address,
10293 map_size,
10294 0,
10295 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10296 object,
10297 object_offset,
10298 FALSE,
10299 protection,
10300 VM_PROT_ALL,
10301 VM_INHERIT_NONE);
10302 if (kr != KERN_SUCCESS) {
10303 *address = 0;
10304 *size = 0;
10305 *need_unmap = FALSE;
10306 vm_object_deallocate(object); /* for the map entry */
10307 vm_object_lock(object);
10308 return kr;
10309 }
10310
10311 *size = map_size;
10312
10313 /*
10314 * Enter the mapped pages in the page table now.
10315 */
10316 vm_object_lock(object);
10317 /*
10318 * VM object must be kept locked from before PMAP_ENTER()
10319 * until after the kernel is done accessing the page(s).
10320 * Otherwise, the pmap mappings in the kernel could be
10321 * undone by a call to vm_object_pmap_protect().
10322 */
10323
10324 for (page_map_offset = 0;
10325 map_size != 0;
10326 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10327 page = vm_page_lookup(object, offset + page_map_offset);
10328 if (page == VM_PAGE_NULL) {
10329 printf("vm_paging_map_object: no page !?");
10330 vm_object_unlock(object);
10331 vm_map_remove(kernel_map, *address, *size);
10332 *address = 0;
10333 *size = 0;
10334 *need_unmap = FALSE;
10335 vm_object_lock(object);
10336 return KERN_MEMORY_ERROR;
10337 }
10338 page->vmp_pmapped = TRUE;
10339
10340 kr = pmap_enter_check(kernel_pmap,
10341 *address + page_map_offset,
10342 page,
10343 protection,
10344 VM_PROT_NONE,
10345 0,
10346 TRUE);
10347 assert(kr == KERN_SUCCESS);
10348 #if KASAN
10349 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10350 #endif
10351 }
10352
10353 vm_paging_objects_mapped_slow++;
10354 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10355
10356 *need_unmap = TRUE;
10357
10358 return KERN_SUCCESS;
10359 }
10360
10361 /*
10362 * vm_paging_unmap_object:
10363 * Unmaps part of a VM object's pages from the kernel
10364 * virtual address space.
10365 * Context:
10366 * The VM object is locked. This lock will get
10367 * dropped and re-acquired though.
10368 */
10369 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10370 vm_paging_unmap_object(
10371 vm_object_t object,
10372 vm_map_offset_t start,
10373 vm_map_offset_t end)
10374 {
10375 int i;
10376
10377 if ((vm_paging_base_address == 0) ||
10378 (start < vm_paging_base_address) ||
10379 (end > (vm_paging_base_address
10380 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10381 /*
10382 * We didn't use our pre-allocated pool of
10383 * kernel virtual address. Deallocate the
10384 * virtual memory.
10385 */
10386 if (object != VM_OBJECT_NULL) {
10387 vm_object_unlock(object);
10388 }
10389 vm_map_remove(kernel_map, start, end);
10390 if (object != VM_OBJECT_NULL) {
10391 vm_object_lock(object);
10392 }
10393 } else {
10394 /*
10395 * We used a kernel virtual address from our
10396 * pre-allocated pool. Put it back in the pool
10397 * for next time.
10398 */
10399 assert(end - start == PAGE_SIZE);
10400 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10401 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10402
10403 /* undo the pmap mapping */
10404 pmap_remove(kernel_pmap, start, end);
10405
10406 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10407 vm_paging_page_inuse[i] = FALSE;
10408 if (vm_paging_page_waiter) {
10409 thread_wakeup(&vm_paging_page_waiter);
10410 }
10411 simple_unlock(&vm_paging_lock);
10412 }
10413 }
10414
10415
10416 /*
10417 * page->vmp_object must be locked
10418 */
10419 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10420 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10421 {
10422 if (!queues_locked) {
10423 vm_page_lockspin_queues();
10424 }
10425
10426 page->vmp_free_when_done = FALSE;
10427 /*
10428 * need to drop the laundry count...
10429 * we may also need to remove it
10430 * from the I/O paging queue...
10431 * vm_pageout_throttle_up handles both cases
10432 *
10433 * the laundry and pageout_queue flags are cleared...
10434 */
10435 vm_pageout_throttle_up(page);
10436
10437 if (!queues_locked) {
10438 vm_page_unlock_queues();
10439 }
10440 }
10441
10442 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10443
10444 upl_t
vector_upl_create(vm_offset_t upl_offset,uint32_t max_upls)10445 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10446 {
10447 int i = 0;
10448 upl_t upl;
10449
10450 assert(max_upls > 0);
10451 if (max_upls == 0) {
10452 return NULL;
10453 }
10454
10455 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10456 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10457 }
10458 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10459
10460 upl = upl_create(0, UPL_VECTOR, 0);
10461 upl->vector_upl = vector_upl;
10462 upl->u_offset = upl_offset;
10463 vector_upl->size = 0;
10464 vector_upl->offset = upl_offset;
10465 vector_upl->invalid_upls = 0;
10466 vector_upl->num_upls = 0;
10467 vector_upl->pagelist = NULL;
10468 vector_upl->max_upls = max_upls;
10469
10470 for (i = 0; i < max_upls; i++) {
10471 vector_upl->upls[i].iostate.size = 0;
10472 vector_upl->upls[i].iostate.offset = 0;
10473 }
10474 return upl;
10475 }
10476
10477 uint32_t
vector_upl_max_upls(const upl_t upl)10478 vector_upl_max_upls(const upl_t upl)
10479 {
10480 if (!vector_upl_is_valid(upl)) {
10481 return 0;
10482 }
10483 return ((vector_upl_t)(upl->vector_upl))->max_upls;
10484 }
10485
10486 void
vector_upl_deallocate(upl_t upl)10487 vector_upl_deallocate(upl_t upl)
10488 {
10489 vector_upl_t vector_upl = upl->vector_upl;
10490
10491 assert(vector_upl_is_valid(upl));
10492
10493 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10494 panic("Deallocating non-empty Vectored UPL");
10495 }
10496 uint32_t max_upls = vector_upl->max_upls;
10497 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10498 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10499 upl->vector_upl = NULL;
10500 }
10501
10502 boolean_t
vector_upl_is_valid(upl_t upl)10503 vector_upl_is_valid(upl_t upl)
10504 {
10505 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10506 }
10507
10508 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10509 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10510 {
10511 if (vector_upl_is_valid(upl)) {
10512 vector_upl_t vector_upl = upl->vector_upl;
10513
10514 if (vector_upl) {
10515 if (subupl) {
10516 if (io_size) {
10517 if (io_size < PAGE_SIZE) {
10518 io_size = PAGE_SIZE;
10519 }
10520 subupl->vector_upl = (void*)vector_upl;
10521 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10522 vector_upl->size += io_size;
10523 upl->u_size += io_size;
10524 } else {
10525 uint32_t i = 0, invalid_upls = 0;
10526 for (i = 0; i < vector_upl->num_upls; i++) {
10527 if (vector_upl->upls[i].elem == subupl) {
10528 break;
10529 }
10530 }
10531 if (i == vector_upl->num_upls) {
10532 panic("Trying to remove sub-upl when none exists");
10533 }
10534
10535 vector_upl->upls[i].elem = NULL;
10536 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10537 relaxed);
10538 if (invalid_upls == vector_upl->num_upls) {
10539 return TRUE;
10540 } else {
10541 return FALSE;
10542 }
10543 }
10544 } else {
10545 panic("vector_upl_set_subupl was passed a NULL upl element");
10546 }
10547 } else {
10548 panic("vector_upl_set_subupl was passed a non-vectored upl");
10549 }
10550 } else {
10551 panic("vector_upl_set_subupl was passed a NULL upl");
10552 }
10553
10554 return FALSE;
10555 }
10556
10557 void
vector_upl_set_pagelist(upl_t upl)10558 vector_upl_set_pagelist(upl_t upl)
10559 {
10560 if (vector_upl_is_valid(upl)) {
10561 uint32_t i = 0;
10562 vector_upl_t vector_upl = upl->vector_upl;
10563
10564 if (vector_upl) {
10565 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10566
10567 vector_upl->pagelist = kalloc_type(struct upl_page_info,
10568 atop(vector_upl->size), Z_WAITOK);
10569
10570 for (i = 0; i < vector_upl->num_upls; i++) {
10571 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10572 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10573 pagelist_size += cur_upl_pagelist_size;
10574 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10575 upl->highest_page = vector_upl->upls[i].elem->highest_page;
10576 }
10577 }
10578 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10579 } else {
10580 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10581 }
10582 } else {
10583 panic("vector_upl_set_pagelist was passed a NULL upl");
10584 }
10585 }
10586
10587 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10588 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10589 {
10590 if (vector_upl_is_valid(upl)) {
10591 vector_upl_t vector_upl = upl->vector_upl;
10592 if (vector_upl) {
10593 if (index < vector_upl->num_upls) {
10594 return vector_upl->upls[index].elem;
10595 }
10596 } else {
10597 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10598 }
10599 }
10600 return NULL;
10601 }
10602
10603 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10604 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10605 {
10606 if (vector_upl_is_valid(upl)) {
10607 uint32_t i = 0;
10608 vector_upl_t vector_upl = upl->vector_upl;
10609
10610 if (vector_upl) {
10611 upl_t subupl = NULL;
10612 vector_upl_iostates_t subupl_state;
10613
10614 for (i = 0; i < vector_upl->num_upls; i++) {
10615 subupl = vector_upl->upls[i].elem;
10616 subupl_state = vector_upl->upls[i].iostate;
10617 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10618 /* We could have been passed an offset/size pair that belongs
10619 * to an UPL element that has already been committed/aborted.
10620 * If so, return NULL.
10621 */
10622 if (subupl == NULL) {
10623 return NULL;
10624 }
10625 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10626 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10627 if (*upl_size > subupl_state.size) {
10628 *upl_size = subupl_state.size;
10629 }
10630 }
10631 if (*upl_offset >= subupl_state.offset) {
10632 *upl_offset -= subupl_state.offset;
10633 } else if (i) {
10634 panic("Vector UPL offset miscalculation");
10635 }
10636 return subupl;
10637 }
10638 }
10639 } else {
10640 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10641 }
10642 }
10643 return NULL;
10644 }
10645
10646 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10647 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10648 {
10649 *v_upl_submap = NULL;
10650
10651 if (vector_upl_is_valid(upl)) {
10652 vector_upl_t vector_upl = upl->vector_upl;
10653 if (vector_upl) {
10654 *v_upl_submap = vector_upl->submap;
10655 *submap_dst_addr = vector_upl->submap_dst_addr;
10656 } else {
10657 panic("vector_upl_get_submap was passed a non-vectored UPL");
10658 }
10659 } else {
10660 panic("vector_upl_get_submap was passed a null UPL");
10661 }
10662 }
10663
10664 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10665 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10666 {
10667 if (vector_upl_is_valid(upl)) {
10668 vector_upl_t vector_upl = upl->vector_upl;
10669 if (vector_upl) {
10670 vector_upl->submap = submap;
10671 vector_upl->submap_dst_addr = submap_dst_addr;
10672 } else {
10673 panic("vector_upl_get_submap was passed a non-vectored UPL");
10674 }
10675 } else {
10676 panic("vector_upl_get_submap was passed a NULL UPL");
10677 }
10678 }
10679
10680 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10681 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10682 {
10683 if (vector_upl_is_valid(upl)) {
10684 uint32_t i = 0;
10685 vector_upl_t vector_upl = upl->vector_upl;
10686
10687 if (vector_upl) {
10688 for (i = 0; i < vector_upl->num_upls; i++) {
10689 if (vector_upl->upls[i].elem == subupl) {
10690 break;
10691 }
10692 }
10693
10694 if (i == vector_upl->num_upls) {
10695 panic("setting sub-upl iostate when none exists");
10696 }
10697
10698 vector_upl->upls[i].iostate.offset = offset;
10699 if (size < PAGE_SIZE) {
10700 size = PAGE_SIZE;
10701 }
10702 vector_upl->upls[i].iostate.size = size;
10703 } else {
10704 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10705 }
10706 } else {
10707 panic("vector_upl_set_iostate was passed a NULL UPL");
10708 }
10709 }
10710
10711 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10712 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10713 {
10714 if (vector_upl_is_valid(upl)) {
10715 uint32_t i = 0;
10716 vector_upl_t vector_upl = upl->vector_upl;
10717
10718 if (vector_upl) {
10719 for (i = 0; i < vector_upl->num_upls; i++) {
10720 if (vector_upl->upls[i].elem == subupl) {
10721 break;
10722 }
10723 }
10724
10725 if (i == vector_upl->num_upls) {
10726 panic("getting sub-upl iostate when none exists");
10727 }
10728
10729 *offset = vector_upl->upls[i].iostate.offset;
10730 *size = vector_upl->upls[i].iostate.size;
10731 } else {
10732 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10733 }
10734 } else {
10735 panic("vector_upl_get_iostate was passed a NULL UPL");
10736 }
10737 }
10738
10739 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10740 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10741 {
10742 if (vector_upl_is_valid(upl)) {
10743 vector_upl_t vector_upl = upl->vector_upl;
10744 if (vector_upl) {
10745 if (index < vector_upl->num_upls) {
10746 *offset = vector_upl->upls[index].iostate.offset;
10747 *size = vector_upl->upls[index].iostate.size;
10748 } else {
10749 *offset = *size = 0;
10750 }
10751 } else {
10752 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10753 }
10754 } else {
10755 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10756 }
10757 }
10758
10759 void *
upl_get_internal_vectorupl(upl_t upl)10760 upl_get_internal_vectorupl(upl_t upl)
10761 {
10762 return upl->vector_upl;
10763 }
10764
10765 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10766 upl_get_internal_vectorupl_pagelist(upl_t upl)
10767 {
10768 return upl->vector_upl->pagelist;
10769 }
10770
10771 upl_page_info_t *
upl_get_internal_page_list(upl_t upl)10772 upl_get_internal_page_list(upl_t upl)
10773 {
10774 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10775 }
10776
10777 void
upl_clear_dirty(upl_t upl,boolean_t value)10778 upl_clear_dirty(
10779 upl_t upl,
10780 boolean_t value)
10781 {
10782 if (value) {
10783 upl->flags |= UPL_CLEAR_DIRTY;
10784 } else {
10785 upl->flags &= ~UPL_CLEAR_DIRTY;
10786 }
10787 }
10788
10789 void
upl_set_referenced(upl_t upl,boolean_t value)10790 upl_set_referenced(
10791 upl_t upl,
10792 boolean_t value)
10793 {
10794 upl_lock(upl);
10795 if (value) {
10796 upl->ext_ref_count++;
10797 } else {
10798 if (!upl->ext_ref_count) {
10799 panic("upl_set_referenced not %p", upl);
10800 }
10801 upl->ext_ref_count--;
10802 }
10803 upl_unlock(upl);
10804 }
10805
10806 #if CONFIG_IOSCHED
10807 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10808 upl_set_blkno(
10809 upl_t upl,
10810 vm_offset_t upl_offset,
10811 int io_size,
10812 int64_t blkno)
10813 {
10814 int i, j;
10815 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10816 return;
10817 }
10818
10819 assert(upl->upl_reprio_info != 0);
10820 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10821 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10822 }
10823 }
10824 #endif
10825
10826 void inline
memoryshot(unsigned int event,unsigned int control)10827 memoryshot(unsigned int event, unsigned int control)
10828 {
10829 if (vm_debug_events) {
10830 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10831 vm_page_active_count, vm_page_inactive_count,
10832 vm_page_free_count, vm_page_speculative_count,
10833 vm_page_throttled_count);
10834 } else {
10835 (void) event;
10836 (void) control;
10837 }
10838 }
10839
10840 #ifdef MACH_BSD
10841
10842 boolean_t
upl_device_page(upl_page_info_t * upl)10843 upl_device_page(upl_page_info_t *upl)
10844 {
10845 return UPL_DEVICE_PAGE(upl);
10846 }
10847 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10848 upl_page_present(upl_page_info_t *upl, int index)
10849 {
10850 return UPL_PAGE_PRESENT(upl, index);
10851 }
10852 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10853 upl_speculative_page(upl_page_info_t *upl, int index)
10854 {
10855 return UPL_SPECULATIVE_PAGE(upl, index);
10856 }
10857 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10858 upl_dirty_page(upl_page_info_t *upl, int index)
10859 {
10860 return UPL_DIRTY_PAGE(upl, index);
10861 }
10862 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10863 upl_valid_page(upl_page_info_t *upl, int index)
10864 {
10865 return UPL_VALID_PAGE(upl, index);
10866 }
10867 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10868 upl_phys_page(upl_page_info_t *upl, int index)
10869 {
10870 return UPL_PHYS_PAGE(upl, index);
10871 }
10872
10873 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10874 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10875 {
10876 upl[index].mark = v;
10877 }
10878
10879 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10880 upl_page_get_mark(upl_page_info_t *upl, int index)
10881 {
10882 return upl[index].mark;
10883 }
10884
10885 void
vm_countdirtypages(void)10886 vm_countdirtypages(void)
10887 {
10888 vm_page_t m;
10889 int dpages;
10890 int pgopages;
10891 int precpages;
10892
10893
10894 dpages = 0;
10895 pgopages = 0;
10896 precpages = 0;
10897
10898 vm_page_lock_queues();
10899 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10900 do {
10901 if (m == (vm_page_t)0) {
10902 break;
10903 }
10904
10905 if (m->vmp_dirty) {
10906 dpages++;
10907 }
10908 if (m->vmp_free_when_done) {
10909 pgopages++;
10910 }
10911 if (m->vmp_precious) {
10912 precpages++;
10913 }
10914
10915 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10916 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10917 if (m == (vm_page_t)0) {
10918 break;
10919 }
10920 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10921 vm_page_unlock_queues();
10922
10923 vm_page_lock_queues();
10924 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10925 do {
10926 if (m == (vm_page_t)0) {
10927 break;
10928 }
10929
10930 dpages++;
10931 assert(m->vmp_dirty);
10932 assert(!m->vmp_free_when_done);
10933 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10934 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10935 if (m == (vm_page_t)0) {
10936 break;
10937 }
10938 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10939 vm_page_unlock_queues();
10940
10941 vm_page_lock_queues();
10942 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10943 do {
10944 if (m == (vm_page_t)0) {
10945 break;
10946 }
10947
10948 if (m->vmp_dirty) {
10949 dpages++;
10950 }
10951 if (m->vmp_free_when_done) {
10952 pgopages++;
10953 }
10954 if (m->vmp_precious) {
10955 precpages++;
10956 }
10957
10958 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10959 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10960 if (m == (vm_page_t)0) {
10961 break;
10962 }
10963 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10964 vm_page_unlock_queues();
10965
10966 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10967
10968 dpages = 0;
10969 pgopages = 0;
10970 precpages = 0;
10971
10972 vm_page_lock_queues();
10973 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10974
10975 do {
10976 if (m == (vm_page_t)0) {
10977 break;
10978 }
10979 if (m->vmp_dirty) {
10980 dpages++;
10981 }
10982 if (m->vmp_free_when_done) {
10983 pgopages++;
10984 }
10985 if (m->vmp_precious) {
10986 precpages++;
10987 }
10988
10989 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10990 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10991 if (m == (vm_page_t)0) {
10992 break;
10993 }
10994 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10995 vm_page_unlock_queues();
10996
10997 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10998 }
10999 #endif /* MACH_BSD */
11000
11001
11002 #if CONFIG_IOSCHED
11003 int
upl_get_cached_tier(upl_t upl)11004 upl_get_cached_tier(upl_t upl)
11005 {
11006 assert(upl);
11007 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
11008 return upl->upl_priority;
11009 }
11010 return -1;
11011 }
11012 #endif /* CONFIG_IOSCHED */
11013
11014
11015 void
upl_callout_iodone(upl_t upl)11016 upl_callout_iodone(upl_t upl)
11017 {
11018 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11019
11020 if (upl_ctx) {
11021 void (*iodone_func)(void *, int) = upl_ctx->io_done;
11022
11023 assert(upl_ctx->io_done);
11024
11025 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11026 }
11027 }
11028
11029 void
upl_set_iodone(upl_t upl,void * upl_iodone)11030 upl_set_iodone(upl_t upl, void *upl_iodone)
11031 {
11032 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11033 }
11034
11035 void
upl_set_iodone_error(upl_t upl,int error)11036 upl_set_iodone_error(upl_t upl, int error)
11037 {
11038 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11039
11040 if (upl_ctx) {
11041 upl_ctx->io_error = error;
11042 }
11043 }
11044
11045
11046 ppnum_t
upl_get_highest_page(upl_t upl)11047 upl_get_highest_page(
11048 upl_t upl)
11049 {
11050 return upl->highest_page;
11051 }
11052
11053 upl_size_t
upl_get_size(upl_t upl)11054 upl_get_size(
11055 upl_t upl)
11056 {
11057 return upl_adjusted_size(upl, PAGE_MASK);
11058 }
11059
11060 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11061 upl_adjusted_size(
11062 upl_t upl,
11063 vm_map_offset_t pgmask)
11064 {
11065 vm_object_offset_t start_offset, end_offset;
11066
11067 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11068 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11069
11070 return (upl_size_t)(end_offset - start_offset);
11071 }
11072
11073 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11074 upl_adjusted_offset(
11075 upl_t upl,
11076 vm_map_offset_t pgmask)
11077 {
11078 return trunc_page_mask_64(upl->u_offset, pgmask);
11079 }
11080
11081 vm_object_offset_t
upl_get_data_offset(upl_t upl)11082 upl_get_data_offset(
11083 upl_t upl)
11084 {
11085 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11086 }
11087
11088 upl_t
upl_associated_upl(upl_t upl)11089 upl_associated_upl(upl_t upl)
11090 {
11091 return upl->associated_upl;
11092 }
11093
11094 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11095 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11096 {
11097 upl->associated_upl = associated_upl;
11098 }
11099
11100 struct vnode *
upl_lookup_vnode(upl_t upl)11101 upl_lookup_vnode(upl_t upl)
11102 {
11103 if (!upl->map_object->internal) {
11104 return vnode_pager_lookup_vnode(upl->map_object->pager);
11105 } else {
11106 return NULL;
11107 }
11108 }
11109
11110 #if UPL_DEBUG
11111 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11112 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11113 {
11114 upl->ubc_alias1 = alias1;
11115 upl->ubc_alias2 = alias2;
11116 return KERN_SUCCESS;
11117 }
11118 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11119 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11120 {
11121 if (al) {
11122 *al = upl->ubc_alias1;
11123 }
11124 if (al2) {
11125 *al2 = upl->ubc_alias2;
11126 }
11127 return KERN_SUCCESS;
11128 }
11129 #endif /* UPL_DEBUG */
11130
11131 #if VM_PRESSURE_EVENTS
11132 /*
11133 * Upward trajectory.
11134 */
11135 extern boolean_t vm_compressor_low_on_space(void);
11136
11137 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11138 VM_PRESSURE_NORMAL_TO_WARNING(void)
11139 {
11140 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11141 /* Available pages below our threshold */
11142 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11143 /* No frozen processes to kill */
11144 if (memorystatus_frozen_count == 0) {
11145 /* Not enough suspended processes available. */
11146 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11147 return TRUE;
11148 }
11149 }
11150 }
11151 return FALSE;
11152 } else {
11153 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11154 }
11155 }
11156
11157 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11158 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11159 {
11160 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161 /* Available pages below our threshold */
11162 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11163 return TRUE;
11164 }
11165 return FALSE;
11166 } else {
11167 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11168 }
11169 }
11170
11171 /*
11172 * Downward trajectory.
11173 */
11174 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11175 VM_PRESSURE_WARNING_TO_NORMAL(void)
11176 {
11177 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11178 /* Available pages above our threshold */
11179 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11180 if (memorystatus_available_pages > target_threshold) {
11181 return TRUE;
11182 }
11183 return FALSE;
11184 } else {
11185 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11186 }
11187 }
11188
11189 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11190 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11191 {
11192 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11193 /* Available pages above our threshold */
11194 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11195 if (memorystatus_available_pages > target_threshold) {
11196 return TRUE;
11197 }
11198 return FALSE;
11199 } else {
11200 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11201 }
11202 }
11203 #endif /* VM_PRESSURE_EVENTS */
11204
11205 #if DEVELOPMENT || DEBUG
11206 bool compressor_running_perf_test;
11207 uint64_t compressor_perf_test_pages_processed;
11208
11209 kern_return_t
11210 run_compressor_perf_test(
11211 user_addr_t buf,
11212 size_t buffer_size,
11213 uint64_t *time,
11214 uint64_t *bytes_compressed,
11215 uint64_t *compressor_growth);
11216
11217 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11218 move_pages_to_queue(
11219 vm_map_t map,
11220 user_addr_t start_addr,
11221 size_t buffer_size,
11222 vm_page_queue_head_t *queue,
11223 size_t *pages_moved)
11224 {
11225 kern_return_t err = KERN_SUCCESS;
11226 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11227 boolean_t addr_in_map = FALSE;
11228 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11229 vm_object_t curr_object = VM_OBJECT_NULL;
11230 *pages_moved = 0;
11231
11232
11233 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11234 /*
11235 * We don't currently support benchmarking maps with a different page size
11236 * than the kernel.
11237 */
11238 return KERN_INVALID_ARGUMENT;
11239 }
11240
11241 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11242 return KERN_INVALID_ARGUMENT;
11243 }
11244
11245 vm_map_lock_read(map);
11246 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11247 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11248
11249
11250 while (curr_addr < end_addr) {
11251 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11252 if (!addr_in_map) {
11253 err = KERN_INVALID_ARGUMENT;
11254 break;
11255 }
11256 curr_object = VME_OBJECT(curr_entry);
11257 if (curr_object) {
11258 vm_object_lock(curr_object);
11259 /* We really only want anonymous memory that's in the top level map and object here. */
11260 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11261 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11262 err = KERN_INVALID_ARGUMENT;
11263 vm_object_unlock(curr_object);
11264 break;
11265 }
11266 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11267 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11268 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11269 vm_map_offset_t curr_offset = start_offset;
11270 vm_page_t curr_page;
11271 while (curr_offset < end_offset) {
11272 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11273 if (curr_page != VM_PAGE_NULL) {
11274 vm_page_lock_queues();
11275 if (curr_page->vmp_laundry) {
11276 vm_pageout_steal_laundry(curr_page, TRUE);
11277 }
11278 /*
11279 * we've already factored out pages in the laundry which
11280 * means this page can't be on the pageout queue so it's
11281 * safe to do the vm_page_queues_remove
11282 */
11283 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11284 vm_page_queues_remove(curr_page, TRUE);
11285 if (donate) {
11286 /*
11287 * The compressor needs to see this bit to know
11288 * where this page needs to land. Also if stolen,
11289 * this bit helps put the page back in the right
11290 * special queue where it belongs.
11291 */
11292 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11293 }
11294 // Clear the referenced bit so we ensure this gets paged out
11295 curr_page->vmp_reference = false;
11296 if (curr_page->vmp_pmapped) {
11297 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11298 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11299 }
11300 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11301 vm_page_unlock_queues();
11302 *pages_moved += 1;
11303 }
11304 curr_offset += PAGE_SIZE_64;
11305 curr_addr += PAGE_SIZE_64;
11306 }
11307 }
11308 vm_object_unlock(curr_object);
11309 }
11310 vm_map_unlock_read(map);
11311 return err;
11312 }
11313
11314 /*
11315 * Local queue for processing benchmark pages.
11316 * Can't be allocated on the stack because the pointer has to
11317 * be packable.
11318 */
11319 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11320 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11321 run_compressor_perf_test(
11322 user_addr_t buf,
11323 size_t buffer_size,
11324 uint64_t *time,
11325 uint64_t *bytes_compressed,
11326 uint64_t *compressor_growth)
11327 {
11328 kern_return_t err = KERN_SUCCESS;
11329 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11330 return KERN_NOT_SUPPORTED;
11331 }
11332 if (current_task() == kernel_task) {
11333 return KERN_INVALID_ARGUMENT;
11334 }
11335 vm_page_lock_queues();
11336 if (compressor_running_perf_test) {
11337 /* Only run one instance of the benchmark at a time. */
11338 vm_page_unlock_queues();
11339 return KERN_RESOURCE_SHORTAGE;
11340 }
11341 vm_page_unlock_queues();
11342 size_t page_count = 0;
11343 vm_map_t map;
11344 vm_page_t p, next;
11345 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11346 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11347 *bytes_compressed = *compressor_growth = 0;
11348
11349 vm_page_queue_init(&compressor_perf_test_queue);
11350 map = current_task()->map;
11351 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11352 if (err != KERN_SUCCESS) {
11353 goto out;
11354 }
11355
11356 vm_page_lock_queues();
11357 compressor_running_perf_test = true;
11358 compressor_perf_test_pages_processed = 0;
11359 /*
11360 * At this point the compressor threads should only process the benchmark queue
11361 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11362 * to determine how many compressed bytes we ended up using.
11363 */
11364 compressed_bytes_start = c_segment_compressed_bytes;
11365 vm_page_unlock_queues();
11366
11367 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11368
11369 vm_page_lock_queues();
11370 compressor_perf_test_start = mach_absolute_time();
11371
11372 // Wake up the compressor thread(s)
11373 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11374 pgo_iothread_internal_state[0].pgo_iothread);
11375
11376 /*
11377 * Depending on when this test is run we could overshoot or be right on the mark
11378 * with our page_count. So the comparison is of the _less than_ variety.
11379 */
11380 while (compressor_perf_test_pages_processed < page_count) {
11381 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11382 vm_page_unlock_queues();
11383 thread_block(THREAD_CONTINUE_NULL);
11384 vm_page_lock_queues();
11385 }
11386 compressor_perf_test_end = mach_absolute_time();
11387 compressed_bytes_end = c_segment_compressed_bytes;
11388 vm_page_unlock_queues();
11389
11390
11391 out:
11392 /*
11393 * If we errored out above, then we could still have some pages
11394 * on the local queue. Make sure to put them back on the active queue before
11395 * returning so they're not orphaned.
11396 */
11397 vm_page_lock_queues();
11398 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11399 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11400 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11401 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11402
11403 vm_page_enqueue_active(p, FALSE);
11404 p = next;
11405 }
11406
11407 compressor_running_perf_test = false;
11408 vm_page_unlock_queues();
11409 if (err == KERN_SUCCESS) {
11410 *bytes_compressed = page_count * PAGE_SIZE_64;
11411 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11412 }
11413
11414 /*
11415 * pageout_scan will consider waking the compactor swapper
11416 * before it blocks. Do the same thing here before we return
11417 * to ensure that back to back benchmark runs can't overly fragment the
11418 * compressor pool.
11419 */
11420 vm_consider_waking_compactor_swapper();
11421 return err;
11422 }
11423 #endif /* DEVELOPMENT || DEBUG */
11424