1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91
92 #include <sys/kdebug_triage.h>
93
94 #include <machine/vm_tuning.h>
95 #include <machine/commpage.h>
96
97 #include <vm/pmap.h>
98 #include <vm/vm_compressor_pager.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map_internal.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_pageout.h>
104 #include <vm/vm_protos.h> /* must be last */
105 #include <vm/memory_object.h>
106 #include <vm/vm_purgeable_internal.h>
107 #include <vm/vm_shared_region.h>
108 #include <vm/vm_compressor.h>
109
110 #include <san/kasan.h>
111
112 #if CONFIG_PHANTOM_CACHE
113 #include <vm/vm_phantom_cache.h>
114 #endif
115
116 #if UPL_DEBUG
117 #include <libkern/OSDebug.h>
118 #endif
119
120 extern int cs_debug;
121
122 extern void mbuf_drain(boolean_t);
123
124 #if VM_PRESSURE_EVENTS
125 #if CONFIG_JETSAM
126 extern unsigned int memorystatus_available_pages;
127 extern unsigned int memorystatus_available_pages_pressure;
128 extern unsigned int memorystatus_available_pages_critical;
129 #else /* CONFIG_JETSAM */
130 extern uint64_t memorystatus_available_pages;
131 extern uint64_t memorystatus_available_pages_pressure;
132 extern uint64_t memorystatus_available_pages_critical;
133 #endif /* CONFIG_JETSAM */
134
135 extern unsigned int memorystatus_frozen_count;
136 extern unsigned int memorystatus_suspended_count;
137 extern vm_pressure_level_t memorystatus_vm_pressure_level;
138
139 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
140 extern uint32_t memorystatus_jetsam_fg_band_waiters;
141
142 void vm_pressure_response(void);
143 extern void consider_vm_pressure_events(void);
144
145 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
146 #endif /* VM_PRESSURE_EVENTS */
147
148 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
149 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
150 boolean_t vps_dynamic_priority_enabled = FALSE;
151 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
152
153 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
154 #if !XNU_TARGET_OS_OSX
155 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
156 #else /* !XNU_TARGET_OS_OSX */
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
158 #endif /* !XNU_TARGET_OS_OSX */
159 #endif
160
161 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
162 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
163 #endif
164
165 #ifndef VM_PAGE_LAUNDRY_MAX
166 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
167 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
168
169 #ifndef VM_PAGEOUT_BURST_WAIT
170 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
171 #endif /* VM_PAGEOUT_BURST_WAIT */
172
173 #ifndef VM_PAGEOUT_EMPTY_WAIT
174 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
175 #endif /* VM_PAGEOUT_EMPTY_WAIT */
176
177 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
178 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
179 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
180
181 #ifndef VM_PAGEOUT_IDLE_WAIT
182 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
183 #endif /* VM_PAGEOUT_IDLE_WAIT */
184
185 #ifndef VM_PAGEOUT_SWAP_WAIT
186 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
187 #endif /* VM_PAGEOUT_SWAP_WAIT */
188
189
190 #ifndef VM_PAGE_SPECULATIVE_TARGET
191 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
192 #endif /* VM_PAGE_SPECULATIVE_TARGET */
193
194
195 /*
196 * To obtain a reasonable LRU approximation, the inactive queue
197 * needs to be large enough to give pages on it a chance to be
198 * referenced a second time. This macro defines the fraction
199 * of active+inactive pages that should be inactive.
200 * The pageout daemon uses it to update vm_page_inactive_target.
201 *
202 * If vm_page_free_count falls below vm_page_free_target and
203 * vm_page_inactive_count is below vm_page_inactive_target,
204 * then the pageout daemon starts running.
205 */
206
207 #ifndef VM_PAGE_INACTIVE_TARGET
208 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
209 #endif /* VM_PAGE_INACTIVE_TARGET */
210
211 /*
212 * Once the pageout daemon starts running, it keeps going
213 * until vm_page_free_count meets or exceeds vm_page_free_target.
214 */
215
216 #ifndef VM_PAGE_FREE_TARGET
217 #if !XNU_TARGET_OS_OSX
218 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
219 #else /* !XNU_TARGET_OS_OSX */
220 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
221 #endif /* !XNU_TARGET_OS_OSX */
222 #endif /* VM_PAGE_FREE_TARGET */
223
224
225 /*
226 * The pageout daemon always starts running once vm_page_free_count
227 * falls below vm_page_free_min.
228 */
229
230 #ifndef VM_PAGE_FREE_MIN
231 #if !XNU_TARGET_OS_OSX
232 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
233 #else /* !XNU_TARGET_OS_OSX */
234 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
235 #endif /* !XNU_TARGET_OS_OSX */
236 #endif /* VM_PAGE_FREE_MIN */
237
238 #if !XNU_TARGET_OS_OSX
239 #define VM_PAGE_FREE_RESERVED_LIMIT 100
240 #define VM_PAGE_FREE_MIN_LIMIT 1500
241 #define VM_PAGE_FREE_TARGET_LIMIT 2000
242 #else /* !XNU_TARGET_OS_OSX */
243 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
244 #define VM_PAGE_FREE_MIN_LIMIT 3500
245 #define VM_PAGE_FREE_TARGET_LIMIT 4000
246 #endif /* !XNU_TARGET_OS_OSX */
247
248 /*
249 * When vm_page_free_count falls below vm_page_free_reserved,
250 * only vm-privileged threads can allocate pages. vm-privilege
251 * allows the pageout daemon and default pager (and any other
252 * associated threads needed for default pageout) to continue
253 * operation by dipping into the reserved pool of pages.
254 */
255
256 #ifndef VM_PAGE_FREE_RESERVED
257 #define VM_PAGE_FREE_RESERVED(n) \
258 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
259 #endif /* VM_PAGE_FREE_RESERVED */
260
261 /*
262 * When we dequeue pages from the inactive list, they are
263 * reactivated (ie, put back on the active queue) if referenced.
264 * However, it is possible to starve the free list if other
265 * processors are referencing pages faster than we can turn off
266 * the referenced bit. So we limit the number of reactivations
267 * we will make per call of vm_pageout_scan().
268 */
269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270
271 #ifndef VM_PAGE_REACTIVATE_LIMIT
272 #if !XNU_TARGET_OS_OSX
273 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
274 #else /* !XNU_TARGET_OS_OSX */
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
276 #endif /* !XNU_TARGET_OS_OSX */
277 #endif /* VM_PAGE_REACTIVATE_LIMIT */
278 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
279
280 int vm_pageout_protect_realtime = true;
281
282 extern boolean_t hibernate_cleaning_in_progress;
283
284 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
285
286 #if VM_PRESSURE_EVENTS
287 void vm_pressure_thread(void);
288
289 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
290 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
291
292 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
293 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
294 #endif
295
296 static void vm_pageout_iothread_external(struct cq *, wait_result_t);
297 static void vm_pageout_iothread_internal(struct cq *, wait_result_t);
298 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
299
300 extern void vm_pageout_continue(void);
301 extern void vm_pageout_scan(void);
302
303 boolean_t vm_pageout_running = FALSE;
304
305 uint32_t vm_page_upl_tainted = 0;
306 uint32_t vm_page_iopl_tainted = 0;
307
308 #if XNU_TARGET_OS_OSX
309 static boolean_t vm_pageout_waiter = FALSE;
310 #endif /* XNU_TARGET_OS_OSX */
311
312
313 #if DEVELOPMENT || DEBUG
314 struct vm_pageout_debug vm_pageout_debug;
315 #endif
316 struct vm_pageout_vminfo vm_pageout_vminfo;
317 struct vm_pageout_state vm_pageout_state;
318 struct vm_config vm_config;
319
320 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
321 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
322 #if DEVELOPMENT || DEBUG
323 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
324 #endif /* DEVELOPMENT || DEBUG */
325
326 int vm_upl_wait_for_pages = 0;
327 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
328
329 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
330
331 int vm_debug_events = 0;
332
333 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
334
335 #if CONFIG_MEMORYSTATUS
336 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
337
338 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
339 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
340
341 #endif
342
343 #if __AMP__
344 int vm_compressor_ebound = 1;
345 int vm_pgo_pbound = 0;
346 extern void thread_bind_cluster_type(thread_t, char, bool);
347 #endif /* __AMP__ */
348
349
350 /*
351 * Routine: vm_pageout_object_terminate
352 * Purpose:
353 * Destroy the pageout_object, and perform all of the
354 * required cleanup actions.
355 *
356 * In/Out conditions:
357 * The object must be locked, and will be returned locked.
358 */
359 void
vm_pageout_object_terminate(vm_object_t object)360 vm_pageout_object_terminate(
361 vm_object_t object)
362 {
363 vm_object_t shadow_object;
364
365 /*
366 * Deal with the deallocation (last reference) of a pageout object
367 * (used for cleaning-in-place) by dropping the paging references/
368 * freeing pages in the original object.
369 */
370
371 assert(object->pageout);
372 shadow_object = object->shadow;
373 vm_object_lock(shadow_object);
374
375 while (!vm_page_queue_empty(&object->memq)) {
376 vm_page_t p, m;
377 vm_object_offset_t offset;
378
379 p = (vm_page_t) vm_page_queue_first(&object->memq);
380
381 assert(p->vmp_private);
382 assert(p->vmp_free_when_done);
383 p->vmp_free_when_done = FALSE;
384 assert(!p->vmp_cleaning);
385 assert(!p->vmp_laundry);
386
387 offset = p->vmp_offset;
388 VM_PAGE_FREE(p);
389 p = VM_PAGE_NULL;
390
391 m = vm_page_lookup(shadow_object,
392 offset + object->vo_shadow_offset);
393
394 if (m == VM_PAGE_NULL) {
395 continue;
396 }
397
398 assert((m->vmp_dirty) || (m->vmp_precious) ||
399 (m->vmp_busy && m->vmp_cleaning));
400
401 /*
402 * Handle the trusted pager throttle.
403 * Also decrement the burst throttle (if external).
404 */
405 vm_page_lock_queues();
406 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
407 vm_pageout_throttle_up(m);
408 }
409
410 /*
411 * Handle the "target" page(s). These pages are to be freed if
412 * successfully cleaned. Target pages are always busy, and are
413 * wired exactly once. The initial target pages are not mapped,
414 * (so cannot be referenced or modified) but converted target
415 * pages may have been modified between the selection as an
416 * adjacent page and conversion to a target.
417 */
418 if (m->vmp_free_when_done) {
419 assert(m->vmp_busy);
420 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
421 assert(m->vmp_wire_count == 1);
422 m->vmp_cleaning = FALSE;
423 m->vmp_free_when_done = FALSE;
424 /*
425 * Revoke all access to the page. Since the object is
426 * locked, and the page is busy, this prevents the page
427 * from being dirtied after the pmap_disconnect() call
428 * returns.
429 *
430 * Since the page is left "dirty" but "not modifed", we
431 * can detect whether the page was redirtied during
432 * pageout by checking the modify state.
433 */
434 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
435 SET_PAGE_DIRTY(m, FALSE);
436 } else {
437 m->vmp_dirty = FALSE;
438 }
439
440 if (m->vmp_dirty) {
441 vm_page_unwire(m, TRUE); /* reactivates */
442 counter_inc(&vm_statistics_reactivations);
443 PAGE_WAKEUP_DONE(m);
444 } else {
445 vm_page_free(m); /* clears busy, etc. */
446 }
447 vm_page_unlock_queues();
448 continue;
449 }
450 /*
451 * Handle the "adjacent" pages. These pages were cleaned in
452 * place, and should be left alone.
453 * If prep_pin_count is nonzero, then someone is using the
454 * page, so make it active.
455 */
456 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
457 if (m->vmp_reference) {
458 vm_page_activate(m);
459 } else {
460 vm_page_deactivate(m);
461 }
462 }
463 if (m->vmp_overwriting) {
464 /*
465 * the (COPY_OUT_FROM == FALSE) request_page_list case
466 */
467 if (m->vmp_busy) {
468 /*
469 * We do not re-set m->vmp_dirty !
470 * The page was busy so no extraneous activity
471 * could have occurred. COPY_INTO is a read into the
472 * new pages. CLEAN_IN_PLACE does actually write
473 * out the pages but handling outside of this code
474 * will take care of resetting dirty. We clear the
475 * modify however for the Programmed I/O case.
476 */
477 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
478
479 m->vmp_busy = FALSE;
480 m->vmp_absent = FALSE;
481 } else {
482 /*
483 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
484 * Occurs when the original page was wired
485 * at the time of the list request
486 */
487 assert(VM_PAGE_WIRED(m));
488 vm_page_unwire(m, TRUE); /* reactivates */
489 }
490 m->vmp_overwriting = FALSE;
491 } else {
492 m->vmp_dirty = FALSE;
493 }
494 m->vmp_cleaning = FALSE;
495
496 /*
497 * Wakeup any thread waiting for the page to be un-cleaning.
498 */
499 PAGE_WAKEUP(m);
500 vm_page_unlock_queues();
501 }
502 /*
503 * Account for the paging reference taken in vm_paging_object_allocate.
504 */
505 vm_object_activity_end(shadow_object);
506 vm_object_unlock(shadow_object);
507
508 assert(object->ref_count == 0);
509 assert(object->paging_in_progress == 0);
510 assert(object->activity_in_progress == 0);
511 assert(object->resident_page_count == 0);
512 return;
513 }
514
515 /*
516 * Routine: vm_pageclean_setup
517 *
518 * Purpose: setup a page to be cleaned (made non-dirty), but not
519 * necessarily flushed from the VM page cache.
520 * This is accomplished by cleaning in place.
521 *
522 * The page must not be busy, and new_object
523 * must be locked.
524 *
525 */
526 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)527 vm_pageclean_setup(
528 vm_page_t m,
529 vm_page_t new_m,
530 vm_object_t new_object,
531 vm_object_offset_t new_offset)
532 {
533 assert(!m->vmp_busy);
534 #if 0
535 assert(!m->vmp_cleaning);
536 #endif
537
538 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
539
540 /*
541 * Mark original page as cleaning in place.
542 */
543 m->vmp_cleaning = TRUE;
544 SET_PAGE_DIRTY(m, FALSE);
545 m->vmp_precious = FALSE;
546
547 /*
548 * Convert the fictitious page to a private shadow of
549 * the real page.
550 */
551 assert(new_m->vmp_fictitious);
552 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
553 new_m->vmp_fictitious = FALSE;
554 new_m->vmp_private = TRUE;
555 new_m->vmp_free_when_done = TRUE;
556 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
557
558 vm_page_lockspin_queues();
559 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
560 vm_page_unlock_queues();
561
562 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
563 assert(!new_m->vmp_wanted);
564 new_m->vmp_busy = FALSE;
565 }
566
567 /*
568 * Routine: vm_pageout_initialize_page
569 * Purpose:
570 * Causes the specified page to be initialized in
571 * the appropriate memory object. This routine is used to push
572 * pages into a copy-object when they are modified in the
573 * permanent object.
574 *
575 * The page is moved to a temporary object and paged out.
576 *
577 * In/out conditions:
578 * The page in question must not be on any pageout queues.
579 * The object to which it belongs must be locked.
580 * The page must be busy, but not hold a paging reference.
581 *
582 * Implementation:
583 * Move this page to a completely new object.
584 */
585 void
vm_pageout_initialize_page(vm_page_t m)586 vm_pageout_initialize_page(
587 vm_page_t m)
588 {
589 vm_object_t object;
590 vm_object_offset_t paging_offset;
591 memory_object_t pager;
592
593 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
594
595 object = VM_PAGE_OBJECT(m);
596
597 assert(m->vmp_busy);
598 assert(object->internal);
599
600 /*
601 * Verify that we really want to clean this page
602 */
603 assert(!m->vmp_absent);
604 assert(m->vmp_dirty);
605
606 /*
607 * Create a paging reference to let us play with the object.
608 */
609 paging_offset = m->vmp_offset + object->paging_offset;
610
611 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
612 panic("reservation without pageout?"); /* alan */
613
614 VM_PAGE_FREE(m);
615 vm_object_unlock(object);
616
617 return;
618 }
619
620 /*
621 * If there's no pager, then we can't clean the page. This should
622 * never happen since this should be a copy object and therefore not
623 * an external object, so the pager should always be there.
624 */
625
626 pager = object->pager;
627
628 if (pager == MEMORY_OBJECT_NULL) {
629 panic("missing pager for copy object");
630
631 VM_PAGE_FREE(m);
632 return;
633 }
634
635 /*
636 * set the page for future call to vm_fault_list_request
637 */
638 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
639 SET_PAGE_DIRTY(m, FALSE);
640
641 /*
642 * keep the object from collapsing or terminating
643 */
644 vm_object_paging_begin(object);
645 vm_object_unlock(object);
646
647 /*
648 * Write the data to its pager.
649 * Note that the data is passed by naming the new object,
650 * not a virtual address; the pager interface has been
651 * manipulated to use the "internal memory" data type.
652 * [The object reference from its allocation is donated
653 * to the eventual recipient.]
654 */
655 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
656
657 vm_object_lock(object);
658 vm_object_paging_end(object);
659 }
660
661
662 /*
663 * vm_pageout_cluster:
664 *
665 * Given a page, queue it to the appropriate I/O thread,
666 * which will page it out and attempt to clean adjacent pages
667 * in the same operation.
668 *
669 * The object and queues must be locked. We will take a
670 * paging reference to prevent deallocation or collapse when we
671 * release the object lock back at the call site. The I/O thread
672 * is responsible for consuming this reference
673 *
674 * The page must not be on any pageout queue.
675 */
676 #if DEVELOPMENT || DEBUG
677 vmct_stats_t vmct_stats;
678
679 int32_t vmct_active = 0;
680 uint64_t vm_compressor_epoch_start = 0;
681 uint64_t vm_compressor_epoch_stop = 0;
682
683 typedef enum vmct_state_t {
684 VMCT_IDLE,
685 VMCT_AWAKENED,
686 VMCT_ACTIVE,
687 } vmct_state_t;
688 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
689 #endif
690
691
692
693 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)694 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
695 {
696 event_t wakeup_event;
697 vm_object_t object = VM_PAGE_OBJECT(m);
698
699 VM_PAGE_CHECK(m);
700 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
701 vm_object_lock_assert_exclusive(object);
702
703 /*
704 * Make sure it's OK to page this out.
705 */
706 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
707 assert(!m->vmp_cleaning && !m->vmp_laundry);
708 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
709
710 /*
711 * protect the object from collapse or termination
712 */
713 vm_object_activity_begin(object);
714
715 if (object->internal == TRUE) {
716 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
717
718 m->vmp_busy = TRUE;
719 wakeup_event = (event_t) &(vm_pageout_queue_internal.pgo_pending);
720 } else {
721 wakeup_event = (event_t) &(vm_pageout_queue_external.pgo_pending);
722 }
723
724 /*
725 * pgo_laundry count is tied to the laundry bit
726 */
727 m->vmp_laundry = TRUE;
728 q->pgo_laundry++;
729
730 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
731 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
732
733 if (q->pgo_idle == TRUE) {
734 q->pgo_idle = FALSE;
735 thread_wakeup(wakeup_event);
736 }
737 VM_PAGE_CHECK(m);
738 }
739
740 void
vm_pageout_cluster(vm_page_t m)741 vm_pageout_cluster(vm_page_t m)
742 {
743 struct vm_pageout_queue *q;
744 vm_object_t object = VM_PAGE_OBJECT(m);
745 if (object->internal) {
746 q = &vm_pageout_queue_internal;
747 } else {
748 q = &vm_pageout_queue_external;
749 }
750 vm_pageout_cluster_to_queue(m, q);
751 }
752
753
754 /*
755 * A page is back from laundry or we are stealing it back from
756 * the laundering state. See if there are some pages waiting to
757 * go to laundry and if we can let some of them go now.
758 *
759 * Object and page queues must be locked.
760 */
761 void
vm_pageout_throttle_up(vm_page_t m)762 vm_pageout_throttle_up(
763 vm_page_t m)
764 {
765 struct vm_pageout_queue *q;
766 vm_object_t m_object;
767
768 m_object = VM_PAGE_OBJECT(m);
769
770 assert(m_object != VM_OBJECT_NULL);
771 assert(m_object != kernel_object);
772
773 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
774 vm_object_lock_assert_exclusive(m_object);
775
776 if (m_object->internal == TRUE) {
777 q = &vm_pageout_queue_internal;
778 } else {
779 q = &vm_pageout_queue_external;
780 }
781
782 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
783 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
784 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
785
786 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
787
788 vm_object_activity_end(m_object);
789
790 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
791 }
792 if (m->vmp_laundry == TRUE) {
793 m->vmp_laundry = FALSE;
794 q->pgo_laundry--;
795
796 if (q->pgo_throttled == TRUE) {
797 q->pgo_throttled = FALSE;
798 thread_wakeup((event_t) &q->pgo_laundry);
799 }
800 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
801 q->pgo_draining = FALSE;
802 thread_wakeup((event_t) (&q->pgo_laundry + 1));
803 }
804 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
805 }
806 }
807
808
809 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)810 vm_pageout_throttle_up_batch(
811 struct vm_pageout_queue *q,
812 int batch_cnt)
813 {
814 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
815
816 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
817
818 q->pgo_laundry -= batch_cnt;
819
820 if (q->pgo_throttled == TRUE) {
821 q->pgo_throttled = FALSE;
822 thread_wakeup((event_t) &q->pgo_laundry);
823 }
824 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
825 q->pgo_draining = FALSE;
826 thread_wakeup((event_t) (&q->pgo_laundry + 1));
827 }
828 }
829
830
831
832 /*
833 * VM memory pressure monitoring.
834 *
835 * vm_pageout_scan() keeps track of the number of pages it considers and
836 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
837 *
838 * compute_memory_pressure() is called every second from compute_averages()
839 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
840 * of recalimed pages in a new vm_pageout_stat[] bucket.
841 *
842 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
843 * The caller provides the number of seconds ("nsecs") worth of statistics
844 * it wants, up to 30 seconds.
845 * It computes the number of pages reclaimed in the past "nsecs" seconds and
846 * also returns the number of pages the system still needs to reclaim at this
847 * moment in time.
848 */
849 #if DEVELOPMENT || DEBUG
850 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
851 #else
852 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
853 #endif
854 struct vm_pageout_stat {
855 unsigned long vm_page_active_count;
856 unsigned long vm_page_speculative_count;
857 unsigned long vm_page_inactive_count;
858 unsigned long vm_page_anonymous_count;
859
860 unsigned long vm_page_free_count;
861 unsigned long vm_page_wire_count;
862 unsigned long vm_page_compressor_count;
863
864 unsigned long vm_page_pages_compressed;
865 unsigned long vm_page_pageable_internal_count;
866 unsigned long vm_page_pageable_external_count;
867 unsigned long vm_page_xpmapped_external_count;
868
869 unsigned int pages_grabbed;
870 unsigned int pages_freed;
871
872 unsigned int pages_compressed;
873 unsigned int pages_grabbed_by_compressor;
874 unsigned int failed_compressions;
875
876 unsigned int pages_evicted;
877 unsigned int pages_purged;
878
879 unsigned int considered;
880 unsigned int considered_bq_internal;
881 unsigned int considered_bq_external;
882
883 unsigned int skipped_external;
884 unsigned int skipped_internal;
885 unsigned int filecache_min_reactivations;
886
887 unsigned int freed_speculative;
888 unsigned int freed_cleaned;
889 unsigned int freed_internal;
890 unsigned int freed_external;
891
892 unsigned int cleaned_dirty_external;
893 unsigned int cleaned_dirty_internal;
894
895 unsigned int inactive_referenced;
896 unsigned int inactive_nolock;
897 unsigned int reactivation_limit_exceeded;
898 unsigned int forced_inactive_reclaim;
899
900 unsigned int throttled_internal_q;
901 unsigned int throttled_external_q;
902
903 unsigned int phantom_ghosts_found;
904 unsigned int phantom_ghosts_added;
905
906 unsigned int vm_page_realtime_count;
907 unsigned int forcereclaimed_sharedcache;
908 unsigned int forcereclaimed_realtime;
909 unsigned int protected_sharedcache;
910 unsigned int protected_realtime;
911 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
912
913 unsigned int vm_pageout_stat_now = 0;
914
915 #define VM_PAGEOUT_STAT_BEFORE(i) \
916 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
917 #define VM_PAGEOUT_STAT_AFTER(i) \
918 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
919
920 #if VM_PAGE_BUCKETS_CHECK
921 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
922 #endif /* VM_PAGE_BUCKETS_CHECK */
923
924
925 void
926 record_memory_pressure(void);
927 void
record_memory_pressure(void)928 record_memory_pressure(void)
929 {
930 unsigned int vm_pageout_next;
931
932 #if VM_PAGE_BUCKETS_CHECK
933 /* check the consistency of VM page buckets at regular interval */
934 static int counter = 0;
935 if ((++counter % vm_page_buckets_check_interval) == 0) {
936 vm_page_buckets_check();
937 }
938 #endif /* VM_PAGE_BUCKETS_CHECK */
939
940 vm_pageout_state.vm_memory_pressure =
941 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
942 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
943 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
944 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
945
946 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
947
948 /* move "now" forward */
949 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
950
951 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
952
953 vm_pageout_stat_now = vm_pageout_next;
954 }
955
956
957 /*
958 * IMPORTANT
959 * mach_vm_ctl_page_free_wanted() is called indirectly, via
960 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
961 * it must be safe in the restricted stackshot context. Locks and/or
962 * blocking are not allowable.
963 */
964 unsigned int
mach_vm_ctl_page_free_wanted(void)965 mach_vm_ctl_page_free_wanted(void)
966 {
967 unsigned int page_free_target, page_free_count, page_free_wanted;
968
969 page_free_target = vm_page_free_target;
970 page_free_count = vm_page_free_count;
971 if (page_free_target > page_free_count) {
972 page_free_wanted = page_free_target - page_free_count;
973 } else {
974 page_free_wanted = 0;
975 }
976
977 return page_free_wanted;
978 }
979
980
981 /*
982 * IMPORTANT:
983 * mach_vm_pressure_monitor() is called when taking a stackshot, with
984 * wait_for_pressure FALSE, so that code path must remain safe in the
985 * restricted stackshot context. No blocking or locks are allowable.
986 * on that code path.
987 */
988
989 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)990 mach_vm_pressure_monitor(
991 boolean_t wait_for_pressure,
992 unsigned int nsecs_monitored,
993 unsigned int *pages_reclaimed_p,
994 unsigned int *pages_wanted_p)
995 {
996 wait_result_t wr;
997 unsigned int vm_pageout_then, vm_pageout_now;
998 unsigned int pages_reclaimed;
999 unsigned int units_of_monitor;
1000
1001 units_of_monitor = 8 * nsecs_monitored;
1002 /*
1003 * We don't take the vm_page_queue_lock here because we don't want
1004 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1005 * thread when it's trying to reclaim memory. We don't need fully
1006 * accurate monitoring anyway...
1007 */
1008
1009 if (wait_for_pressure) {
1010 /* wait until there's memory pressure */
1011 while (vm_page_free_count >= vm_page_free_target) {
1012 wr = assert_wait((event_t) &vm_page_free_wanted,
1013 THREAD_INTERRUPTIBLE);
1014 if (wr == THREAD_WAITING) {
1015 wr = thread_block(THREAD_CONTINUE_NULL);
1016 }
1017 if (wr == THREAD_INTERRUPTED) {
1018 return KERN_ABORTED;
1019 }
1020 if (wr == THREAD_AWAKENED) {
1021 /*
1022 * The memory pressure might have already
1023 * been relieved but let's not block again
1024 * and let's report that there was memory
1025 * pressure at some point.
1026 */
1027 break;
1028 }
1029 }
1030 }
1031
1032 /* provide the number of pages the system wants to reclaim */
1033 if (pages_wanted_p != NULL) {
1034 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1035 }
1036
1037 if (pages_reclaimed_p == NULL) {
1038 return KERN_SUCCESS;
1039 }
1040
1041 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1042 vm_pageout_now = vm_pageout_stat_now;
1043 pages_reclaimed = 0;
1044 for (vm_pageout_then =
1045 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1046 vm_pageout_then != vm_pageout_now &&
1047 units_of_monitor-- != 0;
1048 vm_pageout_then =
1049 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1050 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1051 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1052 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1053 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1054 }
1055 *pages_reclaimed_p = pages_reclaimed;
1056
1057 return KERN_SUCCESS;
1058 }
1059
1060
1061
1062 #if DEVELOPMENT || DEBUG
1063
1064 static void
1065 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1066
1067 /*
1068 * condition variable used to make sure there is
1069 * only a single sweep going on at a time
1070 */
1071 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1072
1073
1074 void
vm_pageout_disconnect_all_pages()1075 vm_pageout_disconnect_all_pages()
1076 {
1077 vm_page_lock_queues();
1078
1079 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1080 vm_page_unlock_queues();
1081 return;
1082 }
1083 vm_pageout_disconnect_all_pages_active = TRUE;
1084 vm_page_unlock_queues();
1085
1086 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1087 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1088 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1089
1090 vm_pageout_disconnect_all_pages_active = FALSE;
1091 }
1092
1093
1094 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1095 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1096 {
1097 vm_page_t m;
1098 vm_object_t t_object = NULL;
1099 vm_object_t l_object = NULL;
1100 vm_object_t m_object = NULL;
1101 int delayed_unlock = 0;
1102 int try_failed_count = 0;
1103 int disconnected_count = 0;
1104 int paused_count = 0;
1105 int object_locked_count = 0;
1106
1107 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1108 q, qcount, 0, 0, 0);
1109
1110 vm_page_lock_queues();
1111
1112 while (qcount && !vm_page_queue_empty(q)) {
1113 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1114
1115 m = (vm_page_t) vm_page_queue_first(q);
1116 m_object = VM_PAGE_OBJECT(m);
1117
1118 /*
1119 * check to see if we currently are working
1120 * with the same object... if so, we've
1121 * already got the lock
1122 */
1123 if (m_object != l_object) {
1124 /*
1125 * the object associated with candidate page is
1126 * different from the one we were just working
1127 * with... dump the lock if we still own it
1128 */
1129 if (l_object != NULL) {
1130 vm_object_unlock(l_object);
1131 l_object = NULL;
1132 }
1133 if (m_object != t_object) {
1134 try_failed_count = 0;
1135 }
1136
1137 /*
1138 * Try to lock object; since we've alread got the
1139 * page queues lock, we can only 'try' for this one.
1140 * if the 'try' fails, we need to do a mutex_pause
1141 * to allow the owner of the object lock a chance to
1142 * run...
1143 */
1144 if (!vm_object_lock_try_scan(m_object)) {
1145 if (try_failed_count > 20) {
1146 goto reenter_pg_on_q;
1147 }
1148 vm_page_unlock_queues();
1149 mutex_pause(try_failed_count++);
1150 vm_page_lock_queues();
1151 delayed_unlock = 0;
1152
1153 paused_count++;
1154
1155 t_object = m_object;
1156 continue;
1157 }
1158 object_locked_count++;
1159
1160 l_object = m_object;
1161 }
1162 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1163 /*
1164 * put it back on the head of its queue
1165 */
1166 goto reenter_pg_on_q;
1167 }
1168 if (m->vmp_pmapped == TRUE) {
1169 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1170
1171 disconnected_count++;
1172 }
1173 reenter_pg_on_q:
1174 vm_page_queue_remove(q, m, vmp_pageq);
1175 vm_page_queue_enter(q, m, vmp_pageq);
1176
1177 qcount--;
1178 try_failed_count = 0;
1179
1180 if (delayed_unlock++ > 128) {
1181 if (l_object != NULL) {
1182 vm_object_unlock(l_object);
1183 l_object = NULL;
1184 }
1185 lck_mtx_yield(&vm_page_queue_lock);
1186 delayed_unlock = 0;
1187 }
1188 }
1189 if (l_object != NULL) {
1190 vm_object_unlock(l_object);
1191 l_object = NULL;
1192 }
1193 vm_page_unlock_queues();
1194
1195 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1196 q, disconnected_count, object_locked_count, paused_count, 0);
1197 }
1198
1199 extern char* proc_best_name(struct proc* proc);
1200
1201 int
vm_toggle_task_selfdonate_pages(task_t task)1202 vm_toggle_task_selfdonate_pages(task_t task)
1203 {
1204 int state = 0;
1205 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1206 printf("VM Donation mode is OFF on the system\n");
1207 return state;
1208 }
1209 if (task != kernel_task) {
1210 task_lock(task);
1211 if (!task->donates_own_pages) {
1212 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1213 task->donates_own_pages = true;
1214 state = 1;
1215 } else if (task->donates_own_pages) {
1216 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1217 task->donates_own_pages = false;
1218 state = 0;
1219 }
1220 task_unlock(task);
1221 }
1222 return state;
1223 }
1224 #endif /* DEVELOPMENT || DEBUG */
1225
1226 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1227 vm_task_set_selfdonate_pages(task_t task, bool donate)
1228 {
1229 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1230 assert(task != kernel_task);
1231
1232 task_lock(task);
1233 task->donates_own_pages = donate;
1234 task_unlock(task);
1235 }
1236
1237
1238
1239 static size_t
1240 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1241
1242 /*
1243 * condition variable used to make sure there is
1244 * only a single sweep going on at a time
1245 */
1246 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1247
1248
1249 void
vm_pageout_anonymous_pages()1250 vm_pageout_anonymous_pages()
1251 {
1252 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1253 vm_page_lock_queues();
1254
1255 if (vm_pageout_anonymous_pages_active == TRUE) {
1256 vm_page_unlock_queues();
1257 return;
1258 }
1259 vm_pageout_anonymous_pages_active = TRUE;
1260 vm_page_unlock_queues();
1261
1262 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1263 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1264 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1265
1266 if (VM_CONFIG_SWAP_IS_PRESENT) {
1267 vm_consider_swapping();
1268 }
1269
1270 vm_page_lock_queues();
1271 vm_pageout_anonymous_pages_active = FALSE;
1272 vm_page_unlock_queues();
1273 }
1274 }
1275
1276
1277 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1278 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1279 {
1280 vm_page_t m;
1281 vm_object_t t_object = NULL;
1282 vm_object_t l_object = NULL;
1283 vm_object_t m_object = NULL;
1284 int delayed_unlock = 0;
1285 int try_failed_count = 0;
1286 int refmod_state;
1287 int pmap_options;
1288 struct vm_pageout_queue *iq;
1289 ppnum_t phys_page;
1290 size_t pages_moved = 0;
1291
1292
1293 iq = &vm_pageout_queue_internal;
1294
1295 vm_page_lock_queues();
1296
1297 #if DEVELOPMENT || DEBUG
1298 if (perf_test) {
1299 iq = &vm_pageout_queue_benchmark;
1300 }
1301 #endif /* DEVELOPMENT ||DEBUG */
1302
1303 while (qcount && !vm_page_queue_empty(q)) {
1304 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1305
1306 if (VM_PAGE_Q_THROTTLED(iq)) {
1307 if (l_object != NULL) {
1308 vm_object_unlock(l_object);
1309 l_object = NULL;
1310 }
1311 iq->pgo_draining = TRUE;
1312
1313 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1314 vm_page_unlock_queues();
1315
1316 thread_block(THREAD_CONTINUE_NULL);
1317
1318 vm_page_lock_queues();
1319 delayed_unlock = 0;
1320 continue;
1321 }
1322 m = (vm_page_t) vm_page_queue_first(q);
1323 m_object = VM_PAGE_OBJECT(m);
1324
1325 /*
1326 * check to see if we currently are working
1327 * with the same object... if so, we've
1328 * already got the lock
1329 */
1330 if (m_object != l_object) {
1331 if (!m_object->internal) {
1332 goto reenter_pg_on_q;
1333 }
1334
1335 /*
1336 * the object associated with candidate page is
1337 * different from the one we were just working
1338 * with... dump the lock if we still own it
1339 */
1340 if (l_object != NULL) {
1341 vm_object_unlock(l_object);
1342 l_object = NULL;
1343 }
1344 if (m_object != t_object) {
1345 try_failed_count = 0;
1346 }
1347
1348 /*
1349 * Try to lock object; since we've alread got the
1350 * page queues lock, we can only 'try' for this one.
1351 * if the 'try' fails, we need to do a mutex_pause
1352 * to allow the owner of the object lock a chance to
1353 * run...
1354 */
1355 if (!vm_object_lock_try_scan(m_object)) {
1356 if (try_failed_count > 20) {
1357 goto reenter_pg_on_q;
1358 }
1359 vm_page_unlock_queues();
1360 mutex_pause(try_failed_count++);
1361 vm_page_lock_queues();
1362 delayed_unlock = 0;
1363
1364 t_object = m_object;
1365 continue;
1366 }
1367 l_object = m_object;
1368 }
1369 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1370 /*
1371 * page is not to be cleaned
1372 * put it back on the head of its queue
1373 */
1374 goto reenter_pg_on_q;
1375 }
1376 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1377
1378 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1379 refmod_state = pmap_get_refmod(phys_page);
1380
1381 if (refmod_state & VM_MEM_REFERENCED) {
1382 m->vmp_reference = TRUE;
1383 }
1384 if (refmod_state & VM_MEM_MODIFIED) {
1385 SET_PAGE_DIRTY(m, FALSE);
1386 }
1387 }
1388 if (m->vmp_reference == TRUE) {
1389 m->vmp_reference = FALSE;
1390 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1391 goto reenter_pg_on_q;
1392 }
1393 if (m->vmp_pmapped == TRUE) {
1394 if (m->vmp_dirty || m->vmp_precious) {
1395 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1396 } else {
1397 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1398 }
1399 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1400 if (refmod_state & VM_MEM_MODIFIED) {
1401 SET_PAGE_DIRTY(m, FALSE);
1402 }
1403 }
1404
1405 if (!m->vmp_dirty && !m->vmp_precious) {
1406 vm_page_unlock_queues();
1407 VM_PAGE_FREE(m);
1408 vm_page_lock_queues();
1409 delayed_unlock = 0;
1410
1411 goto next_pg;
1412 }
1413 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1414 if (!m_object->pager_initialized) {
1415 vm_page_unlock_queues();
1416
1417 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1418
1419 if (!m_object->pager_initialized) {
1420 vm_object_compressor_pager_create(m_object);
1421 }
1422
1423 vm_page_lock_queues();
1424 delayed_unlock = 0;
1425 }
1426 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1427 goto reenter_pg_on_q;
1428 }
1429 /*
1430 * vm_object_compressor_pager_create will drop the object lock
1431 * which means 'm' may no longer be valid to use
1432 */
1433 continue;
1434 }
1435
1436 if (!perf_test) {
1437 /*
1438 * we've already factored out pages in the laundry which
1439 * means this page can't be on the pageout queue so it's
1440 * safe to do the vm_page_queues_remove
1441 */
1442 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1443 vm_page_queues_remove(m, TRUE);
1444 if (donate) {
1445 /*
1446 * The compressor needs to see this bit to know
1447 * where this page needs to land. Also if stolen,
1448 * this bit helps put the page back in the right
1449 * special queue where it belongs.
1450 */
1451 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1452 }
1453 } else {
1454 vm_page_queue_remove(q, m, vmp_pageq);
1455 }
1456
1457 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1458
1459 vm_pageout_cluster_to_queue(m, iq);
1460
1461 pages_moved++;
1462 goto next_pg;
1463
1464 reenter_pg_on_q:
1465 vm_page_queue_remove(q, m, vmp_pageq);
1466 vm_page_queue_enter(q, m, vmp_pageq);
1467 next_pg:
1468 qcount--;
1469 try_failed_count = 0;
1470
1471 if (delayed_unlock++ > 128) {
1472 if (l_object != NULL) {
1473 vm_object_unlock(l_object);
1474 l_object = NULL;
1475 }
1476 lck_mtx_yield(&vm_page_queue_lock);
1477 delayed_unlock = 0;
1478 }
1479 }
1480 if (l_object != NULL) {
1481 vm_object_unlock(l_object);
1482 l_object = NULL;
1483 }
1484 vm_page_unlock_queues();
1485 return pages_moved;
1486 }
1487
1488
1489
1490 /*
1491 * function in BSD to apply I/O throttle to the pageout thread
1492 */
1493 extern void vm_pageout_io_throttle(void);
1494
1495 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1496 MACRO_BEGIN \
1497 /* \
1498 * If a "reusable" page somehow made it back into \
1499 * the active queue, it's been re-used and is not \
1500 * quite re-usable. \
1501 * If the VM object was "all_reusable", consider it \
1502 * as "all re-used" instead of converting it to \
1503 * "partially re-used", which could be expensive. \
1504 */ \
1505 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1506 if ((m)->vmp_reusable || \
1507 (obj)->all_reusable) { \
1508 vm_object_reuse_pages((obj), \
1509 (m)->vmp_offset, \
1510 (m)->vmp_offset + PAGE_SIZE_64, \
1511 FALSE); \
1512 } \
1513 MACRO_END
1514
1515
1516 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1517 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1518
1519 #define FCS_IDLE 0
1520 #define FCS_DELAYED 1
1521 #define FCS_DEADLOCK_DETECTED 2
1522
1523 struct flow_control {
1524 int state;
1525 mach_timespec_t ts;
1526 };
1527
1528
1529 uint64_t vm_pageout_rejected_bq_internal = 0;
1530 uint64_t vm_pageout_rejected_bq_external = 0;
1531 uint64_t vm_pageout_skipped_bq_internal = 0;
1532 uint64_t vm_pageout_skipped_bq_external = 0;
1533
1534 #define ANONS_GRABBED_LIMIT 2
1535
1536
1537 #if 0
1538 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1539 #endif
1540 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1541
1542 #define VM_PAGEOUT_PB_NO_ACTION 0
1543 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1544 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1545
1546
1547 #if 0
1548 static void
1549 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1550 {
1551 if (*local_freeq) {
1552 vm_page_unlock_queues();
1553
1554 VM_DEBUG_CONSTANT_EVENT(
1555 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1556 vm_page_free_count, 0, 0, 1);
1557
1558 vm_page_free_list(*local_freeq, TRUE);
1559
1560 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1561 vm_page_free_count, *local_freed, 0, 1);
1562
1563 *local_freeq = NULL;
1564 *local_freed = 0;
1565
1566 vm_page_lock_queues();
1567 } else {
1568 lck_mtx_yield(&vm_page_queue_lock);
1569 }
1570 *delayed_unlock = 1;
1571 }
1572 #endif
1573
1574
1575 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1576 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1577 vm_page_t *local_freeq, int *local_freed, int action)
1578 {
1579 vm_page_unlock_queues();
1580
1581 if (*object != NULL) {
1582 vm_object_unlock(*object);
1583 *object = NULL;
1584 }
1585 if (*local_freeq) {
1586 vm_page_free_list(*local_freeq, TRUE);
1587
1588 *local_freeq = NULL;
1589 *local_freed = 0;
1590 }
1591 *delayed_unlock = 1;
1592
1593 switch (action) {
1594 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1595 vm_consider_waking_compactor_swapper();
1596 break;
1597 case VM_PAGEOUT_PB_THREAD_YIELD:
1598 thread_yield_internal(1);
1599 break;
1600 case VM_PAGEOUT_PB_NO_ACTION:
1601 default:
1602 break;
1603 }
1604 vm_page_lock_queues();
1605 }
1606
1607
1608 static struct vm_pageout_vminfo last;
1609
1610 uint64_t last_vm_page_pages_grabbed = 0;
1611
1612 extern uint32_t c_segment_pages_compressed;
1613
1614 extern uint64_t shared_region_pager_reclaimed;
1615 extern struct memory_object_pager_ops shared_region_pager_ops;
1616
1617 void
update_vm_info(void)1618 update_vm_info(void)
1619 {
1620 unsigned long tmp;
1621 uint64_t tmp64;
1622
1623 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1624 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1625 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1626 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1627
1628 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1629 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1630 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1631
1632 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1633 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1634 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1635 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1636 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1637
1638 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1639 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1640 last.vm_pageout_considered_page = tmp;
1641
1642 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1643 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1644 last.vm_pageout_compressions = tmp64;
1645
1646 tmp = vm_pageout_vminfo.vm_compressor_failed;
1647 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1648 last.vm_compressor_failed = tmp;
1649
1650 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1651 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1652 last.vm_compressor_pages_grabbed = tmp64;
1653
1654 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1655 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1656 last.vm_phantom_cache_found_ghost = tmp;
1657
1658 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1659 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1660 last.vm_phantom_cache_added_ghost = tmp;
1661
1662 tmp64 = counter_load(&vm_page_grab_count);
1663 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1664 last_vm_page_pages_grabbed = tmp64;
1665
1666 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1667 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1668 last.vm_page_pages_freed = tmp;
1669
1670 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1671 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1672 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1673 last.vm_pageout_pages_evicted = tmp;
1674
1675 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1676 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1677 last.vm_pageout_pages_purged = tmp;
1678
1679 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1680 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1681 last.vm_pageout_freed_speculative = tmp;
1682
1683 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1684 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1685 last.vm_pageout_freed_external = tmp;
1686
1687 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1688 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1689 last.vm_pageout_inactive_referenced = tmp;
1690
1691 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1692 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1693 last.vm_pageout_scan_inactive_throttled_external = tmp;
1694
1695 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1696 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1697 last.vm_pageout_inactive_dirty_external = tmp;
1698
1699 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1700 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1701 last.vm_pageout_freed_cleaned = tmp;
1702
1703 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1704 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1705 last.vm_pageout_inactive_nolock = tmp;
1706
1707 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1708 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1709 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1710
1711 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1712 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1713 last.vm_pageout_skipped_external = tmp;
1714
1715 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1716 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1717 last.vm_pageout_skipped_internal = tmp;
1718
1719 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1720 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1721 last.vm_pageout_reactivation_limit_exceeded = tmp;
1722
1723 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1724 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1725 last.vm_pageout_inactive_force_reclaim = tmp;
1726
1727 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1728 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1729 last.vm_pageout_freed_internal = tmp;
1730
1731 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1732 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1733 last.vm_pageout_considered_bq_internal = tmp;
1734
1735 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1736 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1737 last.vm_pageout_considered_bq_external = tmp;
1738
1739 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1740 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1741 last.vm_pageout_filecache_min_reactivated = tmp;
1742
1743 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1744 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1745 last.vm_pageout_inactive_dirty_internal = tmp;
1746
1747 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1748 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1749 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1750
1751 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1752 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1753 last.vm_pageout_forcereclaimed_realtime = tmp;
1754
1755 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1756 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1757 last.vm_pageout_protected_sharedcache = tmp;
1758
1759 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1760 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1761 last.vm_pageout_protected_realtime = tmp;
1762 }
1763
1764 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1765 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1766 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1767 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1768 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1769 0);
1770
1771 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1772 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1773 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1774 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1775 0,
1776 0);
1777
1778 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1779 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1780 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1781 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1782 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1783 0);
1784
1785 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1786 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1787 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1788 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1789 vm_pageout_stats[vm_pageout_stat_now].considered,
1790 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1791 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1792 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1793 0);
1794
1795 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1796 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1797 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1798 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1799 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1800 0);
1801
1802 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1803 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1804 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1805 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1806 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1807 0);
1808
1809 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1810 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1811 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1812 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1813 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1814 0);
1815
1816 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1817 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1818 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1819 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1820 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1821 0);
1822
1823 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1824 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1825 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1826 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1827 vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1828 0);
1829 }
1830 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1831 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1832 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1833 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1834 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1835 0);
1836
1837 record_memory_pressure();
1838 }
1839
1840 extern boolean_t hibernation_vmqueues_inspection;
1841
1842 /*
1843 * Return values for functions called by vm_pageout_scan
1844 * that control its flow.
1845 *
1846 * PROCEED -- vm_pageout_scan will keep making forward progress.
1847 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1848 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1849 */
1850
1851 #define VM_PAGEOUT_SCAN_PROCEED (0)
1852 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1853 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1854
1855 /*
1856 * This function is called only from vm_pageout_scan and
1857 * it moves overflow secluded pages (one-at-a-time) to the
1858 * batched 'local' free Q or active Q.
1859 */
1860 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1861 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1862 {
1863 #if CONFIG_SECLUDED_MEMORY
1864 /*
1865 * Deal with secluded_q overflow.
1866 */
1867 if (vm_page_secluded_count > vm_page_secluded_target) {
1868 vm_page_t secluded_page;
1869
1870 /*
1871 * SECLUDED_AGING_BEFORE_ACTIVE:
1872 * Excess secluded pages go to the active queue and
1873 * will later go to the inactive queue.
1874 */
1875 assert((vm_page_secluded_count_free +
1876 vm_page_secluded_count_inuse) ==
1877 vm_page_secluded_count);
1878 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1879 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1880
1881 vm_page_queues_remove(secluded_page, FALSE);
1882 assert(!secluded_page->vmp_fictitious);
1883 assert(!VM_PAGE_WIRED(secluded_page));
1884
1885 if (secluded_page->vmp_object == 0) {
1886 /* transfer to free queue */
1887 assert(secluded_page->vmp_busy);
1888 secluded_page->vmp_snext = *local_freeq;
1889 *local_freeq = secluded_page;
1890 *local_freed += 1;
1891 } else {
1892 /* transfer to head of active queue */
1893 vm_page_enqueue_active(secluded_page, FALSE);
1894 secluded_page = VM_PAGE_NULL;
1895 }
1896 }
1897 #else /* CONFIG_SECLUDED_MEMORY */
1898
1899 #pragma unused(local_freeq)
1900 #pragma unused(local_freed)
1901
1902 return;
1903
1904 #endif /* CONFIG_SECLUDED_MEMORY */
1905 }
1906
1907 /*
1908 * This function is called only from vm_pageout_scan and
1909 * it initializes the loop targets for vm_pageout_scan().
1910 */
1911 static void
vps_init_page_targets(void)1912 vps_init_page_targets(void)
1913 {
1914 /*
1915 * LD TODO: Other page targets should be calculated here too.
1916 */
1917 vm_page_anonymous_min = vm_page_inactive_target / 20;
1918
1919 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1920 vm_pageout_state.vm_page_speculative_percentage = 50;
1921 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1922 vm_pageout_state.vm_page_speculative_percentage = 1;
1923 }
1924
1925 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1926 vm_page_inactive_count);
1927 }
1928
1929 /*
1930 * This function is called only from vm_pageout_scan and
1931 * it purges a single VM object at-a-time and will either
1932 * make vm_pageout_scan() restart the loop or keeping moving forward.
1933 */
1934 static int
vps_purge_object()1935 vps_purge_object()
1936 {
1937 int force_purge;
1938
1939 assert(available_for_purge >= 0);
1940 force_purge = 0; /* no force-purging */
1941
1942 #if VM_PRESSURE_EVENTS
1943 vm_pressure_level_t pressure_level;
1944
1945 pressure_level = memorystatus_vm_pressure_level;
1946
1947 if (pressure_level > kVMPressureNormal) {
1948 if (pressure_level >= kVMPressureCritical) {
1949 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1950 } else if (pressure_level >= kVMPressureUrgent) {
1951 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1952 } else if (pressure_level >= kVMPressureWarning) {
1953 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1954 }
1955 }
1956 #endif /* VM_PRESSURE_EVENTS */
1957
1958 if (available_for_purge || force_purge) {
1959 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1960
1961 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1962 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1963 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1964 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1965 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1966
1967 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1968 }
1969 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1970 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1971 }
1972
1973 return VM_PAGEOUT_SCAN_PROCEED;
1974 }
1975
1976 /*
1977 * This function is called only from vm_pageout_scan and
1978 * it will try to age the next speculative Q if the oldest
1979 * one is empty.
1980 */
1981 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1982 vps_age_speculative_queue(boolean_t force_speculative_aging)
1983 {
1984 #define DELAY_SPECULATIVE_AGE 1000
1985
1986 /*
1987 * try to pull pages from the aging bins...
1988 * see vm_page.h for an explanation of how
1989 * this mechanism works
1990 */
1991 boolean_t can_steal = FALSE;
1992 int num_scanned_queues;
1993 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1994 mach_timespec_t ts;
1995 struct vm_speculative_age_q *aq;
1996 struct vm_speculative_age_q *sq;
1997
1998 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1999
2000 aq = &vm_page_queue_speculative[speculative_steal_index];
2001
2002 num_scanned_queues = 0;
2003 while (vm_page_queue_empty(&aq->age_q) &&
2004 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2005 speculative_steal_index++;
2006
2007 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2008 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2009 }
2010
2011 aq = &vm_page_queue_speculative[speculative_steal_index];
2012 }
2013
2014 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2015 /*
2016 * XXX We've scanned all the speculative
2017 * queues but still haven't found one
2018 * that is not empty, even though
2019 * vm_page_speculative_count is not 0.
2020 */
2021 if (!vm_page_queue_empty(&sq->age_q)) {
2022 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2023 }
2024 #if DEVELOPMENT || DEBUG
2025 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2026 #endif
2027 /* readjust... */
2028 vm_page_speculative_count = 0;
2029 /* ... and continue */
2030 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2031 }
2032
2033 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2034 can_steal = TRUE;
2035 } else {
2036 if (!delay_speculative_age) {
2037 mach_timespec_t ts_fully_aged;
2038
2039 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2040 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2041 * 1000 * NSEC_PER_USEC;
2042
2043 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2044
2045 clock_sec_t sec;
2046 clock_nsec_t nsec;
2047 clock_get_system_nanotime(&sec, &nsec);
2048 ts.tv_sec = (unsigned int) sec;
2049 ts.tv_nsec = nsec;
2050
2051 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2052 can_steal = TRUE;
2053 } else {
2054 delay_speculative_age++;
2055 }
2056 } else {
2057 delay_speculative_age++;
2058 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2059 delay_speculative_age = 0;
2060 }
2061 }
2062 }
2063 if (can_steal == TRUE) {
2064 vm_page_speculate_ageit(aq);
2065 }
2066
2067 return VM_PAGEOUT_SCAN_PROCEED;
2068 }
2069
2070 /*
2071 * This function is called only from vm_pageout_scan and
2072 * it evicts a single VM object from the cache.
2073 */
2074 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2075 vps_object_cache_evict(vm_object_t *object_to_unlock)
2076 {
2077 static int cache_evict_throttle = 0;
2078 struct vm_speculative_age_q *sq;
2079
2080 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2081
2082 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2083 int pages_evicted;
2084
2085 if (*object_to_unlock != NULL) {
2086 vm_object_unlock(*object_to_unlock);
2087 *object_to_unlock = NULL;
2088 }
2089 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2090
2091 pages_evicted = vm_object_cache_evict(100, 10);
2092
2093 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2094
2095 if (pages_evicted) {
2096 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2097
2098 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2099 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2100 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2101
2102 /*
2103 * we just freed up to 100 pages,
2104 * so go back to the top of the main loop
2105 * and re-evaulate the memory situation
2106 */
2107 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2108 } else {
2109 cache_evict_throttle = 1000;
2110 }
2111 }
2112 if (cache_evict_throttle) {
2113 cache_evict_throttle--;
2114 }
2115
2116 return VM_PAGEOUT_SCAN_PROCEED;
2117 }
2118
2119
2120 /*
2121 * This function is called only from vm_pageout_scan and
2122 * it calculates the filecache min. that needs to be maintained
2123 * as we start to steal pages.
2124 */
2125 static void
vps_calculate_filecache_min(void)2126 vps_calculate_filecache_min(void)
2127 {
2128 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2129
2130 #if CONFIG_JETSAM
2131 /*
2132 * don't let the filecache_min fall below 15% of available memory
2133 * on systems with an active compressor that isn't nearing its
2134 * limits w/r to accepting new data
2135 *
2136 * on systems w/o the compressor/swapper, the filecache is always
2137 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2138 * since most (if not all) of the anonymous pages are in the
2139 * throttled queue (which isn't counted as available) which
2140 * effectively disables this filter
2141 */
2142 if (vm_compressor_low_on_space() || divisor == 0) {
2143 vm_pageout_state.vm_page_filecache_min = 0;
2144 } else {
2145 vm_pageout_state.vm_page_filecache_min =
2146 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2147 }
2148 #else
2149 if (vm_compressor_out_of_space() || divisor == 0) {
2150 vm_pageout_state.vm_page_filecache_min = 0;
2151 } else {
2152 /*
2153 * don't let the filecache_min fall below the specified critical level
2154 */
2155 vm_pageout_state.vm_page_filecache_min =
2156 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2157 }
2158 #endif
2159 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2160 vm_pageout_state.vm_page_filecache_min = 0;
2161 }
2162 }
2163
2164 /*
2165 * This function is called only from vm_pageout_scan and
2166 * it updates the flow control time to detect if VM pageoutscan
2167 * isn't making progress.
2168 */
2169 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2170 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2171 {
2172 mach_timespec_t ts;
2173 clock_sec_t sec;
2174 clock_nsec_t nsec;
2175
2176 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2177 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2178 clock_get_system_nanotime(&sec, &nsec);
2179 flow_control->ts.tv_sec = (unsigned int) sec;
2180 flow_control->ts.tv_nsec = nsec;
2181 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2182
2183 flow_control->state = FCS_DELAYED;
2184
2185 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2186 }
2187
2188 /*
2189 * This function is called only from vm_pageout_scan and
2190 * it is the flow control logic of VM pageout scan which
2191 * controls if it should block and for how long.
2192 * Any blocking of vm_pageout_scan happens ONLY in this function.
2193 */
2194 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2195 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2196 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2197 {
2198 boolean_t exceeded_burst_throttle = FALSE;
2199 unsigned int msecs = 0;
2200 uint32_t inactive_external_count;
2201 mach_timespec_t ts;
2202 struct vm_pageout_queue *iq;
2203 struct vm_pageout_queue *eq;
2204 struct vm_speculative_age_q *sq;
2205
2206 iq = &vm_pageout_queue_internal;
2207 eq = &vm_pageout_queue_external;
2208 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2209
2210 /*
2211 * Sometimes we have to pause:
2212 * 1) No inactive pages - nothing to do.
2213 * 2) Loop control - no acceptable pages found on the inactive queue
2214 * within the last vm_pageout_burst_inactive_throttle iterations
2215 * 3) Flow control - default pageout queue is full
2216 */
2217 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2218 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2219 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2220 vm_page_queue_empty(&sq->age_q)) {
2221 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2222 msecs = vm_pageout_state.vm_pageout_empty_wait;
2223 } else if (inactive_burst_count >=
2224 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2225 (vm_page_inactive_count +
2226 vm_page_speculative_count))) {
2227 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2228 msecs = vm_pageout_state.vm_pageout_burst_wait;
2229
2230 exceeded_burst_throttle = TRUE;
2231 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2232 VM_DYNAMIC_PAGING_ENABLED()) {
2233 clock_sec_t sec;
2234 clock_nsec_t nsec;
2235
2236 switch (flow_control->state) {
2237 case FCS_IDLE:
2238 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2239 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2240 /*
2241 * since the compressor is running independently of vm_pageout_scan
2242 * let's not wait for it just yet... as long as we have a healthy supply
2243 * of filecache pages to work with, let's keep stealing those.
2244 */
2245 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2246
2247 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2248 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2249 *anons_grabbed = ANONS_GRABBED_LIMIT;
2250 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2251 return VM_PAGEOUT_SCAN_PROCEED;
2252 }
2253 }
2254
2255 vps_flow_control_reset_deadlock_timer(flow_control);
2256 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2257
2258 break;
2259
2260 case FCS_DELAYED:
2261 clock_get_system_nanotime(&sec, &nsec);
2262 ts.tv_sec = (unsigned int) sec;
2263 ts.tv_nsec = nsec;
2264
2265 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2266 /*
2267 * the pageout thread for the default pager is potentially
2268 * deadlocked since the
2269 * default pager queue has been throttled for more than the
2270 * allowable time... we need to move some clean pages or dirty
2271 * pages belonging to the external pagers if they aren't throttled
2272 * vm_page_free_wanted represents the number of threads currently
2273 * blocked waiting for pages... we'll move one page for each of
2274 * these plus a fixed amount to break the logjam... once we're done
2275 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2276 * with a new timeout target since we have no way of knowing
2277 * whether we've broken the deadlock except through observation
2278 * of the queue associated with the default pager... we need to
2279 * stop moving pages and allow the system to run to see what
2280 * state it settles into.
2281 */
2282
2283 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2284 vm_page_free_wanted + vm_page_free_wanted_privileged;
2285 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2286 flow_control->state = FCS_DEADLOCK_DETECTED;
2287 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2288 return VM_PAGEOUT_SCAN_PROCEED;
2289 }
2290 /*
2291 * just resniff instead of trying
2292 * to compute a new delay time... we're going to be
2293 * awakened immediately upon a laundry completion,
2294 * so we won't wait any longer than necessary
2295 */
2296 msecs = vm_pageout_state.vm_pageout_idle_wait;
2297 break;
2298
2299 case FCS_DEADLOCK_DETECTED:
2300 if (*vm_pageout_deadlock_target) {
2301 return VM_PAGEOUT_SCAN_PROCEED;
2302 }
2303
2304 vps_flow_control_reset_deadlock_timer(flow_control);
2305 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2306
2307 break;
2308 }
2309 } else {
2310 /*
2311 * No need to pause...
2312 */
2313 return VM_PAGEOUT_SCAN_PROCEED;
2314 }
2315
2316 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2317
2318 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2319 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2320
2321 if (vm_page_free_count >= vm_page_free_target) {
2322 /*
2323 * we're here because
2324 * 1) someone else freed up some pages while we had
2325 * the queues unlocked above
2326 * and we've hit one of the 3 conditions that
2327 * cause us to pause the pageout scan thread
2328 *
2329 * since we already have enough free pages,
2330 * let's avoid stalling and return normally
2331 *
2332 * before we return, make sure the pageout I/O threads
2333 * are running throttled in case there are still requests
2334 * in the laundry... since we have enough free pages
2335 * we don't need the laundry to be cleaned in a timely
2336 * fashion... so let's avoid interfering with foreground
2337 * activity
2338 *
2339 * we don't want to hold vm_page_queue_free_lock when
2340 * calling vm_pageout_adjust_eq_iothrottle (since it
2341 * may cause other locks to be taken), we do the intitial
2342 * check outside of the lock. Once we take the lock,
2343 * we recheck the condition since it may have changed.
2344 * if it has, no problem, we will make the threads
2345 * non-throttled before actually blocking
2346 */
2347 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2348 }
2349 vm_free_page_lock();
2350
2351 if (vm_page_free_count >= vm_page_free_target &&
2352 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2353 return VM_PAGEOUT_SCAN_DONE_RETURN;
2354 }
2355 vm_free_page_unlock();
2356
2357 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2358 /*
2359 * we're most likely about to block due to one of
2360 * the 3 conditions that cause vm_pageout_scan to
2361 * not be able to make forward progress w/r
2362 * to providing new pages to the free queue,
2363 * so unthrottle the I/O threads in case we
2364 * have laundry to be cleaned... it needs
2365 * to be completed ASAP.
2366 *
2367 * even if we don't block, we want the io threads
2368 * running unthrottled since the sum of free +
2369 * clean pages is still under our free target
2370 */
2371 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2372 }
2373 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2374 /*
2375 * if we get here we're below our free target and
2376 * we're stalling due to a full laundry queue or
2377 * we don't have any inactive pages other then
2378 * those in the clean queue...
2379 * however, we have pages on the clean queue that
2380 * can be moved to the free queue, so let's not
2381 * stall the pageout scan
2382 */
2383 flow_control->state = FCS_IDLE;
2384 return VM_PAGEOUT_SCAN_PROCEED;
2385 }
2386 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2387 flow_control->state = FCS_IDLE;
2388 return VM_PAGEOUT_SCAN_PROCEED;
2389 }
2390
2391 VM_CHECK_MEMORYSTATUS;
2392
2393 if (flow_control->state != FCS_IDLE) {
2394 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2395 }
2396
2397 iq->pgo_throttled = TRUE;
2398 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2399
2400 vm_page_unlock_queues();
2401
2402 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2403
2404 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2405 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2406 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2407
2408 thread_block(THREAD_CONTINUE_NULL);
2409
2410 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2411 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2412 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2413
2414 vm_page_lock_queues();
2415
2416 iq->pgo_throttled = FALSE;
2417
2418 vps_init_page_targets();
2419
2420 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2421 }
2422
2423 extern boolean_t vm_darkwake_mode;
2424 /*
2425 * This function is called only from vm_pageout_scan and
2426 * it will find and return the most appropriate page to be
2427 * reclaimed.
2428 */
2429 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2430 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2431 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2432 {
2433 vm_page_t m = NULL;
2434 vm_object_t m_object = VM_OBJECT_NULL;
2435 uint32_t inactive_external_count;
2436 struct vm_speculative_age_q *sq;
2437 struct vm_pageout_queue *iq;
2438 int retval = VM_PAGEOUT_SCAN_PROCEED;
2439
2440 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2441 iq = &vm_pageout_queue_internal;
2442
2443 *is_page_from_bg_q = FALSE;
2444
2445 m = NULL;
2446 m_object = VM_OBJECT_NULL;
2447
2448 if (VM_DYNAMIC_PAGING_ENABLED()) {
2449 assert(vm_page_throttled_count == 0);
2450 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2451 }
2452
2453 /*
2454 * Try for a clean-queue inactive page.
2455 * These are pages that vm_pageout_scan tried to steal earlier, but
2456 * were dirty and had to be cleaned. Pick them up now that they are clean.
2457 */
2458 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2459 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2460
2461 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2462
2463 goto found_page;
2464 }
2465
2466 /*
2467 * The next most eligible pages are ones we paged in speculatively,
2468 * but which have not yet been touched and have been aged out.
2469 */
2470 if (!vm_page_queue_empty(&sq->age_q)) {
2471 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2472
2473 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2474
2475 if (!m->vmp_dirty || force_anonymous == FALSE) {
2476 goto found_page;
2477 } else {
2478 m = NULL;
2479 }
2480 }
2481
2482 #if !CONFIG_JETSAM
2483 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2484 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2485 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2486 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2487 goto found_page;
2488 }
2489 }
2490 #endif /* !CONFIG_JETSAM */
2491
2492 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2493 vm_object_t bg_m_object = NULL;
2494
2495 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2496
2497 bg_m_object = VM_PAGE_OBJECT(m);
2498
2499 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2500 /*
2501 * This page is on the background queue
2502 * but not on a pageable queue OR is busy during
2503 * darkwake mode when the target is artificially lowered.
2504 * If it is busy during darkwake mode, and we don't skip it,
2505 * we will just swing back around and try again with the same
2506 * queue and might hit the same page or its neighbor in a
2507 * similar state. Both of these are transient states and will
2508 * get resolved, but, at this point let's ignore this page.
2509 */
2510 if (vm_darkwake_mode && m->vmp_busy) {
2511 if (bg_m_object->internal) {
2512 vm_pageout_skipped_bq_internal++;
2513 } else {
2514 vm_pageout_skipped_bq_external++;
2515 }
2516 }
2517 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2518 if (bg_m_object->internal &&
2519 (VM_PAGE_Q_THROTTLED(iq) ||
2520 vm_compressor_out_of_space() == TRUE ||
2521 vm_page_free_count < (vm_page_free_reserved / 4))) {
2522 vm_pageout_skipped_bq_internal++;
2523 } else {
2524 *is_page_from_bg_q = TRUE;
2525
2526 if (bg_m_object->internal) {
2527 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2528 } else {
2529 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2530 }
2531 goto found_page;
2532 }
2533 }
2534 }
2535
2536 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2537
2538 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2539 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2540 *grab_anonymous = TRUE;
2541 *anons_grabbed = 0;
2542
2543 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2544 vm_pageout_vminfo.vm_pageout_skipped_external++;
2545 } else {
2546 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2547 /*
2548 * No swap and we are in dangerously low levels of free memory.
2549 * If we keep going ahead with anonymous pages, we are going to run into a situation
2550 * where the compressor will be stuck waiting for free pages (if it isn't already).
2551 *
2552 * So, pick a file backed page...
2553 */
2554 *grab_anonymous = FALSE;
2555 *anons_grabbed = ANONS_GRABBED_LIMIT;
2556 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2557 }
2558 }
2559 goto want_anonymous;
2560 }
2561 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2562
2563 #if CONFIG_JETSAM
2564 /* If the file-backed pool has accumulated
2565 * significantly more pages than the jetsam
2566 * threshold, prefer to reclaim those
2567 * inline to minimise compute overhead of reclaiming
2568 * anonymous pages.
2569 * This calculation does not account for the CPU local
2570 * external page queues, as those are expected to be
2571 * much smaller relative to the global pools.
2572 */
2573
2574 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2575
2576 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2577 if (vm_page_pageable_external_count >
2578 vm_pageout_state.vm_page_filecache_min) {
2579 if ((vm_page_pageable_external_count *
2580 vm_pageout_memorystatus_fb_factor_dr) >
2581 (memorystatus_available_pages_critical *
2582 vm_pageout_memorystatus_fb_factor_nr)) {
2583 *grab_anonymous = FALSE;
2584
2585 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2586 }
2587 }
2588 if (*grab_anonymous) {
2589 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2590 }
2591 }
2592 #endif /* CONFIG_JETSAM */
2593
2594 want_anonymous:
2595 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2596 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2597 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2598
2599 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2600 *anons_grabbed = 0;
2601
2602 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2603 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2604 if ((++(*reactivated_this_call) % 100)) {
2605 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2606
2607 vm_page_activate(m);
2608 counter_inc(&vm_statistics_reactivations);
2609 #if DEVELOPMENT || DEBUG
2610 if (*is_page_from_bg_q == TRUE) {
2611 if (m_object->internal) {
2612 vm_pageout_rejected_bq_internal++;
2613 } else {
2614 vm_pageout_rejected_bq_external++;
2615 }
2616 }
2617 #endif /* DEVELOPMENT || DEBUG */
2618 vm_pageout_state.vm_pageout_inactive_used++;
2619
2620 m = NULL;
2621 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2622
2623 goto found_page;
2624 }
2625
2626 /*
2627 * steal 1 of the file backed pages even if
2628 * we are under the limit that has been set
2629 * for a healthy filecache
2630 */
2631 }
2632 }
2633 goto found_page;
2634 }
2635 }
2636 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2638
2639 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2640 *anons_grabbed += 1;
2641
2642 goto found_page;
2643 }
2644
2645 m = NULL;
2646
2647 found_page:
2648 *victim_page = m;
2649
2650 return retval;
2651 }
2652
2653 /*
2654 * This function is called only from vm_pageout_scan and
2655 * it will put a page back on the active/inactive queue
2656 * if we can't reclaim it for some reason.
2657 */
2658 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2659 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2660 {
2661 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2662 vm_page_enqueue_inactive(m, FALSE);
2663 } else {
2664 vm_page_activate(m);
2665 }
2666
2667 #if DEVELOPMENT || DEBUG
2668 vm_object_t m_object = VM_PAGE_OBJECT(m);
2669
2670 if (page_from_bg_q == TRUE) {
2671 if (m_object->internal) {
2672 vm_pageout_rejected_bq_internal++;
2673 } else {
2674 vm_pageout_rejected_bq_external++;
2675 }
2676 }
2677 #endif /* DEVELOPMENT || DEBUG */
2678 }
2679
2680 /*
2681 * This function is called only from vm_pageout_scan and
2682 * it will try to grab the victim page's VM object (m_object)
2683 * which differs from the previous victim page's object (object).
2684 */
2685 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2686 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2687 {
2688 struct vm_speculative_age_q *sq;
2689
2690 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2691
2692 /*
2693 * the object associated with candidate page is
2694 * different from the one we were just working
2695 * with... dump the lock if we still own it
2696 */
2697 if (*object != NULL) {
2698 vm_object_unlock(*object);
2699 *object = NULL;
2700 }
2701 /*
2702 * Try to lock object; since we've alread got the
2703 * page queues lock, we can only 'try' for this one.
2704 * if the 'try' fails, we need to do a mutex_pause
2705 * to allow the owner of the object lock a chance to
2706 * run... otherwise, we're likely to trip over this
2707 * object in the same state as we work our way through
2708 * the queue... clumps of pages associated with the same
2709 * object are fairly typical on the inactive and active queues
2710 */
2711 if (!vm_object_lock_try_scan(m_object)) {
2712 vm_page_t m_want = NULL;
2713
2714 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2715
2716 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2717 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2718 }
2719
2720 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2721
2722 m->vmp_reference = FALSE;
2723
2724 if (!m_object->object_is_shared_cache) {
2725 /*
2726 * don't apply this optimization if this is the shared cache
2727 * object, it's too easy to get rid of very hot and important
2728 * pages...
2729 * m->vmp_object must be stable since we hold the page queues lock...
2730 * we can update the scan_collisions field sans the object lock
2731 * since it is a separate field and this is the only spot that does
2732 * a read-modify-write operation and it is never executed concurrently...
2733 * we can asynchronously set this field to 0 when creating a UPL, so it
2734 * is possible for the value to be a bit non-determistic, but that's ok
2735 * since it's only used as a hint
2736 */
2737 m_object->scan_collisions = 1;
2738 }
2739 if (page_from_bg_q) {
2740 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2741 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2742 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2743 } else if (!vm_page_queue_empty(&sq->age_q)) {
2744 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2745 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2746 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2747 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2748 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2749 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2750 }
2751
2752 /*
2753 * this is the next object we're going to be interested in
2754 * try to make sure its available after the mutex_pause
2755 * returns control
2756 */
2757 if (m_want) {
2758 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2759 }
2760
2761 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2762
2763 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2764 } else {
2765 *object = m_object;
2766 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2767 }
2768
2769 return VM_PAGEOUT_SCAN_PROCEED;
2770 }
2771
2772 /*
2773 * This function is called only from vm_pageout_scan and
2774 * it notices that pageout scan may be rendered ineffective
2775 * due to a FS deadlock and will jetsam a process if possible.
2776 * If jetsam isn't supported, it'll move the page to the active
2777 * queue to try and get some different pages pushed onwards so
2778 * we can try to get out of this scenario.
2779 */
2780 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2781 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2782 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2783 {
2784 struct vm_pageout_queue *eq;
2785 vm_object_t cur_object = VM_OBJECT_NULL;
2786
2787 cur_object = *object;
2788
2789 eq = &vm_pageout_queue_external;
2790
2791 if (cur_object->internal == FALSE) {
2792 /*
2793 * we need to break up the following potential deadlock case...
2794 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2795 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2796 * c) Most of the pages in the inactive queue belong to this file.
2797 *
2798 * we are potentially in this deadlock because...
2799 * a) the external pageout queue is throttled
2800 * b) we're done with the active queue and moved on to the inactive queue
2801 * c) we've got a dirty external page
2802 *
2803 * since we don't know the reason for the external pageout queue being throttled we
2804 * must suspect that we are deadlocked, so move the current page onto the active queue
2805 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2806 *
2807 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2808 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2809 * pool the next time we select a victim page... if we can make enough new free pages,
2810 * the deadlock will break, the external pageout queue will empty and it will no longer
2811 * be throttled
2812 *
2813 * if we have jetsam configured, keep a count of the pages reactivated this way so
2814 * that we can try to find clean pages in the active/inactive queues before
2815 * deciding to jetsam a process
2816 */
2817 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2818
2819 vm_page_check_pageable_safe(m);
2820 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2821 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2822 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2823 vm_page_active_count++;
2824 vm_page_pageable_external_count++;
2825
2826 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2827
2828 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2829
2830 #pragma unused(force_anonymous)
2831
2832 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2833
2834 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2835 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2836 /*
2837 * Possible deadlock scenario so request jetsam action
2838 */
2839
2840 assert(cur_object);
2841 vm_object_unlock(cur_object);
2842
2843 cur_object = VM_OBJECT_NULL;
2844
2845 /*
2846 * VM pageout scan needs to know we have dropped this lock and so set the
2847 * object variable we got passed in to NULL.
2848 */
2849 *object = VM_OBJECT_NULL;
2850
2851 vm_page_unlock_queues();
2852
2853 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2854 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2855
2856 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2857 if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2858 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2859 }
2860
2861 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2862 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2863
2864 vm_page_lock_queues();
2865 *delayed_unlock = 1;
2866 }
2867 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2868
2869 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2870 #pragma unused(delayed_unlock)
2871
2872 *force_anonymous = TRUE;
2873 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2874 } else {
2875 vm_page_activate(m);
2876 counter_inc(&vm_statistics_reactivations);
2877
2878 #if DEVELOPMENT || DEBUG
2879 if (is_page_from_bg_q == TRUE) {
2880 if (cur_object->internal) {
2881 vm_pageout_rejected_bq_internal++;
2882 } else {
2883 vm_pageout_rejected_bq_external++;
2884 }
2885 }
2886 #endif /* DEVELOPMENT || DEBUG */
2887
2888 vm_pageout_state.vm_pageout_inactive_used++;
2889 }
2890 }
2891
2892
2893 void
vm_page_balance_inactive(int max_to_move)2894 vm_page_balance_inactive(int max_to_move)
2895 {
2896 vm_page_t m;
2897
2898 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2899
2900 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2901 /*
2902 * It is likely that the hibernation code path is
2903 * dealing with these very queues as we are about
2904 * to move pages around in/from them and completely
2905 * change the linkage of the pages.
2906 *
2907 * And so we skip the rebalancing of these queues.
2908 */
2909 return;
2910 }
2911 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2912 vm_page_inactive_count +
2913 vm_page_speculative_count);
2914
2915 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2916 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2917
2918 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2919
2920 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2921 assert(!m->vmp_laundry);
2922 assert(VM_PAGE_OBJECT(m) != kernel_object);
2923 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2924
2925 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2926
2927 /*
2928 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2929 *
2930 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2931 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2932 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2933 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2934 * by pageout_scan, which is just fine since the last reference would have happened quite far
2935 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2936 * have happened before we moved the page
2937 */
2938 if (m->vmp_pmapped == TRUE) {
2939 /*
2940 * We might be holding the page queue lock as a
2941 * spin lock and clearing the "referenced" bit could
2942 * take a while if there are lots of mappings of
2943 * that page, so make sure we acquire the lock as
2944 * as mutex to avoid a spinlock timeout.
2945 */
2946 vm_page_lockconvert_queues();
2947 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2948 }
2949
2950 /*
2951 * The page might be absent or busy,
2952 * but vm_page_deactivate can handle that.
2953 * FALSE indicates that we don't want a H/W clear reference
2954 */
2955 vm_page_deactivate_internal(m, FALSE);
2956 }
2957 }
2958
2959 /*
2960 * vm_pageout_scan does the dirty work for the pageout daemon.
2961 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2962 * held and vm_page_free_wanted == 0.
2963 */
2964 void
vm_pageout_scan(void)2965 vm_pageout_scan(void)
2966 {
2967 unsigned int loop_count = 0;
2968 unsigned int inactive_burst_count = 0;
2969 unsigned int reactivated_this_call;
2970 unsigned int reactivate_limit;
2971 vm_page_t local_freeq = NULL;
2972 int local_freed = 0;
2973 int delayed_unlock;
2974 int delayed_unlock_limit = 0;
2975 int refmod_state = 0;
2976 int vm_pageout_deadlock_target = 0;
2977 struct vm_pageout_queue *iq;
2978 struct vm_pageout_queue *eq;
2979 struct vm_speculative_age_q *sq;
2980 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2981 boolean_t inactive_throttled = FALSE;
2982 vm_object_t object = NULL;
2983 uint32_t inactive_reclaim_run;
2984 boolean_t grab_anonymous = FALSE;
2985 boolean_t force_anonymous = FALSE;
2986 boolean_t force_speculative_aging = FALSE;
2987 int anons_grabbed = 0;
2988 int page_prev_q_state = 0;
2989 boolean_t page_from_bg_q = FALSE;
2990 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
2991 vm_object_t m_object = VM_OBJECT_NULL;
2992 int retval = 0;
2993 boolean_t lock_yield_check = FALSE;
2994
2995
2996 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2997 vm_pageout_vminfo.vm_pageout_freed_speculative,
2998 vm_pageout_state.vm_pageout_inactive_clean,
2999 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3000 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3001
3002 flow_control.state = FCS_IDLE;
3003 iq = &vm_pageout_queue_internal;
3004 eq = &vm_pageout_queue_external;
3005 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3006
3007 /* Ask the pmap layer to return any pages it no longer needs. */
3008 uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
3009
3010 vm_page_lock_queues();
3011
3012 vm_page_wire_count -= pmap_wired_pages_freed;
3013
3014 delayed_unlock = 1;
3015
3016 /*
3017 * Calculate the max number of referenced pages on the inactive
3018 * queue that we will reactivate.
3019 */
3020 reactivated_this_call = 0;
3021 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3022 vm_page_inactive_count);
3023 inactive_reclaim_run = 0;
3024
3025 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3026
3027 /*
3028 * We must limit the rate at which we send pages to the pagers
3029 * so that we don't tie up too many pages in the I/O queues.
3030 * We implement a throttling mechanism using the laundry count
3031 * to limit the number of pages outstanding to the default
3032 * and external pagers. We can bypass the throttles and look
3033 * for clean pages if the pageout queues don't drain in a timely
3034 * fashion since this may indicate that the pageout paths are
3035 * stalled waiting for memory, which only we can provide.
3036 */
3037
3038 vps_init_page_targets();
3039 assert(object == NULL);
3040 assert(delayed_unlock != 0);
3041
3042 for (;;) {
3043 vm_page_t m;
3044
3045 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3046
3047 if (lock_yield_check) {
3048 lock_yield_check = FALSE;
3049
3050 if (delayed_unlock++ > delayed_unlock_limit) {
3051 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3052 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3053 } else if (vm_pageout_scan_wants_object) {
3054 vm_page_unlock_queues();
3055 mutex_pause(0);
3056 vm_page_lock_queues();
3057 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3058 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3059 }
3060 }
3061
3062 if (vm_upl_wait_for_pages < 0) {
3063 vm_upl_wait_for_pages = 0;
3064 }
3065
3066 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3067
3068 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3069 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3070 }
3071
3072 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3073
3074 assert(delayed_unlock);
3075
3076 /*
3077 * maintain our balance
3078 */
3079 vm_page_balance_inactive(1);
3080
3081
3082 /**********************************************************************
3083 * above this point we're playing with the active and secluded queues
3084 * below this point we're playing with the throttling mechanisms
3085 * and the inactive queue
3086 **********************************************************************/
3087
3088 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3089 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3090
3091 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3092 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3093 /*
3094 * make sure the pageout I/O threads are running
3095 * throttled in case there are still requests
3096 * in the laundry... since we have met our targets
3097 * we don't need the laundry to be cleaned in a timely
3098 * fashion... so let's avoid interfering with foreground
3099 * activity
3100 */
3101 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3102
3103 vm_free_page_lock();
3104
3105 if ((vm_page_free_count >= vm_page_free_target) &&
3106 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3107 /*
3108 * done - we have met our target *and*
3109 * there is no one waiting for a page.
3110 */
3111 return_from_scan:
3112 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3113
3114 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3115 vm_pageout_state.vm_pageout_inactive,
3116 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3117 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3118 vm_pageout_vminfo.vm_pageout_freed_speculative,
3119 vm_pageout_state.vm_pageout_inactive_clean,
3120 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3121 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3122
3123 return;
3124 }
3125 vm_free_page_unlock();
3126 }
3127
3128 /*
3129 * Before anything, we check if we have any ripe volatile
3130 * objects around. If so, try to purge the first object.
3131 * If the purge fails, fall through to reclaim a page instead.
3132 * If the purge succeeds, go back to the top and reevalute
3133 * the new memory situation.
3134 */
3135 retval = vps_purge_object();
3136
3137 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3138 /*
3139 * Success
3140 */
3141 if (object != NULL) {
3142 vm_object_unlock(object);
3143 object = NULL;
3144 }
3145
3146 lock_yield_check = FALSE;
3147 continue;
3148 }
3149
3150 /*
3151 * If our 'aged' queue is empty and we have some speculative pages
3152 * in the other queues, let's go through and see if we need to age
3153 * them.
3154 *
3155 * If we succeeded in aging a speculative Q or just that everything
3156 * looks normal w.r.t queue age and queue counts, we keep going onward.
3157 *
3158 * If, for some reason, we seem to have a mismatch between the spec.
3159 * page count and the page queues, we reset those variables and
3160 * restart the loop (LD TODO: Track this better?).
3161 */
3162 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3163 retval = vps_age_speculative_queue(force_speculative_aging);
3164
3165 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3166 lock_yield_check = FALSE;
3167 continue;
3168 }
3169 }
3170 force_speculative_aging = FALSE;
3171
3172 /*
3173 * Check to see if we need to evict objects from the cache.
3174 *
3175 * Note: 'object' here doesn't have anything to do with
3176 * the eviction part. We just need to make sure we have dropped
3177 * any object lock we might be holding if we need to go down
3178 * into the eviction logic.
3179 */
3180 retval = vps_object_cache_evict(&object);
3181
3182 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3183 lock_yield_check = FALSE;
3184 continue;
3185 }
3186
3187
3188 /*
3189 * Calculate our filecache_min that will affect the loop
3190 * going forward.
3191 */
3192 vps_calculate_filecache_min();
3193
3194 /*
3195 * LD TODO: Use a structure to hold all state variables for a single
3196 * vm_pageout_scan iteration and pass that structure to this function instead.
3197 */
3198 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3199 &delayed_unlock, &local_freeq, &local_freed,
3200 &vm_pageout_deadlock_target, inactive_burst_count);
3201
3202 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3203 if (loop_count >= vm_page_inactive_count) {
3204 loop_count = 0;
3205 }
3206
3207 inactive_burst_count = 0;
3208
3209 assert(object == NULL);
3210 assert(delayed_unlock != 0);
3211
3212 lock_yield_check = FALSE;
3213 continue;
3214 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3215 goto return_from_scan;
3216 }
3217
3218 flow_control.state = FCS_IDLE;
3219
3220 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3221 vm_pageout_inactive_external_forced_reactivate_limit);
3222 loop_count++;
3223 inactive_burst_count++;
3224 vm_pageout_state.vm_pageout_inactive++;
3225
3226 /*
3227 * Choose a victim.
3228 */
3229
3230 m = NULL;
3231 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3232
3233 if (m == NULL) {
3234 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3235 inactive_burst_count = 0;
3236
3237 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3238 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3239 }
3240
3241 lock_yield_check = TRUE;
3242 continue;
3243 }
3244
3245 /*
3246 * if we've gotten here, we have no victim page.
3247 * check to see if we've not finished balancing the queues
3248 * or we have a page on the aged speculative queue that we
3249 * skipped due to force_anonymous == TRUE.. or we have
3250 * speculative pages that we can prematurely age... if
3251 * one of these cases we'll keep going, else panic
3252 */
3253 force_anonymous = FALSE;
3254 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3255
3256 if (!vm_page_queue_empty(&sq->age_q)) {
3257 lock_yield_check = TRUE;
3258 continue;
3259 }
3260
3261 if (vm_page_speculative_count) {
3262 force_speculative_aging = TRUE;
3263 lock_yield_check = TRUE;
3264 continue;
3265 }
3266 panic("vm_pageout: no victim");
3267
3268 /* NOTREACHED */
3269 }
3270
3271 assert(VM_PAGE_PAGEABLE(m));
3272 m_object = VM_PAGE_OBJECT(m);
3273 force_anonymous = FALSE;
3274
3275 page_prev_q_state = m->vmp_q_state;
3276 /*
3277 * we just found this page on one of our queues...
3278 * it can't also be on the pageout queue, so safe
3279 * to call vm_page_queues_remove
3280 */
3281 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3282 vm_page_queues_remove(m, TRUE);
3283 if (donate) {
3284 /*
3285 * The compressor needs to see this bit to know
3286 * where this page needs to land. Also if stolen,
3287 * this bit helps put the page back in the right
3288 * special queue where it belongs.
3289 */
3290 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3291 }
3292
3293 assert(!m->vmp_laundry);
3294 assert(!m->vmp_private);
3295 assert(!m->vmp_fictitious);
3296 assert(m_object != kernel_object);
3297 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3298
3299 vm_pageout_vminfo.vm_pageout_considered_page++;
3300
3301 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3302
3303 /*
3304 * check to see if we currently are working
3305 * with the same object... if so, we've
3306 * already got the lock
3307 */
3308 if (m_object != object) {
3309 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3310
3311 /*
3312 * vps_switch_object() will always drop the 'object' lock first
3313 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3314 * either 'm_object' or NULL.
3315 */
3316 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3317
3318 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3319 lock_yield_check = TRUE;
3320 continue;
3321 }
3322 }
3323 assert(m_object == object);
3324 assert(VM_PAGE_OBJECT(m) == m_object);
3325
3326 if (m->vmp_busy) {
3327 /*
3328 * Somebody is already playing with this page.
3329 * Put it back on the appropriate queue
3330 *
3331 */
3332 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3333
3334 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3335 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3336 }
3337
3338 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3339
3340 lock_yield_check = TRUE;
3341 continue;
3342 }
3343
3344 /*
3345 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3346 * If already cleaning this page in place
3347 * just leave if off the paging queues.
3348 * We can leave the page mapped, and upl_commit_range
3349 * will put it on the clean queue.
3350 *
3351 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3352 * an msync INVALIDATE is in progress...
3353 * this page has been marked for destruction
3354 * after it has been cleaned,
3355 * but not yet gathered into a UPL
3356 * where 'cleaning' will be set...
3357 * just leave it off the paging queues
3358 *
3359 * if (m->vmp_free_when_done && m->vmp_clenaing)
3360 * an msync INVALIDATE is in progress
3361 * and the UPL has already gathered this page...
3362 * just leave it off the paging queues
3363 */
3364 if (m->vmp_free_when_done || m->vmp_cleaning) {
3365 lock_yield_check = TRUE;
3366 continue;
3367 }
3368
3369
3370 /*
3371 * If it's absent, in error or the object is no longer alive,
3372 * we can reclaim the page... in the no longer alive case,
3373 * there are 2 states the page can be in that preclude us
3374 * from reclaiming it - busy or cleaning - that we've already
3375 * dealt with
3376 */
3377 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3378 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3379 if (m->vmp_absent) {
3380 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3381 } else if (!object->alive ||
3382 (!object->internal &&
3383 object->pager == MEMORY_OBJECT_NULL)) {
3384 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3385 } else {
3386 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3387 }
3388 reclaim_page:
3389 if (vm_pageout_deadlock_target) {
3390 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3391 vm_pageout_deadlock_target--;
3392 }
3393
3394 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3395
3396 if (object->internal) {
3397 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3398 } else {
3399 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3400 }
3401 assert(!m->vmp_cleaning);
3402 assert(!m->vmp_laundry);
3403
3404 if (!object->internal &&
3405 object->pager != NULL &&
3406 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3407 shared_region_pager_reclaimed++;
3408 }
3409
3410 m->vmp_busy = TRUE;
3411
3412 /*
3413 * remove page from object here since we're already
3414 * behind the object lock... defer the rest of the work
3415 * we'd normally do in vm_page_free_prepare_object
3416 * until 'vm_page_free_list' is called
3417 */
3418 if (m->vmp_tabled) {
3419 vm_page_remove(m, TRUE);
3420 }
3421
3422 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3423 m->vmp_snext = local_freeq;
3424 local_freeq = m;
3425 local_freed++;
3426
3427 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3428 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3429 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3430 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3431 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3432 vm_pageout_vminfo.vm_pageout_freed_internal++;
3433 } else {
3434 vm_pageout_vminfo.vm_pageout_freed_external++;
3435 }
3436
3437 inactive_burst_count = 0;
3438
3439 lock_yield_check = TRUE;
3440 continue;
3441 }
3442 if (object->copy == VM_OBJECT_NULL) {
3443 /*
3444 * No one else can have any interest in this page.
3445 * If this is an empty purgable object, the page can be
3446 * reclaimed even if dirty.
3447 * If the page belongs to a volatile purgable object, we
3448 * reactivate it if the compressor isn't active.
3449 */
3450 if (object->purgable == VM_PURGABLE_EMPTY) {
3451 if (m->vmp_pmapped == TRUE) {
3452 /* unmap the page */
3453 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3454 if (refmod_state & VM_MEM_MODIFIED) {
3455 SET_PAGE_DIRTY(m, FALSE);
3456 }
3457 }
3458 if (m->vmp_dirty || m->vmp_precious) {
3459 /* we saved the cost of cleaning this page ! */
3460 vm_page_purged_count++;
3461 }
3462 goto reclaim_page;
3463 }
3464
3465 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3466 /*
3467 * With the VM compressor, the cost of
3468 * reclaiming a page is much lower (no I/O),
3469 * so if we find a "volatile" page, it's better
3470 * to let it get compressed rather than letting
3471 * it occupy a full page until it gets purged.
3472 * So no need to check for "volatile" here.
3473 */
3474 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3475 /*
3476 * Avoid cleaning a "volatile" page which might
3477 * be purged soon.
3478 */
3479
3480 /* if it's wired, we can't put it on our queue */
3481 assert(!VM_PAGE_WIRED(m));
3482
3483 /* just stick it back on! */
3484 reactivated_this_call++;
3485
3486 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3487 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3488 }
3489
3490 goto reactivate_page;
3491 }
3492 }
3493 /*
3494 * If it's being used, reactivate.
3495 * (Fictitious pages are either busy or absent.)
3496 * First, update the reference and dirty bits
3497 * to make sure the page is unreferenced.
3498 */
3499 refmod_state = -1;
3500
3501 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3502 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3503
3504 if (refmod_state & VM_MEM_REFERENCED) {
3505 m->vmp_reference = TRUE;
3506 }
3507 if (refmod_state & VM_MEM_MODIFIED) {
3508 SET_PAGE_DIRTY(m, FALSE);
3509 }
3510 }
3511
3512 if (m->vmp_reference || m->vmp_dirty) {
3513 /* deal with a rogue "reusable" page */
3514 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3515 }
3516
3517 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3518 vm_pageout_state.vm_page_xpmapped_min = 0;
3519 } else {
3520 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3521 }
3522
3523 if (!m->vmp_no_cache &&
3524 page_from_bg_q == FALSE &&
3525 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3526 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3527 /*
3528 * The page we pulled off the inactive list has
3529 * been referenced. It is possible for other
3530 * processors to be touching pages faster than we
3531 * can clear the referenced bit and traverse the
3532 * inactive queue, so we limit the number of
3533 * reactivations.
3534 */
3535 if (++reactivated_this_call >= reactivate_limit &&
3536 !object->object_is_shared_cache &&
3537 !((m->vmp_realtime ||
3538 object->for_realtime) &&
3539 vm_pageout_protect_realtime)) {
3540 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3541 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3542 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3543 if (object->object_is_shared_cache) {
3544 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3545 } else if (m->vmp_realtime ||
3546 object->for_realtime) {
3547 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3548 }
3549 } else {
3550 uint32_t isinuse;
3551
3552 if (reactivated_this_call >= reactivate_limit) {
3553 if (object->object_is_shared_cache) {
3554 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3555 } else if ((m->vmp_realtime ||
3556 object->for_realtime) &&
3557 vm_pageout_protect_realtime) {
3558 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3559 }
3560 }
3561 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3562 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3563 }
3564
3565 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3566 reactivate_page:
3567 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3568 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3569 /*
3570 * no explict mappings of this object exist
3571 * and it's not open via the filesystem
3572 */
3573 vm_page_deactivate(m);
3574 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3575 } else {
3576 /*
3577 * The page was/is being used, so put back on active list.
3578 */
3579 vm_page_activate(m);
3580 counter_inc(&vm_statistics_reactivations);
3581 inactive_burst_count = 0;
3582 }
3583 #if DEVELOPMENT || DEBUG
3584 if (page_from_bg_q == TRUE) {
3585 if (m_object->internal) {
3586 vm_pageout_rejected_bq_internal++;
3587 } else {
3588 vm_pageout_rejected_bq_external++;
3589 }
3590 }
3591 #endif /* DEVELOPMENT || DEBUG */
3592
3593 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3594 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3595 }
3596 vm_pageout_state.vm_pageout_inactive_used++;
3597
3598 lock_yield_check = TRUE;
3599 continue;
3600 }
3601 /*
3602 * Make sure we call pmap_get_refmod() if it
3603 * wasn't already called just above, to update
3604 * the dirty bit.
3605 */
3606 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3607 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3608 if (refmod_state & VM_MEM_MODIFIED) {
3609 SET_PAGE_DIRTY(m, FALSE);
3610 }
3611 }
3612 }
3613
3614 /*
3615 * we've got a candidate page to steal...
3616 *
3617 * m->vmp_dirty is up to date courtesy of the
3618 * preceding check for m->vmp_reference... if
3619 * we get here, then m->vmp_reference had to be
3620 * FALSE (or possibly "reactivate_limit" was
3621 * exceeded), but in either case we called
3622 * pmap_get_refmod() and updated both
3623 * m->vmp_reference and m->vmp_dirty
3624 *
3625 * if it's dirty or precious we need to
3626 * see if the target queue is throtttled
3627 * it if is, we need to skip over it by moving it back
3628 * to the end of the inactive queue
3629 */
3630
3631 inactive_throttled = FALSE;
3632
3633 if (m->vmp_dirty || m->vmp_precious) {
3634 if (object->internal) {
3635 if (VM_PAGE_Q_THROTTLED(iq)) {
3636 inactive_throttled = TRUE;
3637 }
3638 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3639 inactive_throttled = TRUE;
3640 }
3641 }
3642 throttle_inactive:
3643 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3644 object->internal && m->vmp_dirty &&
3645 (object->purgable == VM_PURGABLE_DENY ||
3646 object->purgable == VM_PURGABLE_NONVOLATILE ||
3647 object->purgable == VM_PURGABLE_VOLATILE)) {
3648 vm_page_check_pageable_safe(m);
3649 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3650 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3651 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3652 vm_page_throttled_count++;
3653
3654 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3655
3656 inactive_burst_count = 0;
3657
3658 lock_yield_check = TRUE;
3659 continue;
3660 }
3661 if (inactive_throttled == TRUE) {
3662 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3663 &delayed_unlock, &force_anonymous, page_from_bg_q);
3664
3665 inactive_burst_count = 0;
3666
3667 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3668 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3669 }
3670
3671 lock_yield_check = TRUE;
3672 continue;
3673 }
3674
3675 /*
3676 * we've got a page that we can steal...
3677 * eliminate all mappings and make sure
3678 * we have the up-to-date modified state
3679 *
3680 * if we need to do a pmap_disconnect then we
3681 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3682 * provides the true state atomically... the
3683 * page was still mapped up to the pmap_disconnect
3684 * and may have been dirtied at the last microsecond
3685 *
3686 * Note that if 'pmapped' is FALSE then the page is not
3687 * and has not been in any map, so there is no point calling
3688 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3689 * of likely usage of the page.
3690 */
3691 if (m->vmp_pmapped == TRUE) {
3692 int pmap_options;
3693
3694 /*
3695 * Don't count this page as going into the compressor
3696 * if any of these are true:
3697 * 1) compressed pager isn't enabled
3698 * 2) Freezer enabled device with compressed pager
3699 * backend (exclusive use) i.e. most of the VM system
3700 * (including vm_pageout_scan) has no knowledge of
3701 * the compressor
3702 * 3) This page belongs to a file and hence will not be
3703 * sent into the compressor
3704 */
3705 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3706 object->internal == FALSE) {
3707 pmap_options = 0;
3708 } else if (m->vmp_dirty || m->vmp_precious) {
3709 /*
3710 * VM knows that this page is dirty (or
3711 * precious) and needs to be compressed
3712 * rather than freed.
3713 * Tell the pmap layer to count this page
3714 * as "compressed".
3715 */
3716 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3717 } else {
3718 /*
3719 * VM does not know if the page needs to
3720 * be preserved but the pmap layer might tell
3721 * us if any mapping has "modified" it.
3722 * Let's the pmap layer to count this page
3723 * as compressed if and only if it has been
3724 * modified.
3725 */
3726 pmap_options =
3727 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3728 }
3729 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3730 pmap_options,
3731 NULL);
3732 if (refmod_state & VM_MEM_MODIFIED) {
3733 SET_PAGE_DIRTY(m, FALSE);
3734 }
3735 }
3736
3737 /*
3738 * reset our count of pages that have been reclaimed
3739 * since the last page was 'stolen'
3740 */
3741 inactive_reclaim_run = 0;
3742
3743 /*
3744 * If it's clean and not precious, we can free the page.
3745 */
3746 if (!m->vmp_dirty && !m->vmp_precious) {
3747 vm_pageout_state.vm_pageout_inactive_clean++;
3748
3749 /*
3750 * OK, at this point we have found a page we are going to free.
3751 */
3752 #if CONFIG_PHANTOM_CACHE
3753 if (!object->internal) {
3754 vm_phantom_cache_add_ghost(m);
3755 }
3756 #endif
3757 goto reclaim_page;
3758 }
3759
3760 /*
3761 * The page may have been dirtied since the last check
3762 * for a throttled target queue (which may have been skipped
3763 * if the page was clean then). With the dirty page
3764 * disconnected here, we can make one final check.
3765 */
3766 if (object->internal) {
3767 if (VM_PAGE_Q_THROTTLED(iq)) {
3768 inactive_throttled = TRUE;
3769 }
3770 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3771 inactive_throttled = TRUE;
3772 }
3773
3774 if (inactive_throttled == TRUE) {
3775 goto throttle_inactive;
3776 }
3777
3778 #if VM_PRESSURE_EVENTS
3779 #if CONFIG_JETSAM
3780
3781 /*
3782 * If Jetsam is enabled, then the sending
3783 * of memory pressure notifications is handled
3784 * from the same thread that takes care of high-water
3785 * and other jetsams i.e. the memorystatus_thread.
3786 */
3787
3788 #else /* CONFIG_JETSAM */
3789
3790 vm_pressure_response();
3791
3792 #endif /* CONFIG_JETSAM */
3793 #endif /* VM_PRESSURE_EVENTS */
3794
3795 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3796 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3797 }
3798
3799 if (object->internal) {
3800 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3801 } else {
3802 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3803 }
3804
3805 /*
3806 * internal pages will go to the compressor...
3807 * external pages will go to the appropriate pager to be cleaned
3808 * and upon completion will end up on 'vm_page_queue_cleaned' which
3809 * is a preferred queue to steal from
3810 */
3811 vm_pageout_cluster(m);
3812 inactive_burst_count = 0;
3813
3814 /*
3815 * back to top of pageout scan loop
3816 */
3817 }
3818 }
3819
3820
3821 void
vm_page_free_reserve(int pages)3822 vm_page_free_reserve(
3823 int pages)
3824 {
3825 int free_after_reserve;
3826
3827 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3828 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3829 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3830 } else {
3831 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3832 }
3833 } else {
3834 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3835 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3836 } else {
3837 vm_page_free_reserved += pages;
3838 }
3839 }
3840 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3841
3842 vm_page_free_min = vm_page_free_reserved +
3843 VM_PAGE_FREE_MIN(free_after_reserve);
3844
3845 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3846 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3847 }
3848
3849 vm_page_free_target = vm_page_free_reserved +
3850 VM_PAGE_FREE_TARGET(free_after_reserve);
3851
3852 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3853 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3854 }
3855
3856 if (vm_page_free_target < vm_page_free_min + 5) {
3857 vm_page_free_target = vm_page_free_min + 5;
3858 }
3859
3860 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3861 }
3862
3863 /*
3864 * vm_pageout is the high level pageout daemon.
3865 */
3866
3867 void
vm_pageout_continue(void)3868 vm_pageout_continue(void)
3869 {
3870 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3871 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3872
3873 vm_free_page_lock();
3874 vm_pageout_running = TRUE;
3875 vm_free_page_unlock();
3876
3877 vm_pageout_scan();
3878 /*
3879 * we hold both the vm_page_queue_free_lock
3880 * and the vm_page_queues_lock at this point
3881 */
3882 assert(vm_page_free_wanted == 0);
3883 assert(vm_page_free_wanted_privileged == 0);
3884 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3885
3886 vm_pageout_running = FALSE;
3887 #if XNU_TARGET_OS_OSX
3888 if (vm_pageout_waiter) {
3889 vm_pageout_waiter = FALSE;
3890 thread_wakeup((event_t)&vm_pageout_waiter);
3891 }
3892 #endif /* XNU_TARGET_OS_OSX */
3893
3894 vm_free_page_unlock();
3895 vm_page_unlock_queues();
3896
3897 thread_block((thread_continue_t)vm_pageout_continue);
3898 /*NOTREACHED*/
3899 }
3900
3901 #if XNU_TARGET_OS_OSX
3902 kern_return_t
vm_pageout_wait(uint64_t deadline)3903 vm_pageout_wait(uint64_t deadline)
3904 {
3905 kern_return_t kr;
3906
3907 vm_free_page_lock();
3908 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3909 vm_pageout_waiter = TRUE;
3910 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3911 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3912 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3913 kr = KERN_OPERATION_TIMED_OUT;
3914 }
3915 }
3916 vm_free_page_unlock();
3917
3918 return kr;
3919 }
3920 #endif /* XNU_TARGET_OS_OSX */
3921
3922
3923 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q,__unused wait_result_t w)3924 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q, __unused wait_result_t w)
3925 {
3926 vm_page_t m = NULL;
3927 vm_object_t object;
3928 vm_object_offset_t offset;
3929 memory_object_t pager;
3930
3931 /* On systems with a compressor, the external IO thread clears its
3932 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3933 * creation)
3934 */
3935 if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3936 current_thread()->options &= ~TH_OPT_VMPRIV;
3937 }
3938
3939 vm_page_lockspin_queues();
3940
3941 while (!vm_page_queue_empty(&q->pgo_pending)) {
3942 q->pgo_busy = TRUE;
3943 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3944
3945 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3946 VM_PAGE_CHECK(m);
3947 /*
3948 * grab a snapshot of the object and offset this
3949 * page is tabled in so that we can relookup this
3950 * page after we've taken the object lock - these
3951 * fields are stable while we hold the page queues lock
3952 * but as soon as we drop it, there is nothing to keep
3953 * this page in this object... we hold an activity_in_progress
3954 * on this object which will keep it from terminating
3955 */
3956 object = VM_PAGE_OBJECT(m);
3957 offset = m->vmp_offset;
3958
3959 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3960 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3961
3962 vm_page_unlock_queues();
3963
3964 vm_object_lock(object);
3965
3966 m = vm_page_lookup(object, offset);
3967
3968 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3969 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3970 /*
3971 * it's either the same page that someone else has
3972 * started cleaning (or it's finished cleaning or
3973 * been put back on the pageout queue), or
3974 * the page has been freed or we have found a
3975 * new page at this offset... in all of these cases
3976 * we merely need to release the activity_in_progress
3977 * we took when we put the page on the pageout queue
3978 */
3979 vm_object_activity_end(object);
3980 vm_object_unlock(object);
3981
3982 vm_page_lockspin_queues();
3983 continue;
3984 }
3985 pager = object->pager;
3986
3987 if (pager == MEMORY_OBJECT_NULL) {
3988 /*
3989 * This pager has been destroyed by either
3990 * memory_object_destroy or vm_object_destroy, and
3991 * so there is nowhere for the page to go.
3992 */
3993 if (m->vmp_free_when_done) {
3994 /*
3995 * Just free the page... VM_PAGE_FREE takes
3996 * care of cleaning up all the state...
3997 * including doing the vm_pageout_throttle_up
3998 */
3999 VM_PAGE_FREE(m);
4000 } else {
4001 vm_page_lockspin_queues();
4002
4003 vm_pageout_throttle_up(m);
4004 vm_page_activate(m);
4005
4006 vm_page_unlock_queues();
4007
4008 /*
4009 * And we are done with it.
4010 */
4011 }
4012 vm_object_activity_end(object);
4013 vm_object_unlock(object);
4014
4015 vm_page_lockspin_queues();
4016 continue;
4017 }
4018 #if 0
4019 /*
4020 * we don't hold the page queue lock
4021 * so this check isn't safe to make
4022 */
4023 VM_PAGE_CHECK(m);
4024 #endif
4025 /*
4026 * give back the activity_in_progress reference we
4027 * took when we queued up this page and replace it
4028 * it with a paging_in_progress reference that will
4029 * also hold the paging offset from changing and
4030 * prevent the object from terminating
4031 */
4032 vm_object_activity_end(object);
4033 vm_object_paging_begin(object);
4034 vm_object_unlock(object);
4035
4036 /*
4037 * Send the data to the pager.
4038 * any pageout clustering happens there
4039 */
4040 memory_object_data_return(pager,
4041 m->vmp_offset + object->paging_offset,
4042 PAGE_SIZE,
4043 NULL,
4044 NULL,
4045 FALSE,
4046 FALSE,
4047 0);
4048
4049 vm_object_lock(object);
4050 vm_object_paging_end(object);
4051 vm_object_unlock(object);
4052
4053 vm_pageout_io_throttle();
4054
4055 vm_page_lockspin_queues();
4056 }
4057 q->pgo_busy = FALSE;
4058 q->pgo_idle = TRUE;
4059
4060 assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
4061 vm_page_unlock_queues();
4062
4063 thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
4064 /*NOTREACHED*/
4065 }
4066
4067
4068 #define MAX_FREE_BATCH 32
4069 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4070 * this thread.
4071 */
4072
4073
4074 void
4075 vm_pageout_iothread_internal_continue(struct cq *, __unused wait_result_t);
4076 void
vm_pageout_iothread_internal_continue(struct cq * cq,__unused wait_result_t w)4077 vm_pageout_iothread_internal_continue(struct cq *cq, __unused wait_result_t w)
4078 {
4079 struct vm_pageout_queue *q;
4080 vm_page_t m = NULL;
4081 boolean_t pgo_draining;
4082 vm_page_t local_q;
4083 int local_cnt;
4084 vm_page_t local_freeq = NULL;
4085 int local_freed = 0;
4086 int local_batch_size;
4087 #if DEVELOPMENT || DEBUG
4088 int ncomps = 0;
4089 boolean_t marked_active = FALSE;
4090 int num_pages_processed = 0;
4091 #endif
4092 void *chead = NULL;
4093
4094 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4095
4096 q = cq->q;
4097 #if DEVELOPMENT || DEBUG
4098 bool benchmark_accounting = false;
4099 /*
4100 * If we're running the compressor perf test, only process the benchmark pages.
4101 * We'll get back to our regular queue once the benchmark is done
4102 */
4103 if (compressor_running_perf_test) {
4104 q = cq->benchmark_q;
4105 if (!vm_page_queue_empty(&q->pgo_pending)) {
4106 benchmark_accounting = true;
4107 } else {
4108 q = cq->q;
4109 benchmark_accounting = false;
4110 }
4111 }
4112 #endif /* DEVELOPMENT || DEBUG */
4113
4114 #if __AMP__
4115 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4116 local_batch_size = (q->pgo_maxlaundry >> 3);
4117 local_batch_size = MAX(local_batch_size, 16);
4118 } else {
4119 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4120 }
4121 #else
4122 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4123 #endif
4124
4125 #if RECORD_THE_COMPRESSED_DATA
4126 if (q->pgo_laundry) {
4127 c_compressed_record_init();
4128 }
4129 #endif
4130 while (TRUE) {
4131 int pages_left_on_q = 0;
4132
4133 local_cnt = 0;
4134 local_q = NULL;
4135
4136 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4137
4138 vm_page_lock_queues();
4139 #if DEVELOPMENT || DEBUG
4140 if (marked_active == FALSE) {
4141 vmct_active++;
4142 vmct_state[cq->id] = VMCT_ACTIVE;
4143 marked_active = TRUE;
4144 if (vmct_active == 1) {
4145 vm_compressor_epoch_start = mach_absolute_time();
4146 }
4147 }
4148 #endif
4149 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4150
4151 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4152
4153 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4154 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4155 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4156 VM_PAGE_CHECK(m);
4157
4158 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4159 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4160 m->vmp_laundry = FALSE;
4161
4162 m->vmp_snext = local_q;
4163 local_q = m;
4164 local_cnt++;
4165 }
4166 if (local_q == NULL) {
4167 break;
4168 }
4169
4170 q->pgo_busy = TRUE;
4171
4172 if ((pgo_draining = q->pgo_draining) == FALSE) {
4173 vm_pageout_throttle_up_batch(q, local_cnt);
4174 pages_left_on_q = q->pgo_laundry;
4175 } else {
4176 pages_left_on_q = q->pgo_laundry - local_cnt;
4177 }
4178
4179 vm_page_unlock_queues();
4180
4181 #if !RECORD_THE_COMPRESSED_DATA
4182 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4183 thread_wakeup((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id + 1));
4184 }
4185 #endif
4186 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4187
4188 while (local_q) {
4189 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4190
4191 m = local_q;
4192 local_q = m->vmp_snext;
4193 m->vmp_snext = NULL;
4194
4195 /*
4196 * Technically we need the pageq locks to manipulate this field.
4197 * However, this page has been removed from all queues and is only
4198 * known to this compressor thread dealing with this local queue.
4199 *
4200 * TODO LIONEL: Add a second localq that is the early localq and
4201 * put special pages like this one on that queue in the block above
4202 * under the pageq lock to avoid this 'works but not clean' logic.
4203 */
4204 void *donate_queue_head;
4205 #if XNU_TARGET_OS_OSX
4206 donate_queue_head = &cq->current_early_swapout_chead;
4207 #else /* XNU_TARGET_OS_OSX */
4208 donate_queue_head = &cq->current_late_swapout_chead;
4209 #endif /* XNU_TARGET_OS_OSX */
4210 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4211 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4212 chead = donate_queue_head;
4213 } else {
4214 chead = &cq->current_regular_swapout_chead;
4215 }
4216
4217 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4218 #if DEVELOPMENT || DEBUG
4219 ncomps++;
4220 #endif
4221 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4222
4223 m->vmp_snext = local_freeq;
4224 local_freeq = m;
4225 local_freed++;
4226
4227 if (local_freed >= MAX_FREE_BATCH) {
4228 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4229
4230 vm_page_free_list(local_freeq, TRUE);
4231
4232 local_freeq = NULL;
4233 local_freed = 0;
4234 }
4235 }
4236 #if DEVELOPMENT || DEBUG
4237 num_pages_processed++;
4238 #endif /* DEVELOPMENT || DEBUG */
4239 #if !CONFIG_JETSAM
4240 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4241 kern_return_t wait_result;
4242 int need_wakeup = 0;
4243
4244 if (local_freeq) {
4245 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4246
4247 vm_page_free_list(local_freeq, TRUE);
4248 local_freeq = NULL;
4249 local_freed = 0;
4250
4251 continue;
4252 }
4253 vm_free_page_lock_spin();
4254
4255 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4256 if (vm_page_free_wanted_privileged++ == 0) {
4257 need_wakeup = 1;
4258 }
4259 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4260
4261 vm_free_page_unlock();
4262
4263 if (need_wakeup) {
4264 thread_wakeup((event_t)&vm_page_free_wanted);
4265 }
4266
4267 if (wait_result == THREAD_WAITING) {
4268 thread_block(THREAD_CONTINUE_NULL);
4269 }
4270 } else {
4271 vm_free_page_unlock();
4272 }
4273 }
4274 #endif
4275 }
4276 if (local_freeq) {
4277 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4278
4279 vm_page_free_list(local_freeq, TRUE);
4280 local_freeq = NULL;
4281 local_freed = 0;
4282 }
4283 if (pgo_draining == TRUE) {
4284 vm_page_lockspin_queues();
4285 vm_pageout_throttle_up_batch(q, local_cnt);
4286 vm_page_unlock_queues();
4287 }
4288 }
4289 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4290
4291 /*
4292 * queue lock is held and our q is empty
4293 */
4294 q->pgo_busy = FALSE;
4295 q->pgo_idle = TRUE;
4296
4297 assert_wait((event_t) ((uintptr_t)&cq->q->pgo_pending + cq->id), THREAD_UNINT);
4298 #if DEVELOPMENT || DEBUG
4299 if (marked_active == TRUE) {
4300 vmct_active--;
4301 vmct_state[cq->id] = VMCT_IDLE;
4302
4303 if (vmct_active == 0) {
4304 vm_compressor_epoch_stop = mach_absolute_time();
4305 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4306 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4307 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4308 /* This interval includes intervals where one or more
4309 * compressor threads were pre-empted
4310 */
4311 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4312 }
4313 }
4314 if (compressor_running_perf_test && benchmark_accounting) {
4315 /*
4316 * We could turn ON compressor_running_perf_test while still processing
4317 * regular non-benchmark pages. We shouldn't count them here else we
4318 * could overshoot. We might also still be populating that benchmark Q
4319 * and be under pressure. So we will go back to the regular queues. And
4320 * benchmark accounting will be off for that case too.
4321 */
4322 compressor_perf_test_pages_processed += num_pages_processed;
4323 thread_wakeup(&compressor_perf_test_pages_processed);
4324 }
4325 #endif
4326 vm_page_unlock_queues();
4327 #if DEVELOPMENT || DEBUG
4328 if (__improbable(vm_compressor_time_thread)) {
4329 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4330 vmct_stats.vmct_pages[cq->id] += ncomps;
4331 vmct_stats.vmct_iterations[cq->id]++;
4332 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4333 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4334 }
4335 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4336 vmct_stats.vmct_minpages[cq->id] = ncomps;
4337 }
4338 }
4339 #endif
4340
4341 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4342
4343 thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4344 /*NOTREACHED*/
4345 }
4346
4347
4348 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4349 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4350 {
4351 vm_object_t object;
4352 memory_object_t pager;
4353 int compressed_count_delta;
4354 kern_return_t retval;
4355
4356 object = VM_PAGE_OBJECT(m);
4357
4358 assert(!m->vmp_free_when_done);
4359 assert(!m->vmp_laundry);
4360
4361 pager = object->pager;
4362
4363 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4364 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4365
4366 vm_object_lock(object);
4367
4368 /*
4369 * If there is no memory object for the page, create
4370 * one and hand it to the compression pager.
4371 */
4372
4373 if (!object->pager_initialized) {
4374 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4375 }
4376 if (!object->pager_initialized) {
4377 vm_object_compressor_pager_create(object);
4378 }
4379
4380 pager = object->pager;
4381
4382 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4383 /*
4384 * Still no pager for the object,
4385 * or the pager has been destroyed.
4386 * Reactivate the page.
4387 *
4388 * Should only happen if there is no
4389 * compression pager
4390 */
4391 PAGE_WAKEUP_DONE(m);
4392
4393 vm_page_lockspin_queues();
4394 vm_page_activate(m);
4395 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4396 vm_page_unlock_queues();
4397
4398 /*
4399 * And we are done with it.
4400 */
4401 vm_object_activity_end(object);
4402 vm_object_unlock(object);
4403
4404 return KERN_FAILURE;
4405 }
4406 vm_object_unlock(object);
4407
4408 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4409 }
4410 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4411 assert(object->activity_in_progress > 0);
4412
4413 retval = vm_compressor_pager_put(
4414 pager,
4415 m->vmp_offset + object->paging_offset,
4416 VM_PAGE_GET_PHYS_PAGE(m),
4417 current_chead,
4418 scratch_buf,
4419 &compressed_count_delta);
4420
4421 vm_object_lock(object);
4422
4423 assert(object->activity_in_progress > 0);
4424 assert(VM_PAGE_OBJECT(m) == object);
4425 assert( !VM_PAGE_WIRED(m));
4426
4427 vm_compressor_pager_count(pager,
4428 compressed_count_delta,
4429 FALSE, /* shared_lock */
4430 object);
4431
4432 if (retval == KERN_SUCCESS) {
4433 /*
4434 * If the object is purgeable, its owner's
4435 * purgeable ledgers will be updated in
4436 * vm_page_remove() but the page still
4437 * contributes to the owner's memory footprint,
4438 * so account for it as such.
4439 */
4440 if ((object->purgable != VM_PURGABLE_DENY ||
4441 object->vo_ledger_tag) &&
4442 object->vo_owner != NULL) {
4443 /* one more compressed purgeable/tagged page */
4444 vm_object_owner_compressed_update(object,
4445 +1);
4446 }
4447 counter_inc(&vm_statistics_compressions);
4448
4449 if (m->vmp_tabled) {
4450 vm_page_remove(m, TRUE);
4451 }
4452 } else {
4453 PAGE_WAKEUP_DONE(m);
4454
4455 vm_page_lockspin_queues();
4456
4457 vm_page_activate(m);
4458 vm_pageout_vminfo.vm_compressor_failed++;
4459
4460 vm_page_unlock_queues();
4461 }
4462 vm_object_activity_end(object);
4463 vm_object_unlock(object);
4464
4465 return retval;
4466 }
4467
4468
4469 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4470 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4471 {
4472 uint32_t policy;
4473
4474 if (hibernate_cleaning_in_progress == TRUE) {
4475 req_lowpriority = FALSE;
4476 }
4477
4478 if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4479 vm_page_unlock_queues();
4480
4481 if (req_lowpriority == TRUE) {
4482 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4483 DTRACE_VM(laundrythrottle);
4484 } else {
4485 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4486 DTRACE_VM(laundryunthrottle);
4487 }
4488 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4489 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4490
4491 vm_page_lock_queues();
4492 eq->pgo_lowpriority = req_lowpriority;
4493 }
4494 }
4495
4496
4497 static void
vm_pageout_iothread_external(__unused struct cq * c,__unused wait_result_t w)4498 vm_pageout_iothread_external(__unused struct cq *c, __unused wait_result_t w)
4499 {
4500 thread_t self = current_thread();
4501
4502 self->options |= TH_OPT_VMPRIV;
4503
4504 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4505
4506 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4507 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4508
4509 vm_page_lock_queues();
4510
4511 vm_pageout_queue_external.pgo_tid = self->thread_id;
4512 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4513 vm_pageout_queue_external.pgo_inited = TRUE;
4514
4515 vm_page_unlock_queues();
4516
4517 #if CONFIG_THREAD_GROUPS
4518 thread_group_vm_add();
4519 #endif /* CONFIG_THREAD_GROUPS */
4520
4521 vm_pageout_iothread_external_continue(&vm_pageout_queue_external, 0);
4522
4523 /*NOTREACHED*/
4524 }
4525
4526
4527 static void
vm_pageout_iothread_internal(struct cq * cq,__unused wait_result_t w)4528 vm_pageout_iothread_internal(struct cq *cq, __unused wait_result_t w)
4529 {
4530 thread_t self = current_thread();
4531
4532 self->options |= TH_OPT_VMPRIV;
4533
4534 vm_page_lock_queues();
4535
4536 vm_pageout_queue_internal.pgo_tid = self->thread_id;
4537 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4538 vm_pageout_queue_internal.pgo_inited = TRUE;
4539
4540 #if DEVELOPMENT || DEBUG
4541 vm_pageout_queue_benchmark.pgo_tid = vm_pageout_queue_internal.pgo_tid;
4542 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4543 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4544 vm_pageout_queue_benchmark.pgo_idle = TRUE;
4545 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4546 #endif /* DEVELOPMENT || DEBUG */
4547
4548 vm_page_unlock_queues();
4549
4550 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4551 thread_vm_bind_group_add();
4552 }
4553
4554 #if CONFIG_THREAD_GROUPS
4555 thread_group_vm_add();
4556 #endif /* CONFIG_THREAD_GROUPS */
4557
4558 #if __AMP__
4559 if (vm_compressor_ebound) {
4560 /*
4561 * Use the soft bound option for vm_compressor to allow it to run on
4562 * P-cores if E-cluster is unavailable.
4563 */
4564 thread_bind_cluster_type(self, 'E', true);
4565 }
4566 #endif /* __AMP__ */
4567
4568 thread_set_thread_name(current_thread(), "VM_compressor");
4569 #if DEVELOPMENT || DEBUG
4570 vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4571 #endif
4572 vm_pageout_iothread_internal_continue(cq, 0);
4573
4574 /*NOTREACHED*/
4575 }
4576
4577 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4578 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4579 {
4580 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4581 return KERN_SUCCESS;
4582 } else {
4583 return KERN_FAILURE; /* Already set */
4584 }
4585 }
4586
4587 extern boolean_t memorystatus_manual_testing_on;
4588 extern unsigned int memorystatus_level;
4589
4590
4591 #if VM_PRESSURE_EVENTS
4592
4593 boolean_t vm_pressure_events_enabled = FALSE;
4594
4595 extern uint64_t next_warning_notification_sent_at_ts;
4596 extern uint64_t next_critical_notification_sent_at_ts;
4597
4598 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4599
4600 /*
4601 * The last time there was change in pressure level OR we forced a check
4602 * because the system is stuck in a non-normal pressure level.
4603 */
4604 uint64_t vm_pressure_last_level_transition_abs = 0;
4605
4606 /*
4607 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4608 * level before resending out notifications for that level again.
4609 */
4610 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4611
4612 void
vm_pressure_response(void)4613 vm_pressure_response(void)
4614 {
4615 vm_pressure_level_t old_level = kVMPressureNormal;
4616 int new_level = -1;
4617 unsigned int total_pages;
4618 uint64_t available_memory = 0;
4619 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4620 bool force_check = false;
4621 int time_in_mins;
4622
4623
4624 if (vm_pressure_events_enabled == FALSE) {
4625 return;
4626 }
4627
4628 #if !XNU_TARGET_OS_OSX
4629
4630 available_memory = (uint64_t) memorystatus_available_pages;
4631
4632 #else /* !XNU_TARGET_OS_OSX */
4633
4634 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4635 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4636
4637 #endif /* !XNU_TARGET_OS_OSX */
4638
4639 total_pages = (unsigned int) atop_64(max_mem);
4640 #if CONFIG_SECLUDED_MEMORY
4641 total_pages -= vm_page_secluded_count;
4642 #endif /* CONFIG_SECLUDED_MEMORY */
4643 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4644
4645 if (memorystatus_manual_testing_on) {
4646 return;
4647 }
4648
4649 curr_ts = mach_absolute_time();
4650 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4651
4652 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4653 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4654 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4655
4656 old_level = memorystatus_vm_pressure_level;
4657
4658 switch (memorystatus_vm_pressure_level) {
4659 case kVMPressureNormal:
4660 {
4661 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4662 new_level = kVMPressureCritical;
4663 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4664 new_level = kVMPressureWarning;
4665 }
4666 break;
4667 }
4668
4669 case kVMPressureWarning:
4670 case kVMPressureUrgent:
4671 {
4672 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4673 new_level = kVMPressureNormal;
4674 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4675 new_level = kVMPressureCritical;
4676 } else if (force_check) {
4677 new_level = kVMPressureWarning;
4678 next_warning_notification_sent_at_ts = curr_ts;
4679 }
4680 break;
4681 }
4682
4683 case kVMPressureCritical:
4684 {
4685 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4686 new_level = kVMPressureNormal;
4687 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4688 new_level = kVMPressureWarning;
4689 } else if (force_check) {
4690 new_level = kVMPressureCritical;
4691 next_critical_notification_sent_at_ts = curr_ts;
4692 }
4693 break;
4694 }
4695
4696 default:
4697 return;
4698 }
4699
4700 if (new_level != -1 || force_check) {
4701 if (new_level != -1) {
4702 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4703
4704 if (new_level != (int) old_level) {
4705 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4706 new_level, old_level, 0, 0);
4707 }
4708 } else {
4709 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4710 new_level, old_level, force_check, 0);
4711 }
4712
4713 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4714 /*
4715 * We don't want to schedule a wakeup while hibernation is in progress
4716 * because that could collide with checks for non-monotonicity in the scheduler.
4717 * We do however do all the updates to memorystatus_vm_pressure_level because
4718 * we _might_ want to use that for decisions regarding which pages or how
4719 * many pages we want to dump in hibernation.
4720 */
4721 return;
4722 }
4723
4724 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4725 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4726 thread_wakeup(&vm_pressure_thread);
4727 }
4728
4729 if (old_level != memorystatus_vm_pressure_level) {
4730 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4731 }
4732 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4733 }
4734 }
4735 }
4736 #endif /* VM_PRESSURE_EVENTS */
4737
4738 /*
4739 * Function called by a kernel thread to either get the current pressure level or
4740 * wait until memory pressure changes from a given level.
4741 */
4742 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4743 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4744 {
4745 #if !VM_PRESSURE_EVENTS
4746
4747 return KERN_FAILURE;
4748
4749 #else /* VM_PRESSURE_EVENTS */
4750
4751 wait_result_t wr = 0;
4752 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4753
4754 if (pressure_level == NULL) {
4755 return KERN_INVALID_ARGUMENT;
4756 }
4757
4758 if (*pressure_level == kVMPressureJetsam) {
4759 if (!wait_for_pressure) {
4760 return KERN_INVALID_ARGUMENT;
4761 }
4762
4763 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4764 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4765 THREAD_INTERRUPTIBLE);
4766 if (wr == THREAD_WAITING) {
4767 ++memorystatus_jetsam_fg_band_waiters;
4768 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4769 wr = thread_block(THREAD_CONTINUE_NULL);
4770 } else {
4771 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4772 }
4773 if (wr != THREAD_AWAKENED) {
4774 return KERN_ABORTED;
4775 }
4776 *pressure_level = kVMPressureJetsam;
4777 return KERN_SUCCESS;
4778 }
4779
4780 if (wait_for_pressure == TRUE) {
4781 while (old_level == *pressure_level) {
4782 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4783 THREAD_INTERRUPTIBLE);
4784 if (wr == THREAD_WAITING) {
4785 wr = thread_block(THREAD_CONTINUE_NULL);
4786 }
4787 if (wr == THREAD_INTERRUPTED) {
4788 return KERN_ABORTED;
4789 }
4790
4791 if (wr == THREAD_AWAKENED) {
4792 old_level = memorystatus_vm_pressure_level;
4793 }
4794 }
4795 }
4796
4797 *pressure_level = old_level;
4798 return KERN_SUCCESS;
4799 #endif /* VM_PRESSURE_EVENTS */
4800 }
4801
4802 #if VM_PRESSURE_EVENTS
4803 void
vm_pressure_thread(void)4804 vm_pressure_thread(void)
4805 {
4806 static boolean_t thread_initialized = FALSE;
4807
4808 if (thread_initialized == TRUE) {
4809 vm_pageout_state.vm_pressure_thread_running = TRUE;
4810 consider_vm_pressure_events();
4811 vm_pageout_state.vm_pressure_thread_running = FALSE;
4812 }
4813
4814 #if CONFIG_THREAD_GROUPS
4815 thread_group_vm_add();
4816 #endif /* CONFIG_THREAD_GROUPS */
4817
4818 thread_set_thread_name(current_thread(), "VM_pressure");
4819 thread_initialized = TRUE;
4820 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4821 thread_block((thread_continue_t)vm_pressure_thread);
4822 }
4823 #endif /* VM_PRESSURE_EVENTS */
4824
4825
4826 /*
4827 * called once per-second via "compute_averages"
4828 */
4829 void
compute_pageout_gc_throttle(__unused void * arg)4830 compute_pageout_gc_throttle(__unused void *arg)
4831 {
4832 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4833 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4834
4835 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4836 }
4837 }
4838
4839 /*
4840 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4841 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4842 * jetsams. We need to check if the zone map size is above its jetsam limit to
4843 * decide if this was indeed the case.
4844 *
4845 * We need to do this on a different thread because of the following reasons:
4846 *
4847 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4848 * itself causing the system to hang. We perform synchronous jetsams if we're
4849 * leaking in the VM map entries zone, so the leaking process could be doing a
4850 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4851 * jetsam itself. We also need the vm_map lock on the process termination path,
4852 * which would now lead the dying process to deadlock against itself.
4853 *
4854 * 2. The jetsam path might need to allocate zone memory itself. We could try
4855 * using the non-blocking variant of zalloc for this path, but we can still
4856 * end up trying to do a kmem_alloc when the zone maps are almost full.
4857 */
4858 __dead2
4859 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4860 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4861 {
4862 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4863
4864 if (step == VM_PAGEOUT_GC_INIT) {
4865 /* first time being called is not about GC */
4866 #if CONFIG_THREAD_GROUPS
4867 thread_group_vm_add();
4868 #endif /* CONFIG_THREAD_GROUPS */
4869 } else if (zone_map_nearing_exhaustion()) {
4870 /*
4871 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4872 *
4873 * Bail out after calling zone_gc (which triggers the
4874 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4875 * operations that clear out a bunch of caches might allocate zone
4876 * memory themselves (for eg. vm_map operations would need VM map
4877 * entries). Since the zone map is almost full at this point, we
4878 * could end up with a panic. We just need to quickly jetsam a
4879 * process and exit here.
4880 *
4881 * It could so happen that we were woken up to relieve memory
4882 * pressure and the zone map also happened to be near its limit at
4883 * the time, in which case we'll skip out early. But that should be
4884 * ok; if memory pressure persists, the thread will simply be woken
4885 * up again.
4886 */
4887 zone_gc(ZONE_GC_JETSAM);
4888 } else {
4889 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4890 boolean_t buf_large_zfree = FALSE;
4891 boolean_t first_try = TRUE;
4892
4893 stack_collect();
4894
4895 consider_machine_collect();
4896 mbuf_drain(FALSE);
4897
4898 do {
4899 if (consider_buffer_cache_collect != NULL) {
4900 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4901 }
4902 if (first_try == TRUE || buf_large_zfree == TRUE) {
4903 /*
4904 * zone_gc should be last, because the other operations
4905 * might return memory to zones.
4906 */
4907 zone_gc(ZONE_GC_TRIM);
4908 }
4909 first_try = FALSE;
4910 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4911
4912 consider_machine_adjust();
4913 }
4914
4915 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4916
4917 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4918 __builtin_unreachable();
4919 }
4920
4921
4922 #if VM_PAGE_BUCKETS_CHECK
4923 #if VM_PAGE_FAKE_BUCKETS
4924 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4925 #endif /* VM_PAGE_FAKE_BUCKETS */
4926 #endif /* VM_PAGE_BUCKETS_CHECK */
4927
4928
4929
4930 void
vm_set_restrictions(unsigned int num_cpus)4931 vm_set_restrictions(unsigned int num_cpus)
4932 {
4933 int vm_restricted_to_single_processor = 0;
4934
4935 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4936 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4937 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4938 } else {
4939 assert(num_cpus > 0);
4940
4941 if (num_cpus <= 3) {
4942 /*
4943 * on systems with a limited number of CPUS, bind the
4944 * 4 major threads that can free memory and that tend to use
4945 * a fair bit of CPU under pressured conditions to a single processor.
4946 * This insures that these threads don't hog all of the available CPUs
4947 * (important for camera launch), while allowing them to run independently
4948 * w/r to locks... the 4 threads are
4949 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4950 * vm_compressor_swap_trigger_thread (minor and major compactions),
4951 * memorystatus_thread (jetsams).
4952 *
4953 * the first time the thread is run, it is responsible for checking the
4954 * state of vm_restricted_to_single_processor, and if TRUE it calls
4955 * thread_bind_master... someday this should be replaced with a group
4956 * scheduling mechanism and KPI.
4957 */
4958 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4959 } else {
4960 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4961 }
4962 }
4963 }
4964
4965 /*
4966 * Set up vm_config based on the vm_compressor_mode.
4967 * Must run BEFORE the pageout thread starts up.
4968 */
4969 __startup_func
4970 void
vm_config_init(void)4971 vm_config_init(void)
4972 {
4973 bzero(&vm_config, sizeof(vm_config));
4974
4975 switch (vm_compressor_mode) {
4976 case VM_PAGER_DEFAULT:
4977 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4978 OS_FALLTHROUGH;
4979
4980 case VM_PAGER_COMPRESSOR_WITH_SWAP:
4981 vm_config.compressor_is_present = TRUE;
4982 vm_config.swap_is_present = TRUE;
4983 vm_config.compressor_is_active = TRUE;
4984 vm_config.swap_is_active = TRUE;
4985 break;
4986
4987 case VM_PAGER_COMPRESSOR_NO_SWAP:
4988 vm_config.compressor_is_present = TRUE;
4989 vm_config.swap_is_present = TRUE;
4990 vm_config.compressor_is_active = TRUE;
4991 break;
4992
4993 case VM_PAGER_FREEZER_DEFAULT:
4994 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4995 OS_FALLTHROUGH;
4996
4997 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4998 vm_config.compressor_is_present = TRUE;
4999 vm_config.swap_is_present = TRUE;
5000 break;
5001
5002 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5003 vm_config.compressor_is_present = TRUE;
5004 vm_config.swap_is_present = TRUE;
5005 vm_config.compressor_is_active = TRUE;
5006 vm_config.freezer_swap_is_active = TRUE;
5007 break;
5008
5009 case VM_PAGER_NOT_CONFIGURED:
5010 break;
5011
5012 default:
5013 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5014 break;
5015 }
5016 }
5017
5018 __startup_func
5019 static void
vm_pageout_create_gc_thread(void)5020 vm_pageout_create_gc_thread(void)
5021 {
5022 thread_t thread;
5023
5024 if (kernel_thread_create(vm_pageout_garbage_collect,
5025 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5026 panic("vm_pageout_garbage_collect: create failed");
5027 }
5028 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5029 if (thread->reserved_stack == 0) {
5030 assert(thread->kernel_stack);
5031 thread->reserved_stack = thread->kernel_stack;
5032 }
5033
5034 /* thread is started in vm_pageout() */
5035 vm_pageout_gc_thread = thread;
5036 }
5037 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5038
5039 void
vm_pageout(void)5040 vm_pageout(void)
5041 {
5042 thread_t self = current_thread();
5043 thread_t thread;
5044 kern_return_t result;
5045 spl_t s;
5046
5047 /*
5048 * Set thread privileges.
5049 */
5050 s = splsched();
5051
5052 #if CONFIG_VPS_DYNAMIC_PRIO
5053
5054 int vps_dynprio_bootarg = 0;
5055
5056 if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5057 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5058 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5059 } else {
5060 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5061 vps_dynamic_priority_enabled = TRUE;
5062 } else {
5063 vps_dynamic_priority_enabled = FALSE;
5064 }
5065 }
5066
5067 if (vps_dynamic_priority_enabled) {
5068 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5069 thread_set_eager_preempt(self);
5070 } else {
5071 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5072 }
5073
5074 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5075
5076 vps_dynamic_priority_enabled = FALSE;
5077 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5078
5079 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5080
5081 thread_lock(self);
5082 self->options |= TH_OPT_VMPRIV;
5083 thread_unlock(self);
5084
5085 if (!self->reserved_stack) {
5086 self->reserved_stack = self->kernel_stack;
5087 }
5088
5089 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5090 vps_dynamic_priority_enabled == FALSE) {
5091 thread_vm_bind_group_add();
5092 }
5093
5094
5095 #if CONFIG_THREAD_GROUPS
5096 thread_group_vm_add();
5097 #endif /* CONFIG_THREAD_GROUPS */
5098
5099 #if __AMP__
5100 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5101 if (vm_pgo_pbound) {
5102 /*
5103 * Use the soft bound option for vm pageout to allow it to run on
5104 * E-cores if P-cluster is unavailable.
5105 */
5106 thread_bind_cluster_type(self, 'P', true);
5107 }
5108 #endif /* __AMP__ */
5109
5110 PE_parse_boot_argn("vmpgo_protect_realtime",
5111 &vm_pageout_protect_realtime,
5112 sizeof(vm_pageout_protect_realtime));
5113 splx(s);
5114
5115 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5116
5117 /*
5118 * Initialize some paging parameters.
5119 */
5120
5121 vm_pageout_state.vm_pressure_thread_running = FALSE;
5122 vm_pageout_state.vm_pressure_changed = FALSE;
5123 vm_pageout_state.memorystatus_purge_on_warning = 2;
5124 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5125 vm_pageout_state.memorystatus_purge_on_critical = 8;
5126 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5127 vm_pageout_state.vm_page_speculative_percentage = 5;
5128 vm_pageout_state.vm_page_speculative_target = 0;
5129
5130 vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
5131 vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
5132
5133 vm_pageout_state.vm_pageout_swap_wait = 0;
5134 vm_pageout_state.vm_pageout_idle_wait = 0;
5135 vm_pageout_state.vm_pageout_empty_wait = 0;
5136 vm_pageout_state.vm_pageout_burst_wait = 0;
5137 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5138 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5139 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5140
5141 vm_pageout_state.vm_pageout_inactive = 0;
5142 vm_pageout_state.vm_pageout_inactive_used = 0;
5143 vm_pageout_state.vm_pageout_inactive_clean = 0;
5144
5145 vm_pageout_state.vm_memory_pressure = 0;
5146 vm_pageout_state.vm_page_filecache_min = 0;
5147 #if CONFIG_JETSAM
5148 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5149 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5150 #else
5151 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5152 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5153 #endif
5154 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5155
5156 vm_pageout_state.vm_pageout_considered_page_last = 0;
5157
5158 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5159 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5160 }
5161
5162 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5163 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5164 }
5165
5166 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5167 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5168 }
5169
5170 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5171 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5172 }
5173
5174 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5175 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5176 }
5177
5178 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5179 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5180 }
5181
5182 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5183 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5184 }
5185 /*
5186 * even if we've already called vm_page_free_reserve
5187 * call it again here to insure that the targets are
5188 * accurately calculated (it uses vm_page_free_count_init)
5189 * calling it with an arg of 0 will not change the reserve
5190 * but will re-calculate free_min and free_target
5191 */
5192 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5193 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5194 } else {
5195 vm_page_free_reserve(0);
5196 }
5197
5198 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5199 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5200
5201 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5202 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5203 vm_pageout_queue_external.pgo_tid = -1;
5204
5205 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5206 vm_pageout_queue_internal.pgo_tid = -1;
5207
5208 #if DEVELOPMENT || DEBUG
5209 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5210 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5211 vm_pageout_queue_internal.pgo_tid = -1;
5212 #endif /* DEVELOPMENT || DEBUG */
5213
5214
5215 /* internal pageout thread started when default pager registered first time */
5216 /* external pageout and garbage collection threads started here */
5217
5218 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5219 BASEPRI_VM,
5220 &vm_pageout_state.vm_pageout_external_iothread);
5221 if (result != KERN_SUCCESS) {
5222 panic("vm_pageout_iothread_external: create failed");
5223 }
5224 thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5225 thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5226
5227 thread_mtx_lock(vm_pageout_gc_thread );
5228 thread_start(vm_pageout_gc_thread );
5229 thread_mtx_unlock(vm_pageout_gc_thread);
5230
5231 #if VM_PRESSURE_EVENTS
5232 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5233 BASEPRI_DEFAULT,
5234 &thread);
5235
5236 if (result != KERN_SUCCESS) {
5237 panic("vm_pressure_thread: create failed");
5238 }
5239
5240 thread_deallocate(thread);
5241 #endif
5242
5243 vm_object_reaper_init();
5244
5245
5246 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5247 vm_compressor_init();
5248 }
5249
5250 #if VM_PRESSURE_EVENTS
5251 vm_pressure_events_enabled = TRUE;
5252 #endif /* VM_PRESSURE_EVENTS */
5253
5254 #if CONFIG_PHANTOM_CACHE
5255 vm_phantom_cache_init();
5256 #endif
5257 #if VM_PAGE_BUCKETS_CHECK
5258 #if VM_PAGE_FAKE_BUCKETS
5259 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5260 (uint64_t) vm_page_fake_buckets_start,
5261 (uint64_t) vm_page_fake_buckets_end);
5262 pmap_protect(kernel_pmap,
5263 vm_page_fake_buckets_start,
5264 vm_page_fake_buckets_end,
5265 VM_PROT_READ);
5266 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5267 #endif /* VM_PAGE_FAKE_BUCKETS */
5268 #endif /* VM_PAGE_BUCKETS_CHECK */
5269
5270 #if VM_OBJECT_TRACKING
5271 vm_object_tracking_init();
5272 #endif /* VM_OBJECT_TRACKING */
5273
5274 #if __arm64__
5275 // vm_tests();
5276 #endif /* __arm64__ */
5277
5278 vm_pageout_continue();
5279
5280 /*
5281 * Unreached code!
5282 *
5283 * The vm_pageout_continue() call above never returns, so the code below is never
5284 * executed. We take advantage of this to declare several DTrace VM related probe
5285 * points that our kernel doesn't have an analog for. These are probe points that
5286 * exist in Solaris and are in the DTrace documentation, so people may have written
5287 * scripts that use them. Declaring the probe points here means their scripts will
5288 * compile and execute which we want for portability of the scripts, but since this
5289 * section of code is never reached, the probe points will simply never fire. Yes,
5290 * this is basically a hack. The problem is the DTrace probe points were chosen with
5291 * Solaris specific VM events in mind, not portability to different VM implementations.
5292 */
5293
5294 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5295 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5296 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5297 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5298 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5299 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5300 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5301 /*NOTREACHED*/
5302 }
5303
5304
5305
5306 kern_return_t
vm_pageout_internal_start(void)5307 vm_pageout_internal_start(void)
5308 {
5309 kern_return_t result = KERN_SUCCESS;
5310 host_basic_info_data_t hinfo;
5311 vm_offset_t buf, bufsize;
5312
5313 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5314
5315 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5316 #define BSD_HOST 1
5317 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5318
5319 assert(hinfo.max_cpus > 0);
5320
5321 #if !XNU_TARGET_OS_OSX
5322 vm_pageout_state.vm_compressor_thread_count = 1;
5323 #else /* !XNU_TARGET_OS_OSX */
5324 if (hinfo.max_cpus > 4) {
5325 vm_pageout_state.vm_compressor_thread_count = 2;
5326 } else {
5327 vm_pageout_state.vm_compressor_thread_count = 1;
5328 }
5329 #endif /* !XNU_TARGET_OS_OSX */
5330 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5331 sizeof(vm_pageout_state.vm_compressor_thread_count));
5332
5333 #if __AMP__
5334 PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5335 if (vm_compressor_ebound) {
5336 vm_pageout_state.vm_compressor_thread_count = 2;
5337 }
5338 #endif
5339 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5340 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5341 }
5342 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5343 vm_pageout_state.vm_compressor_thread_count = 1;
5344 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5345 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5346 }
5347
5348 vm_pageout_queue_internal.pgo_maxlaundry =
5349 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5350
5351 PE_parse_boot_argn("vmpgoi_maxlaundry",
5352 &vm_pageout_queue_internal.pgo_maxlaundry,
5353 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5354
5355 #if DEVELOPMENT || DEBUG
5356 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5357 #endif /* DEVELOPMENT || DEBUG */
5358
5359 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5360
5361 kmem_alloc(kernel_map, &buf,
5362 bufsize * vm_pageout_state.vm_compressor_thread_count,
5363 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5364 VM_KERN_MEMORY_COMPRESSOR);
5365
5366 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5367 ciq[i].id = i;
5368 ciq[i].q = &vm_pageout_queue_internal;
5369 ciq[i].current_early_swapout_chead = NULL;
5370 ciq[i].current_regular_swapout_chead = NULL;
5371 ciq[i].current_late_swapout_chead = NULL;
5372 ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5373 #if DEVELOPMENT || DEBUG
5374 ciq[i].benchmark_q = &vm_pageout_queue_benchmark;
5375 #endif /* DEVELOPMENT || DEBUG */
5376
5377 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5378 (void *)&ciq[i], BASEPRI_VM,
5379 &vm_pageout_state.vm_pageout_internal_iothread);
5380
5381 if (result == KERN_SUCCESS) {
5382 thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5383 } else {
5384 break;
5385 }
5386 }
5387 return result;
5388 }
5389
5390 #if CONFIG_IOSCHED
5391 /*
5392 * To support I/O Expedite for compressed files we mark the upls with special flags.
5393 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5394 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5395 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5396 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5397 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5398 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5399 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5400 * unless the real I/O upl is being destroyed).
5401 */
5402
5403
5404 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5405 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5406 {
5407 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5408
5409 upl_lock(src_upl);
5410 if (src_upl->decmp_io_upl) {
5411 /*
5412 * If there is already an alive real I/O UPL, ignore this new UPL.
5413 * This case should rarely happen and even if it does, it just means
5414 * that we might issue a spurious expedite which the driver is expected
5415 * to handle.
5416 */
5417 upl_unlock(src_upl);
5418 return;
5419 }
5420 src_upl->decmp_io_upl = (void *)upl;
5421 src_upl->ref_count++;
5422
5423 upl->flags |= UPL_DECMP_REAL_IO;
5424 upl->decmp_io_upl = (void *)src_upl;
5425 upl_unlock(src_upl);
5426 }
5427 #endif /* CONFIG_IOSCHED */
5428
5429 #if UPL_DEBUG
5430 int upl_debug_enabled = 1;
5431 #else
5432 int upl_debug_enabled = 0;
5433 #endif
5434
5435 static upl_t
upl_create(int type,int flags,upl_size_t size)5436 upl_create(int type, int flags, upl_size_t size)
5437 {
5438 upl_t upl;
5439 vm_size_t page_field_size = 0;
5440 int upl_flags = 0;
5441 vm_size_t upl_size = sizeof(struct upl);
5442
5443 assert(page_aligned(size));
5444
5445 size = round_page_32(size);
5446
5447 if (type & UPL_CREATE_LITE) {
5448 page_field_size = (atop(size) + 7) >> 3;
5449 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5450
5451 upl_flags |= UPL_LITE;
5452 }
5453 if (type & UPL_CREATE_INTERNAL) {
5454 upl_size += sizeof(struct upl_page_info) * atop(size);
5455
5456 upl_flags |= UPL_INTERNAL;
5457 }
5458 // rdar://88964158
5459 /* BEGIN IGNORE CODESTYLE */
5460 __typed_allocators_ignore_push
5461 upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5462 __typed_allocators_ignore_pop
5463 /* END IGNORE CODESTYLE */
5464
5465 upl->flags = upl_flags | flags;
5466 upl->ref_count = 1;
5467 upl_lock_init(upl);
5468 #if CONFIG_IOSCHED
5469 if (type & UPL_CREATE_IO_TRACKING) {
5470 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5471 }
5472
5473 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5474 /* Only support expedite on internal UPLs */
5475 thread_t curthread = current_thread();
5476 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5477 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5478 if (curthread->decmp_upl != NULL) {
5479 upl_set_decmp_info(upl, curthread->decmp_upl);
5480 }
5481 }
5482 #endif
5483 #if CONFIG_IOSCHED || UPL_DEBUG
5484 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5485 upl->upl_creator = current_thread();
5486 upl->flags |= UPL_TRACKED_BY_OBJECT;
5487 }
5488 #endif
5489
5490 #if UPL_DEBUG
5491 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5492 #endif /* UPL_DEBUG */
5493
5494 return upl;
5495 }
5496
5497 static void
upl_destroy(upl_t upl)5498 upl_destroy(upl_t upl)
5499 {
5500 int page_field_size; /* bit field in word size buf */
5501 int size;
5502
5503 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5504
5505 if (upl->ext_ref_count) {
5506 panic("upl(%p) ext_ref_count", upl);
5507 }
5508
5509 #if CONFIG_IOSCHED
5510 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5511 upl_t src_upl;
5512 src_upl = upl->decmp_io_upl;
5513 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5514 upl_lock(src_upl);
5515 src_upl->decmp_io_upl = NULL;
5516 upl_unlock(src_upl);
5517 upl_deallocate(src_upl);
5518 }
5519 #endif /* CONFIG_IOSCHED */
5520
5521 #if CONFIG_IOSCHED || UPL_DEBUG
5522 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5523 !(upl->flags & UPL_VECTOR)) {
5524 vm_object_t object;
5525
5526 if (upl->flags & UPL_SHADOWED) {
5527 object = upl->map_object->shadow;
5528 } else {
5529 object = upl->map_object;
5530 }
5531
5532 vm_object_lock(object);
5533 queue_remove(&object->uplq, upl, upl_t, uplq);
5534 vm_object_activity_end(object);
5535 vm_object_collapse(object, 0, TRUE);
5536 vm_object_unlock(object);
5537 }
5538 #endif
5539 /*
5540 * drop a reference on the map_object whether or
5541 * not a pageout object is inserted
5542 */
5543 if (upl->flags & UPL_SHADOWED) {
5544 vm_object_deallocate(upl->map_object);
5545 }
5546
5547 if (upl->flags & UPL_DEVICE_MEMORY) {
5548 size = PAGE_SIZE;
5549 } else {
5550 size = upl_adjusted_size(upl, PAGE_MASK);
5551 }
5552 page_field_size = 0;
5553
5554 if (upl->flags & UPL_LITE) {
5555 page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5556 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5557 }
5558 upl_lock_destroy(upl);
5559 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5560
5561 #if CONFIG_IOSCHED
5562 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5563 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5564 }
5565 #endif
5566
5567 // rdar://88964158
5568 __typed_allocators_ignore_push
5569 if (upl->flags & UPL_INTERNAL) {
5570 kheap_free(KHEAP_DEFAULT, upl,
5571 sizeof(struct upl) +
5572 (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5573 + page_field_size);
5574 } else {
5575 kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5576 }
5577 __typed_allocators_ignore_pop
5578 }
5579
5580 void
upl_deallocate(upl_t upl)5581 upl_deallocate(upl_t upl)
5582 {
5583 upl_lock(upl);
5584
5585 if (--upl->ref_count == 0) {
5586 if (vector_upl_is_valid(upl)) {
5587 vector_upl_deallocate(upl);
5588 }
5589 upl_unlock(upl);
5590
5591 if (upl->upl_iodone) {
5592 upl_callout_iodone(upl);
5593 }
5594
5595 upl_destroy(upl);
5596 } else {
5597 upl_unlock(upl);
5598 }
5599 }
5600
5601 #if CONFIG_IOSCHED
5602 void
upl_mark_decmp(upl_t upl)5603 upl_mark_decmp(upl_t upl)
5604 {
5605 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5606 upl->flags |= UPL_DECMP_REQ;
5607 upl->upl_creator->decmp_upl = (void *)upl;
5608 }
5609 }
5610
5611 void
upl_unmark_decmp(upl_t upl)5612 upl_unmark_decmp(upl_t upl)
5613 {
5614 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5615 upl->upl_creator->decmp_upl = NULL;
5616 }
5617 }
5618
5619 #endif /* CONFIG_IOSCHED */
5620
5621 #define VM_PAGE_Q_BACKING_UP(q) \
5622 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5623
5624 boolean_t must_throttle_writes(void);
5625
5626 boolean_t
must_throttle_writes()5627 must_throttle_writes()
5628 {
5629 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5630 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5631 return TRUE;
5632 }
5633
5634 return FALSE;
5635 }
5636
5637 int vm_page_delayed_work_ctx_needed = 0;
5638 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5639
5640 __startup_func
5641 static void
vm_page_delayed_work_init_ctx(void)5642 vm_page_delayed_work_init_ctx(void)
5643 {
5644 uint16_t min_delayed_work_ctx_allocated = 16;
5645
5646 /*
5647 * try really hard to always keep NCPU elements around in the zone
5648 * in order for the UPL code to almost always get an element.
5649 */
5650 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5651 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5652 }
5653
5654 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5655 }
5656 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5657
5658 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5659 vm_page_delayed_work_get_ctx(void)
5660 {
5661 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5662
5663 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5664
5665 if (__probable(dw_ctx)) {
5666 dw_ctx->delayed_owner = current_thread();
5667 } else {
5668 vm_page_delayed_work_ctx_needed++;
5669 }
5670 return dw_ctx ? dw_ctx->dwp : NULL;
5671 }
5672
5673 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5674 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5675 {
5676 struct vm_page_delayed_work_ctx *ldw_ctx;
5677
5678 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5679 ldw_ctx->delayed_owner = NULL;
5680
5681 zfree(dw_ctx_zone, ldw_ctx);
5682 }
5683
5684 /*
5685 * Routine: vm_object_upl_request
5686 * Purpose:
5687 * Cause the population of a portion of a vm_object.
5688 * Depending on the nature of the request, the pages
5689 * returned may be contain valid data or be uninitialized.
5690 * A page list structure, listing the physical pages
5691 * will be returned upon request.
5692 * This function is called by the file system or any other
5693 * supplier of backing store to a pager.
5694 * IMPORTANT NOTE: The caller must still respect the relationship
5695 * between the vm_object and its backing memory object. The
5696 * caller MUST NOT substitute changes in the backing file
5697 * without first doing a memory_object_lock_request on the
5698 * target range unless it is know that the pages are not
5699 * shared with another entity at the pager level.
5700 * Copy_in_to:
5701 * if a page list structure is present
5702 * return the mapped physical pages, where a
5703 * page is not present, return a non-initialized
5704 * one. If the no_sync bit is turned on, don't
5705 * call the pager unlock to synchronize with other
5706 * possible copies of the page. Leave pages busy
5707 * in the original object, if a page list structure
5708 * was specified. When a commit of the page list
5709 * pages is done, the dirty bit will be set for each one.
5710 * Copy_out_from:
5711 * If a page list structure is present, return
5712 * all mapped pages. Where a page does not exist
5713 * map a zero filled one. Leave pages busy in
5714 * the original object. If a page list structure
5715 * is not specified, this call is a no-op.
5716 *
5717 * Note: access of default pager objects has a rather interesting
5718 * twist. The caller of this routine, presumably the file system
5719 * page cache handling code, will never actually make a request
5720 * against a default pager backed object. Only the default
5721 * pager will make requests on backing store related vm_objects
5722 * In this way the default pager can maintain the relationship
5723 * between backing store files (abstract memory objects) and
5724 * the vm_objects (cache objects), they support.
5725 *
5726 */
5727
5728 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5729 vm_object_upl_request(
5730 vm_object_t object,
5731 vm_object_offset_t offset,
5732 upl_size_t size,
5733 upl_t *upl_ptr,
5734 upl_page_info_array_t user_page_list,
5735 unsigned int *page_list_count,
5736 upl_control_flags_t cntrl_flags,
5737 vm_tag_t tag)
5738 {
5739 vm_page_t dst_page = VM_PAGE_NULL;
5740 vm_object_offset_t dst_offset;
5741 upl_size_t xfer_size;
5742 unsigned int size_in_pages;
5743 boolean_t dirty;
5744 boolean_t hw_dirty;
5745 upl_t upl = NULL;
5746 unsigned int entry;
5747 vm_page_t alias_page = NULL;
5748 int refmod_state = 0;
5749 wpl_array_t lite_list = NULL;
5750 vm_object_t last_copy_object;
5751 struct vm_page_delayed_work dw_array;
5752 struct vm_page_delayed_work *dwp, *dwp_start;
5753 bool dwp_finish_ctx = TRUE;
5754 int dw_count;
5755 int dw_limit;
5756 int io_tracking_flag = 0;
5757 int grab_options;
5758 int page_grab_count = 0;
5759 ppnum_t phys_page;
5760 pmap_flush_context pmap_flush_context_storage;
5761 boolean_t pmap_flushes_delayed = FALSE;
5762 #if DEVELOPMENT || DEBUG
5763 task_t task = current_task();
5764 #endif /* DEVELOPMENT || DEBUG */
5765
5766 dwp_start = dwp = NULL;
5767
5768 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5769 /*
5770 * For forward compatibility's sake,
5771 * reject any unknown flag.
5772 */
5773 return KERN_INVALID_VALUE;
5774 }
5775 if ((!object->internal) && (object->paging_offset != 0)) {
5776 panic("vm_object_upl_request: external object with non-zero paging offset");
5777 }
5778 if (object->phys_contiguous) {
5779 panic("vm_object_upl_request: contiguous object specified");
5780 }
5781
5782 assertf(page_aligned(offset) && page_aligned(size),
5783 "offset 0x%llx size 0x%x",
5784 offset, size);
5785
5786 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5787
5788 dw_count = 0;
5789 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5790 dwp_start = vm_page_delayed_work_get_ctx();
5791 if (dwp_start == NULL) {
5792 dwp_start = &dw_array;
5793 dw_limit = 1;
5794 dwp_finish_ctx = FALSE;
5795 }
5796
5797 dwp = dwp_start;
5798
5799 if (size > MAX_UPL_SIZE_BYTES) {
5800 size = MAX_UPL_SIZE_BYTES;
5801 }
5802
5803 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5804 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5805 }
5806
5807 #if CONFIG_IOSCHED || UPL_DEBUG
5808 if (object->io_tracking || upl_debug_enabled) {
5809 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5810 }
5811 #endif
5812 #if CONFIG_IOSCHED
5813 if (object->io_tracking) {
5814 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5815 }
5816 #endif
5817
5818 if (cntrl_flags & UPL_SET_INTERNAL) {
5819 if (cntrl_flags & UPL_SET_LITE) {
5820 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5821
5822 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5823 lite_list = (wpl_array_t)
5824 (((uintptr_t)user_page_list) +
5825 ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5826 if (size == 0) {
5827 user_page_list = NULL;
5828 lite_list = NULL;
5829 }
5830 } else {
5831 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5832
5833 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5834 if (size == 0) {
5835 user_page_list = NULL;
5836 }
5837 }
5838 } else {
5839 if (cntrl_flags & UPL_SET_LITE) {
5840 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5841
5842 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5843 if (size == 0) {
5844 lite_list = NULL;
5845 }
5846 } else {
5847 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5848 }
5849 }
5850 *upl_ptr = upl;
5851
5852 if (user_page_list) {
5853 user_page_list[0].device = FALSE;
5854 }
5855
5856 if (cntrl_flags & UPL_SET_LITE) {
5857 upl->map_object = object;
5858 } else {
5859 upl->map_object = vm_object_allocate(size);
5860 /*
5861 * No neeed to lock the new object: nobody else knows
5862 * about it yet, so it's all ours so far.
5863 */
5864 upl->map_object->shadow = object;
5865 upl->map_object->pageout = TRUE;
5866 upl->map_object->can_persist = FALSE;
5867 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5868 upl->map_object->vo_shadow_offset = offset;
5869 upl->map_object->wimg_bits = object->wimg_bits;
5870 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5871 "object %p shadow_offset 0x%llx",
5872 upl->map_object, upl->map_object->vo_shadow_offset);
5873
5874 alias_page = vm_page_grab_fictitious(TRUE);
5875
5876 upl->flags |= UPL_SHADOWED;
5877 }
5878 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5879 upl->flags |= UPL_PAGEOUT;
5880 }
5881
5882 vm_object_lock(object);
5883 vm_object_activity_begin(object);
5884
5885 grab_options = 0;
5886 #if CONFIG_SECLUDED_MEMORY
5887 if (object->can_grab_secluded) {
5888 grab_options |= VM_PAGE_GRAB_SECLUDED;
5889 }
5890 #endif /* CONFIG_SECLUDED_MEMORY */
5891
5892 /*
5893 * we can lock in the paging_offset once paging_in_progress is set
5894 */
5895 upl->u_size = size;
5896 upl->u_offset = offset + object->paging_offset;
5897
5898 #if CONFIG_IOSCHED || UPL_DEBUG
5899 if (object->io_tracking || upl_debug_enabled) {
5900 vm_object_activity_begin(object);
5901 queue_enter(&object->uplq, upl, upl_t, uplq);
5902 }
5903 #endif
5904 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5905 /*
5906 * Honor copy-on-write obligations
5907 *
5908 * The caller is gathering these pages and
5909 * might modify their contents. We need to
5910 * make sure that the copy object has its own
5911 * private copies of these pages before we let
5912 * the caller modify them.
5913 */
5914 vm_object_update(object,
5915 offset,
5916 size,
5917 NULL,
5918 NULL,
5919 FALSE, /* should_return */
5920 MEMORY_OBJECT_COPY_SYNC,
5921 VM_PROT_NO_CHANGE);
5922
5923 VM_PAGEOUT_DEBUG(upl_cow, 1);
5924 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5925 }
5926 /*
5927 * remember which copy object we synchronized with
5928 */
5929 last_copy_object = object->copy;
5930 entry = 0;
5931
5932 xfer_size = size;
5933 dst_offset = offset;
5934 size_in_pages = size / PAGE_SIZE;
5935
5936 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5937 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5938 object->scan_collisions = 0;
5939 }
5940
5941 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5942 boolean_t isSSD = FALSE;
5943
5944 #if !XNU_TARGET_OS_OSX
5945 isSSD = TRUE;
5946 #else /* !XNU_TARGET_OS_OSX */
5947 vnode_pager_get_isSSD(object->pager, &isSSD);
5948 #endif /* !XNU_TARGET_OS_OSX */
5949 vm_object_unlock(object);
5950
5951 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5952
5953 if (isSSD == TRUE) {
5954 delay(1000 * size_in_pages);
5955 } else {
5956 delay(5000 * size_in_pages);
5957 }
5958 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5959
5960 vm_object_lock(object);
5961 }
5962
5963 while (xfer_size) {
5964 dwp->dw_mask = 0;
5965
5966 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5967 vm_object_unlock(object);
5968 alias_page = vm_page_grab_fictitious(TRUE);
5969 vm_object_lock(object);
5970 }
5971 if (cntrl_flags & UPL_COPYOUT_FROM) {
5972 upl->flags |= UPL_PAGE_SYNC_DONE;
5973
5974 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5975 dst_page->vmp_fictitious ||
5976 dst_page->vmp_absent ||
5977 VMP_ERROR_GET(dst_page) ||
5978 dst_page->vmp_cleaning ||
5979 (VM_PAGE_WIRED(dst_page))) {
5980 if (user_page_list) {
5981 user_page_list[entry].phys_addr = 0;
5982 }
5983
5984 goto try_next_page;
5985 }
5986 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5987
5988 /*
5989 * grab this up front...
5990 * a high percentange of the time we're going to
5991 * need the hardware modification state a bit later
5992 * anyway... so we can eliminate an extra call into
5993 * the pmap layer by grabbing it here and recording it
5994 */
5995 if (dst_page->vmp_pmapped) {
5996 refmod_state = pmap_get_refmod(phys_page);
5997 } else {
5998 refmod_state = 0;
5999 }
6000
6001 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6002 /*
6003 * page is on inactive list and referenced...
6004 * reactivate it now... this gets it out of the
6005 * way of vm_pageout_scan which would have to
6006 * reactivate it upon tripping over it
6007 */
6008 dwp->dw_mask |= DW_vm_page_activate;
6009 }
6010 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6011 /*
6012 * we're only asking for DIRTY pages to be returned
6013 */
6014 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6015 /*
6016 * if we were the page stolen by vm_pageout_scan to be
6017 * cleaned (as opposed to a buddy being clustered in
6018 * or this request is not being driven by a PAGEOUT cluster
6019 * then we only need to check for the page being dirty or
6020 * precious to decide whether to return it
6021 */
6022 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6023 goto check_busy;
6024 }
6025 goto dont_return;
6026 }
6027 /*
6028 * this is a request for a PAGEOUT cluster and this page
6029 * is merely along for the ride as a 'buddy'... not only
6030 * does it have to be dirty to be returned, but it also
6031 * can't have been referenced recently...
6032 */
6033 if ((hibernate_cleaning_in_progress == TRUE ||
6034 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6035 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6036 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6037 goto check_busy;
6038 }
6039 dont_return:
6040 /*
6041 * if we reach here, we're not to return
6042 * the page... go on to the next one
6043 */
6044 if (dst_page->vmp_laundry == TRUE) {
6045 /*
6046 * if we get here, the page is not 'cleaning' (filtered out above).
6047 * since it has been referenced, remove it from the laundry
6048 * so we don't pay the cost of an I/O to clean a page
6049 * we're just going to take back
6050 */
6051 vm_page_lockspin_queues();
6052
6053 vm_pageout_steal_laundry(dst_page, TRUE);
6054 vm_page_activate(dst_page);
6055
6056 vm_page_unlock_queues();
6057 }
6058 if (user_page_list) {
6059 user_page_list[entry].phys_addr = 0;
6060 }
6061
6062 goto try_next_page;
6063 }
6064 check_busy:
6065 if (dst_page->vmp_busy) {
6066 if (cntrl_flags & UPL_NOBLOCK) {
6067 if (user_page_list) {
6068 user_page_list[entry].phys_addr = 0;
6069 }
6070 dwp->dw_mask = 0;
6071
6072 goto try_next_page;
6073 }
6074 /*
6075 * someone else is playing with the
6076 * page. We will have to wait.
6077 */
6078 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6079
6080 continue;
6081 }
6082 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6083 vm_page_lockspin_queues();
6084
6085 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6086 /*
6087 * we've buddied up a page for a clustered pageout
6088 * that has already been moved to the pageout
6089 * queue by pageout_scan... we need to remove
6090 * it from the queue and drop the laundry count
6091 * on that queue
6092 */
6093 vm_pageout_throttle_up(dst_page);
6094 }
6095 vm_page_unlock_queues();
6096 }
6097 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6098 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6099
6100 if (phys_page > upl->highest_page) {
6101 upl->highest_page = phys_page;
6102 }
6103
6104 assert(!pmap_is_noencrypt(phys_page));
6105
6106 if (cntrl_flags & UPL_SET_LITE) {
6107 unsigned int pg_num;
6108
6109 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6110 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6111 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6112
6113 if (hw_dirty) {
6114 if (pmap_flushes_delayed == FALSE) {
6115 pmap_flush_context_init(&pmap_flush_context_storage);
6116 pmap_flushes_delayed = TRUE;
6117 }
6118 pmap_clear_refmod_options(phys_page,
6119 VM_MEM_MODIFIED,
6120 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6121 &pmap_flush_context_storage);
6122 }
6123
6124 /*
6125 * Mark original page as cleaning
6126 * in place.
6127 */
6128 dst_page->vmp_cleaning = TRUE;
6129 dst_page->vmp_precious = FALSE;
6130 } else {
6131 /*
6132 * use pageclean setup, it is more
6133 * convenient even for the pageout
6134 * cases here
6135 */
6136 vm_object_lock(upl->map_object);
6137 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6138 vm_object_unlock(upl->map_object);
6139
6140 alias_page->vmp_absent = FALSE;
6141 alias_page = NULL;
6142 }
6143 if (dirty) {
6144 SET_PAGE_DIRTY(dst_page, FALSE);
6145 } else {
6146 dst_page->vmp_dirty = FALSE;
6147 }
6148
6149 if (!dirty) {
6150 dst_page->vmp_precious = TRUE;
6151 }
6152
6153 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6154 if (!VM_PAGE_WIRED(dst_page)) {
6155 dst_page->vmp_free_when_done = TRUE;
6156 }
6157 }
6158 } else {
6159 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6160 /*
6161 * Honor copy-on-write obligations
6162 *
6163 * The copy object has changed since we
6164 * last synchronized for copy-on-write.
6165 * Another copy object might have been
6166 * inserted while we released the object's
6167 * lock. Since someone could have seen the
6168 * original contents of the remaining pages
6169 * through that new object, we have to
6170 * synchronize with it again for the remaining
6171 * pages only. The previous pages are "busy"
6172 * so they can not be seen through the new
6173 * mapping. The new mapping will see our
6174 * upcoming changes for those previous pages,
6175 * but that's OK since they couldn't see what
6176 * was there before. It's just a race anyway
6177 * and there's no guarantee of consistency or
6178 * atomicity. We just don't want new mappings
6179 * to see both the *before* and *after* pages.
6180 */
6181 if (object->copy != VM_OBJECT_NULL) {
6182 vm_object_update(
6183 object,
6184 dst_offset,/* current offset */
6185 xfer_size, /* remaining size */
6186 NULL,
6187 NULL,
6188 FALSE, /* should_return */
6189 MEMORY_OBJECT_COPY_SYNC,
6190 VM_PROT_NO_CHANGE);
6191
6192 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6193 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6194 }
6195 /*
6196 * remember the copy object we synced with
6197 */
6198 last_copy_object = object->copy;
6199 }
6200 dst_page = vm_page_lookup(object, dst_offset);
6201
6202 if (dst_page != VM_PAGE_NULL) {
6203 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6204 /*
6205 * skip over pages already present in the cache
6206 */
6207 if (user_page_list) {
6208 user_page_list[entry].phys_addr = 0;
6209 }
6210
6211 goto try_next_page;
6212 }
6213 if (dst_page->vmp_fictitious) {
6214 panic("need corner case for fictitious page");
6215 }
6216
6217 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6218 /*
6219 * someone else is playing with the
6220 * page. We will have to wait.
6221 */
6222 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6223
6224 continue;
6225 }
6226 if (dst_page->vmp_laundry) {
6227 vm_pageout_steal_laundry(dst_page, FALSE);
6228 }
6229 } else {
6230 if (object->private) {
6231 /*
6232 * This is a nasty wrinkle for users
6233 * of upl who encounter device or
6234 * private memory however, it is
6235 * unavoidable, only a fault can
6236 * resolve the actual backing
6237 * physical page by asking the
6238 * backing device.
6239 */
6240 if (user_page_list) {
6241 user_page_list[entry].phys_addr = 0;
6242 }
6243
6244 goto try_next_page;
6245 }
6246 if (object->scan_collisions) {
6247 /*
6248 * the pageout_scan thread is trying to steal
6249 * pages from this object, but has run into our
6250 * lock... grab 2 pages from the head of the object...
6251 * the first is freed on behalf of pageout_scan, the
6252 * 2nd is for our own use... we use vm_object_page_grab
6253 * in both cases to avoid taking pages from the free
6254 * list since we are under memory pressure and our
6255 * lock on this object is getting in the way of
6256 * relieving it
6257 */
6258 dst_page = vm_object_page_grab(object);
6259
6260 if (dst_page != VM_PAGE_NULL) {
6261 vm_page_release(dst_page,
6262 FALSE);
6263 }
6264
6265 dst_page = vm_object_page_grab(object);
6266 }
6267 if (dst_page == VM_PAGE_NULL) {
6268 /*
6269 * need to allocate a page
6270 */
6271 dst_page = vm_page_grab_options(grab_options);
6272 if (dst_page != VM_PAGE_NULL) {
6273 page_grab_count++;
6274 }
6275 }
6276 if (dst_page == VM_PAGE_NULL) {
6277 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6278 /*
6279 * we don't want to stall waiting for pages to come onto the free list
6280 * while we're already holding absent pages in this UPL
6281 * the caller will deal with the empty slots
6282 */
6283 if (user_page_list) {
6284 user_page_list[entry].phys_addr = 0;
6285 }
6286
6287 goto try_next_page;
6288 }
6289 /*
6290 * no pages available... wait
6291 * then try again for the same
6292 * offset...
6293 */
6294 vm_object_unlock(object);
6295
6296 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6297
6298 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6299
6300 VM_PAGE_WAIT();
6301 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6302
6303 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6304
6305 vm_object_lock(object);
6306
6307 continue;
6308 }
6309 vm_page_insert(dst_page, object, dst_offset);
6310
6311 dst_page->vmp_absent = TRUE;
6312 dst_page->vmp_busy = FALSE;
6313
6314 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6315 /*
6316 * if UPL_RET_ONLY_ABSENT was specified,
6317 * than we're definitely setting up a
6318 * upl for a clustered read/pagein
6319 * operation... mark the pages as clustered
6320 * so upl_commit_range can put them on the
6321 * speculative list
6322 */
6323 dst_page->vmp_clustered = TRUE;
6324
6325 if (!(cntrl_flags & UPL_FILE_IO)) {
6326 counter_inc(&vm_statistics_pageins);
6327 }
6328 }
6329 }
6330 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6331
6332 dst_page->vmp_overwriting = TRUE;
6333
6334 if (dst_page->vmp_pmapped) {
6335 if (!(cntrl_flags & UPL_FILE_IO)) {
6336 /*
6337 * eliminate all mappings from the
6338 * original object and its prodigy
6339 */
6340 refmod_state = pmap_disconnect(phys_page);
6341 } else {
6342 refmod_state = pmap_get_refmod(phys_page);
6343 }
6344 } else {
6345 refmod_state = 0;
6346 }
6347
6348 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6349 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6350
6351 if (cntrl_flags & UPL_SET_LITE) {
6352 unsigned int pg_num;
6353
6354 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6355 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6356 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6357
6358 if (hw_dirty) {
6359 pmap_clear_modify(phys_page);
6360 }
6361
6362 /*
6363 * Mark original page as cleaning
6364 * in place.
6365 */
6366 dst_page->vmp_cleaning = TRUE;
6367 dst_page->vmp_precious = FALSE;
6368 } else {
6369 /*
6370 * use pageclean setup, it is more
6371 * convenient even for the pageout
6372 * cases here
6373 */
6374 vm_object_lock(upl->map_object);
6375 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6376 vm_object_unlock(upl->map_object);
6377
6378 alias_page->vmp_absent = FALSE;
6379 alias_page = NULL;
6380 }
6381
6382 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6383 upl->flags &= ~UPL_CLEAR_DIRTY;
6384 upl->flags |= UPL_SET_DIRTY;
6385 dirty = TRUE;
6386 /*
6387 * Page belonging to a code-signed object is about to
6388 * be written. Mark it tainted and disconnect it from
6389 * all pmaps so processes have to fault it back in and
6390 * deal with the tainted bit.
6391 */
6392 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6393 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6394 vm_page_upl_tainted++;
6395 if (dst_page->vmp_pmapped) {
6396 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6397 if (refmod_state & VM_MEM_REFERENCED) {
6398 dst_page->vmp_reference = TRUE;
6399 }
6400 }
6401 }
6402 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6403 /*
6404 * clean in place for read implies
6405 * that a write will be done on all
6406 * the pages that are dirty before
6407 * a upl commit is done. The caller
6408 * is obligated to preserve the
6409 * contents of all pages marked dirty
6410 */
6411 upl->flags |= UPL_CLEAR_DIRTY;
6412 }
6413 dst_page->vmp_dirty = dirty;
6414
6415 if (!dirty) {
6416 dst_page->vmp_precious = TRUE;
6417 }
6418
6419 if (!VM_PAGE_WIRED(dst_page)) {
6420 /*
6421 * deny access to the target page while
6422 * it is being worked on
6423 */
6424 dst_page->vmp_busy = TRUE;
6425 } else {
6426 dwp->dw_mask |= DW_vm_page_wire;
6427 }
6428
6429 /*
6430 * We might be about to satisfy a fault which has been
6431 * requested. So no need for the "restart" bit.
6432 */
6433 dst_page->vmp_restart = FALSE;
6434 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6435 /*
6436 * expect the page to be used
6437 */
6438 dwp->dw_mask |= DW_set_reference;
6439 }
6440 if (cntrl_flags & UPL_PRECIOUS) {
6441 if (object->internal) {
6442 SET_PAGE_DIRTY(dst_page, FALSE);
6443 dst_page->vmp_precious = FALSE;
6444 } else {
6445 dst_page->vmp_precious = TRUE;
6446 }
6447 } else {
6448 dst_page->vmp_precious = FALSE;
6449 }
6450 }
6451 if (dst_page->vmp_busy) {
6452 upl->flags |= UPL_HAS_BUSY;
6453 }
6454
6455 if (phys_page > upl->highest_page) {
6456 upl->highest_page = phys_page;
6457 }
6458 assert(!pmap_is_noencrypt(phys_page));
6459 if (user_page_list) {
6460 user_page_list[entry].phys_addr = phys_page;
6461 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6462 user_page_list[entry].absent = dst_page->vmp_absent;
6463 user_page_list[entry].dirty = dst_page->vmp_dirty;
6464 user_page_list[entry].precious = dst_page->vmp_precious;
6465 user_page_list[entry].device = FALSE;
6466 user_page_list[entry].needed = FALSE;
6467 if (dst_page->vmp_clustered == TRUE) {
6468 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6469 } else {
6470 user_page_list[entry].speculative = FALSE;
6471 }
6472 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6473 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6474 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6475 user_page_list[entry].mark = FALSE;
6476 }
6477 /*
6478 * if UPL_RET_ONLY_ABSENT is set, then
6479 * we are working with a fresh page and we've
6480 * just set the clustered flag on it to
6481 * indicate that it was drug in as part of a
6482 * speculative cluster... so leave it alone
6483 */
6484 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6485 /*
6486 * someone is explicitly grabbing this page...
6487 * update clustered and speculative state
6488 *
6489 */
6490 if (dst_page->vmp_clustered) {
6491 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6492 }
6493 }
6494 try_next_page:
6495 if (dwp->dw_mask) {
6496 if (dwp->dw_mask & DW_vm_page_activate) {
6497 counter_inc(&vm_statistics_reactivations);
6498 }
6499
6500 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6501
6502 if (dw_count >= dw_limit) {
6503 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6504
6505 dwp = dwp_start;
6506 dw_count = 0;
6507 }
6508 }
6509 entry++;
6510 dst_offset += PAGE_SIZE_64;
6511 xfer_size -= PAGE_SIZE;
6512 }
6513 if (dw_count) {
6514 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6515 dwp = dwp_start;
6516 dw_count = 0;
6517 }
6518
6519 if (alias_page != NULL) {
6520 VM_PAGE_FREE(alias_page);
6521 }
6522 if (pmap_flushes_delayed == TRUE) {
6523 pmap_flush(&pmap_flush_context_storage);
6524 }
6525
6526 if (page_list_count != NULL) {
6527 if (upl->flags & UPL_INTERNAL) {
6528 *page_list_count = 0;
6529 } else if (*page_list_count > entry) {
6530 *page_list_count = entry;
6531 }
6532 }
6533 #if UPL_DEBUG
6534 upl->upl_state = 1;
6535 #endif
6536 vm_object_unlock(object);
6537
6538 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6539 #if DEVELOPMENT || DEBUG
6540 if (task != NULL) {
6541 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6542 }
6543 #endif /* DEVELOPMENT || DEBUG */
6544
6545 if (dwp_start && dwp_finish_ctx) {
6546 vm_page_delayed_work_finish_ctx(dwp_start);
6547 dwp_start = dwp = NULL;
6548 }
6549
6550 return KERN_SUCCESS;
6551 }
6552
6553 /*
6554 * Routine: vm_object_super_upl_request
6555 * Purpose:
6556 * Cause the population of a portion of a vm_object
6557 * in much the same way as memory_object_upl_request.
6558 * Depending on the nature of the request, the pages
6559 * returned may be contain valid data or be uninitialized.
6560 * However, the region may be expanded up to the super
6561 * cluster size provided.
6562 */
6563
6564 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6565 vm_object_super_upl_request(
6566 vm_object_t object,
6567 vm_object_offset_t offset,
6568 upl_size_t size,
6569 upl_size_t super_cluster,
6570 upl_t *upl,
6571 upl_page_info_t *user_page_list,
6572 unsigned int *page_list_count,
6573 upl_control_flags_t cntrl_flags,
6574 vm_tag_t tag)
6575 {
6576 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6577 return KERN_FAILURE;
6578 }
6579
6580 assert(object->paging_in_progress);
6581 offset = offset - object->paging_offset;
6582
6583 if (super_cluster > size) {
6584 vm_object_offset_t base_offset;
6585 upl_size_t super_size;
6586 vm_object_size_t super_size_64;
6587
6588 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6589 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6590 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6591 super_size = (upl_size_t) super_size_64;
6592 assert(super_size == super_size_64);
6593
6594 if (offset > (base_offset + super_size)) {
6595 panic("vm_object_super_upl_request: Missed target pageout"
6596 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6597 offset, base_offset, super_size, super_cluster,
6598 size, object->paging_offset);
6599 }
6600 /*
6601 * apparently there is a case where the vm requests a
6602 * page to be written out who's offset is beyond the
6603 * object size
6604 */
6605 if ((offset + size) > (base_offset + super_size)) {
6606 super_size_64 = (offset + size) - base_offset;
6607 super_size = (upl_size_t) super_size_64;
6608 assert(super_size == super_size_64);
6609 }
6610
6611 offset = base_offset;
6612 size = super_size;
6613 }
6614 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6615 }
6616
6617 int cs_executable_create_upl = 0;
6618 extern int proc_selfpid(void);
6619 extern char *proc_name_address(void *p);
6620
6621 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6622 vm_map_create_upl(
6623 vm_map_t map,
6624 vm_map_address_t offset,
6625 upl_size_t *upl_size,
6626 upl_t *upl,
6627 upl_page_info_array_t page_list,
6628 unsigned int *count,
6629 upl_control_flags_t *flags,
6630 vm_tag_t tag)
6631 {
6632 vm_map_entry_t entry;
6633 upl_control_flags_t caller_flags;
6634 int force_data_sync;
6635 int sync_cow_data;
6636 vm_object_t local_object;
6637 vm_map_offset_t local_offset;
6638 vm_map_offset_t local_start;
6639 kern_return_t ret;
6640 vm_map_address_t original_offset;
6641 vm_map_size_t original_size, adjusted_size;
6642 vm_map_offset_t local_entry_start;
6643 vm_object_offset_t local_entry_offset;
6644 vm_object_offset_t offset_in_mapped_page;
6645 boolean_t release_map = FALSE;
6646
6647 start_with_map:
6648
6649 original_offset = offset;
6650 original_size = *upl_size;
6651 adjusted_size = original_size;
6652
6653 caller_flags = *flags;
6654
6655 if (caller_flags & ~UPL_VALID_FLAGS) {
6656 /*
6657 * For forward compatibility's sake,
6658 * reject any unknown flag.
6659 */
6660 ret = KERN_INVALID_VALUE;
6661 goto done;
6662 }
6663 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6664 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6665
6666 if (upl == NULL) {
6667 ret = KERN_INVALID_ARGUMENT;
6668 goto done;
6669 }
6670
6671 REDISCOVER_ENTRY:
6672 vm_map_lock_read(map);
6673
6674 if (!vm_map_lookup_entry(map, offset, &entry)) {
6675 vm_map_unlock_read(map);
6676 ret = KERN_FAILURE;
6677 goto done;
6678 }
6679
6680 local_entry_start = entry->vme_start;
6681 local_entry_offset = VME_OFFSET(entry);
6682
6683 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6684 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6685 }
6686
6687 if (entry->vme_end - original_offset < adjusted_size) {
6688 adjusted_size = entry->vme_end - original_offset;
6689 assert(adjusted_size > 0);
6690 *upl_size = (upl_size_t) adjusted_size;
6691 assert(*upl_size == adjusted_size);
6692 }
6693
6694 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6695 *flags = 0;
6696
6697 if (!entry->is_sub_map &&
6698 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6699 if (VME_OBJECT(entry)->private) {
6700 *flags = UPL_DEV_MEMORY;
6701 }
6702
6703 if (VME_OBJECT(entry)->phys_contiguous) {
6704 *flags |= UPL_PHYS_CONTIG;
6705 }
6706 }
6707 vm_map_unlock_read(map);
6708 ret = KERN_SUCCESS;
6709 goto done;
6710 }
6711
6712 offset_in_mapped_page = 0;
6713 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6714 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6715 *upl_size = (upl_size_t)
6716 (vm_map_round_page(original_offset + adjusted_size,
6717 VM_MAP_PAGE_MASK(map))
6718 - offset);
6719
6720 offset_in_mapped_page = original_offset - offset;
6721 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6722
6723 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6724 }
6725
6726 if (!entry->is_sub_map) {
6727 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6728 !VME_OBJECT(entry)->phys_contiguous) {
6729 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6730 *upl_size = MAX_UPL_SIZE_BYTES;
6731 }
6732 }
6733
6734 /*
6735 * Create an object if necessary.
6736 */
6737 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6738 if (vm_map_lock_read_to_write(map)) {
6739 goto REDISCOVER_ENTRY;
6740 }
6741
6742 VME_OBJECT_SET(entry,
6743 vm_object_allocate((vm_size_t)
6744 vm_object_round_page((entry->vme_end - entry->vme_start))),
6745 false, 0);
6746 VME_OFFSET_SET(entry, 0);
6747 assert(entry->use_pmap);
6748
6749 vm_map_lock_write_to_read(map);
6750 }
6751
6752 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6753 !(entry->protection & VM_PROT_WRITE)) {
6754 vm_map_unlock_read(map);
6755 ret = KERN_PROTECTION_FAILURE;
6756 goto done;
6757 }
6758 }
6759
6760 #if !XNU_TARGET_OS_OSX
6761 if (map->pmap != kernel_pmap &&
6762 (caller_flags & UPL_COPYOUT_FROM) &&
6763 (entry->protection & VM_PROT_EXECUTE) &&
6764 !(entry->protection & VM_PROT_WRITE)) {
6765 vm_offset_t kaddr;
6766 vm_size_t ksize;
6767
6768 /*
6769 * We're about to create a read-only UPL backed by
6770 * memory from an executable mapping.
6771 * Wiring the pages would result in the pages being copied
6772 * (due to the "MAP_PRIVATE" mapping) and no longer
6773 * code-signed, so no longer eligible for execution.
6774 * Instead, let's copy the data into a kernel buffer and
6775 * create the UPL from this kernel buffer.
6776 * The kernel buffer is then freed, leaving the UPL holding
6777 * the last reference on the VM object, so the memory will
6778 * be released when the UPL is committed.
6779 */
6780
6781 vm_map_unlock_read(map);
6782 entry = VM_MAP_ENTRY_NULL;
6783 /* allocate kernel buffer */
6784 ksize = round_page(*upl_size);
6785 kaddr = 0;
6786 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6787 KMA_PAGEABLE | KMA_DATA, tag);
6788 if (ret == KERN_SUCCESS) {
6789 /* copyin the user data */
6790 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6791 }
6792 if (ret == KERN_SUCCESS) {
6793 if (ksize > *upl_size) {
6794 /* zero out the extra space in kernel buffer */
6795 memset((void *)(kaddr + *upl_size),
6796 0,
6797 ksize - *upl_size);
6798 }
6799 /* create the UPL from the kernel buffer */
6800 vm_object_offset_t offset_in_object;
6801 vm_object_offset_t offset_in_object_page;
6802
6803 offset_in_object = offset - local_entry_start + local_entry_offset;
6804 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6805 assert(offset_in_object_page < PAGE_SIZE);
6806 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6807 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6808 ret = vm_map_create_upl(kernel_map,
6809 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6810 upl_size, upl, page_list, count, flags, tag);
6811 }
6812 if (kaddr != 0) {
6813 /* free the kernel buffer */
6814 kmem_free(kernel_map, kaddr, ksize);
6815 kaddr = 0;
6816 ksize = 0;
6817 }
6818 #if DEVELOPMENT || DEBUG
6819 DTRACE_VM4(create_upl_from_executable,
6820 vm_map_t, map,
6821 vm_map_address_t, offset,
6822 upl_size_t, *upl_size,
6823 kern_return_t, ret);
6824 #endif /* DEVELOPMENT || DEBUG */
6825 goto done;
6826 }
6827 #endif /* !XNU_TARGET_OS_OSX */
6828
6829 if (!entry->is_sub_map) {
6830 local_object = VME_OBJECT(entry);
6831 assert(local_object != VM_OBJECT_NULL);
6832 }
6833
6834 if (!entry->is_sub_map &&
6835 !entry->needs_copy &&
6836 *upl_size != 0 &&
6837 local_object->vo_size > *upl_size && /* partial UPL */
6838 entry->wired_count == 0 && /* No COW for entries that are wired */
6839 (map->pmap != kernel_pmap) && /* alias checks */
6840 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6841 ||
6842 ( /* case 2 */
6843 local_object->internal &&
6844 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6845 local_object->ref_count > 1))) {
6846 vm_prot_t prot;
6847
6848 /*
6849 * Case 1:
6850 * Set up the targeted range for copy-on-write to avoid
6851 * applying true_share/copy_delay to the entire object.
6852 *
6853 * Case 2:
6854 * This map entry covers only part of an internal
6855 * object. There could be other map entries covering
6856 * other areas of this object and some of these map
6857 * entries could be marked as "needs_copy", which
6858 * assumes that the object is COPY_SYMMETRIC.
6859 * To avoid marking this object as COPY_DELAY and
6860 * "true_share", let's shadow it and mark the new
6861 * (smaller) object as "true_share" and COPY_DELAY.
6862 */
6863
6864 if (vm_map_lock_read_to_write(map)) {
6865 goto REDISCOVER_ENTRY;
6866 }
6867 vm_map_lock_assert_exclusive(map);
6868 assert(VME_OBJECT(entry) == local_object);
6869
6870 vm_map_clip_start(map,
6871 entry,
6872 vm_map_trunc_page(offset,
6873 VM_MAP_PAGE_MASK(map)));
6874 vm_map_clip_end(map,
6875 entry,
6876 vm_map_round_page(offset + *upl_size,
6877 VM_MAP_PAGE_MASK(map)));
6878 if ((entry->vme_end - offset) < *upl_size) {
6879 *upl_size = (upl_size_t) (entry->vme_end - offset);
6880 assert(*upl_size == entry->vme_end - offset);
6881 }
6882
6883 prot = entry->protection & ~VM_PROT_WRITE;
6884 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6885 prot |= VM_PROT_EXECUTE;
6886 }
6887 vm_object_pmap_protect(local_object,
6888 VME_OFFSET(entry),
6889 entry->vme_end - entry->vme_start,
6890 ((entry->is_shared ||
6891 map->mapped_in_other_pmaps)
6892 ? PMAP_NULL
6893 : map->pmap),
6894 VM_MAP_PAGE_SIZE(map),
6895 entry->vme_start,
6896 prot);
6897
6898 assert(entry->wired_count == 0);
6899
6900 /*
6901 * Lock the VM object and re-check its status: if it's mapped
6902 * in another address space, we could still be racing with
6903 * another thread holding that other VM map exclusively.
6904 */
6905 vm_object_lock(local_object);
6906 if (local_object->true_share) {
6907 /* object is already in proper state: no COW needed */
6908 assert(local_object->copy_strategy !=
6909 MEMORY_OBJECT_COPY_SYMMETRIC);
6910 } else {
6911 /* not true_share: ask for copy-on-write below */
6912 assert(local_object->copy_strategy ==
6913 MEMORY_OBJECT_COPY_SYMMETRIC);
6914 entry->needs_copy = TRUE;
6915 }
6916 vm_object_unlock(local_object);
6917
6918 vm_map_lock_write_to_read(map);
6919 }
6920
6921 if (entry->needs_copy) {
6922 /*
6923 * Honor copy-on-write for COPY_SYMMETRIC
6924 * strategy.
6925 */
6926 vm_map_t local_map;
6927 vm_object_t object;
6928 vm_object_offset_t new_offset;
6929 vm_prot_t prot;
6930 boolean_t wired;
6931 vm_map_version_t version;
6932 vm_map_t real_map;
6933 vm_prot_t fault_type;
6934
6935 local_map = map;
6936
6937 if (caller_flags & UPL_COPYOUT_FROM) {
6938 fault_type = VM_PROT_READ | VM_PROT_COPY;
6939 vm_counters.create_upl_extra_cow++;
6940 vm_counters.create_upl_extra_cow_pages +=
6941 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6942 } else {
6943 fault_type = VM_PROT_WRITE;
6944 }
6945 if (vm_map_lookup_and_lock_object(&local_map,
6946 offset, fault_type,
6947 OBJECT_LOCK_EXCLUSIVE,
6948 &version, &object,
6949 &new_offset, &prot, &wired,
6950 NULL,
6951 &real_map, NULL) != KERN_SUCCESS) {
6952 if (fault_type == VM_PROT_WRITE) {
6953 vm_counters.create_upl_lookup_failure_write++;
6954 } else {
6955 vm_counters.create_upl_lookup_failure_copy++;
6956 }
6957 vm_map_unlock_read(local_map);
6958 ret = KERN_FAILURE;
6959 goto done;
6960 }
6961 if (real_map != local_map) {
6962 vm_map_unlock(real_map);
6963 }
6964 vm_map_unlock_read(local_map);
6965
6966 vm_object_unlock(object);
6967
6968 goto REDISCOVER_ENTRY;
6969 }
6970
6971 if (entry->is_sub_map) {
6972 vm_map_t submap;
6973
6974 submap = VME_SUBMAP(entry);
6975 local_start = entry->vme_start;
6976 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6977
6978 vm_map_reference(submap);
6979 vm_map_unlock_read(map);
6980
6981 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6982 offset += offset_in_mapped_page;
6983 *upl_size -= offset_in_mapped_page;
6984
6985 if (release_map) {
6986 vm_map_deallocate(map);
6987 }
6988 map = submap;
6989 release_map = TRUE;
6990 offset = local_offset + (offset - local_start);
6991 goto start_with_map;
6992 }
6993
6994 if (sync_cow_data &&
6995 (VME_OBJECT(entry)->shadow ||
6996 VME_OBJECT(entry)->copy)) {
6997 local_object = VME_OBJECT(entry);
6998 local_start = entry->vme_start;
6999 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7000
7001 vm_object_reference(local_object);
7002 vm_map_unlock_read(map);
7003
7004 if (local_object->shadow && local_object->copy) {
7005 vm_object_lock_request(local_object->shadow,
7006 ((vm_object_offset_t)
7007 ((offset - local_start) +
7008 local_offset) +
7009 local_object->vo_shadow_offset),
7010 *upl_size, FALSE,
7011 MEMORY_OBJECT_DATA_SYNC,
7012 VM_PROT_NO_CHANGE);
7013 }
7014 sync_cow_data = FALSE;
7015 vm_object_deallocate(local_object);
7016
7017 goto REDISCOVER_ENTRY;
7018 }
7019 if (force_data_sync) {
7020 local_object = VME_OBJECT(entry);
7021 local_start = entry->vme_start;
7022 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7023
7024 vm_object_reference(local_object);
7025 vm_map_unlock_read(map);
7026
7027 vm_object_lock_request(local_object,
7028 ((vm_object_offset_t)
7029 ((offset - local_start) +
7030 local_offset)),
7031 (vm_object_size_t)*upl_size,
7032 FALSE,
7033 MEMORY_OBJECT_DATA_SYNC,
7034 VM_PROT_NO_CHANGE);
7035
7036 force_data_sync = FALSE;
7037 vm_object_deallocate(local_object);
7038
7039 goto REDISCOVER_ENTRY;
7040 }
7041 if (VME_OBJECT(entry)->private) {
7042 *flags = UPL_DEV_MEMORY;
7043 } else {
7044 *flags = 0;
7045 }
7046
7047 if (VME_OBJECT(entry)->phys_contiguous) {
7048 *flags |= UPL_PHYS_CONTIG;
7049 }
7050
7051 local_object = VME_OBJECT(entry);
7052 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7053 local_start = entry->vme_start;
7054
7055 /*
7056 * Wiring will copy the pages to the shadow object.
7057 * The shadow object will not be code-signed so
7058 * attempting to execute code from these copied pages
7059 * would trigger a code-signing violation.
7060 */
7061 if (entry->protection & VM_PROT_EXECUTE) {
7062 #if MACH_ASSERT
7063 printf("pid %d[%s] create_upl out of executable range from "
7064 "0x%llx to 0x%llx: side effects may include "
7065 "code-signing violations later on\n",
7066 proc_selfpid(),
7067 (get_bsdtask_info(current_task())
7068 ? proc_name_address(get_bsdtask_info(current_task()))
7069 : "?"),
7070 (uint64_t) entry->vme_start,
7071 (uint64_t) entry->vme_end);
7072 #endif /* MACH_ASSERT */
7073 DTRACE_VM2(cs_executable_create_upl,
7074 uint64_t, (uint64_t)entry->vme_start,
7075 uint64_t, (uint64_t)entry->vme_end);
7076 cs_executable_create_upl++;
7077 }
7078
7079 vm_object_lock(local_object);
7080
7081 /*
7082 * Ensure that this object is "true_share" and "copy_delay" now,
7083 * while we're still holding the VM map lock. After we unlock the map,
7084 * anything could happen to that mapping, including some copy-on-write
7085 * activity. We need to make sure that the IOPL will point at the
7086 * same memory as the mapping.
7087 */
7088 if (local_object->true_share) {
7089 assert(local_object->copy_strategy !=
7090 MEMORY_OBJECT_COPY_SYMMETRIC);
7091 } else if (local_object != kernel_object &&
7092 local_object != compressor_object &&
7093 !local_object->phys_contiguous) {
7094 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7095 if (!local_object->true_share &&
7096 vm_object_tracking_btlog) {
7097 btlog_record(vm_object_tracking_btlog, local_object,
7098 VM_OBJECT_TRACKING_OP_TRUESHARE,
7099 btref_get(__builtin_frame_address(0), 0));
7100 }
7101 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7102 local_object->true_share = TRUE;
7103 if (local_object->copy_strategy ==
7104 MEMORY_OBJECT_COPY_SYMMETRIC) {
7105 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7106 }
7107 }
7108
7109 vm_object_reference_locked(local_object);
7110 vm_object_unlock(local_object);
7111
7112 vm_map_unlock_read(map);
7113
7114 offset += offset_in_mapped_page;
7115 assert(*upl_size > offset_in_mapped_page);
7116 *upl_size -= offset_in_mapped_page;
7117
7118 ret = vm_object_iopl_request(local_object,
7119 ((vm_object_offset_t)
7120 ((offset - local_start) + local_offset)),
7121 *upl_size,
7122 upl,
7123 page_list,
7124 count,
7125 caller_flags,
7126 tag);
7127 vm_object_deallocate(local_object);
7128
7129 done:
7130 if (release_map) {
7131 vm_map_deallocate(map);
7132 }
7133
7134 return ret;
7135 }
7136
7137 /*
7138 * Internal routine to enter a UPL into a VM map.
7139 *
7140 * JMM - This should just be doable through the standard
7141 * vm_map_enter() API.
7142 */
7143 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7144 vm_map_enter_upl_range(
7145 vm_map_t map,
7146 upl_t upl,
7147 vm_object_offset_t offset_to_map,
7148 upl_size_t size_to_map,
7149 vm_prot_t prot_to_map,
7150 vm_map_offset_t *dst_addr)
7151 {
7152 vm_map_size_t size;
7153 vm_object_offset_t offset;
7154 vm_map_offset_t addr;
7155 vm_page_t m;
7156 kern_return_t kr;
7157 int isVectorUPL = 0, curr_upl = 0;
7158 upl_t vector_upl = NULL;
7159 mach_vm_offset_t vector_upl_dst_addr = 0;
7160 vm_map_t vector_upl_submap = NULL;
7161 upl_offset_t subupl_offset = 0;
7162 upl_size_t subupl_size = 0;
7163
7164 if (upl == UPL_NULL) {
7165 return KERN_INVALID_ARGUMENT;
7166 }
7167
7168 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7169 assert(map == kernel_map);
7170
7171 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7172 int mapped = 0, valid_upls = 0;
7173 vector_upl = upl;
7174
7175 upl_lock(vector_upl);
7176 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7177 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7178 if (upl == NULL) {
7179 continue;
7180 }
7181 valid_upls++;
7182 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7183 mapped++;
7184 }
7185 }
7186
7187 if (mapped) {
7188 if (mapped != valid_upls) {
7189 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7190 } else {
7191 upl_unlock(vector_upl);
7192 return KERN_FAILURE;
7193 }
7194 }
7195
7196 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7197 panic("TODO4K: vector UPL not implemented");
7198 }
7199
7200 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7201 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7202 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7203 VM_KERN_MEMORY_NONE).kmr_submap;
7204 map = vector_upl_submap;
7205 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7206 curr_upl = 0;
7207 } else {
7208 upl_lock(upl);
7209 }
7210
7211 process_upl_to_enter:
7212 if (isVectorUPL) {
7213 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7214 *dst_addr = vector_upl_dst_addr;
7215 upl_unlock(vector_upl);
7216 return KERN_SUCCESS;
7217 }
7218 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7219 if (upl == NULL) {
7220 goto process_upl_to_enter;
7221 }
7222
7223 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7224 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7225 } else {
7226 /*
7227 * check to see if already mapped
7228 */
7229 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7230 upl_unlock(upl);
7231 return KERN_FAILURE;
7232 }
7233 }
7234
7235 if ((!(upl->flags & UPL_SHADOWED)) &&
7236 ((upl->flags & UPL_HAS_BUSY) ||
7237 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7238 vm_object_t object;
7239 vm_page_t alias_page;
7240 vm_object_offset_t new_offset;
7241 unsigned int pg_num;
7242 wpl_array_t lite_list;
7243
7244 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7245 if (upl->flags & UPL_INTERNAL) {
7246 lite_list = (wpl_array_t)
7247 ((((uintptr_t)upl) + sizeof(struct upl))
7248 + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7249 } else {
7250 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7251 }
7252 object = upl->map_object;
7253 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7254
7255 vm_object_lock(upl->map_object);
7256
7257 upl->map_object->shadow = object;
7258 upl->map_object->pageout = TRUE;
7259 upl->map_object->can_persist = FALSE;
7260 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7261 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7262 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7263 "object %p shadow_offset 0x%llx",
7264 upl->map_object,
7265 (uint64_t)upl->map_object->vo_shadow_offset);
7266 upl->map_object->wimg_bits = object->wimg_bits;
7267 offset = upl->map_object->vo_shadow_offset;
7268 new_offset = 0;
7269
7270 upl->flags |= UPL_SHADOWED;
7271
7272 while (size) {
7273 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7274 assert(pg_num == new_offset / PAGE_SIZE);
7275
7276 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7277 alias_page = vm_page_grab_fictitious(TRUE);
7278
7279 vm_object_lock(object);
7280
7281 m = vm_page_lookup(object, offset);
7282 if (m == VM_PAGE_NULL) {
7283 panic("vm_upl_map: page missing");
7284 }
7285
7286 /*
7287 * Convert the fictitious page to a private
7288 * shadow of the real page.
7289 */
7290 assert(alias_page->vmp_fictitious);
7291 alias_page->vmp_fictitious = FALSE;
7292 alias_page->vmp_private = TRUE;
7293 alias_page->vmp_free_when_done = TRUE;
7294 /*
7295 * since m is a page in the upl it must
7296 * already be wired or BUSY, so it's
7297 * safe to assign the underlying physical
7298 * page to the alias
7299 */
7300 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7301
7302 vm_object_unlock(object);
7303
7304 vm_page_lockspin_queues();
7305 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7306 vm_page_unlock_queues();
7307
7308 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7309
7310 assert(!alias_page->vmp_wanted);
7311 alias_page->vmp_busy = FALSE;
7312 alias_page->vmp_absent = FALSE;
7313 }
7314 size -= PAGE_SIZE;
7315 offset += PAGE_SIZE_64;
7316 new_offset += PAGE_SIZE_64;
7317 }
7318 vm_object_unlock(upl->map_object);
7319 }
7320 if (upl->flags & UPL_SHADOWED) {
7321 if (isVectorUPL) {
7322 offset = 0;
7323 } else {
7324 offset = offset_to_map;
7325 }
7326 } else {
7327 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7328 if (!isVectorUPL) {
7329 offset += offset_to_map;
7330 }
7331 }
7332
7333 if (isVectorUPL) {
7334 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7335 } else {
7336 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7337 }
7338
7339 vm_object_reference(upl->map_object);
7340
7341 if (!isVectorUPL) {
7342 *dst_addr = 0;
7343 /*
7344 * NEED A UPL_MAP ALIAS
7345 */
7346 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7347 VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_DATA, VM_KERN_MEMORY_OSFMK,
7348 upl->map_object, offset, FALSE,
7349 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7350
7351 if (kr != KERN_SUCCESS) {
7352 vm_object_deallocate(upl->map_object);
7353 upl_unlock(upl);
7354 return kr;
7355 }
7356 } else {
7357 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7358 VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7359 upl->map_object, offset, FALSE,
7360 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7361 if (kr) {
7362 panic("vm_map_enter failed for a Vector UPL");
7363 }
7364 }
7365 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7366 /* this will have to be an increment rather than */
7367 /* an assignment. */
7368 vm_object_lock(upl->map_object);
7369
7370 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7371 m = vm_page_lookup(upl->map_object, offset);
7372
7373 if (m) {
7374 m->vmp_pmapped = TRUE;
7375
7376 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7377 * but only in kernel space. If this was on a user map,
7378 * we'd have to set the wpmapped bit. */
7379 /* m->vmp_wpmapped = TRUE; */
7380 assert(map->pmap == kernel_pmap);
7381
7382 PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7383
7384 assert(kr == KERN_SUCCESS);
7385 #if KASAN
7386 kasan_notify_address(addr, PAGE_SIZE_64);
7387 #endif
7388 }
7389 offset += PAGE_SIZE_64;
7390 }
7391 vm_object_unlock(upl->map_object);
7392
7393 /*
7394 * hold a reference for the mapping
7395 */
7396 upl->ref_count++;
7397 upl->flags |= UPL_PAGE_LIST_MAPPED;
7398 upl->kaddr = (vm_offset_t) *dst_addr;
7399 assert(upl->kaddr == *dst_addr);
7400
7401 if (isVectorUPL) {
7402 goto process_upl_to_enter;
7403 }
7404
7405 if (!isVectorUPL) {
7406 vm_map_offset_t addr_adjustment;
7407
7408 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7409 if (addr_adjustment) {
7410 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7411 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7412 *dst_addr += addr_adjustment;
7413 }
7414 }
7415
7416 upl_unlock(upl);
7417
7418 return KERN_SUCCESS;
7419 }
7420
7421 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7422 vm_map_enter_upl(
7423 vm_map_t map,
7424 upl_t upl,
7425 vm_map_offset_t *dst_addr)
7426 {
7427 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7428 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7429 }
7430
7431 /*
7432 * Internal routine to remove a UPL mapping from a VM map.
7433 *
7434 * XXX - This should just be doable through a standard
7435 * vm_map_remove() operation. Otherwise, implicit clean-up
7436 * of the target map won't be able to correctly remove
7437 * these (and release the reference on the UPL). Having
7438 * to do this means we can't map these into user-space
7439 * maps yet.
7440 */
7441 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7442 vm_map_remove_upl_range(
7443 vm_map_t map,
7444 upl_t upl,
7445 __unused vm_object_offset_t offset_to_unmap,
7446 __unused upl_size_t size_to_unmap)
7447 {
7448 vm_address_t addr;
7449 upl_size_t size;
7450 int isVectorUPL = 0, curr_upl = 0;
7451 upl_t vector_upl = NULL;
7452
7453 if (upl == UPL_NULL) {
7454 return KERN_INVALID_ARGUMENT;
7455 }
7456
7457 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7458 int unmapped = 0, valid_upls = 0;
7459 vector_upl = upl;
7460 upl_lock(vector_upl);
7461 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7462 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7463 if (upl == NULL) {
7464 continue;
7465 }
7466 valid_upls++;
7467 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7468 unmapped++;
7469 }
7470 }
7471
7472 if (unmapped) {
7473 if (unmapped != valid_upls) {
7474 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7475 } else {
7476 upl_unlock(vector_upl);
7477 return KERN_FAILURE;
7478 }
7479 }
7480 curr_upl = 0;
7481 } else {
7482 upl_lock(upl);
7483 }
7484
7485 process_upl_to_remove:
7486 if (isVectorUPL) {
7487 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7488 vm_map_t v_upl_submap;
7489 vm_offset_t v_upl_submap_dst_addr;
7490 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7491
7492 kmem_free_guard(map, v_upl_submap_dst_addr,
7493 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7494 vm_map_deallocate(v_upl_submap);
7495 upl_unlock(vector_upl);
7496 return KERN_SUCCESS;
7497 }
7498
7499 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7500 if (upl == NULL) {
7501 goto process_upl_to_remove;
7502 }
7503 }
7504
7505 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7506 addr = upl->kaddr;
7507 size = upl->u_mapped_size;
7508
7509 assert(upl->ref_count > 1);
7510 upl->ref_count--; /* removing mapping ref */
7511
7512 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7513 upl->kaddr = (vm_offset_t) 0;
7514 upl->u_mapped_size = 0;
7515
7516 if (isVectorUPL) {
7517 /*
7518 * If it's a Vectored UPL, we'll be removing the entire
7519 * submap anyways, so no need to remove individual UPL
7520 * element mappings from within the submap
7521 */
7522 goto process_upl_to_remove;
7523 }
7524
7525 upl_unlock(upl);
7526
7527 vm_map_remove(map,
7528 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7529 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7530 return KERN_SUCCESS;
7531 }
7532 upl_unlock(upl);
7533
7534 return KERN_FAILURE;
7535 }
7536
7537 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7538 vm_map_remove_upl(
7539 vm_map_t map,
7540 upl_t upl)
7541 {
7542 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7543 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7544 }
7545
7546 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7547 upl_commit_range(
7548 upl_t upl,
7549 upl_offset_t offset,
7550 upl_size_t size,
7551 int flags,
7552 upl_page_info_t *page_list,
7553 mach_msg_type_number_t count,
7554 boolean_t *empty)
7555 {
7556 upl_size_t xfer_size, subupl_size;
7557 vm_object_t shadow_object;
7558 vm_object_t object;
7559 vm_object_t m_object;
7560 vm_object_offset_t target_offset;
7561 upl_offset_t subupl_offset = offset;
7562 int entry;
7563 wpl_array_t lite_list;
7564 int occupied;
7565 int clear_refmod = 0;
7566 int pgpgout_count = 0;
7567 struct vm_page_delayed_work dw_array;
7568 struct vm_page_delayed_work *dwp, *dwp_start;
7569 bool dwp_finish_ctx = TRUE;
7570 int dw_count;
7571 int dw_limit;
7572 int isVectorUPL = 0;
7573 upl_t vector_upl = NULL;
7574 boolean_t should_be_throttled = FALSE;
7575
7576 vm_page_t nxt_page = VM_PAGE_NULL;
7577 int fast_path_possible = 0;
7578 int fast_path_full_commit = 0;
7579 int throttle_page = 0;
7580 int unwired_count = 0;
7581 int local_queue_count = 0;
7582 vm_page_t first_local, last_local;
7583 vm_object_offset_t obj_start, obj_end, obj_offset;
7584 kern_return_t kr = KERN_SUCCESS;
7585
7586 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7587
7588 dwp_start = dwp = NULL;
7589
7590 subupl_size = size;
7591 *empty = FALSE;
7592
7593 if (upl == UPL_NULL) {
7594 return KERN_INVALID_ARGUMENT;
7595 }
7596
7597 dw_count = 0;
7598 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7599 dwp_start = vm_page_delayed_work_get_ctx();
7600 if (dwp_start == NULL) {
7601 dwp_start = &dw_array;
7602 dw_limit = 1;
7603 dwp_finish_ctx = FALSE;
7604 }
7605
7606 dwp = dwp_start;
7607
7608 if (count == 0) {
7609 page_list = NULL;
7610 }
7611
7612 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7613 vector_upl = upl;
7614 upl_lock(vector_upl);
7615 } else {
7616 upl_lock(upl);
7617 }
7618
7619 process_upl_to_commit:
7620
7621 if (isVectorUPL) {
7622 size = subupl_size;
7623 offset = subupl_offset;
7624 if (size == 0) {
7625 upl_unlock(vector_upl);
7626 kr = KERN_SUCCESS;
7627 goto done;
7628 }
7629 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7630 if (upl == NULL) {
7631 upl_unlock(vector_upl);
7632 kr = KERN_FAILURE;
7633 goto done;
7634 }
7635 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7636 subupl_size -= size;
7637 subupl_offset += size;
7638 }
7639
7640 #if UPL_DEBUG
7641 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7642 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7643
7644 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7645 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7646
7647 upl->upl_commit_index++;
7648 }
7649 #endif
7650 if (upl->flags & UPL_DEVICE_MEMORY) {
7651 xfer_size = 0;
7652 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7653 xfer_size = size;
7654 } else {
7655 if (!isVectorUPL) {
7656 upl_unlock(upl);
7657 } else {
7658 upl_unlock(vector_upl);
7659 }
7660 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7661 kr = KERN_FAILURE;
7662 goto done;
7663 }
7664 if (upl->flags & UPL_SET_DIRTY) {
7665 flags |= UPL_COMMIT_SET_DIRTY;
7666 }
7667 if (upl->flags & UPL_CLEAR_DIRTY) {
7668 flags |= UPL_COMMIT_CLEAR_DIRTY;
7669 }
7670
7671 if (upl->flags & UPL_INTERNAL) {
7672 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7673 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7674 } else {
7675 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7676 }
7677
7678 object = upl->map_object;
7679
7680 if (upl->flags & UPL_SHADOWED) {
7681 vm_object_lock(object);
7682 shadow_object = object->shadow;
7683 } else {
7684 shadow_object = object;
7685 }
7686 entry = offset / PAGE_SIZE;
7687 target_offset = (vm_object_offset_t)offset;
7688
7689 if (upl->flags & UPL_KERNEL_OBJECT) {
7690 vm_object_lock_shared(shadow_object);
7691 } else {
7692 vm_object_lock(shadow_object);
7693 }
7694
7695 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7696
7697 if (upl->flags & UPL_ACCESS_BLOCKED) {
7698 assert(shadow_object->blocked_access);
7699 shadow_object->blocked_access = FALSE;
7700 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7701 }
7702
7703 if (shadow_object->code_signed) {
7704 /*
7705 * CODE SIGNING:
7706 * If the object is code-signed, do not let this UPL tell
7707 * us if the pages are valid or not. Let the pages be
7708 * validated by VM the normal way (when they get mapped or
7709 * copied).
7710 */
7711 flags &= ~UPL_COMMIT_CS_VALIDATED;
7712 }
7713 if (!page_list) {
7714 /*
7715 * No page list to get the code-signing info from !?
7716 */
7717 flags &= ~UPL_COMMIT_CS_VALIDATED;
7718 }
7719 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7720 should_be_throttled = TRUE;
7721 }
7722
7723 if ((upl->flags & UPL_IO_WIRE) &&
7724 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7725 !isVectorUPL &&
7726 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7727 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7728 if (!vm_page_queue_empty(&shadow_object->memq)) {
7729 if (size == shadow_object->vo_size) {
7730 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7731 fast_path_full_commit = 1;
7732 }
7733 fast_path_possible = 1;
7734
7735 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7736 (shadow_object->purgable == VM_PURGABLE_DENY ||
7737 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7738 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7739 throttle_page = 1;
7740 }
7741 }
7742 }
7743 first_local = VM_PAGE_NULL;
7744 last_local = VM_PAGE_NULL;
7745
7746 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7747 obj_end = obj_start + xfer_size;
7748 obj_start = vm_object_trunc_page(obj_start);
7749 obj_end = vm_object_round_page(obj_end);
7750 for (obj_offset = obj_start;
7751 obj_offset < obj_end;
7752 obj_offset += PAGE_SIZE) {
7753 vm_page_t t, m;
7754
7755 dwp->dw_mask = 0;
7756 clear_refmod = 0;
7757
7758 m = VM_PAGE_NULL;
7759
7760 if (upl->flags & UPL_LITE) {
7761 unsigned int pg_num;
7762
7763 if (nxt_page != VM_PAGE_NULL) {
7764 m = nxt_page;
7765 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7766 target_offset = m->vmp_offset;
7767 }
7768 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7769 assert(pg_num == target_offset / PAGE_SIZE);
7770
7771 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7772 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7773
7774 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7775 m = vm_page_lookup(shadow_object, obj_offset);
7776 }
7777 } else {
7778 m = NULL;
7779 }
7780 }
7781 if (upl->flags & UPL_SHADOWED) {
7782 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7783 t->vmp_free_when_done = FALSE;
7784
7785 VM_PAGE_FREE(t);
7786
7787 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7788 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7789 }
7790 }
7791 }
7792 if (m == VM_PAGE_NULL) {
7793 goto commit_next_page;
7794 }
7795
7796 m_object = VM_PAGE_OBJECT(m);
7797
7798 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7799 assert(m->vmp_busy);
7800
7801 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7802 goto commit_next_page;
7803 }
7804
7805 if (flags & UPL_COMMIT_CS_VALIDATED) {
7806 /*
7807 * CODE SIGNING:
7808 * Set the code signing bits according to
7809 * what the UPL says they should be.
7810 */
7811 m->vmp_cs_validated |= page_list[entry].cs_validated;
7812 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7813 m->vmp_cs_nx |= page_list[entry].cs_nx;
7814 }
7815 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7816 m->vmp_written_by_kernel = TRUE;
7817 }
7818
7819 if (upl->flags & UPL_IO_WIRE) {
7820 if (page_list) {
7821 page_list[entry].phys_addr = 0;
7822 }
7823
7824 if (flags & UPL_COMMIT_SET_DIRTY) {
7825 SET_PAGE_DIRTY(m, FALSE);
7826 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7827 m->vmp_dirty = FALSE;
7828
7829 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7830 m->vmp_cs_validated &&
7831 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7832 /*
7833 * CODE SIGNING:
7834 * This page is no longer dirty
7835 * but could have been modified,
7836 * so it will need to be
7837 * re-validated.
7838 */
7839 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7840
7841 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7842
7843 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7844 }
7845 clear_refmod |= VM_MEM_MODIFIED;
7846 }
7847 if (upl->flags & UPL_ACCESS_BLOCKED) {
7848 /*
7849 * We blocked access to the pages in this UPL.
7850 * Clear the "busy" bit and wake up any waiter
7851 * for this page.
7852 */
7853 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7854 }
7855 if (fast_path_possible) {
7856 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7857 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7858 if (m->vmp_absent) {
7859 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7860 assert(m->vmp_wire_count == 0);
7861 assert(m->vmp_busy);
7862
7863 m->vmp_absent = FALSE;
7864 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7865 } else {
7866 if (m->vmp_wire_count == 0) {
7867 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7868 }
7869 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7870
7871 /*
7872 * XXX FBDP need to update some other
7873 * counters here (purgeable_wired_count)
7874 * (ledgers), ...
7875 */
7876 assert(m->vmp_wire_count > 0);
7877 m->vmp_wire_count--;
7878
7879 if (m->vmp_wire_count == 0) {
7880 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7881 unwired_count++;
7882 }
7883 }
7884 if (m->vmp_wire_count == 0) {
7885 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7886
7887 if (last_local == VM_PAGE_NULL) {
7888 assert(first_local == VM_PAGE_NULL);
7889
7890 last_local = m;
7891 first_local = m;
7892 } else {
7893 assert(first_local != VM_PAGE_NULL);
7894
7895 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7896 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7897 first_local = m;
7898 }
7899 local_queue_count++;
7900
7901 if (throttle_page) {
7902 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7903 } else {
7904 if (flags & UPL_COMMIT_INACTIVATE) {
7905 if (shadow_object->internal) {
7906 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7907 } else {
7908 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7909 }
7910 } else {
7911 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7912 }
7913 }
7914 }
7915 } else {
7916 if (flags & UPL_COMMIT_INACTIVATE) {
7917 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7918 clear_refmod |= VM_MEM_REFERENCED;
7919 }
7920 if (m->vmp_absent) {
7921 if (flags & UPL_COMMIT_FREE_ABSENT) {
7922 dwp->dw_mask |= DW_vm_page_free;
7923 } else {
7924 m->vmp_absent = FALSE;
7925 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7926
7927 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7928 dwp->dw_mask |= DW_vm_page_activate;
7929 }
7930 }
7931 } else {
7932 dwp->dw_mask |= DW_vm_page_unwire;
7933 }
7934 }
7935 goto commit_next_page;
7936 }
7937 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7938
7939 if (page_list) {
7940 page_list[entry].phys_addr = 0;
7941 }
7942
7943 /*
7944 * make sure to clear the hardware
7945 * modify or reference bits before
7946 * releasing the BUSY bit on this page
7947 * otherwise we risk losing a legitimate
7948 * change of state
7949 */
7950 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7951 m->vmp_dirty = FALSE;
7952
7953 clear_refmod |= VM_MEM_MODIFIED;
7954 }
7955 if (m->vmp_laundry) {
7956 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7957 }
7958
7959 if (VM_PAGE_WIRED(m)) {
7960 m->vmp_free_when_done = FALSE;
7961 }
7962
7963 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7964 m->vmp_cs_validated &&
7965 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7966 /*
7967 * CODE SIGNING:
7968 * This page is no longer dirty
7969 * but could have been modified,
7970 * so it will need to be
7971 * re-validated.
7972 */
7973 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7974
7975 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7976
7977 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7978 }
7979 if (m->vmp_overwriting) {
7980 /*
7981 * the (COPY_OUT_FROM == FALSE) request_page_list case
7982 */
7983 if (m->vmp_busy) {
7984 #if CONFIG_PHANTOM_CACHE
7985 if (m->vmp_absent && !m_object->internal) {
7986 dwp->dw_mask |= DW_vm_phantom_cache_update;
7987 }
7988 #endif
7989 m->vmp_absent = FALSE;
7990
7991 dwp->dw_mask |= DW_clear_busy;
7992 } else {
7993 /*
7994 * alternate (COPY_OUT_FROM == FALSE) page_list case
7995 * Occurs when the original page was wired
7996 * at the time of the list request
7997 */
7998 assert(VM_PAGE_WIRED(m));
7999
8000 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8001 }
8002 m->vmp_overwriting = FALSE;
8003 }
8004 m->vmp_cleaning = FALSE;
8005
8006 if (m->vmp_free_when_done) {
8007 /*
8008 * With the clean queue enabled, UPL_PAGEOUT should
8009 * no longer set the pageout bit. Its pages now go
8010 * to the clean queue.
8011 *
8012 * We don't use the cleaned Q anymore and so this
8013 * assert isn't correct. The code for the clean Q
8014 * still exists and might be used in the future. If we
8015 * go back to the cleaned Q, we will re-enable this
8016 * assert.
8017 *
8018 * assert(!(upl->flags & UPL_PAGEOUT));
8019 */
8020 assert(!m_object->internal);
8021
8022 m->vmp_free_when_done = FALSE;
8023
8024 if ((flags & UPL_COMMIT_SET_DIRTY) ||
8025 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8026 /*
8027 * page was re-dirtied after we started
8028 * the pageout... reactivate it since
8029 * we don't know whether the on-disk
8030 * copy matches what is now in memory
8031 */
8032 SET_PAGE_DIRTY(m, FALSE);
8033
8034 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8035
8036 if (upl->flags & UPL_PAGEOUT) {
8037 counter_inc(&vm_statistics_reactivations);
8038 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8039 }
8040 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8041 /*
8042 * Someone else might still be handling this
8043 * page (vm_fault() for example), so let's not
8044 * free it or "un-busy" it!
8045 * Put that page in the "speculative" queue
8046 * for now (since we would otherwise have freed
8047 * it) and let whoever is keeping the page
8048 * "busy" move it if needed when they're done
8049 * with it.
8050 */
8051 dwp->dw_mask |= DW_vm_page_speculate;
8052 } else {
8053 /*
8054 * page has been successfully cleaned
8055 * go ahead and free it for other use
8056 */
8057 if (m_object->internal) {
8058 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8059 } else {
8060 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8061 }
8062 m->vmp_dirty = FALSE;
8063 if (!(upl->flags & UPL_HAS_BUSY)) {
8064 assert(!m->vmp_busy);
8065 }
8066 m->vmp_busy = TRUE;
8067
8068 dwp->dw_mask |= DW_vm_page_free;
8069 }
8070 goto commit_next_page;
8071 }
8072 /*
8073 * It is a part of the semantic of COPYOUT_FROM
8074 * UPLs that a commit implies cache sync
8075 * between the vm page and the backing store
8076 * this can be used to strip the precious bit
8077 * as well as clean
8078 */
8079 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8080 m->vmp_precious = FALSE;
8081 }
8082
8083 if (flags & UPL_COMMIT_SET_DIRTY) {
8084 SET_PAGE_DIRTY(m, FALSE);
8085 } else {
8086 m->vmp_dirty = FALSE;
8087 }
8088
8089 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8090 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8091 pgpgout_count++;
8092
8093 counter_inc(&vm_statistics_pageouts);
8094 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8095
8096 dwp->dw_mask |= DW_enqueue_cleaned;
8097 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8098 /*
8099 * page coming back in from being 'frozen'...
8100 * it was dirty before it was frozen, so keep it so
8101 * the vm_page_activate will notice that it really belongs
8102 * on the throttle queue and put it there
8103 */
8104 SET_PAGE_DIRTY(m, FALSE);
8105 dwp->dw_mask |= DW_vm_page_activate;
8106 } else {
8107 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8108 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8109 clear_refmod |= VM_MEM_REFERENCED;
8110 } else if (!VM_PAGE_PAGEABLE(m)) {
8111 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8112 dwp->dw_mask |= DW_vm_page_speculate;
8113 } else if (m->vmp_reference) {
8114 dwp->dw_mask |= DW_vm_page_activate;
8115 } else {
8116 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8117 clear_refmod |= VM_MEM_REFERENCED;
8118 }
8119 }
8120 }
8121 if (upl->flags & UPL_ACCESS_BLOCKED) {
8122 /*
8123 * We blocked access to the pages in this URL.
8124 * Clear the "busy" bit on this page before we
8125 * wake up any waiter.
8126 */
8127 dwp->dw_mask |= DW_clear_busy;
8128 }
8129 /*
8130 * Wakeup any thread waiting for the page to be un-cleaning.
8131 */
8132 dwp->dw_mask |= DW_PAGE_WAKEUP;
8133
8134 commit_next_page:
8135 if (clear_refmod) {
8136 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8137 }
8138
8139 target_offset += PAGE_SIZE_64;
8140 xfer_size -= PAGE_SIZE;
8141 entry++;
8142
8143 if (dwp->dw_mask) {
8144 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8145 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8146
8147 if (dw_count >= dw_limit) {
8148 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8149
8150 dwp = dwp_start;
8151 dw_count = 0;
8152 }
8153 } else {
8154 if (dwp->dw_mask & DW_clear_busy) {
8155 m->vmp_busy = FALSE;
8156 }
8157
8158 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8159 PAGE_WAKEUP(m);
8160 }
8161 }
8162 }
8163 }
8164 if (dw_count) {
8165 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8166 dwp = dwp_start;
8167 dw_count = 0;
8168 }
8169
8170 if (fast_path_possible) {
8171 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8172 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8173
8174 if (local_queue_count || unwired_count) {
8175 if (local_queue_count) {
8176 vm_page_t first_target;
8177 vm_page_queue_head_t *target_queue;
8178
8179 if (throttle_page) {
8180 target_queue = &vm_page_queue_throttled;
8181 } else {
8182 if (flags & UPL_COMMIT_INACTIVATE) {
8183 if (shadow_object->internal) {
8184 target_queue = &vm_page_queue_anonymous;
8185 } else {
8186 target_queue = &vm_page_queue_inactive;
8187 }
8188 } else {
8189 target_queue = &vm_page_queue_active;
8190 }
8191 }
8192 /*
8193 * Transfer the entire local queue to a regular LRU page queues.
8194 */
8195 vm_page_lockspin_queues();
8196
8197 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8198
8199 if (vm_page_queue_empty(target_queue)) {
8200 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8201 } else {
8202 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8203 }
8204
8205 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8206 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8207 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8208
8209 /*
8210 * Adjust the global page counts.
8211 */
8212 if (throttle_page) {
8213 vm_page_throttled_count += local_queue_count;
8214 } else {
8215 if (flags & UPL_COMMIT_INACTIVATE) {
8216 if (shadow_object->internal) {
8217 vm_page_anonymous_count += local_queue_count;
8218 }
8219 vm_page_inactive_count += local_queue_count;
8220
8221 token_new_pagecount += local_queue_count;
8222 } else {
8223 vm_page_active_count += local_queue_count;
8224 }
8225
8226 if (shadow_object->internal) {
8227 vm_page_pageable_internal_count += local_queue_count;
8228 } else {
8229 vm_page_pageable_external_count += local_queue_count;
8230 }
8231 }
8232 } else {
8233 vm_page_lockspin_queues();
8234 }
8235 if (unwired_count) {
8236 vm_page_wire_count -= unwired_count;
8237 VM_CHECK_MEMORYSTATUS;
8238 }
8239 vm_page_unlock_queues();
8240
8241 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8242 }
8243 }
8244 occupied = 1;
8245
8246 if (upl->flags & UPL_DEVICE_MEMORY) {
8247 occupied = 0;
8248 } else if (upl->flags & UPL_LITE) {
8249 int pg_num;
8250 int i;
8251
8252 occupied = 0;
8253
8254 if (!fast_path_full_commit) {
8255 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8256 pg_num = (pg_num + 31) >> 5;
8257
8258 for (i = 0; i < pg_num; i++) {
8259 if (lite_list[i] != 0) {
8260 occupied = 1;
8261 break;
8262 }
8263 }
8264 }
8265 } else {
8266 if (vm_page_queue_empty(&upl->map_object->memq)) {
8267 occupied = 0;
8268 }
8269 }
8270 if (occupied == 0) {
8271 /*
8272 * If this UPL element belongs to a Vector UPL and is
8273 * empty, then this is the right function to deallocate
8274 * it. So go ahead set the *empty variable. The flag
8275 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8276 * should be considered relevant for the Vector UPL and not
8277 * the internal UPLs.
8278 */
8279 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8280 *empty = TRUE;
8281 }
8282
8283 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8284 /*
8285 * this is not a paging object
8286 * so we need to drop the paging reference
8287 * that was taken when we created the UPL
8288 * against this object
8289 */
8290 vm_object_activity_end(shadow_object);
8291 vm_object_collapse(shadow_object, 0, TRUE);
8292 } else {
8293 /*
8294 * we dontated the paging reference to
8295 * the map object... vm_pageout_object_terminate
8296 * will drop this reference
8297 */
8298 }
8299 }
8300 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8301 vm_object_unlock(shadow_object);
8302 if (object != shadow_object) {
8303 vm_object_unlock(object);
8304 }
8305
8306 if (!isVectorUPL) {
8307 upl_unlock(upl);
8308 } else {
8309 /*
8310 * If we completed our operations on an UPL that is
8311 * part of a Vectored UPL and if empty is TRUE, then
8312 * we should go ahead and deallocate this UPL element.
8313 * Then we check if this was the last of the UPL elements
8314 * within that Vectored UPL. If so, set empty to TRUE
8315 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8316 * can go ahead and deallocate the Vector UPL too.
8317 */
8318 if (*empty == TRUE) {
8319 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8320 upl_deallocate(upl);
8321 }
8322 goto process_upl_to_commit;
8323 }
8324 if (pgpgout_count) {
8325 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8326 }
8327
8328 kr = KERN_SUCCESS;
8329 done:
8330 if (dwp_start && dwp_finish_ctx) {
8331 vm_page_delayed_work_finish_ctx(dwp_start);
8332 dwp_start = dwp = NULL;
8333 }
8334
8335 return kr;
8336 }
8337
8338 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8339 upl_abort_range(
8340 upl_t upl,
8341 upl_offset_t offset,
8342 upl_size_t size,
8343 int error,
8344 boolean_t *empty)
8345 {
8346 upl_page_info_t *user_page_list = NULL;
8347 upl_size_t xfer_size, subupl_size;
8348 vm_object_t shadow_object;
8349 vm_object_t object;
8350 vm_object_offset_t target_offset;
8351 upl_offset_t subupl_offset = offset;
8352 int entry;
8353 wpl_array_t lite_list;
8354 int occupied;
8355 struct vm_page_delayed_work dw_array;
8356 struct vm_page_delayed_work *dwp, *dwp_start;
8357 bool dwp_finish_ctx = TRUE;
8358 int dw_count;
8359 int dw_limit;
8360 int isVectorUPL = 0;
8361 upl_t vector_upl = NULL;
8362 vm_object_offset_t obj_start, obj_end, obj_offset;
8363 kern_return_t kr = KERN_SUCCESS;
8364
8365 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8366
8367 dwp_start = dwp = NULL;
8368
8369 subupl_size = size;
8370 *empty = FALSE;
8371
8372 if (upl == UPL_NULL) {
8373 return KERN_INVALID_ARGUMENT;
8374 }
8375
8376 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8377 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8378 }
8379
8380 dw_count = 0;
8381 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8382 dwp_start = vm_page_delayed_work_get_ctx();
8383 if (dwp_start == NULL) {
8384 dwp_start = &dw_array;
8385 dw_limit = 1;
8386 dwp_finish_ctx = FALSE;
8387 }
8388
8389 dwp = dwp_start;
8390
8391 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8392 vector_upl = upl;
8393 upl_lock(vector_upl);
8394 } else {
8395 upl_lock(upl);
8396 }
8397
8398 process_upl_to_abort:
8399 if (isVectorUPL) {
8400 size = subupl_size;
8401 offset = subupl_offset;
8402 if (size == 0) {
8403 upl_unlock(vector_upl);
8404 kr = KERN_SUCCESS;
8405 goto done;
8406 }
8407 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8408 if (upl == NULL) {
8409 upl_unlock(vector_upl);
8410 kr = KERN_FAILURE;
8411 goto done;
8412 }
8413 subupl_size -= size;
8414 subupl_offset += size;
8415 }
8416
8417 *empty = FALSE;
8418
8419 #if UPL_DEBUG
8420 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8421 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8422
8423 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8424 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8425 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8426
8427 upl->upl_commit_index++;
8428 }
8429 #endif
8430 if (upl->flags & UPL_DEVICE_MEMORY) {
8431 xfer_size = 0;
8432 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8433 xfer_size = size;
8434 } else {
8435 if (!isVectorUPL) {
8436 upl_unlock(upl);
8437 } else {
8438 upl_unlock(vector_upl);
8439 }
8440 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8441 kr = KERN_FAILURE;
8442 goto done;
8443 }
8444 if (upl->flags & UPL_INTERNAL) {
8445 lite_list = (wpl_array_t)
8446 ((((uintptr_t)upl) + sizeof(struct upl))
8447 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8448
8449 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8450 } else {
8451 lite_list = (wpl_array_t)
8452 (((uintptr_t)upl) + sizeof(struct upl));
8453 }
8454 object = upl->map_object;
8455
8456 if (upl->flags & UPL_SHADOWED) {
8457 vm_object_lock(object);
8458 shadow_object = object->shadow;
8459 } else {
8460 shadow_object = object;
8461 }
8462
8463 entry = offset / PAGE_SIZE;
8464 target_offset = (vm_object_offset_t)offset;
8465
8466 if (upl->flags & UPL_KERNEL_OBJECT) {
8467 vm_object_lock_shared(shadow_object);
8468 } else {
8469 vm_object_lock(shadow_object);
8470 }
8471
8472 if (upl->flags & UPL_ACCESS_BLOCKED) {
8473 assert(shadow_object->blocked_access);
8474 shadow_object->blocked_access = FALSE;
8475 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8476 }
8477
8478 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8479 panic("upl_abort_range: kernel_object being DUMPED");
8480 }
8481
8482 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8483 obj_end = obj_start + xfer_size;
8484 obj_start = vm_object_trunc_page(obj_start);
8485 obj_end = vm_object_round_page(obj_end);
8486 for (obj_offset = obj_start;
8487 obj_offset < obj_end;
8488 obj_offset += PAGE_SIZE) {
8489 vm_page_t t, m;
8490 unsigned int pg_num;
8491 boolean_t needed;
8492
8493 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8494 assert(pg_num == target_offset / PAGE_SIZE);
8495
8496 needed = FALSE;
8497
8498 if (user_page_list) {
8499 needed = user_page_list[pg_num].needed;
8500 }
8501
8502 dwp->dw_mask = 0;
8503 m = VM_PAGE_NULL;
8504
8505 if (upl->flags & UPL_LITE) {
8506 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8507 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8508
8509 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8510 m = vm_page_lookup(shadow_object, obj_offset);
8511 }
8512 }
8513 }
8514 if (upl->flags & UPL_SHADOWED) {
8515 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8516 t->vmp_free_when_done = FALSE;
8517
8518 VM_PAGE_FREE(t);
8519
8520 if (m == VM_PAGE_NULL) {
8521 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8522 }
8523 }
8524 }
8525 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8526 goto abort_next_page;
8527 }
8528
8529 if (m != VM_PAGE_NULL) {
8530 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8531
8532 if (m->vmp_absent) {
8533 boolean_t must_free = TRUE;
8534
8535 /*
8536 * COPYOUT = FALSE case
8537 * check for error conditions which must
8538 * be passed back to the pages customer
8539 */
8540 if (error & UPL_ABORT_RESTART) {
8541 m->vmp_restart = TRUE;
8542 m->vmp_absent = FALSE;
8543 m->vmp_unusual = TRUE;
8544 must_free = FALSE;
8545 } else if (error & UPL_ABORT_UNAVAILABLE) {
8546 m->vmp_restart = FALSE;
8547 m->vmp_unusual = TRUE;
8548 must_free = FALSE;
8549 } else if (error & UPL_ABORT_ERROR) {
8550 m->vmp_restart = FALSE;
8551 m->vmp_absent = FALSE;
8552 m->vmp_error = TRUE;
8553 m->vmp_unusual = TRUE;
8554 must_free = FALSE;
8555 }
8556 if (m->vmp_clustered && needed == FALSE) {
8557 /*
8558 * This page was a part of a speculative
8559 * read-ahead initiated by the kernel
8560 * itself. No one is expecting this
8561 * page and no one will clean up its
8562 * error state if it ever becomes valid
8563 * in the future.
8564 * We have to free it here.
8565 */
8566 must_free = TRUE;
8567 }
8568 m->vmp_cleaning = FALSE;
8569
8570 if (m->vmp_overwriting && !m->vmp_busy) {
8571 /*
8572 * this shouldn't happen since
8573 * this is an 'absent' page, but
8574 * it doesn't hurt to check for
8575 * the 'alternate' method of
8576 * stabilizing the page...
8577 * we will mark 'busy' to be cleared
8578 * in the following code which will
8579 * take care of the primary stabilzation
8580 * method (i.e. setting 'busy' to TRUE)
8581 */
8582 dwp->dw_mask |= DW_vm_page_unwire;
8583 }
8584 m->vmp_overwriting = FALSE;
8585
8586 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8587
8588 if (must_free == TRUE) {
8589 dwp->dw_mask |= DW_vm_page_free;
8590 } else {
8591 dwp->dw_mask |= DW_vm_page_activate;
8592 }
8593 } else {
8594 /*
8595 * Handle the trusted pager throttle.
8596 */
8597 if (m->vmp_laundry) {
8598 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8599 }
8600
8601 if (upl->flags & UPL_ACCESS_BLOCKED) {
8602 /*
8603 * We blocked access to the pages in this UPL.
8604 * Clear the "busy" bit and wake up any waiter
8605 * for this page.
8606 */
8607 dwp->dw_mask |= DW_clear_busy;
8608 }
8609 if (m->vmp_overwriting) {
8610 if (m->vmp_busy) {
8611 dwp->dw_mask |= DW_clear_busy;
8612 } else {
8613 /*
8614 * deal with the 'alternate' method
8615 * of stabilizing the page...
8616 * we will either free the page
8617 * or mark 'busy' to be cleared
8618 * in the following code which will
8619 * take care of the primary stabilzation
8620 * method (i.e. setting 'busy' to TRUE)
8621 */
8622 dwp->dw_mask |= DW_vm_page_unwire;
8623 }
8624 m->vmp_overwriting = FALSE;
8625 }
8626 m->vmp_free_when_done = FALSE;
8627 m->vmp_cleaning = FALSE;
8628
8629 if (error & UPL_ABORT_DUMP_PAGES) {
8630 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8631
8632 dwp->dw_mask |= DW_vm_page_free;
8633 } else {
8634 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8635 if (error & UPL_ABORT_REFERENCE) {
8636 /*
8637 * we've been told to explictly
8638 * reference this page... for
8639 * file I/O, this is done by
8640 * implementing an LRU on the inactive q
8641 */
8642 dwp->dw_mask |= DW_vm_page_lru;
8643 } else if (!VM_PAGE_PAGEABLE(m)) {
8644 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8645 }
8646 }
8647 dwp->dw_mask |= DW_PAGE_WAKEUP;
8648 }
8649 }
8650 }
8651 abort_next_page:
8652 target_offset += PAGE_SIZE_64;
8653 xfer_size -= PAGE_SIZE;
8654 entry++;
8655
8656 if (dwp->dw_mask) {
8657 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8658 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8659
8660 if (dw_count >= dw_limit) {
8661 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8662
8663 dwp = dwp_start;
8664 dw_count = 0;
8665 }
8666 } else {
8667 if (dwp->dw_mask & DW_clear_busy) {
8668 m->vmp_busy = FALSE;
8669 }
8670
8671 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8672 PAGE_WAKEUP(m);
8673 }
8674 }
8675 }
8676 }
8677 if (dw_count) {
8678 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8679 dwp = dwp_start;
8680 dw_count = 0;
8681 }
8682
8683 occupied = 1;
8684
8685 if (upl->flags & UPL_DEVICE_MEMORY) {
8686 occupied = 0;
8687 } else if (upl->flags & UPL_LITE) {
8688 int pg_num;
8689 int i;
8690
8691 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8692 pg_num = (pg_num + 31) >> 5;
8693 occupied = 0;
8694
8695 for (i = 0; i < pg_num; i++) {
8696 if (lite_list[i] != 0) {
8697 occupied = 1;
8698 break;
8699 }
8700 }
8701 } else {
8702 if (vm_page_queue_empty(&upl->map_object->memq)) {
8703 occupied = 0;
8704 }
8705 }
8706 if (occupied == 0) {
8707 /*
8708 * If this UPL element belongs to a Vector UPL and is
8709 * empty, then this is the right function to deallocate
8710 * it. So go ahead set the *empty variable. The flag
8711 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8712 * should be considered relevant for the Vector UPL and
8713 * not the internal UPLs.
8714 */
8715 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8716 *empty = TRUE;
8717 }
8718
8719 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8720 /*
8721 * this is not a paging object
8722 * so we need to drop the paging reference
8723 * that was taken when we created the UPL
8724 * against this object
8725 */
8726 vm_object_activity_end(shadow_object);
8727 vm_object_collapse(shadow_object, 0, TRUE);
8728 } else {
8729 /*
8730 * we dontated the paging reference to
8731 * the map object... vm_pageout_object_terminate
8732 * will drop this reference
8733 */
8734 }
8735 }
8736 vm_object_unlock(shadow_object);
8737 if (object != shadow_object) {
8738 vm_object_unlock(object);
8739 }
8740
8741 if (!isVectorUPL) {
8742 upl_unlock(upl);
8743 } else {
8744 /*
8745 * If we completed our operations on an UPL that is
8746 * part of a Vectored UPL and if empty is TRUE, then
8747 * we should go ahead and deallocate this UPL element.
8748 * Then we check if this was the last of the UPL elements
8749 * within that Vectored UPL. If so, set empty to TRUE
8750 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8751 * can go ahead and deallocate the Vector UPL too.
8752 */
8753 if (*empty == TRUE) {
8754 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8755 upl_deallocate(upl);
8756 }
8757 goto process_upl_to_abort;
8758 }
8759
8760 kr = KERN_SUCCESS;
8761
8762 done:
8763 if (dwp_start && dwp_finish_ctx) {
8764 vm_page_delayed_work_finish_ctx(dwp_start);
8765 dwp_start = dwp = NULL;
8766 }
8767
8768 return kr;
8769 }
8770
8771
8772 kern_return_t
upl_abort(upl_t upl,int error)8773 upl_abort(
8774 upl_t upl,
8775 int error)
8776 {
8777 boolean_t empty;
8778
8779 if (upl == UPL_NULL) {
8780 return KERN_INVALID_ARGUMENT;
8781 }
8782
8783 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8784 }
8785
8786
8787 /* an option on commit should be wire */
8788 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8789 upl_commit(
8790 upl_t upl,
8791 upl_page_info_t *page_list,
8792 mach_msg_type_number_t count)
8793 {
8794 boolean_t empty;
8795
8796 if (upl == UPL_NULL) {
8797 return KERN_INVALID_ARGUMENT;
8798 }
8799
8800 return upl_commit_range(upl, 0, upl->u_size, 0,
8801 page_list, count, &empty);
8802 }
8803
8804
8805 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8806 iopl_valid_data(
8807 upl_t upl,
8808 vm_tag_t tag)
8809 {
8810 vm_object_t object;
8811 vm_offset_t offset;
8812 vm_page_t m, nxt_page = VM_PAGE_NULL;
8813 upl_size_t size;
8814 int wired_count = 0;
8815
8816 if (upl == NULL) {
8817 panic("iopl_valid_data: NULL upl");
8818 }
8819 if (vector_upl_is_valid(upl)) {
8820 panic("iopl_valid_data: vector upl");
8821 }
8822 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8823 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8824 }
8825
8826 object = upl->map_object;
8827
8828 if (object == kernel_object || object == compressor_object) {
8829 panic("iopl_valid_data: object == kernel or compressor");
8830 }
8831
8832 if (object->purgable == VM_PURGABLE_VOLATILE ||
8833 object->purgable == VM_PURGABLE_EMPTY) {
8834 panic("iopl_valid_data: object %p purgable %d",
8835 object, object->purgable);
8836 }
8837
8838 size = upl_adjusted_size(upl, PAGE_MASK);
8839
8840 vm_object_lock(object);
8841 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8842
8843 bool whole_object;
8844
8845 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8846 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8847 whole_object = true;
8848 } else {
8849 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8850 whole_object = false;
8851 }
8852
8853 while (size) {
8854 if (whole_object) {
8855 if (nxt_page != VM_PAGE_NULL) {
8856 m = nxt_page;
8857 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8858 }
8859 } else {
8860 m = vm_page_lookup(object, offset);
8861 offset += PAGE_SIZE;
8862
8863 if (m == VM_PAGE_NULL) {
8864 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8865 }
8866 }
8867 if (m->vmp_busy) {
8868 if (!m->vmp_absent) {
8869 panic("iopl_valid_data: busy page w/o absent");
8870 }
8871
8872 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8873 panic("iopl_valid_data: busy+absent page on page queue");
8874 }
8875 if (m->vmp_reusable) {
8876 panic("iopl_valid_data: %p is reusable", m);
8877 }
8878
8879 m->vmp_absent = FALSE;
8880 m->vmp_dirty = TRUE;
8881 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8882 assert(m->vmp_wire_count == 0);
8883 m->vmp_wire_count++;
8884 assert(m->vmp_wire_count);
8885 if (m->vmp_wire_count == 1) {
8886 m->vmp_q_state = VM_PAGE_IS_WIRED;
8887 wired_count++;
8888 } else {
8889 panic("iopl_valid_data: %p already wired", m);
8890 }
8891
8892 PAGE_WAKEUP_DONE(m);
8893 }
8894 size -= PAGE_SIZE;
8895 }
8896 if (wired_count) {
8897 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8898 assert(object->resident_page_count >= object->wired_page_count);
8899
8900 /* no need to adjust purgeable accounting for this object: */
8901 assert(object->purgable != VM_PURGABLE_VOLATILE);
8902 assert(object->purgable != VM_PURGABLE_EMPTY);
8903
8904 vm_page_lockspin_queues();
8905 vm_page_wire_count += wired_count;
8906 vm_page_unlock_queues();
8907 }
8908 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8909 vm_object_unlock(object);
8910 }
8911
8912
8913 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8914 vm_object_set_pmap_cache_attr(
8915 vm_object_t object,
8916 upl_page_info_array_t user_page_list,
8917 unsigned int num_pages,
8918 boolean_t batch_pmap_op)
8919 {
8920 unsigned int cache_attr = 0;
8921
8922 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8923 assert(user_page_list);
8924 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8925 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8926 }
8927 }
8928
8929
8930 boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8931 kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8932
8933
8934
8935 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8936 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8937 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8938 {
8939 vm_page_t dst_page;
8940 unsigned int entry;
8941 int page_count;
8942 int delayed_unlock = 0;
8943 boolean_t retval = TRUE;
8944 ppnum_t phys_page;
8945
8946 vm_object_lock_assert_exclusive(object);
8947 assert(object->purgable != VM_PURGABLE_VOLATILE);
8948 assert(object->purgable != VM_PURGABLE_EMPTY);
8949 assert(object->pager == NULL);
8950 assert(object->copy == NULL);
8951 assert(object->shadow == NULL);
8952
8953 page_count = object->resident_page_count;
8954 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8955
8956 vm_page_lock_queues();
8957
8958 while (page_count--) {
8959 if (dst_page->vmp_busy ||
8960 dst_page->vmp_fictitious ||
8961 dst_page->vmp_absent ||
8962 VMP_ERROR_GET(dst_page) ||
8963 dst_page->vmp_cleaning ||
8964 dst_page->vmp_restart ||
8965 dst_page->vmp_laundry) {
8966 retval = FALSE;
8967 goto done;
8968 }
8969 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8970 retval = FALSE;
8971 goto done;
8972 }
8973 dst_page->vmp_reference = TRUE;
8974
8975 vm_page_wire(dst_page, tag, FALSE);
8976
8977 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8978 SET_PAGE_DIRTY(dst_page, FALSE);
8979 }
8980 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8981 assert(entry >= 0 && entry < object->resident_page_count);
8982 lite_list[entry >> 5] |= 1U << (entry & 31);
8983
8984 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8985
8986 if (phys_page > upl->highest_page) {
8987 upl->highest_page = phys_page;
8988 }
8989
8990 if (user_page_list) {
8991 user_page_list[entry].phys_addr = phys_page;
8992 user_page_list[entry].absent = dst_page->vmp_absent;
8993 user_page_list[entry].dirty = dst_page->vmp_dirty;
8994 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8995 user_page_list[entry].precious = dst_page->vmp_precious;
8996 user_page_list[entry].device = FALSE;
8997 user_page_list[entry].speculative = FALSE;
8998 user_page_list[entry].cs_validated = FALSE;
8999 user_page_list[entry].cs_tainted = FALSE;
9000 user_page_list[entry].cs_nx = FALSE;
9001 user_page_list[entry].needed = FALSE;
9002 user_page_list[entry].mark = FALSE;
9003 }
9004 if (delayed_unlock++ > 256) {
9005 delayed_unlock = 0;
9006 lck_mtx_yield(&vm_page_queue_lock);
9007
9008 VM_CHECK_MEMORYSTATUS;
9009 }
9010 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
9011 }
9012 done:
9013 vm_page_unlock_queues();
9014
9015 VM_CHECK_MEMORYSTATUS;
9016
9017 return retval;
9018 }
9019
9020
9021 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)9022 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
9023 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
9024 int page_count, int* page_grab_count)
9025 {
9026 vm_page_t dst_page;
9027 boolean_t no_zero_fill = FALSE;
9028 int interruptible;
9029 int pages_wired = 0;
9030 int pages_inserted = 0;
9031 int entry = 0;
9032 uint64_t delayed_ledger_update = 0;
9033 kern_return_t ret = KERN_SUCCESS;
9034 int grab_options;
9035 ppnum_t phys_page;
9036
9037 vm_object_lock_assert_exclusive(object);
9038 assert(object->purgable != VM_PURGABLE_VOLATILE);
9039 assert(object->purgable != VM_PURGABLE_EMPTY);
9040 assert(object->pager == NULL);
9041 assert(object->copy == NULL);
9042 assert(object->shadow == NULL);
9043
9044 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9045 interruptible = THREAD_ABORTSAFE;
9046 } else {
9047 interruptible = THREAD_UNINT;
9048 }
9049
9050 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9051 no_zero_fill = TRUE;
9052 }
9053
9054 grab_options = 0;
9055 #if CONFIG_SECLUDED_MEMORY
9056 if (object->can_grab_secluded) {
9057 grab_options |= VM_PAGE_GRAB_SECLUDED;
9058 }
9059 #endif /* CONFIG_SECLUDED_MEMORY */
9060
9061 while (page_count--) {
9062 while ((dst_page = vm_page_grab_options(grab_options))
9063 == VM_PAGE_NULL) {
9064 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9065
9066 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9067
9068 if (vm_page_wait(interruptible) == FALSE) {
9069 /*
9070 * interrupted case
9071 */
9072 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9073
9074 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9075
9076 ret = MACH_SEND_INTERRUPTED;
9077 goto done;
9078 }
9079 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9080
9081 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9082 }
9083 if (no_zero_fill == FALSE) {
9084 vm_page_zero_fill(dst_page);
9085 } else {
9086 dst_page->vmp_absent = TRUE;
9087 }
9088
9089 dst_page->vmp_reference = TRUE;
9090
9091 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9092 SET_PAGE_DIRTY(dst_page, FALSE);
9093 }
9094 if (dst_page->vmp_absent == FALSE) {
9095 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9096 assert(dst_page->vmp_wire_count == 0);
9097 dst_page->vmp_wire_count++;
9098 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9099 assert(dst_page->vmp_wire_count);
9100 pages_wired++;
9101 PAGE_WAKEUP_DONE(dst_page);
9102 }
9103 pages_inserted++;
9104
9105 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9106
9107 lite_list[entry >> 5] |= 1U << (entry & 31);
9108
9109 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9110
9111 if (phys_page > upl->highest_page) {
9112 upl->highest_page = phys_page;
9113 }
9114
9115 if (user_page_list) {
9116 user_page_list[entry].phys_addr = phys_page;
9117 user_page_list[entry].absent = dst_page->vmp_absent;
9118 user_page_list[entry].dirty = dst_page->vmp_dirty;
9119 user_page_list[entry].free_when_done = FALSE;
9120 user_page_list[entry].precious = FALSE;
9121 user_page_list[entry].device = FALSE;
9122 user_page_list[entry].speculative = FALSE;
9123 user_page_list[entry].cs_validated = FALSE;
9124 user_page_list[entry].cs_tainted = FALSE;
9125 user_page_list[entry].cs_nx = FALSE;
9126 user_page_list[entry].needed = FALSE;
9127 user_page_list[entry].mark = FALSE;
9128 }
9129 entry++;
9130 *dst_offset += PAGE_SIZE_64;
9131 }
9132 done:
9133 if (pages_wired) {
9134 vm_page_lockspin_queues();
9135 vm_page_wire_count += pages_wired;
9136 vm_page_unlock_queues();
9137 }
9138 if (pages_inserted) {
9139 if (object->internal) {
9140 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9141 } else {
9142 OSAddAtomic(pages_inserted, &vm_page_external_count);
9143 }
9144 }
9145 if (delayed_ledger_update) {
9146 task_t owner;
9147 int ledger_idx_volatile;
9148 int ledger_idx_nonvolatile;
9149 int ledger_idx_volatile_compressed;
9150 int ledger_idx_nonvolatile_compressed;
9151 boolean_t do_footprint;
9152
9153 owner = VM_OBJECT_OWNER(object);
9154 assert(owner);
9155
9156 vm_object_ledger_tag_ledgers(object,
9157 &ledger_idx_volatile,
9158 &ledger_idx_nonvolatile,
9159 &ledger_idx_volatile_compressed,
9160 &ledger_idx_nonvolatile_compressed,
9161 &do_footprint);
9162
9163 /* more non-volatile bytes */
9164 ledger_credit(owner->ledger,
9165 ledger_idx_nonvolatile,
9166 delayed_ledger_update);
9167 if (do_footprint) {
9168 /* more footprint */
9169 ledger_credit(owner->ledger,
9170 task_ledgers.phys_footprint,
9171 delayed_ledger_update);
9172 }
9173 }
9174
9175 assert(page_grab_count);
9176 *page_grab_count = pages_inserted;
9177
9178 return ret;
9179 }
9180
9181
9182
9183 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9184 vm_object_iopl_request(
9185 vm_object_t object,
9186 vm_object_offset_t offset,
9187 upl_size_t size,
9188 upl_t *upl_ptr,
9189 upl_page_info_array_t user_page_list,
9190 unsigned int *page_list_count,
9191 upl_control_flags_t cntrl_flags,
9192 vm_tag_t tag)
9193 {
9194 vm_page_t dst_page;
9195 vm_object_offset_t dst_offset;
9196 upl_size_t xfer_size;
9197 upl_t upl = NULL;
9198 unsigned int entry;
9199 wpl_array_t lite_list = NULL;
9200 int no_zero_fill = FALSE;
9201 unsigned int size_in_pages;
9202 int page_grab_count = 0;
9203 u_int32_t psize;
9204 kern_return_t ret;
9205 vm_prot_t prot;
9206 struct vm_object_fault_info fault_info = {};
9207 struct vm_page_delayed_work dw_array;
9208 struct vm_page_delayed_work *dwp, *dwp_start;
9209 bool dwp_finish_ctx = TRUE;
9210 int dw_count;
9211 int dw_limit;
9212 int dw_index;
9213 boolean_t caller_lookup;
9214 int io_tracking_flag = 0;
9215 int interruptible;
9216 ppnum_t phys_page;
9217
9218 boolean_t set_cache_attr_needed = FALSE;
9219 boolean_t free_wired_pages = FALSE;
9220 boolean_t fast_path_empty_req = FALSE;
9221 boolean_t fast_path_full_req = FALSE;
9222
9223 #if DEVELOPMENT || DEBUG
9224 task_t task = current_task();
9225 #endif /* DEVELOPMENT || DEBUG */
9226
9227 dwp_start = dwp = NULL;
9228
9229 vm_object_offset_t original_offset = offset;
9230 upl_size_t original_size = size;
9231
9232 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9233
9234 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9235 offset = vm_object_trunc_page(offset);
9236 if (size != original_size || offset != original_offset) {
9237 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9238 }
9239
9240 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9241 /*
9242 * For forward compatibility's sake,
9243 * reject any unknown flag.
9244 */
9245 return KERN_INVALID_VALUE;
9246 }
9247 if (vm_lopage_needed == FALSE) {
9248 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9249 }
9250
9251 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9252 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9253 return KERN_INVALID_VALUE;
9254 }
9255
9256 if (object->phys_contiguous) {
9257 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9258 return KERN_INVALID_ADDRESS;
9259 }
9260
9261 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9262 return KERN_INVALID_ADDRESS;
9263 }
9264 }
9265 }
9266 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9267 no_zero_fill = TRUE;
9268 }
9269
9270 if (cntrl_flags & UPL_COPYOUT_FROM) {
9271 prot = VM_PROT_READ;
9272 } else {
9273 prot = VM_PROT_READ | VM_PROT_WRITE;
9274 }
9275
9276 if ((!object->internal) && (object->paging_offset != 0)) {
9277 panic("vm_object_iopl_request: external object with non-zero paging offset");
9278 }
9279
9280
9281 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9282
9283 #if CONFIG_IOSCHED || UPL_DEBUG
9284 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9285 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9286 }
9287 #endif
9288
9289 #if CONFIG_IOSCHED
9290 if (object->io_tracking) {
9291 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9292 if (object != kernel_object) {
9293 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9294 }
9295 }
9296 #endif
9297
9298 if (object->phys_contiguous) {
9299 psize = PAGE_SIZE;
9300 } else {
9301 psize = size;
9302
9303 dw_count = 0;
9304 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9305 dwp_start = vm_page_delayed_work_get_ctx();
9306 if (dwp_start == NULL) {
9307 dwp_start = &dw_array;
9308 dw_limit = 1;
9309 dwp_finish_ctx = FALSE;
9310 }
9311
9312 dwp = dwp_start;
9313 }
9314
9315 if (cntrl_flags & UPL_SET_INTERNAL) {
9316 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9317
9318 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9319 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9320 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9321 if (size == 0) {
9322 user_page_list = NULL;
9323 lite_list = NULL;
9324 }
9325 } else {
9326 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9327
9328 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9329 if (size == 0) {
9330 lite_list = NULL;
9331 }
9332 }
9333 if (user_page_list) {
9334 user_page_list[0].device = FALSE;
9335 }
9336 *upl_ptr = upl;
9337
9338 if (cntrl_flags & UPL_NOZEROFILLIO) {
9339 DTRACE_VM4(upl_nozerofillio,
9340 vm_object_t, object,
9341 vm_object_offset_t, offset,
9342 upl_size_t, size,
9343 upl_t, upl);
9344 }
9345
9346 upl->map_object = object;
9347 upl->u_offset = original_offset;
9348 upl->u_size = original_size;
9349
9350 size_in_pages = size / PAGE_SIZE;
9351
9352 if (object == kernel_object &&
9353 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9354 upl->flags |= UPL_KERNEL_OBJECT;
9355 #if UPL_DEBUG
9356 vm_object_lock(object);
9357 #else
9358 vm_object_lock_shared(object);
9359 #endif
9360 } else {
9361 vm_object_lock(object);
9362 vm_object_activity_begin(object);
9363 }
9364 /*
9365 * paging in progress also protects the paging_offset
9366 */
9367 upl->u_offset = original_offset + object->paging_offset;
9368
9369 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9370 /*
9371 * The user requested that access to the pages in this UPL
9372 * be blocked until the UPL is commited or aborted.
9373 */
9374 upl->flags |= UPL_ACCESS_BLOCKED;
9375 }
9376
9377 #if CONFIG_IOSCHED || UPL_DEBUG
9378 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9379 vm_object_activity_begin(object);
9380 queue_enter(&object->uplq, upl, upl_t, uplq);
9381 }
9382 #endif
9383
9384 if (object->phys_contiguous) {
9385 if (upl->flags & UPL_ACCESS_BLOCKED) {
9386 assert(!object->blocked_access);
9387 object->blocked_access = TRUE;
9388 }
9389
9390 vm_object_unlock(object);
9391
9392 /*
9393 * don't need any shadow mappings for this one
9394 * since it is already I/O memory
9395 */
9396 upl->flags |= UPL_DEVICE_MEMORY;
9397
9398 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9399
9400 if (user_page_list) {
9401 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9402 user_page_list[0].device = TRUE;
9403 }
9404 if (page_list_count != NULL) {
9405 if (upl->flags & UPL_INTERNAL) {
9406 *page_list_count = 0;
9407 } else {
9408 *page_list_count = 1;
9409 }
9410 }
9411
9412 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9413 #if DEVELOPMENT || DEBUG
9414 if (task != NULL) {
9415 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9416 }
9417 #endif /* DEVELOPMENT || DEBUG */
9418 return KERN_SUCCESS;
9419 }
9420 if (object != kernel_object && object != compressor_object) {
9421 /*
9422 * Protect user space from future COW operations
9423 */
9424 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9425 if (!object->true_share &&
9426 vm_object_tracking_btlog) {
9427 btlog_record(vm_object_tracking_btlog, object,
9428 VM_OBJECT_TRACKING_OP_TRUESHARE,
9429 btref_get(__builtin_frame_address(0), 0));
9430 }
9431 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9432
9433 vm_object_lock_assert_exclusive(object);
9434 object->true_share = TRUE;
9435
9436 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9437 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9438 }
9439 }
9440
9441 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9442 object->copy != VM_OBJECT_NULL) {
9443 /*
9444 * Honor copy-on-write obligations
9445 *
9446 * The caller is gathering these pages and
9447 * might modify their contents. We need to
9448 * make sure that the copy object has its own
9449 * private copies of these pages before we let
9450 * the caller modify them.
9451 *
9452 * NOTE: someone else could map the original object
9453 * after we've done this copy-on-write here, and they
9454 * could then see an inconsistent picture of the memory
9455 * while it's being modified via the UPL. To prevent this,
9456 * we would have to block access to these pages until the
9457 * UPL is released. We could use the UPL_BLOCK_ACCESS
9458 * code path for that...
9459 */
9460 vm_object_update(object,
9461 offset,
9462 size,
9463 NULL,
9464 NULL,
9465 FALSE, /* should_return */
9466 MEMORY_OBJECT_COPY_SYNC,
9467 VM_PROT_NO_CHANGE);
9468 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9469 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9470 }
9471 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9472 object->purgable != VM_PURGABLE_VOLATILE &&
9473 object->purgable != VM_PURGABLE_EMPTY &&
9474 object->copy == NULL &&
9475 size == object->vo_size &&
9476 offset == 0 &&
9477 object->shadow == NULL &&
9478 object->pager == NULL) {
9479 if (object->resident_page_count == size_in_pages) {
9480 assert(object != compressor_object);
9481 assert(object != kernel_object);
9482 fast_path_full_req = TRUE;
9483 } else if (object->resident_page_count == 0) {
9484 assert(object != compressor_object);
9485 assert(object != kernel_object);
9486 fast_path_empty_req = TRUE;
9487 set_cache_attr_needed = TRUE;
9488 }
9489 }
9490
9491 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9492 interruptible = THREAD_ABORTSAFE;
9493 } else {
9494 interruptible = THREAD_UNINT;
9495 }
9496
9497 entry = 0;
9498
9499 xfer_size = size;
9500 dst_offset = offset;
9501
9502 if (fast_path_full_req) {
9503 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9504 goto finish;
9505 }
9506 /*
9507 * we couldn't complete the processing of this request on the fast path
9508 * so fall through to the slow path and finish up
9509 */
9510 } else if (fast_path_empty_req) {
9511 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9512 ret = KERN_MEMORY_ERROR;
9513 goto return_err;
9514 }
9515 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9516
9517 if (ret) {
9518 free_wired_pages = TRUE;
9519 goto return_err;
9520 }
9521 goto finish;
9522 }
9523
9524 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9525 fault_info.lo_offset = offset;
9526 fault_info.hi_offset = offset + xfer_size;
9527 fault_info.mark_zf_absent = TRUE;
9528 fault_info.interruptible = interruptible;
9529 fault_info.batch_pmap_op = TRUE;
9530
9531 while (xfer_size) {
9532 vm_fault_return_t result;
9533
9534 dwp->dw_mask = 0;
9535
9536 if (fast_path_full_req) {
9537 /*
9538 * if we get here, it means that we ran into a page
9539 * state we couldn't handle in the fast path and
9540 * bailed out to the slow path... since the order
9541 * we look at pages is different between the 2 paths,
9542 * the following check is needed to determine whether
9543 * this page was already processed in the fast path
9544 */
9545 if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9546 goto skip_page;
9547 }
9548 }
9549 dst_page = vm_page_lookup(object, dst_offset);
9550
9551 if (dst_page == VM_PAGE_NULL ||
9552 dst_page->vmp_busy ||
9553 VMP_ERROR_GET(dst_page) ||
9554 dst_page->vmp_restart ||
9555 dst_page->vmp_absent ||
9556 dst_page->vmp_fictitious) {
9557 if (object == kernel_object) {
9558 panic("vm_object_iopl_request: missing/bad page in kernel object");
9559 }
9560 if (object == compressor_object) {
9561 panic("vm_object_iopl_request: missing/bad page in compressor object");
9562 }
9563
9564 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9565 ret = KERN_MEMORY_ERROR;
9566 goto return_err;
9567 }
9568 set_cache_attr_needed = TRUE;
9569
9570 /*
9571 * We just looked up the page and the result remains valid
9572 * until the object lock is release, so send it to
9573 * vm_fault_page() (as "dst_page"), to avoid having to
9574 * look it up again there.
9575 */
9576 caller_lookup = TRUE;
9577
9578 do {
9579 vm_page_t top_page;
9580 kern_return_t error_code;
9581
9582 fault_info.cluster_size = xfer_size;
9583
9584 vm_object_paging_begin(object);
9585
9586 result = vm_fault_page(object, dst_offset,
9587 prot | VM_PROT_WRITE, FALSE,
9588 caller_lookup,
9589 &prot, &dst_page, &top_page,
9590 (int *)0,
9591 &error_code, no_zero_fill,
9592 &fault_info);
9593
9594 /* our lookup is no longer valid at this point */
9595 caller_lookup = FALSE;
9596
9597 switch (result) {
9598 case VM_FAULT_SUCCESS:
9599 page_grab_count++;
9600
9601 if (!dst_page->vmp_absent) {
9602 PAGE_WAKEUP_DONE(dst_page);
9603 } else {
9604 /*
9605 * we only get back an absent page if we
9606 * requested that it not be zero-filled
9607 * because we are about to fill it via I/O
9608 *
9609 * absent pages should be left BUSY
9610 * to prevent them from being faulted
9611 * into an address space before we've
9612 * had a chance to complete the I/O on
9613 * them since they may contain info that
9614 * shouldn't be seen by the faulting task
9615 */
9616 }
9617 /*
9618 * Release paging references and
9619 * top-level placeholder page, if any.
9620 */
9621 if (top_page != VM_PAGE_NULL) {
9622 vm_object_t local_object;
9623
9624 local_object = VM_PAGE_OBJECT(top_page);
9625
9626 /*
9627 * comparing 2 packed pointers
9628 */
9629 if (top_page->vmp_object != dst_page->vmp_object) {
9630 vm_object_lock(local_object);
9631 VM_PAGE_FREE(top_page);
9632 vm_object_paging_end(local_object);
9633 vm_object_unlock(local_object);
9634 } else {
9635 VM_PAGE_FREE(top_page);
9636 vm_object_paging_end(local_object);
9637 }
9638 }
9639 vm_object_paging_end(object);
9640 break;
9641
9642 case VM_FAULT_RETRY:
9643 vm_object_lock(object);
9644 break;
9645
9646 case VM_FAULT_MEMORY_SHORTAGE:
9647 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9648
9649 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9650
9651 if (vm_page_wait(interruptible)) {
9652 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9653
9654 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9655 vm_object_lock(object);
9656
9657 break;
9658 }
9659 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9660
9661 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9662 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9663 OS_FALLTHROUGH;
9664
9665 case VM_FAULT_INTERRUPTED:
9666 error_code = MACH_SEND_INTERRUPTED;
9667 OS_FALLTHROUGH;
9668 case VM_FAULT_MEMORY_ERROR:
9669 memory_error:
9670 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9671
9672 vm_object_lock(object);
9673 goto return_err;
9674
9675 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9676 /* success but no page: fail */
9677 vm_object_paging_end(object);
9678 vm_object_unlock(object);
9679 goto memory_error;
9680
9681 default:
9682 panic("vm_object_iopl_request: unexpected error"
9683 " 0x%x from vm_fault_page()\n", result);
9684 }
9685 } while (result != VM_FAULT_SUCCESS);
9686 }
9687 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9688
9689 if (upl->flags & UPL_KERNEL_OBJECT) {
9690 goto record_phys_addr;
9691 }
9692
9693 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9694 dst_page->vmp_busy = TRUE;
9695 goto record_phys_addr;
9696 }
9697
9698 if (dst_page->vmp_cleaning) {
9699 /*
9700 * Someone else is cleaning this page in place.
9701 * In theory, we should be able to proceed and use this
9702 * page but they'll probably end up clearing the "busy"
9703 * bit on it in upl_commit_range() but they didn't set
9704 * it, so they would clear our "busy" bit and open
9705 * us to race conditions.
9706 * We'd better wait for the cleaning to complete and
9707 * then try again.
9708 */
9709 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9710 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9711 continue;
9712 }
9713 if (dst_page->vmp_laundry) {
9714 vm_pageout_steal_laundry(dst_page, FALSE);
9715 }
9716
9717 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9718 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9719 vm_page_t low_page;
9720 int refmod;
9721
9722 /*
9723 * support devices that can't DMA above 32 bits
9724 * by substituting pages from a pool of low address
9725 * memory for any pages we find above the 4G mark
9726 * can't substitute if the page is already wired because
9727 * we don't know whether that physical address has been
9728 * handed out to some other 64 bit capable DMA device to use
9729 */
9730 if (VM_PAGE_WIRED(dst_page)) {
9731 ret = KERN_PROTECTION_FAILURE;
9732 goto return_err;
9733 }
9734 low_page = vm_page_grablo();
9735
9736 if (low_page == VM_PAGE_NULL) {
9737 ret = KERN_RESOURCE_SHORTAGE;
9738 goto return_err;
9739 }
9740 /*
9741 * from here until the vm_page_replace completes
9742 * we musn't drop the object lock... we don't
9743 * want anyone refaulting this page in and using
9744 * it after we disconnect it... we want the fault
9745 * to find the new page being substituted.
9746 */
9747 if (dst_page->vmp_pmapped) {
9748 refmod = pmap_disconnect(phys_page);
9749 } else {
9750 refmod = 0;
9751 }
9752
9753 if (!dst_page->vmp_absent) {
9754 vm_page_copy(dst_page, low_page);
9755 }
9756
9757 low_page->vmp_reference = dst_page->vmp_reference;
9758 low_page->vmp_dirty = dst_page->vmp_dirty;
9759 low_page->vmp_absent = dst_page->vmp_absent;
9760
9761 if (refmod & VM_MEM_REFERENCED) {
9762 low_page->vmp_reference = TRUE;
9763 }
9764 if (refmod & VM_MEM_MODIFIED) {
9765 SET_PAGE_DIRTY(low_page, FALSE);
9766 }
9767
9768 vm_page_replace(low_page, object, dst_offset);
9769
9770 dst_page = low_page;
9771 /*
9772 * vm_page_grablo returned the page marked
9773 * BUSY... we don't need a PAGE_WAKEUP_DONE
9774 * here, because we've never dropped the object lock
9775 */
9776 if (!dst_page->vmp_absent) {
9777 dst_page->vmp_busy = FALSE;
9778 }
9779
9780 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9781 }
9782 if (!dst_page->vmp_busy) {
9783 dwp->dw_mask |= DW_vm_page_wire;
9784 }
9785
9786 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9787 /*
9788 * Mark the page "busy" to block any future page fault
9789 * on this page in addition to wiring it.
9790 * We'll also remove the mapping
9791 * of all these pages before leaving this routine.
9792 */
9793 assert(!dst_page->vmp_fictitious);
9794 dst_page->vmp_busy = TRUE;
9795 }
9796 /*
9797 * expect the page to be used
9798 * page queues lock must be held to set 'reference'
9799 */
9800 dwp->dw_mask |= DW_set_reference;
9801
9802 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9803 SET_PAGE_DIRTY(dst_page, TRUE);
9804 /*
9805 * Page belonging to a code-signed object is about to
9806 * be written. Mark it tainted and disconnect it from
9807 * all pmaps so processes have to fault it back in and
9808 * deal with the tainted bit.
9809 */
9810 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9811 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9812 vm_page_iopl_tainted++;
9813 if (dst_page->vmp_pmapped) {
9814 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9815 if (refmod & VM_MEM_REFERENCED) {
9816 dst_page->vmp_reference = TRUE;
9817 }
9818 }
9819 }
9820 }
9821 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9822 pmap_sync_page_attributes_phys(phys_page);
9823 dst_page->vmp_written_by_kernel = FALSE;
9824 }
9825
9826 record_phys_addr:
9827 if (dst_page->vmp_busy) {
9828 upl->flags |= UPL_HAS_BUSY;
9829 }
9830
9831 lite_list[entry >> 5] |= 1U << (entry & 31);
9832
9833 if (phys_page > upl->highest_page) {
9834 upl->highest_page = phys_page;
9835 }
9836
9837 if (user_page_list) {
9838 user_page_list[entry].phys_addr = phys_page;
9839 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9840 user_page_list[entry].absent = dst_page->vmp_absent;
9841 user_page_list[entry].dirty = dst_page->vmp_dirty;
9842 user_page_list[entry].precious = dst_page->vmp_precious;
9843 user_page_list[entry].device = FALSE;
9844 user_page_list[entry].needed = FALSE;
9845 if (dst_page->vmp_clustered == TRUE) {
9846 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9847 } else {
9848 user_page_list[entry].speculative = FALSE;
9849 }
9850 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9851 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9852 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9853 user_page_list[entry].mark = FALSE;
9854 }
9855 if (object != kernel_object && object != compressor_object) {
9856 /*
9857 * someone is explicitly grabbing this page...
9858 * update clustered and speculative state
9859 *
9860 */
9861 if (dst_page->vmp_clustered) {
9862 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9863 }
9864 }
9865 skip_page:
9866 entry++;
9867 dst_offset += PAGE_SIZE_64;
9868 xfer_size -= PAGE_SIZE;
9869
9870 if (dwp->dw_mask) {
9871 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9872
9873 if (dw_count >= dw_limit) {
9874 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9875
9876 dwp = dwp_start;
9877 dw_count = 0;
9878 }
9879 }
9880 }
9881 assert(entry == size_in_pages);
9882
9883 if (dw_count) {
9884 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9885 dwp = dwp_start;
9886 dw_count = 0;
9887 }
9888 finish:
9889 if (user_page_list && set_cache_attr_needed == TRUE) {
9890 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9891 }
9892
9893 if (page_list_count != NULL) {
9894 if (upl->flags & UPL_INTERNAL) {
9895 *page_list_count = 0;
9896 } else if (*page_list_count > size_in_pages) {
9897 *page_list_count = size_in_pages;
9898 }
9899 }
9900 vm_object_unlock(object);
9901
9902 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9903 /*
9904 * We've marked all the pages "busy" so that future
9905 * page faults will block.
9906 * Now remove the mapping for these pages, so that they
9907 * can't be accessed without causing a page fault.
9908 */
9909 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9910 PMAP_NULL,
9911 PAGE_SIZE,
9912 0, VM_PROT_NONE);
9913 assert(!object->blocked_access);
9914 object->blocked_access = TRUE;
9915 }
9916
9917 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9918 #if DEVELOPMENT || DEBUG
9919 if (task != NULL) {
9920 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9921 }
9922 #endif /* DEVELOPMENT || DEBUG */
9923
9924 if (dwp_start && dwp_finish_ctx) {
9925 vm_page_delayed_work_finish_ctx(dwp_start);
9926 dwp_start = dwp = NULL;
9927 }
9928
9929 return KERN_SUCCESS;
9930
9931 return_err:
9932 dw_index = 0;
9933
9934 for (; offset < dst_offset; offset += PAGE_SIZE) {
9935 boolean_t need_unwire;
9936
9937 dst_page = vm_page_lookup(object, offset);
9938
9939 if (dst_page == VM_PAGE_NULL) {
9940 panic("vm_object_iopl_request: Wired page missing.");
9941 }
9942
9943 /*
9944 * if we've already processed this page in an earlier
9945 * dw_do_work, we need to undo the wiring... we will
9946 * leave the dirty and reference bits on if they
9947 * were set, since we don't have a good way of knowing
9948 * what the previous state was and we won't get here
9949 * under any normal circumstances... we will always
9950 * clear BUSY and wakeup any waiters via vm_page_free
9951 * or PAGE_WAKEUP_DONE
9952 */
9953 need_unwire = TRUE;
9954
9955 if (dw_count) {
9956 if ((dwp_start)[dw_index].dw_m == dst_page) {
9957 /*
9958 * still in the deferred work list
9959 * which means we haven't yet called
9960 * vm_page_wire on this page
9961 */
9962 need_unwire = FALSE;
9963
9964 dw_index++;
9965 dw_count--;
9966 }
9967 }
9968 vm_page_lock_queues();
9969
9970 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9971 vm_page_free(dst_page);
9972
9973 need_unwire = FALSE;
9974 } else {
9975 if (need_unwire == TRUE) {
9976 vm_page_unwire(dst_page, TRUE);
9977 }
9978
9979 PAGE_WAKEUP_DONE(dst_page);
9980 }
9981 vm_page_unlock_queues();
9982
9983 if (need_unwire == TRUE) {
9984 counter_inc(&vm_statistics_reactivations);
9985 }
9986 }
9987 #if UPL_DEBUG
9988 upl->upl_state = 2;
9989 #endif
9990 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9991 vm_object_activity_end(object);
9992 vm_object_collapse(object, 0, TRUE);
9993 }
9994 vm_object_unlock(object);
9995 upl_destroy(upl);
9996
9997 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9998 #if DEVELOPMENT || DEBUG
9999 if (task != NULL) {
10000 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
10001 }
10002 #endif /* DEVELOPMENT || DEBUG */
10003
10004 if (dwp_start && dwp_finish_ctx) {
10005 vm_page_delayed_work_finish_ctx(dwp_start);
10006 dwp_start = dwp = NULL;
10007 }
10008 return ret;
10009 }
10010
10011 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)10012 upl_transpose(
10013 upl_t upl1,
10014 upl_t upl2)
10015 {
10016 kern_return_t retval;
10017 boolean_t upls_locked;
10018 vm_object_t object1, object2;
10019
10020 /* LD: Should mapped UPLs be eligible for a transpose? */
10021 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
10022 return KERN_INVALID_ARGUMENT;
10023 }
10024
10025 upls_locked = FALSE;
10026
10027 /*
10028 * Since we need to lock both UPLs at the same time,
10029 * avoid deadlocks by always taking locks in the same order.
10030 */
10031 if (upl1 < upl2) {
10032 upl_lock(upl1);
10033 upl_lock(upl2);
10034 } else {
10035 upl_lock(upl2);
10036 upl_lock(upl1);
10037 }
10038 upls_locked = TRUE; /* the UPLs will need to be unlocked */
10039
10040 object1 = upl1->map_object;
10041 object2 = upl2->map_object;
10042
10043 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
10044 upl1->u_size != upl2->u_size) {
10045 /*
10046 * We deal only with full objects, not subsets.
10047 * That's because we exchange the entire backing store info
10048 * for the objects: pager, resident pages, etc... We can't do
10049 * only part of it.
10050 */
10051 retval = KERN_INVALID_VALUE;
10052 goto done;
10053 }
10054
10055 /*
10056 * Tranpose the VM objects' backing store.
10057 */
10058 retval = vm_object_transpose(object1, object2,
10059 upl_adjusted_size(upl1, PAGE_MASK));
10060
10061 if (retval == KERN_SUCCESS) {
10062 /*
10063 * Make each UPL point to the correct VM object, i.e. the
10064 * object holding the pages that the UPL refers to...
10065 */
10066 #if CONFIG_IOSCHED || UPL_DEBUG
10067 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10068 vm_object_lock(object1);
10069 vm_object_lock(object2);
10070 }
10071 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10072 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10073 }
10074 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10075 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10076 }
10077 #endif
10078 upl1->map_object = object2;
10079 upl2->map_object = object1;
10080
10081 #if CONFIG_IOSCHED || UPL_DEBUG
10082 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10083 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10084 }
10085 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10086 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10087 }
10088 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10089 vm_object_unlock(object2);
10090 vm_object_unlock(object1);
10091 }
10092 #endif
10093 }
10094
10095 done:
10096 /*
10097 * Cleanup.
10098 */
10099 if (upls_locked) {
10100 upl_unlock(upl1);
10101 upl_unlock(upl2);
10102 upls_locked = FALSE;
10103 }
10104
10105 return retval;
10106 }
10107
10108 void
upl_range_needed(upl_t upl,int index,int count)10109 upl_range_needed(
10110 upl_t upl,
10111 int index,
10112 int count)
10113 {
10114 upl_page_info_t *user_page_list;
10115 int size_in_pages;
10116
10117 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10118 return;
10119 }
10120
10121 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10122
10123 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
10124
10125 while (count-- && index < size_in_pages) {
10126 user_page_list[index++].needed = TRUE;
10127 }
10128 }
10129
10130
10131 /*
10132 * Reserve of virtual addresses in the kernel address space.
10133 * We need to map the physical pages in the kernel, so that we
10134 * can call the code-signing or slide routines with a kernel
10135 * virtual address. We keep this pool of pre-allocated kernel
10136 * virtual addresses so that we don't have to scan the kernel's
10137 * virtaul address space each time we need to work with
10138 * a physical page.
10139 */
10140 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10141 #define VM_PAGING_NUM_PAGES 64
10142 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10143 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10144 int vm_paging_max_index = 0;
10145 int vm_paging_page_waiter = 0;
10146 int vm_paging_page_waiter_total = 0;
10147
10148 unsigned long vm_paging_no_kernel_page = 0;
10149 unsigned long vm_paging_objects_mapped = 0;
10150 unsigned long vm_paging_pages_mapped = 0;
10151 unsigned long vm_paging_objects_mapped_slow = 0;
10152 unsigned long vm_paging_pages_mapped_slow = 0;
10153
10154 __startup_func
10155 static void
vm_paging_map_init(void)10156 vm_paging_map_init(void)
10157 {
10158 kmem_alloc(kernel_map, &vm_paging_base_address,
10159 ptoa(VM_PAGING_NUM_PAGES),
10160 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10161 VM_KERN_MEMORY_NONE);
10162 }
10163 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10164
10165 /*
10166 * vm_paging_map_object:
10167 * Maps part of a VM object's pages in the kernel
10168 * virtual address space, using the pre-allocated
10169 * kernel virtual addresses, if possible.
10170 * Context:
10171 * The VM object is locked. This lock will get
10172 * dropped and re-acquired though, so the caller
10173 * must make sure the VM object is kept alive
10174 * (by holding a VM map that has a reference
10175 * on it, for example, or taking an extra reference).
10176 * The page should also be kept busy to prevent
10177 * it from being reclaimed.
10178 */
10179 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10180 vm_paging_map_object(
10181 vm_page_t page,
10182 vm_object_t object,
10183 vm_object_offset_t offset,
10184 vm_prot_t protection,
10185 boolean_t can_unlock_object,
10186 vm_map_size_t *size, /* IN/OUT */
10187 vm_map_offset_t *address, /* OUT */
10188 boolean_t *need_unmap) /* OUT */
10189 {
10190 kern_return_t kr;
10191 vm_map_offset_t page_map_offset;
10192 vm_map_size_t map_size;
10193 vm_object_offset_t object_offset;
10194 int i;
10195
10196 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10197 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10198 *address = (vm_map_offset_t)
10199 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10200 *need_unmap = FALSE;
10201 return KERN_SUCCESS;
10202
10203 assert(page->vmp_busy);
10204 /*
10205 * Use one of the pre-allocated kernel virtual addresses
10206 * and just enter the VM page in the kernel address space
10207 * at that virtual address.
10208 */
10209 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10210
10211 /*
10212 * Try and find an available kernel virtual address
10213 * from our pre-allocated pool.
10214 */
10215 page_map_offset = 0;
10216 for (;;) {
10217 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10218 if (vm_paging_page_inuse[i] == FALSE) {
10219 page_map_offset =
10220 vm_paging_base_address +
10221 (i * PAGE_SIZE);
10222 break;
10223 }
10224 }
10225 if (page_map_offset != 0) {
10226 /* found a space to map our page ! */
10227 break;
10228 }
10229
10230 if (can_unlock_object) {
10231 /*
10232 * If we can afford to unlock the VM object,
10233 * let's take the slow path now...
10234 */
10235 break;
10236 }
10237 /*
10238 * We can't afford to unlock the VM object, so
10239 * let's wait for a space to become available...
10240 */
10241 vm_paging_page_waiter_total++;
10242 vm_paging_page_waiter++;
10243 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10244 if (kr == THREAD_WAITING) {
10245 simple_unlock(&vm_paging_lock);
10246 kr = thread_block(THREAD_CONTINUE_NULL);
10247 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10248 }
10249 vm_paging_page_waiter--;
10250 /* ... and try again */
10251 }
10252
10253 if (page_map_offset != 0) {
10254 /*
10255 * We found a kernel virtual address;
10256 * map the physical page to that virtual address.
10257 */
10258 if (i > vm_paging_max_index) {
10259 vm_paging_max_index = i;
10260 }
10261 vm_paging_page_inuse[i] = TRUE;
10262 simple_unlock(&vm_paging_lock);
10263
10264 page->vmp_pmapped = TRUE;
10265
10266 /*
10267 * Keep the VM object locked over the PMAP_ENTER
10268 * and the actual use of the page by the kernel,
10269 * or this pmap mapping might get undone by a
10270 * vm_object_pmap_protect() call...
10271 */
10272 PMAP_ENTER(kernel_pmap,
10273 page_map_offset,
10274 page,
10275 protection,
10276 VM_PROT_NONE,
10277 0,
10278 TRUE,
10279 kr);
10280 assert(kr == KERN_SUCCESS);
10281 vm_paging_objects_mapped++;
10282 vm_paging_pages_mapped++;
10283 *address = page_map_offset;
10284 *need_unmap = TRUE;
10285
10286 #if KASAN
10287 kasan_notify_address(page_map_offset, PAGE_SIZE);
10288 #endif
10289
10290 /* all done and mapped, ready to use ! */
10291 return KERN_SUCCESS;
10292 }
10293
10294 /*
10295 * We ran out of pre-allocated kernel virtual
10296 * addresses. Just map the page in the kernel
10297 * the slow and regular way.
10298 */
10299 vm_paging_no_kernel_page++;
10300 simple_unlock(&vm_paging_lock);
10301 }
10302
10303 if (!can_unlock_object) {
10304 *address = 0;
10305 *size = 0;
10306 *need_unmap = FALSE;
10307 return KERN_NOT_SUPPORTED;
10308 }
10309
10310 object_offset = vm_object_trunc_page(offset);
10311 map_size = vm_map_round_page(*size,
10312 VM_MAP_PAGE_MASK(kernel_map));
10313
10314 /*
10315 * Try and map the required range of the object
10316 * in the kernel_map. Given that allocation is
10317 * for pageable memory, it shouldn't contain
10318 * pointers and is mapped into the data range.
10319 */
10320
10321 vm_object_reference_locked(object); /* for the map entry */
10322 vm_object_unlock(object);
10323
10324 kr = vm_map_enter(kernel_map,
10325 address,
10326 map_size,
10327 0,
10328 VM_FLAGS_ANYWHERE,
10329 VM_MAP_KERNEL_FLAGS_DATA,
10330 VM_KERN_MEMORY_NONE,
10331 object,
10332 object_offset,
10333 FALSE,
10334 protection,
10335 VM_PROT_ALL,
10336 VM_INHERIT_NONE);
10337 if (kr != KERN_SUCCESS) {
10338 *address = 0;
10339 *size = 0;
10340 *need_unmap = FALSE;
10341 vm_object_deallocate(object); /* for the map entry */
10342 vm_object_lock(object);
10343 return kr;
10344 }
10345
10346 *size = map_size;
10347
10348 /*
10349 * Enter the mapped pages in the page table now.
10350 */
10351 vm_object_lock(object);
10352 /*
10353 * VM object must be kept locked from before PMAP_ENTER()
10354 * until after the kernel is done accessing the page(s).
10355 * Otherwise, the pmap mappings in the kernel could be
10356 * undone by a call to vm_object_pmap_protect().
10357 */
10358
10359 for (page_map_offset = 0;
10360 map_size != 0;
10361 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10362 page = vm_page_lookup(object, offset + page_map_offset);
10363 if (page == VM_PAGE_NULL) {
10364 printf("vm_paging_map_object: no page !?");
10365 vm_object_unlock(object);
10366 vm_map_remove(kernel_map, *address, *size);
10367 *address = 0;
10368 *size = 0;
10369 *need_unmap = FALSE;
10370 vm_object_lock(object);
10371 return KERN_MEMORY_ERROR;
10372 }
10373 page->vmp_pmapped = TRUE;
10374
10375 PMAP_ENTER(kernel_pmap,
10376 *address + page_map_offset,
10377 page,
10378 protection,
10379 VM_PROT_NONE,
10380 0,
10381 TRUE,
10382 kr);
10383 assert(kr == KERN_SUCCESS);
10384 #if KASAN
10385 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10386 #endif
10387 }
10388
10389 vm_paging_objects_mapped_slow++;
10390 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10391
10392 *need_unmap = TRUE;
10393
10394 return KERN_SUCCESS;
10395 }
10396
10397 /*
10398 * vm_paging_unmap_object:
10399 * Unmaps part of a VM object's pages from the kernel
10400 * virtual address space.
10401 * Context:
10402 * The VM object is locked. This lock will get
10403 * dropped and re-acquired though.
10404 */
10405 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10406 vm_paging_unmap_object(
10407 vm_object_t object,
10408 vm_map_offset_t start,
10409 vm_map_offset_t end)
10410 {
10411 int i;
10412
10413 if ((vm_paging_base_address == 0) ||
10414 (start < vm_paging_base_address) ||
10415 (end > (vm_paging_base_address
10416 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10417 /*
10418 * We didn't use our pre-allocated pool of
10419 * kernel virtual address. Deallocate the
10420 * virtual memory.
10421 */
10422 if (object != VM_OBJECT_NULL) {
10423 vm_object_unlock(object);
10424 }
10425 vm_map_remove(kernel_map, start, end);
10426 if (object != VM_OBJECT_NULL) {
10427 vm_object_lock(object);
10428 }
10429 } else {
10430 /*
10431 * We used a kernel virtual address from our
10432 * pre-allocated pool. Put it back in the pool
10433 * for next time.
10434 */
10435 assert(end - start == PAGE_SIZE);
10436 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10437 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10438
10439 /* undo the pmap mapping */
10440 pmap_remove(kernel_pmap, start, end);
10441
10442 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10443 vm_paging_page_inuse[i] = FALSE;
10444 if (vm_paging_page_waiter) {
10445 thread_wakeup(&vm_paging_page_waiter);
10446 }
10447 simple_unlock(&vm_paging_lock);
10448 }
10449 }
10450
10451
10452 /*
10453 * page->vmp_object must be locked
10454 */
10455 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10456 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10457 {
10458 if (!queues_locked) {
10459 vm_page_lockspin_queues();
10460 }
10461
10462 page->vmp_free_when_done = FALSE;
10463 /*
10464 * need to drop the laundry count...
10465 * we may also need to remove it
10466 * from the I/O paging queue...
10467 * vm_pageout_throttle_up handles both cases
10468 *
10469 * the laundry and pageout_queue flags are cleared...
10470 */
10471 vm_pageout_throttle_up(page);
10472
10473 if (!queues_locked) {
10474 vm_page_unlock_queues();
10475 }
10476 }
10477
10478 upl_t
vector_upl_create(vm_offset_t upl_offset)10479 vector_upl_create(vm_offset_t upl_offset)
10480 {
10481 int i = 0;
10482 upl_t upl;
10483 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10484
10485 upl = upl_create(0, UPL_VECTOR, 0);
10486 upl->vector_upl = vector_upl;
10487 upl->u_offset = upl_offset;
10488 vector_upl->size = 0;
10489 vector_upl->offset = upl_offset;
10490 vector_upl->invalid_upls = 0;
10491 vector_upl->num_upls = 0;
10492 vector_upl->pagelist = NULL;
10493
10494 for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10495 vector_upl->upl_iostates[i].size = 0;
10496 vector_upl->upl_iostates[i].offset = 0;
10497 }
10498 return upl;
10499 }
10500
10501 void
vector_upl_deallocate(upl_t upl)10502 vector_upl_deallocate(upl_t upl)
10503 {
10504 if (upl) {
10505 vector_upl_t vector_upl = upl->vector_upl;
10506 if (vector_upl) {
10507 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10508 panic("Deallocating non-empty Vectored UPL");
10509 }
10510 kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10511 vector_upl->invalid_upls = 0;
10512 vector_upl->num_upls = 0;
10513 vector_upl->pagelist = NULL;
10514 vector_upl->size = 0;
10515 vector_upl->offset = 0;
10516 kfree_type(struct _vector_upl, vector_upl);
10517 vector_upl = (vector_upl_t)0xfeedfeed;
10518 } else {
10519 panic("vector_upl_deallocate was passed a non-vectored upl");
10520 }
10521 } else {
10522 panic("vector_upl_deallocate was passed a NULL upl");
10523 }
10524 }
10525
10526 boolean_t
vector_upl_is_valid(upl_t upl)10527 vector_upl_is_valid(upl_t upl)
10528 {
10529 if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10530 vector_upl_t vector_upl = upl->vector_upl;
10531 if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10532 return FALSE;
10533 } else {
10534 return TRUE;
10535 }
10536 }
10537 return FALSE;
10538 }
10539
10540 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10541 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10542 {
10543 if (vector_upl_is_valid(upl)) {
10544 vector_upl_t vector_upl = upl->vector_upl;
10545
10546 if (vector_upl) {
10547 if (subupl) {
10548 if (io_size) {
10549 if (io_size < PAGE_SIZE) {
10550 io_size = PAGE_SIZE;
10551 }
10552 subupl->vector_upl = (void*)vector_upl;
10553 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10554 vector_upl->size += io_size;
10555 upl->u_size += io_size;
10556 } else {
10557 uint32_t i = 0, invalid_upls = 0;
10558 for (i = 0; i < vector_upl->num_upls; i++) {
10559 if (vector_upl->upl_elems[i] == subupl) {
10560 break;
10561 }
10562 }
10563 if (i == vector_upl->num_upls) {
10564 panic("Trying to remove sub-upl when none exists");
10565 }
10566
10567 vector_upl->upl_elems[i] = NULL;
10568 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10569 relaxed);
10570 if (invalid_upls == vector_upl->num_upls) {
10571 return TRUE;
10572 } else {
10573 return FALSE;
10574 }
10575 }
10576 } else {
10577 panic("vector_upl_set_subupl was passed a NULL upl element");
10578 }
10579 } else {
10580 panic("vector_upl_set_subupl was passed a non-vectored upl");
10581 }
10582 } else {
10583 panic("vector_upl_set_subupl was passed a NULL upl");
10584 }
10585
10586 return FALSE;
10587 }
10588
10589 void
vector_upl_set_pagelist(upl_t upl)10590 vector_upl_set_pagelist(upl_t upl)
10591 {
10592 if (vector_upl_is_valid(upl)) {
10593 uint32_t i = 0;
10594 vector_upl_t vector_upl = upl->vector_upl;
10595
10596 if (vector_upl) {
10597 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10598
10599 vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10600
10601 for (i = 0; i < vector_upl->num_upls; i++) {
10602 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10603 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10604 pagelist_size += cur_upl_pagelist_size;
10605 if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10606 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10607 }
10608 }
10609 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10610 } else {
10611 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10612 }
10613 } else {
10614 panic("vector_upl_set_pagelist was passed a NULL upl");
10615 }
10616 }
10617
10618 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10619 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10620 {
10621 if (vector_upl_is_valid(upl)) {
10622 vector_upl_t vector_upl = upl->vector_upl;
10623 if (vector_upl) {
10624 if (index < vector_upl->num_upls) {
10625 return vector_upl->upl_elems[index];
10626 }
10627 } else {
10628 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10629 }
10630 }
10631 return NULL;
10632 }
10633
10634 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10635 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10636 {
10637 if (vector_upl_is_valid(upl)) {
10638 uint32_t i = 0;
10639 vector_upl_t vector_upl = upl->vector_upl;
10640
10641 if (vector_upl) {
10642 upl_t subupl = NULL;
10643 vector_upl_iostates_t subupl_state;
10644
10645 for (i = 0; i < vector_upl->num_upls; i++) {
10646 subupl = vector_upl->upl_elems[i];
10647 subupl_state = vector_upl->upl_iostates[i];
10648 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10649 /* We could have been passed an offset/size pair that belongs
10650 * to an UPL element that has already been committed/aborted.
10651 * If so, return NULL.
10652 */
10653 if (subupl == NULL) {
10654 return NULL;
10655 }
10656 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10657 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10658 if (*upl_size > subupl_state.size) {
10659 *upl_size = subupl_state.size;
10660 }
10661 }
10662 if (*upl_offset >= subupl_state.offset) {
10663 *upl_offset -= subupl_state.offset;
10664 } else if (i) {
10665 panic("Vector UPL offset miscalculation");
10666 }
10667 return subupl;
10668 }
10669 }
10670 } else {
10671 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10672 }
10673 }
10674 return NULL;
10675 }
10676
10677 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10678 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10679 {
10680 *v_upl_submap = NULL;
10681
10682 if (vector_upl_is_valid(upl)) {
10683 vector_upl_t vector_upl = upl->vector_upl;
10684 if (vector_upl) {
10685 *v_upl_submap = vector_upl->submap;
10686 *submap_dst_addr = vector_upl->submap_dst_addr;
10687 } else {
10688 panic("vector_upl_get_submap was passed a non-vectored UPL");
10689 }
10690 } else {
10691 panic("vector_upl_get_submap was passed a null UPL");
10692 }
10693 }
10694
10695 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10696 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10697 {
10698 if (vector_upl_is_valid(upl)) {
10699 vector_upl_t vector_upl = upl->vector_upl;
10700 if (vector_upl) {
10701 vector_upl->submap = submap;
10702 vector_upl->submap_dst_addr = submap_dst_addr;
10703 } else {
10704 panic("vector_upl_get_submap was passed a non-vectored UPL");
10705 }
10706 } else {
10707 panic("vector_upl_get_submap was passed a NULL UPL");
10708 }
10709 }
10710
10711 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10712 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10713 {
10714 if (vector_upl_is_valid(upl)) {
10715 uint32_t i = 0;
10716 vector_upl_t vector_upl = upl->vector_upl;
10717
10718 if (vector_upl) {
10719 for (i = 0; i < vector_upl->num_upls; i++) {
10720 if (vector_upl->upl_elems[i] == subupl) {
10721 break;
10722 }
10723 }
10724
10725 if (i == vector_upl->num_upls) {
10726 panic("setting sub-upl iostate when none exists");
10727 }
10728
10729 vector_upl->upl_iostates[i].offset = offset;
10730 if (size < PAGE_SIZE) {
10731 size = PAGE_SIZE;
10732 }
10733 vector_upl->upl_iostates[i].size = size;
10734 } else {
10735 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10736 }
10737 } else {
10738 panic("vector_upl_set_iostate was passed a NULL UPL");
10739 }
10740 }
10741
10742 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10743 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10744 {
10745 if (vector_upl_is_valid(upl)) {
10746 uint32_t i = 0;
10747 vector_upl_t vector_upl = upl->vector_upl;
10748
10749 if (vector_upl) {
10750 for (i = 0; i < vector_upl->num_upls; i++) {
10751 if (vector_upl->upl_elems[i] == subupl) {
10752 break;
10753 }
10754 }
10755
10756 if (i == vector_upl->num_upls) {
10757 panic("getting sub-upl iostate when none exists");
10758 }
10759
10760 *offset = vector_upl->upl_iostates[i].offset;
10761 *size = vector_upl->upl_iostates[i].size;
10762 } else {
10763 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10764 }
10765 } else {
10766 panic("vector_upl_get_iostate was passed a NULL UPL");
10767 }
10768 }
10769
10770 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10771 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10772 {
10773 if (vector_upl_is_valid(upl)) {
10774 vector_upl_t vector_upl = upl->vector_upl;
10775 if (vector_upl) {
10776 if (index < vector_upl->num_upls) {
10777 *offset = vector_upl->upl_iostates[index].offset;
10778 *size = vector_upl->upl_iostates[index].size;
10779 } else {
10780 *offset = *size = 0;
10781 }
10782 } else {
10783 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10784 }
10785 } else {
10786 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10787 }
10788 }
10789
10790 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10791 upl_get_internal_vectorupl_pagelist(upl_t upl)
10792 {
10793 return ((vector_upl_t)(upl->vector_upl))->pagelist;
10794 }
10795
10796 void *
upl_get_internal_vectorupl(upl_t upl)10797 upl_get_internal_vectorupl(upl_t upl)
10798 {
10799 return upl->vector_upl;
10800 }
10801
10802 vm_size_t
upl_get_internal_pagelist_offset(void)10803 upl_get_internal_pagelist_offset(void)
10804 {
10805 return sizeof(struct upl);
10806 }
10807
10808 void
upl_clear_dirty(upl_t upl,boolean_t value)10809 upl_clear_dirty(
10810 upl_t upl,
10811 boolean_t value)
10812 {
10813 if (value) {
10814 upl->flags |= UPL_CLEAR_DIRTY;
10815 } else {
10816 upl->flags &= ~UPL_CLEAR_DIRTY;
10817 }
10818 }
10819
10820 void
upl_set_referenced(upl_t upl,boolean_t value)10821 upl_set_referenced(
10822 upl_t upl,
10823 boolean_t value)
10824 {
10825 upl_lock(upl);
10826 if (value) {
10827 upl->ext_ref_count++;
10828 } else {
10829 if (!upl->ext_ref_count) {
10830 panic("upl_set_referenced not %p", upl);
10831 }
10832 upl->ext_ref_count--;
10833 }
10834 upl_unlock(upl);
10835 }
10836
10837 #if CONFIG_IOSCHED
10838 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10839 upl_set_blkno(
10840 upl_t upl,
10841 vm_offset_t upl_offset,
10842 int io_size,
10843 int64_t blkno)
10844 {
10845 int i, j;
10846 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10847 return;
10848 }
10849
10850 assert(upl->upl_reprio_info != 0);
10851 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10852 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10853 }
10854 }
10855 #endif
10856
10857 void inline
memoryshot(unsigned int event,unsigned int control)10858 memoryshot(unsigned int event, unsigned int control)
10859 {
10860 if (vm_debug_events) {
10861 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10862 vm_page_active_count, vm_page_inactive_count,
10863 vm_page_free_count, vm_page_speculative_count,
10864 vm_page_throttled_count);
10865 } else {
10866 (void) event;
10867 (void) control;
10868 }
10869 }
10870
10871 #ifdef MACH_BSD
10872
10873 boolean_t
upl_device_page(upl_page_info_t * upl)10874 upl_device_page(upl_page_info_t *upl)
10875 {
10876 return UPL_DEVICE_PAGE(upl);
10877 }
10878 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10879 upl_page_present(upl_page_info_t *upl, int index)
10880 {
10881 return UPL_PAGE_PRESENT(upl, index);
10882 }
10883 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10884 upl_speculative_page(upl_page_info_t *upl, int index)
10885 {
10886 return UPL_SPECULATIVE_PAGE(upl, index);
10887 }
10888 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10889 upl_dirty_page(upl_page_info_t *upl, int index)
10890 {
10891 return UPL_DIRTY_PAGE(upl, index);
10892 }
10893 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10894 upl_valid_page(upl_page_info_t *upl, int index)
10895 {
10896 return UPL_VALID_PAGE(upl, index);
10897 }
10898 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10899 upl_phys_page(upl_page_info_t *upl, int index)
10900 {
10901 return UPL_PHYS_PAGE(upl, index);
10902 }
10903
10904 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10905 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10906 {
10907 upl[index].mark = v;
10908 }
10909
10910 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10911 upl_page_get_mark(upl_page_info_t *upl, int index)
10912 {
10913 return upl[index].mark;
10914 }
10915
10916 void
vm_countdirtypages(void)10917 vm_countdirtypages(void)
10918 {
10919 vm_page_t m;
10920 int dpages;
10921 int pgopages;
10922 int precpages;
10923
10924
10925 dpages = 0;
10926 pgopages = 0;
10927 precpages = 0;
10928
10929 vm_page_lock_queues();
10930 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10931 do {
10932 if (m == (vm_page_t)0) {
10933 break;
10934 }
10935
10936 if (m->vmp_dirty) {
10937 dpages++;
10938 }
10939 if (m->vmp_free_when_done) {
10940 pgopages++;
10941 }
10942 if (m->vmp_precious) {
10943 precpages++;
10944 }
10945
10946 assert(VM_PAGE_OBJECT(m) != kernel_object);
10947 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10948 if (m == (vm_page_t)0) {
10949 break;
10950 }
10951 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10952 vm_page_unlock_queues();
10953
10954 vm_page_lock_queues();
10955 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10956 do {
10957 if (m == (vm_page_t)0) {
10958 break;
10959 }
10960
10961 dpages++;
10962 assert(m->vmp_dirty);
10963 assert(!m->vmp_free_when_done);
10964 assert(VM_PAGE_OBJECT(m) != kernel_object);
10965 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10966 if (m == (vm_page_t)0) {
10967 break;
10968 }
10969 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10970 vm_page_unlock_queues();
10971
10972 vm_page_lock_queues();
10973 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10974 do {
10975 if (m == (vm_page_t)0) {
10976 break;
10977 }
10978
10979 if (m->vmp_dirty) {
10980 dpages++;
10981 }
10982 if (m->vmp_free_when_done) {
10983 pgopages++;
10984 }
10985 if (m->vmp_precious) {
10986 precpages++;
10987 }
10988
10989 assert(VM_PAGE_OBJECT(m) != kernel_object);
10990 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10991 if (m == (vm_page_t)0) {
10992 break;
10993 }
10994 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10995 vm_page_unlock_queues();
10996
10997 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10998
10999 dpages = 0;
11000 pgopages = 0;
11001 precpages = 0;
11002
11003 vm_page_lock_queues();
11004 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
11005
11006 do {
11007 if (m == (vm_page_t)0) {
11008 break;
11009 }
11010 if (m->vmp_dirty) {
11011 dpages++;
11012 }
11013 if (m->vmp_free_when_done) {
11014 pgopages++;
11015 }
11016 if (m->vmp_precious) {
11017 precpages++;
11018 }
11019
11020 assert(VM_PAGE_OBJECT(m) != kernel_object);
11021 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
11022 if (m == (vm_page_t)0) {
11023 break;
11024 }
11025 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
11026 vm_page_unlock_queues();
11027
11028 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
11029 }
11030 #endif /* MACH_BSD */
11031
11032
11033 #if CONFIG_IOSCHED
11034 int
upl_get_cached_tier(upl_t upl)11035 upl_get_cached_tier(upl_t upl)
11036 {
11037 assert(upl);
11038 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
11039 return upl->upl_priority;
11040 }
11041 return -1;
11042 }
11043 #endif /* CONFIG_IOSCHED */
11044
11045
11046 void
upl_callout_iodone(upl_t upl)11047 upl_callout_iodone(upl_t upl)
11048 {
11049 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11050
11051 if (upl_ctx) {
11052 void (*iodone_func)(void *, int) = upl_ctx->io_done;
11053
11054 assert(upl_ctx->io_done);
11055
11056 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11057 }
11058 }
11059
11060 void
upl_set_iodone(upl_t upl,void * upl_iodone)11061 upl_set_iodone(upl_t upl, void *upl_iodone)
11062 {
11063 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11064 }
11065
11066 void
upl_set_iodone_error(upl_t upl,int error)11067 upl_set_iodone_error(upl_t upl, int error)
11068 {
11069 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11070
11071 if (upl_ctx) {
11072 upl_ctx->io_error = error;
11073 }
11074 }
11075
11076
11077 ppnum_t
upl_get_highest_page(upl_t upl)11078 upl_get_highest_page(
11079 upl_t upl)
11080 {
11081 return upl->highest_page;
11082 }
11083
11084 upl_size_t
upl_get_size(upl_t upl)11085 upl_get_size(
11086 upl_t upl)
11087 {
11088 return upl_adjusted_size(upl, PAGE_MASK);
11089 }
11090
11091 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11092 upl_adjusted_size(
11093 upl_t upl,
11094 vm_map_offset_t pgmask)
11095 {
11096 vm_object_offset_t start_offset, end_offset;
11097
11098 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11099 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11100
11101 return (upl_size_t)(end_offset - start_offset);
11102 }
11103
11104 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11105 upl_adjusted_offset(
11106 upl_t upl,
11107 vm_map_offset_t pgmask)
11108 {
11109 return trunc_page_mask_64(upl->u_offset, pgmask);
11110 }
11111
11112 vm_object_offset_t
upl_get_data_offset(upl_t upl)11113 upl_get_data_offset(
11114 upl_t upl)
11115 {
11116 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11117 }
11118
11119 upl_t
upl_associated_upl(upl_t upl)11120 upl_associated_upl(upl_t upl)
11121 {
11122 return upl->associated_upl;
11123 }
11124
11125 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11126 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11127 {
11128 upl->associated_upl = associated_upl;
11129 }
11130
11131 struct vnode *
upl_lookup_vnode(upl_t upl)11132 upl_lookup_vnode(upl_t upl)
11133 {
11134 if (!upl->map_object->internal) {
11135 return vnode_pager_lookup_vnode(upl->map_object->pager);
11136 } else {
11137 return NULL;
11138 }
11139 }
11140
11141 #if UPL_DEBUG
11142 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11143 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11144 {
11145 upl->ubc_alias1 = alias1;
11146 upl->ubc_alias2 = alias2;
11147 return KERN_SUCCESS;
11148 }
11149 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11150 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11151 {
11152 if (al) {
11153 *al = upl->ubc_alias1;
11154 }
11155 if (al2) {
11156 *al2 = upl->ubc_alias2;
11157 }
11158 return KERN_SUCCESS;
11159 }
11160 #endif /* UPL_DEBUG */
11161
11162 #if VM_PRESSURE_EVENTS
11163 /*
11164 * Upward trajectory.
11165 */
11166 extern boolean_t vm_compressor_low_on_space(void);
11167
11168 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11169 VM_PRESSURE_NORMAL_TO_WARNING(void)
11170 {
11171 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11172 /* Available pages below our threshold */
11173 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11174 /* No frozen processes to kill */
11175 if (memorystatus_frozen_count == 0) {
11176 /* Not enough suspended processes available. */
11177 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11178 return TRUE;
11179 }
11180 }
11181 }
11182 return FALSE;
11183 } else {
11184 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11185 }
11186 }
11187
11188 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11189 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11190 {
11191 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11192 /* Available pages below our threshold */
11193 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11194 return TRUE;
11195 }
11196 return FALSE;
11197 } else {
11198 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11199 }
11200 }
11201
11202 /*
11203 * Downward trajectory.
11204 */
11205 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11206 VM_PRESSURE_WARNING_TO_NORMAL(void)
11207 {
11208 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11209 /* Available pages above our threshold */
11210 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11211 if (memorystatus_available_pages > target_threshold) {
11212 return TRUE;
11213 }
11214 return FALSE;
11215 } else {
11216 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11217 }
11218 }
11219
11220 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11221 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11222 {
11223 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11224 /* Available pages above our threshold */
11225 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11226 if (memorystatus_available_pages > target_threshold) {
11227 return TRUE;
11228 }
11229 return FALSE;
11230 } else {
11231 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11232 }
11233 }
11234 #endif /* VM_PRESSURE_EVENTS */
11235
11236 #if DEVELOPMENT || DEBUG
11237 bool compressor_running_perf_test;
11238 uint64_t compressor_perf_test_pages_processed;
11239
11240 kern_return_t
11241 run_compressor_perf_test(
11242 user_addr_t buf,
11243 size_t buffer_size,
11244 uint64_t *time,
11245 uint64_t *bytes_compressed,
11246 uint64_t *compressor_growth);
11247
11248 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11249 move_pages_to_queue(
11250 vm_map_t map,
11251 user_addr_t start_addr,
11252 size_t buffer_size,
11253 vm_page_queue_head_t *queue,
11254 size_t *pages_moved)
11255 {
11256 kern_return_t err = KERN_SUCCESS;
11257 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11258 boolean_t addr_in_map = FALSE;
11259 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11260 vm_object_t curr_object = VM_OBJECT_NULL;
11261 *pages_moved = 0;
11262
11263
11264 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11265 /*
11266 * We don't currently support benchmarking maps with a different page size
11267 * than the kernel.
11268 */
11269 return KERN_INVALID_ARGUMENT;
11270 }
11271
11272 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11273 return KERN_INVALID_ARGUMENT;
11274 }
11275
11276 vm_map_lock_read(map);
11277 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11278 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11279
11280
11281 while (curr_addr < end_addr) {
11282 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11283 if (!addr_in_map) {
11284 err = KERN_INVALID_ARGUMENT;
11285 break;
11286 }
11287 curr_object = VME_OBJECT(curr_entry);
11288 if (curr_object) {
11289 vm_object_lock(curr_object);
11290 /* We really only want anonymous memory that's in the top level map and object here. */
11291 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11292 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11293 err = KERN_INVALID_ARGUMENT;
11294 vm_object_unlock(curr_object);
11295 break;
11296 }
11297 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11298 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11299 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11300 vm_map_offset_t curr_offset = start_offset;
11301 vm_page_t curr_page;
11302 while (curr_offset < end_offset) {
11303 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11304 if (curr_page != VM_PAGE_NULL) {
11305 vm_page_lock_queues();
11306 if (curr_page->vmp_laundry) {
11307 vm_pageout_steal_laundry(curr_page, TRUE);
11308 }
11309 /*
11310 * we've already factored out pages in the laundry which
11311 * means this page can't be on the pageout queue so it's
11312 * safe to do the vm_page_queues_remove
11313 */
11314 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11315 vm_page_queues_remove(curr_page, TRUE);
11316 if (donate) {
11317 /*
11318 * The compressor needs to see this bit to know
11319 * where this page needs to land. Also if stolen,
11320 * this bit helps put the page back in the right
11321 * special queue where it belongs.
11322 */
11323 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11324 }
11325 // Clear the referenced bit so we ensure this gets paged out
11326 curr_page->vmp_reference = false;
11327 if (curr_page->vmp_pmapped) {
11328 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11329 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11330 }
11331 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11332 vm_page_unlock_queues();
11333 *pages_moved += 1;
11334 }
11335 curr_offset += PAGE_SIZE_64;
11336 curr_addr += PAGE_SIZE_64;
11337 }
11338 }
11339 vm_object_unlock(curr_object);
11340 }
11341 vm_map_unlock_read(map);
11342 return err;
11343 }
11344
11345 /*
11346 * Local queue for processing benchmark pages.
11347 * Can't be allocated on the stack because the pointer has to
11348 * be packable.
11349 */
11350 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11351 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11352 run_compressor_perf_test(
11353 user_addr_t buf,
11354 size_t buffer_size,
11355 uint64_t *time,
11356 uint64_t *bytes_compressed,
11357 uint64_t *compressor_growth)
11358 {
11359 kern_return_t err = KERN_SUCCESS;
11360 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11361 return KERN_NOT_SUPPORTED;
11362 }
11363 if (current_task() == kernel_task) {
11364 return KERN_INVALID_ARGUMENT;
11365 }
11366 vm_page_lock_queues();
11367 if (compressor_running_perf_test) {
11368 /* Only run one instance of the benchmark at a time. */
11369 vm_page_unlock_queues();
11370 return KERN_RESOURCE_SHORTAGE;
11371 }
11372 vm_page_unlock_queues();
11373 size_t page_count = 0;
11374 vm_map_t map;
11375 vm_page_t p, next;
11376 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11377 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11378 *bytes_compressed = *compressor_growth = 0;
11379
11380 vm_page_queue_init(&compressor_perf_test_queue);
11381 map = current_task()->map;
11382 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11383 if (err != KERN_SUCCESS) {
11384 goto out;
11385 }
11386
11387 vm_page_lock_queues();
11388 compressor_running_perf_test = true;
11389 compressor_perf_test_pages_processed = 0;
11390 /*
11391 * At this point the compressor threads should only process the benchmark queue
11392 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11393 * to determine how many compressed bytes we ended up using.
11394 */
11395 compressed_bytes_start = c_segment_compressed_bytes;
11396 vm_page_unlock_queues();
11397
11398 compressor_perf_test_start = mach_absolute_time();
11399 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11400
11401 vm_page_lock_queues();
11402 /*
11403 * Depending on when this test is run we could overshoot or be right on the mark
11404 * with our page_count. So the comparison is of the _less than_ variety.
11405 */
11406 while (compressor_perf_test_pages_processed < page_count) {
11407 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11408 vm_page_unlock_queues();
11409 thread_block(THREAD_CONTINUE_NULL);
11410 vm_page_lock_queues();
11411 }
11412 compressor_perf_test_end = mach_absolute_time();
11413 compressed_bytes_end = c_segment_compressed_bytes;
11414 vm_page_unlock_queues();
11415
11416
11417 out:
11418 /*
11419 * If we errored out above, then we could still have some pages
11420 * on the local queue. Make sure to put them back on the active queue before
11421 * returning so they're not orphaned.
11422 */
11423 vm_page_lock_queues();
11424 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11425 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11426 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11427 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11428
11429 vm_page_enqueue_active(p, FALSE);
11430 p = next;
11431 }
11432
11433 compressor_running_perf_test = false;
11434 vm_page_unlock_queues();
11435 if (err == KERN_SUCCESS) {
11436 *bytes_compressed = page_count * PAGE_SIZE_64;
11437 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11438 }
11439
11440 /*
11441 * pageout_scan will consider waking the compactor swapper
11442 * before it blocks. Do the same thing here before we return
11443 * to ensure that back to back benchmark runs can't overly fragment the
11444 * compressor pool.
11445 */
11446 vm_consider_waking_compactor_swapper();
11447 return err;
11448 }
11449 #endif /* DEVELOPMENT || DEBUG */
11450