1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70 #include <mach_pagemap.h>
71 #include <mach_cluster_stats.h>
72
73 #include <mach/mach_types.h>
74 #include <mach/memory_object.h>
75 #include <mach/memory_object_default.h>
76 #include <mach/memory_object_control_server.h>
77 #include <mach/mach_host_server.h>
78 #include <mach/upl.h>
79 #include <mach/vm_map.h>
80 #include <mach/vm_param.h>
81 #include <mach/vm_statistics.h>
82 #include <mach/sdt.h>
83
84 #include <kern/kern_types.h>
85 #include <kern/counter.h>
86 #include <kern/host_statistics.h>
87 #include <kern/machine.h>
88 #include <kern/misc_protos.h>
89 #include <kern/sched.h>
90 #include <kern/thread.h>
91 #include <kern/kalloc.h>
92 #include <kern/zalloc_internal.h>
93 #include <kern/policy_internal.h>
94 #include <kern/thread_group.h>
95
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111
112 #include <san/kasan.h>
113
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121
122 extern int cs_debug;
123
124 extern void mbuf_drain(boolean_t);
125
126 #if VM_PRESSURE_EVENTS
127 #if CONFIG_JETSAM
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 #else /* CONFIG_JETSAM */
132 extern uint64_t memorystatus_available_pages;
133 extern uint64_t memorystatus_available_pages_pressure;
134 extern uint64_t memorystatus_available_pages_critical;
135 #endif /* CONFIG_JETSAM */
136
137 extern unsigned int memorystatus_frozen_count;
138 extern unsigned int memorystatus_suspended_count;
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140
141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143
144 void vm_pressure_response(void);
145 extern void consider_vm_pressure_events(void);
146
147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
148 #endif /* VM_PRESSURE_EVENTS */
149
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 boolean_t vps_dynamic_priority_enabled = FALSE;
153
154 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
155 #if !XNU_TARGET_OS_OSX
156 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
157 #else /* !XNU_TARGET_OS_OSX */
158 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
159 #endif /* !XNU_TARGET_OS_OSX */
160 #endif
161
162 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
163 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
164 #endif
165
166 #ifndef VM_PAGE_LAUNDRY_MAX
167 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
168 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
169
170 #ifndef VM_PAGEOUT_BURST_WAIT
171 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
172 #endif /* VM_PAGEOUT_BURST_WAIT */
173
174 #ifndef VM_PAGEOUT_EMPTY_WAIT
175 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
176 #endif /* VM_PAGEOUT_EMPTY_WAIT */
177
178 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
179 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
180 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
181
182 #ifndef VM_PAGEOUT_IDLE_WAIT
183 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
184 #endif /* VM_PAGEOUT_IDLE_WAIT */
185
186 #ifndef VM_PAGEOUT_SWAP_WAIT
187 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
188 #endif /* VM_PAGEOUT_SWAP_WAIT */
189
190
191 #ifndef VM_PAGE_SPECULATIVE_TARGET
192 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
193 #endif /* VM_PAGE_SPECULATIVE_TARGET */
194
195
196 /*
197 * To obtain a reasonable LRU approximation, the inactive queue
198 * needs to be large enough to give pages on it a chance to be
199 * referenced a second time. This macro defines the fraction
200 * of active+inactive pages that should be inactive.
201 * The pageout daemon uses it to update vm_page_inactive_target.
202 *
203 * If vm_page_free_count falls below vm_page_free_target and
204 * vm_page_inactive_count is below vm_page_inactive_target,
205 * then the pageout daemon starts running.
206 */
207
208 #ifndef VM_PAGE_INACTIVE_TARGET
209 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
210 #endif /* VM_PAGE_INACTIVE_TARGET */
211
212 /*
213 * Once the pageout daemon starts running, it keeps going
214 * until vm_page_free_count meets or exceeds vm_page_free_target.
215 */
216
217 #ifndef VM_PAGE_FREE_TARGET
218 #if !XNU_TARGET_OS_OSX
219 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
220 #else /* !XNU_TARGET_OS_OSX */
221 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
222 #endif /* !XNU_TARGET_OS_OSX */
223 #endif /* VM_PAGE_FREE_TARGET */
224
225
226 /*
227 * The pageout daemon always starts running once vm_page_free_count
228 * falls below vm_page_free_min.
229 */
230
231 #ifndef VM_PAGE_FREE_MIN
232 #if !XNU_TARGET_OS_OSX
233 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
234 #else /* !XNU_TARGET_OS_OSX */
235 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
236 #endif /* !XNU_TARGET_OS_OSX */
237 #endif /* VM_PAGE_FREE_MIN */
238
239 #if !XNU_TARGET_OS_OSX
240 #define VM_PAGE_FREE_RESERVED_LIMIT 100
241 #define VM_PAGE_FREE_MIN_LIMIT 1500
242 #define VM_PAGE_FREE_TARGET_LIMIT 2000
243 #else /* !XNU_TARGET_OS_OSX */
244 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
245 #define VM_PAGE_FREE_MIN_LIMIT 3500
246 #define VM_PAGE_FREE_TARGET_LIMIT 4000
247 #endif /* !XNU_TARGET_OS_OSX */
248
249 /*
250 * When vm_page_free_count falls below vm_page_free_reserved,
251 * only vm-privileged threads can allocate pages. vm-privilege
252 * allows the pageout daemon and default pager (and any other
253 * associated threads needed for default pageout) to continue
254 * operation by dipping into the reserved pool of pages.
255 */
256
257 #ifndef VM_PAGE_FREE_RESERVED
258 #define VM_PAGE_FREE_RESERVED(n) \
259 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
260 #endif /* VM_PAGE_FREE_RESERVED */
261
262 /*
263 * When we dequeue pages from the inactive list, they are
264 * reactivated (ie, put back on the active queue) if referenced.
265 * However, it is possible to starve the free list if other
266 * processors are referencing pages faster than we can turn off
267 * the referenced bit. So we limit the number of reactivations
268 * we will make per call of vm_pageout_scan().
269 */
270 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
271
272 #ifndef VM_PAGE_REACTIVATE_LIMIT
273 #if !XNU_TARGET_OS_OSX
274 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
275 #else /* !XNU_TARGET_OS_OSX */
276 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
277 #endif /* !XNU_TARGET_OS_OSX */
278 #endif /* VM_PAGE_REACTIVATE_LIMIT */
279 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
280
281 extern boolean_t hibernate_cleaning_in_progress;
282
283 /*
284 * Forward declarations for internal routines.
285 */
286 struct cq {
287 struct vm_pageout_queue *q;
288 void *current_chead;
289 char *scratch_buf;
290 int id;
291 };
292
293 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
294
295
296 #if VM_PRESSURE_EVENTS
297 void vm_pressure_thread(void);
298
299 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
300 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
301
302 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
303 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
304 #endif
305
306 static void vm_pageout_iothread_external(void);
307 static void vm_pageout_iothread_internal(struct cq *cq);
308 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
309
310 extern void vm_pageout_continue(void);
311 extern void vm_pageout_scan(void);
312
313 boolean_t vm_pageout_running = FALSE;
314
315 uint32_t vm_page_upl_tainted = 0;
316 uint32_t vm_page_iopl_tainted = 0;
317
318 #if XNU_TARGET_OS_OSX
319 static boolean_t vm_pageout_waiter = FALSE;
320 #endif /* XNU_TARGET_OS_OSX */
321
322
323 #if DEVELOPMENT || DEBUG
324 struct vm_pageout_debug vm_pageout_debug;
325 #endif
326 struct vm_pageout_vminfo vm_pageout_vminfo;
327 struct vm_pageout_state vm_pageout_state;
328 struct vm_config vm_config;
329
330 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
331 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
332
333 int vm_upl_wait_for_pages = 0;
334 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
335
336 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
337
338 int vm_debug_events = 0;
339
340 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
341
342 #if CONFIG_MEMORYSTATUS
343 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
344
345 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
346 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
347
348 #endif
349
350 #if __AMP__
351 int vm_compressor_ebound = 1;
352 int vm_pgo_pbound = 0;
353 extern void thread_bind_cluster_type(thread_t, char, bool);
354 #endif /* __AMP__ */
355
356
357 /*
358 * Routine: vm_pageout_object_terminate
359 * Purpose:
360 * Destroy the pageout_object, and perform all of the
361 * required cleanup actions.
362 *
363 * In/Out conditions:
364 * The object must be locked, and will be returned locked.
365 */
366 void
vm_pageout_object_terminate(vm_object_t object)367 vm_pageout_object_terminate(
368 vm_object_t object)
369 {
370 vm_object_t shadow_object;
371
372 /*
373 * Deal with the deallocation (last reference) of a pageout object
374 * (used for cleaning-in-place) by dropping the paging references/
375 * freeing pages in the original object.
376 */
377
378 assert(object->pageout);
379 shadow_object = object->shadow;
380 vm_object_lock(shadow_object);
381
382 while (!vm_page_queue_empty(&object->memq)) {
383 vm_page_t p, m;
384 vm_object_offset_t offset;
385
386 p = (vm_page_t) vm_page_queue_first(&object->memq);
387
388 assert(p->vmp_private);
389 assert(p->vmp_free_when_done);
390 p->vmp_free_when_done = FALSE;
391 assert(!p->vmp_cleaning);
392 assert(!p->vmp_laundry);
393
394 offset = p->vmp_offset;
395 VM_PAGE_FREE(p);
396 p = VM_PAGE_NULL;
397
398 m = vm_page_lookup(shadow_object,
399 offset + object->vo_shadow_offset);
400
401 if (m == VM_PAGE_NULL) {
402 continue;
403 }
404
405 assert((m->vmp_dirty) || (m->vmp_precious) ||
406 (m->vmp_busy && m->vmp_cleaning));
407
408 /*
409 * Handle the trusted pager throttle.
410 * Also decrement the burst throttle (if external).
411 */
412 vm_page_lock_queues();
413 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
414 vm_pageout_throttle_up(m);
415 }
416
417 /*
418 * Handle the "target" page(s). These pages are to be freed if
419 * successfully cleaned. Target pages are always busy, and are
420 * wired exactly once. The initial target pages are not mapped,
421 * (so cannot be referenced or modified) but converted target
422 * pages may have been modified between the selection as an
423 * adjacent page and conversion to a target.
424 */
425 if (m->vmp_free_when_done) {
426 assert(m->vmp_busy);
427 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
428 assert(m->vmp_wire_count == 1);
429 m->vmp_cleaning = FALSE;
430 m->vmp_free_when_done = FALSE;
431 /*
432 * Revoke all access to the page. Since the object is
433 * locked, and the page is busy, this prevents the page
434 * from being dirtied after the pmap_disconnect() call
435 * returns.
436 *
437 * Since the page is left "dirty" but "not modifed", we
438 * can detect whether the page was redirtied during
439 * pageout by checking the modify state.
440 */
441 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
442 SET_PAGE_DIRTY(m, FALSE);
443 } else {
444 m->vmp_dirty = FALSE;
445 }
446
447 if (m->vmp_dirty) {
448 vm_page_unwire(m, TRUE); /* reactivates */
449 counter_inc(&vm_statistics_reactivations);
450 PAGE_WAKEUP_DONE(m);
451 } else {
452 vm_page_free(m); /* clears busy, etc. */
453 }
454 vm_page_unlock_queues();
455 continue;
456 }
457 /*
458 * Handle the "adjacent" pages. These pages were cleaned in
459 * place, and should be left alone.
460 * If prep_pin_count is nonzero, then someone is using the
461 * page, so make it active.
462 */
463 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
464 if (m->vmp_reference) {
465 vm_page_activate(m);
466 } else {
467 vm_page_deactivate(m);
468 }
469 }
470 if (m->vmp_overwriting) {
471 /*
472 * the (COPY_OUT_FROM == FALSE) request_page_list case
473 */
474 if (m->vmp_busy) {
475 /*
476 * We do not re-set m->vmp_dirty !
477 * The page was busy so no extraneous activity
478 * could have occurred. COPY_INTO is a read into the
479 * new pages. CLEAN_IN_PLACE does actually write
480 * out the pages but handling outside of this code
481 * will take care of resetting dirty. We clear the
482 * modify however for the Programmed I/O case.
483 */
484 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
485
486 m->vmp_busy = FALSE;
487 m->vmp_absent = FALSE;
488 } else {
489 /*
490 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
491 * Occurs when the original page was wired
492 * at the time of the list request
493 */
494 assert(VM_PAGE_WIRED(m));
495 vm_page_unwire(m, TRUE); /* reactivates */
496 }
497 m->vmp_overwriting = FALSE;
498 } else {
499 m->vmp_dirty = FALSE;
500 }
501 m->vmp_cleaning = FALSE;
502
503 /*
504 * Wakeup any thread waiting for the page to be un-cleaning.
505 */
506 PAGE_WAKEUP(m);
507 vm_page_unlock_queues();
508 }
509 /*
510 * Account for the paging reference taken in vm_paging_object_allocate.
511 */
512 vm_object_activity_end(shadow_object);
513 vm_object_unlock(shadow_object);
514
515 assert(object->ref_count == 0);
516 assert(object->paging_in_progress == 0);
517 assert(object->activity_in_progress == 0);
518 assert(object->resident_page_count == 0);
519 return;
520 }
521
522 /*
523 * Routine: vm_pageclean_setup
524 *
525 * Purpose: setup a page to be cleaned (made non-dirty), but not
526 * necessarily flushed from the VM page cache.
527 * This is accomplished by cleaning in place.
528 *
529 * The page must not be busy, and new_object
530 * must be locked.
531 *
532 */
533 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)534 vm_pageclean_setup(
535 vm_page_t m,
536 vm_page_t new_m,
537 vm_object_t new_object,
538 vm_object_offset_t new_offset)
539 {
540 assert(!m->vmp_busy);
541 #if 0
542 assert(!m->vmp_cleaning);
543 #endif
544
545 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
546
547 /*
548 * Mark original page as cleaning in place.
549 */
550 m->vmp_cleaning = TRUE;
551 SET_PAGE_DIRTY(m, FALSE);
552 m->vmp_precious = FALSE;
553
554 /*
555 * Convert the fictitious page to a private shadow of
556 * the real page.
557 */
558 assert(new_m->vmp_fictitious);
559 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
560 new_m->vmp_fictitious = FALSE;
561 new_m->vmp_private = TRUE;
562 new_m->vmp_free_when_done = TRUE;
563 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
564
565 vm_page_lockspin_queues();
566 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
567 vm_page_unlock_queues();
568
569 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
570 assert(!new_m->vmp_wanted);
571 new_m->vmp_busy = FALSE;
572 }
573
574 /*
575 * Routine: vm_pageout_initialize_page
576 * Purpose:
577 * Causes the specified page to be initialized in
578 * the appropriate memory object. This routine is used to push
579 * pages into a copy-object when they are modified in the
580 * permanent object.
581 *
582 * The page is moved to a temporary object and paged out.
583 *
584 * In/out conditions:
585 * The page in question must not be on any pageout queues.
586 * The object to which it belongs must be locked.
587 * The page must be busy, but not hold a paging reference.
588 *
589 * Implementation:
590 * Move this page to a completely new object.
591 */
592 void
vm_pageout_initialize_page(vm_page_t m)593 vm_pageout_initialize_page(
594 vm_page_t m)
595 {
596 vm_object_t object;
597 vm_object_offset_t paging_offset;
598 memory_object_t pager;
599
600 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
601
602 object = VM_PAGE_OBJECT(m);
603
604 assert(m->vmp_busy);
605 assert(object->internal);
606
607 /*
608 * Verify that we really want to clean this page
609 */
610 assert(!m->vmp_absent);
611 assert(!m->vmp_error);
612 assert(m->vmp_dirty);
613
614 /*
615 * Create a paging reference to let us play with the object.
616 */
617 paging_offset = m->vmp_offset + object->paging_offset;
618
619 if (m->vmp_absent || m->vmp_error || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
620 panic("reservation without pageout?"); /* alan */
621
622 VM_PAGE_FREE(m);
623 vm_object_unlock(object);
624
625 return;
626 }
627
628 /*
629 * If there's no pager, then we can't clean the page. This should
630 * never happen since this should be a copy object and therefore not
631 * an external object, so the pager should always be there.
632 */
633
634 pager = object->pager;
635
636 if (pager == MEMORY_OBJECT_NULL) {
637 panic("missing pager for copy object");
638
639 VM_PAGE_FREE(m);
640 return;
641 }
642
643 /*
644 * set the page for future call to vm_fault_list_request
645 */
646 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
647 SET_PAGE_DIRTY(m, FALSE);
648
649 /*
650 * keep the object from collapsing or terminating
651 */
652 vm_object_paging_begin(object);
653 vm_object_unlock(object);
654
655 /*
656 * Write the data to its pager.
657 * Note that the data is passed by naming the new object,
658 * not a virtual address; the pager interface has been
659 * manipulated to use the "internal memory" data type.
660 * [The object reference from its allocation is donated
661 * to the eventual recipient.]
662 */
663 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
664
665 vm_object_lock(object);
666 vm_object_paging_end(object);
667 }
668
669
670 /*
671 * vm_pageout_cluster:
672 *
673 * Given a page, queue it to the appropriate I/O thread,
674 * which will page it out and attempt to clean adjacent pages
675 * in the same operation.
676 *
677 * The object and queues must be locked. We will take a
678 * paging reference to prevent deallocation or collapse when we
679 * release the object lock back at the call site. The I/O thread
680 * is responsible for consuming this reference
681 *
682 * The page must not be on any pageout queue.
683 */
684 #if DEVELOPMENT || DEBUG
685 vmct_stats_t vmct_stats;
686
687 int32_t vmct_active = 0;
688 uint64_t vm_compressor_epoch_start = 0;
689 uint64_t vm_compressor_epoch_stop = 0;
690
691 typedef enum vmct_state_t {
692 VMCT_IDLE,
693 VMCT_AWAKENED,
694 VMCT_ACTIVE,
695 } vmct_state_t;
696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
697 #endif
698
699
700 void
vm_pageout_cluster(vm_page_t m)701 vm_pageout_cluster(vm_page_t m)
702 {
703 vm_object_t object = VM_PAGE_OBJECT(m);
704 struct vm_pageout_queue *q;
705
706 VM_PAGE_CHECK(m);
707 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
708 vm_object_lock_assert_exclusive(object);
709
710 /*
711 * Only a certain kind of page is appreciated here.
712 */
713 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
714 assert(!m->vmp_cleaning && !m->vmp_laundry);
715 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
716
717 /*
718 * protect the object from collapse or termination
719 */
720 vm_object_activity_begin(object);
721
722 if (object->internal == TRUE) {
723 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
724
725 m->vmp_busy = TRUE;
726
727 q = &vm_pageout_queue_internal;
728 } else {
729 q = &vm_pageout_queue_external;
730 }
731
732 /*
733 * pgo_laundry count is tied to the laundry bit
734 */
735 m->vmp_laundry = TRUE;
736 q->pgo_laundry++;
737
738 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
739 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
740
741 if (q->pgo_idle == TRUE) {
742 q->pgo_idle = FALSE;
743 thread_wakeup((event_t) &q->pgo_pending);
744 }
745 VM_PAGE_CHECK(m);
746 }
747
748
749 /*
750 * A page is back from laundry or we are stealing it back from
751 * the laundering state. See if there are some pages waiting to
752 * go to laundry and if we can let some of them go now.
753 *
754 * Object and page queues must be locked.
755 */
756 void
vm_pageout_throttle_up(vm_page_t m)757 vm_pageout_throttle_up(
758 vm_page_t m)
759 {
760 struct vm_pageout_queue *q;
761 vm_object_t m_object;
762
763 m_object = VM_PAGE_OBJECT(m);
764
765 assert(m_object != VM_OBJECT_NULL);
766 assert(m_object != kernel_object);
767
768 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
769 vm_object_lock_assert_exclusive(m_object);
770
771 if (m_object->internal == TRUE) {
772 q = &vm_pageout_queue_internal;
773 } else {
774 q = &vm_pageout_queue_external;
775 }
776
777 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
778 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
779 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
780
781 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
782
783 vm_object_activity_end(m_object);
784
785 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
786 }
787 if (m->vmp_laundry == TRUE) {
788 m->vmp_laundry = FALSE;
789 q->pgo_laundry--;
790
791 if (q->pgo_throttled == TRUE) {
792 q->pgo_throttled = FALSE;
793 thread_wakeup((event_t) &q->pgo_laundry);
794 }
795 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
796 q->pgo_draining = FALSE;
797 thread_wakeup((event_t) (&q->pgo_laundry + 1));
798 }
799 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
800 }
801 }
802
803
804 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)805 vm_pageout_throttle_up_batch(
806 struct vm_pageout_queue *q,
807 int batch_cnt)
808 {
809 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
810
811 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
812
813 q->pgo_laundry -= batch_cnt;
814
815 if (q->pgo_throttled == TRUE) {
816 q->pgo_throttled = FALSE;
817 thread_wakeup((event_t) &q->pgo_laundry);
818 }
819 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
820 q->pgo_draining = FALSE;
821 thread_wakeup((event_t) (&q->pgo_laundry + 1));
822 }
823 }
824
825
826
827 /*
828 * VM memory pressure monitoring.
829 *
830 * vm_pageout_scan() keeps track of the number of pages it considers and
831 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
832 *
833 * compute_memory_pressure() is called every second from compute_averages()
834 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
835 * of recalimed pages in a new vm_pageout_stat[] bucket.
836 *
837 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
838 * The caller provides the number of seconds ("nsecs") worth of statistics
839 * it wants, up to 30 seconds.
840 * It computes the number of pages reclaimed in the past "nsecs" seconds and
841 * also returns the number of pages the system still needs to reclaim at this
842 * moment in time.
843 */
844 #if DEVELOPMENT || DEBUG
845 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
846 #else
847 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
848 #endif
849 struct vm_pageout_stat {
850 unsigned long vm_page_active_count;
851 unsigned long vm_page_speculative_count;
852 unsigned long vm_page_inactive_count;
853 unsigned long vm_page_anonymous_count;
854
855 unsigned long vm_page_free_count;
856 unsigned long vm_page_wire_count;
857 unsigned long vm_page_compressor_count;
858
859 unsigned long vm_page_pages_compressed;
860 unsigned long vm_page_pageable_internal_count;
861 unsigned long vm_page_pageable_external_count;
862 unsigned long vm_page_xpmapped_external_count;
863
864 unsigned int pages_grabbed;
865 unsigned int pages_freed;
866
867 unsigned int pages_compressed;
868 unsigned int pages_grabbed_by_compressor;
869 unsigned int failed_compressions;
870
871 unsigned int pages_evicted;
872 unsigned int pages_purged;
873
874 unsigned int considered;
875 unsigned int considered_bq_internal;
876 unsigned int considered_bq_external;
877
878 unsigned int skipped_external;
879 unsigned int skipped_internal;
880 unsigned int filecache_min_reactivations;
881
882 unsigned int freed_speculative;
883 unsigned int freed_cleaned;
884 unsigned int freed_internal;
885 unsigned int freed_external;
886
887 unsigned int cleaned_dirty_external;
888 unsigned int cleaned_dirty_internal;
889
890 unsigned int inactive_referenced;
891 unsigned int inactive_nolock;
892 unsigned int reactivation_limit_exceeded;
893 unsigned int forced_inactive_reclaim;
894
895 unsigned int throttled_internal_q;
896 unsigned int throttled_external_q;
897
898 unsigned int phantom_ghosts_found;
899 unsigned int phantom_ghosts_added;
900 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, };
901
902 unsigned int vm_pageout_stat_now = 0;
903
904 #define VM_PAGEOUT_STAT_BEFORE(i) \
905 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
906 #define VM_PAGEOUT_STAT_AFTER(i) \
907 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
908
909 #if VM_PAGE_BUCKETS_CHECK
910 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
911 #endif /* VM_PAGE_BUCKETS_CHECK */
912
913
914 void
915 record_memory_pressure(void);
916 void
record_memory_pressure(void)917 record_memory_pressure(void)
918 {
919 unsigned int vm_pageout_next;
920
921 #if VM_PAGE_BUCKETS_CHECK
922 /* check the consistency of VM page buckets at regular interval */
923 static int counter = 0;
924 if ((++counter % vm_page_buckets_check_interval) == 0) {
925 vm_page_buckets_check();
926 }
927 #endif /* VM_PAGE_BUCKETS_CHECK */
928
929 vm_pageout_state.vm_memory_pressure =
930 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
931 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
932 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
933 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
934
935 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
936
937 /* move "now" forward */
938 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
939
940 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
941
942 vm_pageout_stat_now = vm_pageout_next;
943 }
944
945
946 /*
947 * IMPORTANT
948 * mach_vm_ctl_page_free_wanted() is called indirectly, via
949 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
950 * it must be safe in the restricted stackshot context. Locks and/or
951 * blocking are not allowable.
952 */
953 unsigned int
mach_vm_ctl_page_free_wanted(void)954 mach_vm_ctl_page_free_wanted(void)
955 {
956 unsigned int page_free_target, page_free_count, page_free_wanted;
957
958 page_free_target = vm_page_free_target;
959 page_free_count = vm_page_free_count;
960 if (page_free_target > page_free_count) {
961 page_free_wanted = page_free_target - page_free_count;
962 } else {
963 page_free_wanted = 0;
964 }
965
966 return page_free_wanted;
967 }
968
969
970 /*
971 * IMPORTANT:
972 * mach_vm_pressure_monitor() is called when taking a stackshot, with
973 * wait_for_pressure FALSE, so that code path must remain safe in the
974 * restricted stackshot context. No blocking or locks are allowable.
975 * on that code path.
976 */
977
978 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)979 mach_vm_pressure_monitor(
980 boolean_t wait_for_pressure,
981 unsigned int nsecs_monitored,
982 unsigned int *pages_reclaimed_p,
983 unsigned int *pages_wanted_p)
984 {
985 wait_result_t wr;
986 unsigned int vm_pageout_then, vm_pageout_now;
987 unsigned int pages_reclaimed;
988 unsigned int units_of_monitor;
989
990 units_of_monitor = 8 * nsecs_monitored;
991 /*
992 * We don't take the vm_page_queue_lock here because we don't want
993 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
994 * thread when it's trying to reclaim memory. We don't need fully
995 * accurate monitoring anyway...
996 */
997
998 if (wait_for_pressure) {
999 /* wait until there's memory pressure */
1000 while (vm_page_free_count >= vm_page_free_target) {
1001 wr = assert_wait((event_t) &vm_page_free_wanted,
1002 THREAD_INTERRUPTIBLE);
1003 if (wr == THREAD_WAITING) {
1004 wr = thread_block(THREAD_CONTINUE_NULL);
1005 }
1006 if (wr == THREAD_INTERRUPTED) {
1007 return KERN_ABORTED;
1008 }
1009 if (wr == THREAD_AWAKENED) {
1010 /*
1011 * The memory pressure might have already
1012 * been relieved but let's not block again
1013 * and let's report that there was memory
1014 * pressure at some point.
1015 */
1016 break;
1017 }
1018 }
1019 }
1020
1021 /* provide the number of pages the system wants to reclaim */
1022 if (pages_wanted_p != NULL) {
1023 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1024 }
1025
1026 if (pages_reclaimed_p == NULL) {
1027 return KERN_SUCCESS;
1028 }
1029
1030 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1031 vm_pageout_now = vm_pageout_stat_now;
1032 pages_reclaimed = 0;
1033 for (vm_pageout_then =
1034 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1035 vm_pageout_then != vm_pageout_now &&
1036 units_of_monitor-- != 0;
1037 vm_pageout_then =
1038 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1039 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1040 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1041 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1042 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1043 }
1044 *pages_reclaimed_p = pages_reclaimed;
1045
1046 return KERN_SUCCESS;
1047 }
1048
1049
1050
1051 #if DEVELOPMENT || DEBUG
1052
1053 static void
1054 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1055
1056 /*
1057 * condition variable used to make sure there is
1058 * only a single sweep going on at a time
1059 */
1060 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1061
1062
1063 void
vm_pageout_disconnect_all_pages()1064 vm_pageout_disconnect_all_pages()
1065 {
1066 vm_page_lock_queues();
1067
1068 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1069 vm_page_unlock_queues();
1070 return;
1071 }
1072 vm_pageout_disconnect_all_pages_active = TRUE;
1073 vm_page_unlock_queues();
1074
1075 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1076 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1077 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1078
1079 vm_pageout_disconnect_all_pages_active = FALSE;
1080 }
1081
1082
1083 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1084 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1085 {
1086 vm_page_t m;
1087 vm_object_t t_object = NULL;
1088 vm_object_t l_object = NULL;
1089 vm_object_t m_object = NULL;
1090 int delayed_unlock = 0;
1091 int try_failed_count = 0;
1092 int disconnected_count = 0;
1093 int paused_count = 0;
1094 int object_locked_count = 0;
1095
1096 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1097 q, qcount, 0, 0, 0);
1098
1099 vm_page_lock_queues();
1100
1101 while (qcount && !vm_page_queue_empty(q)) {
1102 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1103
1104 m = (vm_page_t) vm_page_queue_first(q);
1105 m_object = VM_PAGE_OBJECT(m);
1106
1107 /*
1108 * check to see if we currently are working
1109 * with the same object... if so, we've
1110 * already got the lock
1111 */
1112 if (m_object != l_object) {
1113 /*
1114 * the object associated with candidate page is
1115 * different from the one we were just working
1116 * with... dump the lock if we still own it
1117 */
1118 if (l_object != NULL) {
1119 vm_object_unlock(l_object);
1120 l_object = NULL;
1121 }
1122 if (m_object != t_object) {
1123 try_failed_count = 0;
1124 }
1125
1126 /*
1127 * Try to lock object; since we've alread got the
1128 * page queues lock, we can only 'try' for this one.
1129 * if the 'try' fails, we need to do a mutex_pause
1130 * to allow the owner of the object lock a chance to
1131 * run...
1132 */
1133 if (!vm_object_lock_try_scan(m_object)) {
1134 if (try_failed_count > 20) {
1135 goto reenter_pg_on_q;
1136 }
1137 vm_page_unlock_queues();
1138 mutex_pause(try_failed_count++);
1139 vm_page_lock_queues();
1140 delayed_unlock = 0;
1141
1142 paused_count++;
1143
1144 t_object = m_object;
1145 continue;
1146 }
1147 object_locked_count++;
1148
1149 l_object = m_object;
1150 }
1151 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1152 /*
1153 * put it back on the head of its queue
1154 */
1155 goto reenter_pg_on_q;
1156 }
1157 if (m->vmp_pmapped == TRUE) {
1158 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1159
1160 disconnected_count++;
1161 }
1162 reenter_pg_on_q:
1163 vm_page_queue_remove(q, m, vmp_pageq);
1164 vm_page_queue_enter(q, m, vmp_pageq);
1165
1166 qcount--;
1167 try_failed_count = 0;
1168
1169 if (delayed_unlock++ > 128) {
1170 if (l_object != NULL) {
1171 vm_object_unlock(l_object);
1172 l_object = NULL;
1173 }
1174 lck_mtx_yield(&vm_page_queue_lock);
1175 delayed_unlock = 0;
1176 }
1177 }
1178 if (l_object != NULL) {
1179 vm_object_unlock(l_object);
1180 l_object = NULL;
1181 }
1182 vm_page_unlock_queues();
1183
1184 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1185 q, disconnected_count, object_locked_count, paused_count, 0);
1186 }
1187
1188 #endif
1189
1190
1191 static void
1192 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1193
1194 /*
1195 * condition variable used to make sure there is
1196 * only a single sweep going on at a time
1197 */
1198 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1199
1200
1201 void
vm_pageout_anonymous_pages()1202 vm_pageout_anonymous_pages()
1203 {
1204 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1205 vm_page_lock_queues();
1206
1207 if (vm_pageout_anonymous_pages_active == TRUE) {
1208 vm_page_unlock_queues();
1209 return;
1210 }
1211 vm_pageout_anonymous_pages_active = TRUE;
1212 vm_page_unlock_queues();
1213
1214 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1215 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1216 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1217
1218 if (VM_CONFIG_SWAP_IS_PRESENT) {
1219 vm_consider_swapping();
1220 }
1221
1222 vm_page_lock_queues();
1223 vm_pageout_anonymous_pages_active = FALSE;
1224 vm_page_unlock_queues();
1225 }
1226 }
1227
1228
1229 void
vm_pageout_page_queue(vm_page_queue_head_t * q,int qcount)1230 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1231 {
1232 vm_page_t m;
1233 vm_object_t t_object = NULL;
1234 vm_object_t l_object = NULL;
1235 vm_object_t m_object = NULL;
1236 int delayed_unlock = 0;
1237 int try_failed_count = 0;
1238 int refmod_state;
1239 int pmap_options;
1240 struct vm_pageout_queue *iq;
1241 ppnum_t phys_page;
1242
1243
1244 iq = &vm_pageout_queue_internal;
1245
1246 vm_page_lock_queues();
1247
1248 while (qcount && !vm_page_queue_empty(q)) {
1249 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1250
1251 if (VM_PAGE_Q_THROTTLED(iq)) {
1252 if (l_object != NULL) {
1253 vm_object_unlock(l_object);
1254 l_object = NULL;
1255 }
1256 iq->pgo_draining = TRUE;
1257
1258 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1259 vm_page_unlock_queues();
1260
1261 thread_block(THREAD_CONTINUE_NULL);
1262
1263 vm_page_lock_queues();
1264 delayed_unlock = 0;
1265 continue;
1266 }
1267 m = (vm_page_t) vm_page_queue_first(q);
1268 m_object = VM_PAGE_OBJECT(m);
1269
1270 /*
1271 * check to see if we currently are working
1272 * with the same object... if so, we've
1273 * already got the lock
1274 */
1275 if (m_object != l_object) {
1276 if (!m_object->internal) {
1277 goto reenter_pg_on_q;
1278 }
1279
1280 /*
1281 * the object associated with candidate page is
1282 * different from the one we were just working
1283 * with... dump the lock if we still own it
1284 */
1285 if (l_object != NULL) {
1286 vm_object_unlock(l_object);
1287 l_object = NULL;
1288 }
1289 if (m_object != t_object) {
1290 try_failed_count = 0;
1291 }
1292
1293 /*
1294 * Try to lock object; since we've alread got the
1295 * page queues lock, we can only 'try' for this one.
1296 * if the 'try' fails, we need to do a mutex_pause
1297 * to allow the owner of the object lock a chance to
1298 * run...
1299 */
1300 if (!vm_object_lock_try_scan(m_object)) {
1301 if (try_failed_count > 20) {
1302 goto reenter_pg_on_q;
1303 }
1304 vm_page_unlock_queues();
1305 mutex_pause(try_failed_count++);
1306 vm_page_lock_queues();
1307 delayed_unlock = 0;
1308
1309 t_object = m_object;
1310 continue;
1311 }
1312 l_object = m_object;
1313 }
1314 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1315 /*
1316 * page is not to be cleaned
1317 * put it back on the head of its queue
1318 */
1319 goto reenter_pg_on_q;
1320 }
1321 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1322
1323 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1324 refmod_state = pmap_get_refmod(phys_page);
1325
1326 if (refmod_state & VM_MEM_REFERENCED) {
1327 m->vmp_reference = TRUE;
1328 }
1329 if (refmod_state & VM_MEM_MODIFIED) {
1330 SET_PAGE_DIRTY(m, FALSE);
1331 }
1332 }
1333 if (m->vmp_reference == TRUE) {
1334 m->vmp_reference = FALSE;
1335 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1336 goto reenter_pg_on_q;
1337 }
1338 if (m->vmp_pmapped == TRUE) {
1339 if (m->vmp_dirty || m->vmp_precious) {
1340 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1341 } else {
1342 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1343 }
1344 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1345 if (refmod_state & VM_MEM_MODIFIED) {
1346 SET_PAGE_DIRTY(m, FALSE);
1347 }
1348 }
1349
1350 if (!m->vmp_dirty && !m->vmp_precious) {
1351 vm_page_unlock_queues();
1352 VM_PAGE_FREE(m);
1353 vm_page_lock_queues();
1354 delayed_unlock = 0;
1355
1356 goto next_pg;
1357 }
1358 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1359 if (!m_object->pager_initialized) {
1360 vm_page_unlock_queues();
1361
1362 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1363
1364 if (!m_object->pager_initialized) {
1365 vm_object_compressor_pager_create(m_object);
1366 }
1367
1368 vm_page_lock_queues();
1369 delayed_unlock = 0;
1370 }
1371 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1372 goto reenter_pg_on_q;
1373 }
1374 /*
1375 * vm_object_compressor_pager_create will drop the object lock
1376 * which means 'm' may no longer be valid to use
1377 */
1378 continue;
1379 }
1380 /*
1381 * we've already factored out pages in the laundry which
1382 * means this page can't be on the pageout queue so it's
1383 * safe to do the vm_page_queues_remove
1384 */
1385 vm_page_queues_remove(m, TRUE);
1386
1387 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1388
1389 vm_pageout_cluster(m);
1390
1391 goto next_pg;
1392
1393 reenter_pg_on_q:
1394 vm_page_queue_remove(q, m, vmp_pageq);
1395 vm_page_queue_enter(q, m, vmp_pageq);
1396 next_pg:
1397 qcount--;
1398 try_failed_count = 0;
1399
1400 if (delayed_unlock++ > 128) {
1401 if (l_object != NULL) {
1402 vm_object_unlock(l_object);
1403 l_object = NULL;
1404 }
1405 lck_mtx_yield(&vm_page_queue_lock);
1406 delayed_unlock = 0;
1407 }
1408 }
1409 if (l_object != NULL) {
1410 vm_object_unlock(l_object);
1411 l_object = NULL;
1412 }
1413 vm_page_unlock_queues();
1414 }
1415
1416
1417
1418 /*
1419 * function in BSD to apply I/O throttle to the pageout thread
1420 */
1421 extern void vm_pageout_io_throttle(void);
1422
1423 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1424 MACRO_BEGIN \
1425 /* \
1426 * If a "reusable" page somehow made it back into \
1427 * the active queue, it's been re-used and is not \
1428 * quite re-usable. \
1429 * If the VM object was "all_reusable", consider it \
1430 * as "all re-used" instead of converting it to \
1431 * "partially re-used", which could be expensive. \
1432 */ \
1433 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1434 if ((m)->vmp_reusable || \
1435 (obj)->all_reusable) { \
1436 vm_object_reuse_pages((obj), \
1437 (m)->vmp_offset, \
1438 (m)->vmp_offset + PAGE_SIZE_64, \
1439 FALSE); \
1440 } \
1441 MACRO_END
1442
1443
1444 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1445 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1446
1447 #define FCS_IDLE 0
1448 #define FCS_DELAYED 1
1449 #define FCS_DEADLOCK_DETECTED 2
1450
1451 struct flow_control {
1452 int state;
1453 mach_timespec_t ts;
1454 };
1455
1456
1457 #if CONFIG_BACKGROUND_QUEUE
1458 uint64_t vm_pageout_rejected_bq_internal = 0;
1459 uint64_t vm_pageout_rejected_bq_external = 0;
1460 uint64_t vm_pageout_skipped_bq_internal = 0;
1461 #endif
1462
1463 #define ANONS_GRABBED_LIMIT 2
1464
1465
1466 #if 0
1467 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1468 #endif
1469 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1470
1471 #define VM_PAGEOUT_PB_NO_ACTION 0
1472 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1473 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1474
1475
1476 #if 0
1477 static void
1478 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1479 {
1480 if (*local_freeq) {
1481 vm_page_unlock_queues();
1482
1483 VM_DEBUG_CONSTANT_EVENT(
1484 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1485 vm_page_free_count, 0, 0, 1);
1486
1487 vm_page_free_list(*local_freeq, TRUE);
1488
1489 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1490 vm_page_free_count, *local_freed, 0, 1);
1491
1492 *local_freeq = NULL;
1493 *local_freed = 0;
1494
1495 vm_page_lock_queues();
1496 } else {
1497 lck_mtx_yield(&vm_page_queue_lock);
1498 }
1499 *delayed_unlock = 1;
1500 }
1501 #endif
1502
1503
1504 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1505 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1506 vm_page_t *local_freeq, int *local_freed, int action)
1507 {
1508 vm_page_unlock_queues();
1509
1510 if (*object != NULL) {
1511 vm_object_unlock(*object);
1512 *object = NULL;
1513 }
1514 if (*local_freeq) {
1515 vm_page_free_list(*local_freeq, TRUE);
1516
1517 *local_freeq = NULL;
1518 *local_freed = 0;
1519 }
1520 *delayed_unlock = 1;
1521
1522 switch (action) {
1523 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1524 vm_consider_waking_compactor_swapper();
1525 break;
1526 case VM_PAGEOUT_PB_THREAD_YIELD:
1527 thread_yield_internal(1);
1528 break;
1529 case VM_PAGEOUT_PB_NO_ACTION:
1530 default:
1531 break;
1532 }
1533 vm_page_lock_queues();
1534 }
1535
1536
1537 static struct vm_pageout_vminfo last;
1538
1539 uint64_t last_vm_page_pages_grabbed = 0;
1540
1541 extern uint32_t c_segment_pages_compressed;
1542
1543 extern uint64_t shared_region_pager_reclaimed;
1544 extern struct memory_object_pager_ops shared_region_pager_ops;
1545
1546 void
update_vm_info(void)1547 update_vm_info(void)
1548 {
1549 unsigned long tmp;
1550 uint64_t tmp64;
1551
1552 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1553 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1554 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1555 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1556
1557 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1558 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1559 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1560
1561 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1562 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1563 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1564 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1565
1566
1567 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1568 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1569 last.vm_pageout_considered_page = tmp;
1570
1571 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1572 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1573 last.vm_pageout_compressions = tmp64;
1574
1575 tmp = vm_pageout_vminfo.vm_compressor_failed;
1576 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1577 last.vm_compressor_failed = tmp;
1578
1579 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1580 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1581 last.vm_compressor_pages_grabbed = tmp64;
1582
1583 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1584 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1585 last.vm_phantom_cache_found_ghost = tmp;
1586
1587 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1588 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1589 last.vm_phantom_cache_added_ghost = tmp;
1590
1591 tmp64 = counter_load(&vm_page_grab_count);
1592 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1593 last_vm_page_pages_grabbed = tmp64;
1594
1595 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1596 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1597 last.vm_page_pages_freed = tmp;
1598
1599
1600 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1601 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1602 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1603 last.vm_pageout_pages_evicted = tmp;
1604
1605 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1606 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1607 last.vm_pageout_pages_purged = tmp;
1608
1609 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1610 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1611 last.vm_pageout_freed_speculative = tmp;
1612
1613 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1614 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1615 last.vm_pageout_freed_external = tmp;
1616
1617 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1618 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1619 last.vm_pageout_inactive_referenced = tmp;
1620
1621 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1622 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1623 last.vm_pageout_scan_inactive_throttled_external = tmp;
1624
1625 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1626 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1627 last.vm_pageout_inactive_dirty_external = tmp;
1628
1629 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1630 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1631 last.vm_pageout_freed_cleaned = tmp;
1632
1633 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1634 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1635 last.vm_pageout_inactive_nolock = tmp;
1636
1637 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1638 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1639 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1640
1641 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1642 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1643 last.vm_pageout_skipped_external = tmp;
1644
1645 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1646 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1647 last.vm_pageout_skipped_internal = tmp;
1648
1649 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1650 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1651 last.vm_pageout_reactivation_limit_exceeded = tmp;
1652
1653 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1654 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1655 last.vm_pageout_inactive_force_reclaim = tmp;
1656
1657 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1658 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1659 last.vm_pageout_freed_internal = tmp;
1660
1661 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1662 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1663 last.vm_pageout_considered_bq_internal = tmp;
1664
1665 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1666 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1667 last.vm_pageout_considered_bq_external = tmp;
1668
1669 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1670 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1671 last.vm_pageout_filecache_min_reactivated = tmp;
1672
1673 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1674 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1675 last.vm_pageout_inactive_dirty_internal = tmp;
1676 }
1677
1678 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1679 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1680 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1681 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1682 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1683 0);
1684
1685 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1686 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1687 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1688 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1689 0,
1690 0);
1691
1692 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1693 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1694 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1695 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1696 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1697 0);
1698
1699 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1700 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1701 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1702 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1703 vm_pageout_stats[vm_pageout_stat_now].considered,
1704 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1705 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1706 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1707 0);
1708
1709 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1710 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1711 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1712 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1713 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1714 0);
1715
1716 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1717 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1718 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1719 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1720 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1721 0);
1722
1723 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1724 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1725 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1726 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1727 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1728 0);
1729
1730 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1731 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1732 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1733 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1734 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1735 0);
1736 }
1737 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1738 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1739 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1740 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1741 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1742 0);
1743
1744 record_memory_pressure();
1745 }
1746
1747 extern boolean_t hibernation_vmqueues_inspection;
1748
1749 /*
1750 * Return values for functions called by vm_pageout_scan
1751 * that control its flow.
1752 *
1753 * PROCEED -- vm_pageout_scan will keep making forward progress.
1754 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1755 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1756 */
1757
1758 #define VM_PAGEOUT_SCAN_PROCEED (0)
1759 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1760 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1761
1762 /*
1763 * This function is called only from vm_pageout_scan and
1764 * it moves overflow secluded pages (one-at-a-time) to the
1765 * batched 'local' free Q or active Q.
1766 */
1767 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1768 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1769 {
1770 #if CONFIG_SECLUDED_MEMORY
1771 /*
1772 * Deal with secluded_q overflow.
1773 */
1774 if (vm_page_secluded_count > vm_page_secluded_target) {
1775 vm_page_t secluded_page;
1776
1777 /*
1778 * SECLUDED_AGING_BEFORE_ACTIVE:
1779 * Excess secluded pages go to the active queue and
1780 * will later go to the inactive queue.
1781 */
1782 assert((vm_page_secluded_count_free +
1783 vm_page_secluded_count_inuse) ==
1784 vm_page_secluded_count);
1785 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1786 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1787
1788 vm_page_queues_remove(secluded_page, FALSE);
1789 assert(!secluded_page->vmp_fictitious);
1790 assert(!VM_PAGE_WIRED(secluded_page));
1791
1792 if (secluded_page->vmp_object == 0) {
1793 /* transfer to free queue */
1794 assert(secluded_page->vmp_busy);
1795 secluded_page->vmp_snext = *local_freeq;
1796 *local_freeq = secluded_page;
1797 *local_freed += 1;
1798 } else {
1799 /* transfer to head of active queue */
1800 vm_page_enqueue_active(secluded_page, FALSE);
1801 secluded_page = VM_PAGE_NULL;
1802 }
1803 }
1804 #else /* CONFIG_SECLUDED_MEMORY */
1805
1806 #pragma unused(local_freeq)
1807 #pragma unused(local_freed)
1808
1809 return;
1810
1811 #endif /* CONFIG_SECLUDED_MEMORY */
1812 }
1813
1814 /*
1815 * This function is called only from vm_pageout_scan and
1816 * it initializes the loop targets for vm_pageout_scan().
1817 */
1818 static void
vps_init_page_targets(void)1819 vps_init_page_targets(void)
1820 {
1821 /*
1822 * LD TODO: Other page targets should be calculated here too.
1823 */
1824 vm_page_anonymous_min = vm_page_inactive_target / 20;
1825
1826 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1827 vm_pageout_state.vm_page_speculative_percentage = 50;
1828 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1829 vm_pageout_state.vm_page_speculative_percentage = 1;
1830 }
1831
1832 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1833 vm_page_inactive_count);
1834 }
1835
1836 /*
1837 * This function is called only from vm_pageout_scan and
1838 * it purges a single VM object at-a-time and will either
1839 * make vm_pageout_scan() restart the loop or keeping moving forward.
1840 */
1841 static int
vps_purge_object()1842 vps_purge_object()
1843 {
1844 int force_purge;
1845
1846 assert(available_for_purge >= 0);
1847 force_purge = 0; /* no force-purging */
1848
1849 #if VM_PRESSURE_EVENTS
1850 vm_pressure_level_t pressure_level;
1851
1852 pressure_level = memorystatus_vm_pressure_level;
1853
1854 if (pressure_level > kVMPressureNormal) {
1855 if (pressure_level >= kVMPressureCritical) {
1856 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1857 } else if (pressure_level >= kVMPressureUrgent) {
1858 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1859 } else if (pressure_level >= kVMPressureWarning) {
1860 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1861 }
1862 }
1863 #endif /* VM_PRESSURE_EVENTS */
1864
1865 if (available_for_purge || force_purge) {
1866 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1867
1868 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1869 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1870 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1871 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1872 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1873
1874 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1875 }
1876 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1877 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1878 }
1879
1880 return VM_PAGEOUT_SCAN_PROCEED;
1881 }
1882
1883 /*
1884 * This function is called only from vm_pageout_scan and
1885 * it will try to age the next speculative Q if the oldest
1886 * one is empty.
1887 */
1888 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1889 vps_age_speculative_queue(boolean_t force_speculative_aging)
1890 {
1891 #define DELAY_SPECULATIVE_AGE 1000
1892
1893 /*
1894 * try to pull pages from the aging bins...
1895 * see vm_page.h for an explanation of how
1896 * this mechanism works
1897 */
1898 boolean_t can_steal = FALSE;
1899 int num_scanned_queues;
1900 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1901 mach_timespec_t ts;
1902 struct vm_speculative_age_q *aq;
1903 struct vm_speculative_age_q *sq;
1904
1905 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1906
1907 aq = &vm_page_queue_speculative[speculative_steal_index];
1908
1909 num_scanned_queues = 0;
1910 while (vm_page_queue_empty(&aq->age_q) &&
1911 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1912 speculative_steal_index++;
1913
1914 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1915 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1916 }
1917
1918 aq = &vm_page_queue_speculative[speculative_steal_index];
1919 }
1920
1921 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1922 /*
1923 * XXX We've scanned all the speculative
1924 * queues but still haven't found one
1925 * that is not empty, even though
1926 * vm_page_speculative_count is not 0.
1927 */
1928 if (!vm_page_queue_empty(&sq->age_q)) {
1929 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1930 }
1931 #if DEVELOPMENT || DEBUG
1932 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
1933 #endif
1934 /* readjust... */
1935 vm_page_speculative_count = 0;
1936 /* ... and continue */
1937 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1938 }
1939
1940 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
1941 can_steal = TRUE;
1942 } else {
1943 if (!delay_speculative_age) {
1944 mach_timespec_t ts_fully_aged;
1945
1946 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
1947 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
1948 * 1000 * NSEC_PER_USEC;
1949
1950 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1951
1952 clock_sec_t sec;
1953 clock_nsec_t nsec;
1954 clock_get_system_nanotime(&sec, &nsec);
1955 ts.tv_sec = (unsigned int) sec;
1956 ts.tv_nsec = nsec;
1957
1958 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
1959 can_steal = TRUE;
1960 } else {
1961 delay_speculative_age++;
1962 }
1963 } else {
1964 delay_speculative_age++;
1965 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
1966 delay_speculative_age = 0;
1967 }
1968 }
1969 }
1970 if (can_steal == TRUE) {
1971 vm_page_speculate_ageit(aq);
1972 }
1973
1974 return VM_PAGEOUT_SCAN_PROCEED;
1975 }
1976
1977 /*
1978 * This function is called only from vm_pageout_scan and
1979 * it evicts a single VM object from the cache.
1980 */
1981 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)1982 vps_object_cache_evict(vm_object_t *object_to_unlock)
1983 {
1984 static int cache_evict_throttle = 0;
1985 struct vm_speculative_age_q *sq;
1986
1987 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1988
1989 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
1990 int pages_evicted;
1991
1992 if (*object_to_unlock != NULL) {
1993 vm_object_unlock(*object_to_unlock);
1994 *object_to_unlock = NULL;
1995 }
1996 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
1997
1998 pages_evicted = vm_object_cache_evict(100, 10);
1999
2000 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2001
2002 if (pages_evicted) {
2003 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2004
2005 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2006 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2007 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2008
2009 /*
2010 * we just freed up to 100 pages,
2011 * so go back to the top of the main loop
2012 * and re-evaulate the memory situation
2013 */
2014 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2015 } else {
2016 cache_evict_throttle = 1000;
2017 }
2018 }
2019 if (cache_evict_throttle) {
2020 cache_evict_throttle--;
2021 }
2022
2023 return VM_PAGEOUT_SCAN_PROCEED;
2024 }
2025
2026
2027 /*
2028 * This function is called only from vm_pageout_scan and
2029 * it calculates the filecache min. that needs to be maintained
2030 * as we start to steal pages.
2031 */
2032 static void
vps_calculate_filecache_min(void)2033 vps_calculate_filecache_min(void)
2034 {
2035 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2036
2037 #if CONFIG_JETSAM
2038 /*
2039 * don't let the filecache_min fall below 15% of available memory
2040 * on systems with an active compressor that isn't nearing its
2041 * limits w/r to accepting new data
2042 *
2043 * on systems w/o the compressor/swapper, the filecache is always
2044 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2045 * since most (if not all) of the anonymous pages are in the
2046 * throttled queue (which isn't counted as available) which
2047 * effectively disables this filter
2048 */
2049 if (vm_compressor_low_on_space() || divisor == 0) {
2050 vm_pageout_state.vm_page_filecache_min = 0;
2051 } else {
2052 vm_pageout_state.vm_page_filecache_min =
2053 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2054 }
2055 #else
2056 if (vm_compressor_out_of_space() || divisor == 0) {
2057 vm_pageout_state.vm_page_filecache_min = 0;
2058 } else {
2059 /*
2060 * don't let the filecache_min fall below the specified critical level
2061 */
2062 vm_pageout_state.vm_page_filecache_min =
2063 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2064 }
2065 #endif
2066 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2067 vm_pageout_state.vm_page_filecache_min = 0;
2068 }
2069 }
2070
2071 /*
2072 * This function is called only from vm_pageout_scan and
2073 * it updates the flow control time to detect if VM pageoutscan
2074 * isn't making progress.
2075 */
2076 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2077 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2078 {
2079 mach_timespec_t ts;
2080 clock_sec_t sec;
2081 clock_nsec_t nsec;
2082
2083 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2084 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2085 clock_get_system_nanotime(&sec, &nsec);
2086 flow_control->ts.tv_sec = (unsigned int) sec;
2087 flow_control->ts.tv_nsec = nsec;
2088 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2089
2090 flow_control->state = FCS_DELAYED;
2091
2092 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2093 }
2094
2095 /*
2096 * This function is called only from vm_pageout_scan and
2097 * it is the flow control logic of VM pageout scan which
2098 * controls if it should block and for how long.
2099 * Any blocking of vm_pageout_scan happens ONLY in this function.
2100 */
2101 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2102 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2103 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2104 {
2105 boolean_t exceeded_burst_throttle = FALSE;
2106 unsigned int msecs = 0;
2107 uint32_t inactive_external_count;
2108 mach_timespec_t ts;
2109 struct vm_pageout_queue *iq;
2110 struct vm_pageout_queue *eq;
2111 struct vm_speculative_age_q *sq;
2112
2113 iq = &vm_pageout_queue_internal;
2114 eq = &vm_pageout_queue_external;
2115 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2116
2117 /*
2118 * Sometimes we have to pause:
2119 * 1) No inactive pages - nothing to do.
2120 * 2) Loop control - no acceptable pages found on the inactive queue
2121 * within the last vm_pageout_burst_inactive_throttle iterations
2122 * 3) Flow control - default pageout queue is full
2123 */
2124 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2125 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2126 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2127 vm_page_queue_empty(&sq->age_q)) {
2128 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2129 msecs = vm_pageout_state.vm_pageout_empty_wait;
2130 } else if (inactive_burst_count >=
2131 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2132 (vm_page_inactive_count +
2133 vm_page_speculative_count))) {
2134 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2135 msecs = vm_pageout_state.vm_pageout_burst_wait;
2136
2137 exceeded_burst_throttle = TRUE;
2138 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2139 VM_DYNAMIC_PAGING_ENABLED()) {
2140 clock_sec_t sec;
2141 clock_nsec_t nsec;
2142
2143 switch (flow_control->state) {
2144 case FCS_IDLE:
2145 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2146 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2147 /*
2148 * since the compressor is running independently of vm_pageout_scan
2149 * let's not wait for it just yet... as long as we have a healthy supply
2150 * of filecache pages to work with, let's keep stealing those.
2151 */
2152 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2153
2154 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2155 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2156 *anons_grabbed = ANONS_GRABBED_LIMIT;
2157 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2158 return VM_PAGEOUT_SCAN_PROCEED;
2159 }
2160 }
2161
2162 vps_flow_control_reset_deadlock_timer(flow_control);
2163 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2164
2165 break;
2166
2167 case FCS_DELAYED:
2168 clock_get_system_nanotime(&sec, &nsec);
2169 ts.tv_sec = (unsigned int) sec;
2170 ts.tv_nsec = nsec;
2171
2172 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2173 /*
2174 * the pageout thread for the default pager is potentially
2175 * deadlocked since the
2176 * default pager queue has been throttled for more than the
2177 * allowable time... we need to move some clean pages or dirty
2178 * pages belonging to the external pagers if they aren't throttled
2179 * vm_page_free_wanted represents the number of threads currently
2180 * blocked waiting for pages... we'll move one page for each of
2181 * these plus a fixed amount to break the logjam... once we're done
2182 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2183 * with a new timeout target since we have no way of knowing
2184 * whether we've broken the deadlock except through observation
2185 * of the queue associated with the default pager... we need to
2186 * stop moving pages and allow the system to run to see what
2187 * state it settles into.
2188 */
2189
2190 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2191 vm_page_free_wanted + vm_page_free_wanted_privileged;
2192 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2193 flow_control->state = FCS_DEADLOCK_DETECTED;
2194 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2195 return VM_PAGEOUT_SCAN_PROCEED;
2196 }
2197 /*
2198 * just resniff instead of trying
2199 * to compute a new delay time... we're going to be
2200 * awakened immediately upon a laundry completion,
2201 * so we won't wait any longer than necessary
2202 */
2203 msecs = vm_pageout_state.vm_pageout_idle_wait;
2204 break;
2205
2206 case FCS_DEADLOCK_DETECTED:
2207 if (*vm_pageout_deadlock_target) {
2208 return VM_PAGEOUT_SCAN_PROCEED;
2209 }
2210
2211 vps_flow_control_reset_deadlock_timer(flow_control);
2212 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2213
2214 break;
2215 }
2216 } else {
2217 /*
2218 * No need to pause...
2219 */
2220 return VM_PAGEOUT_SCAN_PROCEED;
2221 }
2222
2223 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2224
2225 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2226 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2227
2228 if (vm_page_free_count >= vm_page_free_target) {
2229 /*
2230 * we're here because
2231 * 1) someone else freed up some pages while we had
2232 * the queues unlocked above
2233 * and we've hit one of the 3 conditions that
2234 * cause us to pause the pageout scan thread
2235 *
2236 * since we already have enough free pages,
2237 * let's avoid stalling and return normally
2238 *
2239 * before we return, make sure the pageout I/O threads
2240 * are running throttled in case there are still requests
2241 * in the laundry... since we have enough free pages
2242 * we don't need the laundry to be cleaned in a timely
2243 * fashion... so let's avoid interfering with foreground
2244 * activity
2245 *
2246 * we don't want to hold vm_page_queue_free_lock when
2247 * calling vm_pageout_adjust_eq_iothrottle (since it
2248 * may cause other locks to be taken), we do the intitial
2249 * check outside of the lock. Once we take the lock,
2250 * we recheck the condition since it may have changed.
2251 * if it has, no problem, we will make the threads
2252 * non-throttled before actually blocking
2253 */
2254 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2255 }
2256 lck_mtx_lock(&vm_page_queue_free_lock);
2257
2258 if (vm_page_free_count >= vm_page_free_target &&
2259 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2260 return VM_PAGEOUT_SCAN_DONE_RETURN;
2261 }
2262 lck_mtx_unlock(&vm_page_queue_free_lock);
2263
2264 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2265 /*
2266 * we're most likely about to block due to one of
2267 * the 3 conditions that cause vm_pageout_scan to
2268 * not be able to make forward progress w/r
2269 * to providing new pages to the free queue,
2270 * so unthrottle the I/O threads in case we
2271 * have laundry to be cleaned... it needs
2272 * to be completed ASAP.
2273 *
2274 * even if we don't block, we want the io threads
2275 * running unthrottled since the sum of free +
2276 * clean pages is still under our free target
2277 */
2278 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2279 }
2280 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2281 /*
2282 * if we get here we're below our free target and
2283 * we're stalling due to a full laundry queue or
2284 * we don't have any inactive pages other then
2285 * those in the clean queue...
2286 * however, we have pages on the clean queue that
2287 * can be moved to the free queue, so let's not
2288 * stall the pageout scan
2289 */
2290 flow_control->state = FCS_IDLE;
2291 return VM_PAGEOUT_SCAN_PROCEED;
2292 }
2293 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2294 flow_control->state = FCS_IDLE;
2295 return VM_PAGEOUT_SCAN_PROCEED;
2296 }
2297
2298 VM_CHECK_MEMORYSTATUS;
2299
2300 if (flow_control->state != FCS_IDLE) {
2301 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2302 }
2303
2304 iq->pgo_throttled = TRUE;
2305 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2306
2307 vm_page_unlock_queues();
2308
2309 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2310
2311 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2312 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2313 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2314
2315 thread_block(THREAD_CONTINUE_NULL);
2316
2317 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2318 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2319 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2320
2321 vm_page_lock_queues();
2322
2323 iq->pgo_throttled = FALSE;
2324
2325 vps_init_page_targets();
2326
2327 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2328 }
2329
2330 /*
2331 * This function is called only from vm_pageout_scan and
2332 * it will find and return the most appropriate page to be
2333 * reclaimed.
2334 */
2335 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2336 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2337 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2338 {
2339 vm_page_t m = NULL;
2340 vm_object_t m_object = VM_OBJECT_NULL;
2341 uint32_t inactive_external_count;
2342 struct vm_speculative_age_q *sq;
2343 struct vm_pageout_queue *iq;
2344 int retval = VM_PAGEOUT_SCAN_PROCEED;
2345
2346 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2347 iq = &vm_pageout_queue_internal;
2348
2349 *is_page_from_bg_q = FALSE;
2350
2351 m = NULL;
2352 m_object = VM_OBJECT_NULL;
2353
2354 if (VM_DYNAMIC_PAGING_ENABLED()) {
2355 assert(vm_page_throttled_count == 0);
2356 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2357 }
2358
2359 /*
2360 * Try for a clean-queue inactive page.
2361 * These are pages that vm_pageout_scan tried to steal earlier, but
2362 * were dirty and had to be cleaned. Pick them up now that they are clean.
2363 */
2364 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2365 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2366
2367 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2368
2369 goto found_page;
2370 }
2371
2372 /*
2373 * The next most eligible pages are ones we paged in speculatively,
2374 * but which have not yet been touched and have been aged out.
2375 */
2376 if (!vm_page_queue_empty(&sq->age_q)) {
2377 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2378
2379 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2380
2381 if (!m->vmp_dirty || force_anonymous == FALSE) {
2382 goto found_page;
2383 } else {
2384 m = NULL;
2385 }
2386 }
2387
2388 #if CONFIG_BACKGROUND_QUEUE
2389 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2390 vm_object_t bg_m_object = NULL;
2391
2392 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2393
2394 bg_m_object = VM_PAGE_OBJECT(m);
2395
2396 if (!VM_PAGE_PAGEABLE(m)) {
2397 /*
2398 * This page is on the background queue
2399 * but not on a pageable queue. This is
2400 * likely a transient state and whoever
2401 * took it out of its pageable queue
2402 * will likely put it back on a pageable
2403 * queue soon but we can't deal with it
2404 * at this point, so let's ignore this
2405 * page.
2406 */
2407 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2408 if (bg_m_object->internal &&
2409 (VM_PAGE_Q_THROTTLED(iq) ||
2410 vm_compressor_out_of_space() == TRUE ||
2411 vm_page_free_count < (vm_page_free_reserved / 4))) {
2412 vm_pageout_skipped_bq_internal++;
2413 } else {
2414 *is_page_from_bg_q = TRUE;
2415
2416 if (bg_m_object->internal) {
2417 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2418 } else {
2419 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2420 }
2421 goto found_page;
2422 }
2423 }
2424 }
2425 #endif /* CONFIG_BACKGROUND_QUEUE */
2426
2427 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2428
2429 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2430 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2431 *grab_anonymous = TRUE;
2432 *anons_grabbed = 0;
2433
2434 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2435 vm_pageout_vminfo.vm_pageout_skipped_external++;
2436 } else {
2437 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2438 /*
2439 * No swap and we are in dangerously low levels of free memory.
2440 * If we keep going ahead with anonymous pages, we are going to run into a situation
2441 * where the compressor will be stuck waiting for free pages (if it isn't already).
2442 *
2443 * So, pick a file backed page...
2444 */
2445 *grab_anonymous = FALSE;
2446 *anons_grabbed = ANONS_GRABBED_LIMIT;
2447 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2448 }
2449 }
2450 goto want_anonymous;
2451 }
2452 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2453
2454 #if CONFIG_JETSAM
2455 /* If the file-backed pool has accumulated
2456 * significantly more pages than the jetsam
2457 * threshold, prefer to reclaim those
2458 * inline to minimise compute overhead of reclaiming
2459 * anonymous pages.
2460 * This calculation does not account for the CPU local
2461 * external page queues, as those are expected to be
2462 * much smaller relative to the global pools.
2463 */
2464
2465 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2466
2467 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2468 if (vm_page_pageable_external_count >
2469 vm_pageout_state.vm_page_filecache_min) {
2470 if ((vm_page_pageable_external_count *
2471 vm_pageout_memorystatus_fb_factor_dr) >
2472 (memorystatus_available_pages_critical *
2473 vm_pageout_memorystatus_fb_factor_nr)) {
2474 *grab_anonymous = FALSE;
2475
2476 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2477 }
2478 }
2479 if (*grab_anonymous) {
2480 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2481 }
2482 }
2483 #endif /* CONFIG_JETSAM */
2484
2485 want_anonymous:
2486 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2487 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2488 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2489
2490 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2491 *anons_grabbed = 0;
2492
2493 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2494 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2495 if ((++(*reactivated_this_call) % 100)) {
2496 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2497
2498 vm_page_activate(m);
2499 counter_inc(&vm_statistics_reactivations);
2500 #if CONFIG_BACKGROUND_QUEUE
2501 #if DEVELOPMENT || DEBUG
2502 if (*is_page_from_bg_q == TRUE) {
2503 if (m_object->internal) {
2504 vm_pageout_rejected_bq_internal++;
2505 } else {
2506 vm_pageout_rejected_bq_external++;
2507 }
2508 }
2509 #endif /* DEVELOPMENT || DEBUG */
2510 #endif /* CONFIG_BACKGROUND_QUEUE */
2511 vm_pageout_state.vm_pageout_inactive_used++;
2512
2513 m = NULL;
2514 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2515
2516 goto found_page;
2517 }
2518
2519 /*
2520 * steal 1 of the file backed pages even if
2521 * we are under the limit that has been set
2522 * for a healthy filecache
2523 */
2524 }
2525 }
2526 goto found_page;
2527 }
2528 }
2529 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2530 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2531
2532 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2533 *anons_grabbed += 1;
2534
2535 goto found_page;
2536 }
2537
2538 m = NULL;
2539
2540 found_page:
2541 *victim_page = m;
2542
2543 return retval;
2544 }
2545
2546 /*
2547 * This function is called only from vm_pageout_scan and
2548 * it will put a page back on the active/inactive queue
2549 * if we can't reclaim it for some reason.
2550 */
2551 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2552 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2553 {
2554 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2555 vm_page_enqueue_inactive(m, FALSE);
2556 } else {
2557 vm_page_activate(m);
2558 }
2559
2560 #if CONFIG_BACKGROUND_QUEUE
2561 #if DEVELOPMENT || DEBUG
2562 vm_object_t m_object = VM_PAGE_OBJECT(m);
2563
2564 if (page_from_bg_q == TRUE) {
2565 if (m_object->internal) {
2566 vm_pageout_rejected_bq_internal++;
2567 } else {
2568 vm_pageout_rejected_bq_external++;
2569 }
2570 }
2571 #endif /* DEVELOPMENT || DEBUG */
2572 #endif /* CONFIG_BACKGROUND_QUEUE */
2573 }
2574
2575 /*
2576 * This function is called only from vm_pageout_scan and
2577 * it will try to grab the victim page's VM object (m_object)
2578 * which differs from the previous victim page's object (object).
2579 */
2580 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2581 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2582 {
2583 struct vm_speculative_age_q *sq;
2584
2585 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2586
2587 /*
2588 * the object associated with candidate page is
2589 * different from the one we were just working
2590 * with... dump the lock if we still own it
2591 */
2592 if (*object != NULL) {
2593 vm_object_unlock(*object);
2594 *object = NULL;
2595 }
2596 /*
2597 * Try to lock object; since we've alread got the
2598 * page queues lock, we can only 'try' for this one.
2599 * if the 'try' fails, we need to do a mutex_pause
2600 * to allow the owner of the object lock a chance to
2601 * run... otherwise, we're likely to trip over this
2602 * object in the same state as we work our way through
2603 * the queue... clumps of pages associated with the same
2604 * object are fairly typical on the inactive and active queues
2605 */
2606 if (!vm_object_lock_try_scan(m_object)) {
2607 vm_page_t m_want = NULL;
2608
2609 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2610
2611 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2612 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2613 }
2614
2615 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2616
2617 m->vmp_reference = FALSE;
2618
2619 if (!m_object->object_is_shared_cache) {
2620 /*
2621 * don't apply this optimization if this is the shared cache
2622 * object, it's too easy to get rid of very hot and important
2623 * pages...
2624 * m->vmp_object must be stable since we hold the page queues lock...
2625 * we can update the scan_collisions field sans the object lock
2626 * since it is a separate field and this is the only spot that does
2627 * a read-modify-write operation and it is never executed concurrently...
2628 * we can asynchronously set this field to 0 when creating a UPL, so it
2629 * is possible for the value to be a bit non-determistic, but that's ok
2630 * since it's only used as a hint
2631 */
2632 m_object->scan_collisions = 1;
2633 }
2634 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2635 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2636 } else if (!vm_page_queue_empty(&sq->age_q)) {
2637 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2638 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2639 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2640 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2641 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2642 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2643 }
2644
2645 /*
2646 * this is the next object we're going to be interested in
2647 * try to make sure its available after the mutex_pause
2648 * returns control
2649 */
2650 if (m_want) {
2651 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2652 }
2653
2654 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2655
2656 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2657 } else {
2658 *object = m_object;
2659 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2660 }
2661
2662 return VM_PAGEOUT_SCAN_PROCEED;
2663 }
2664
2665 /*
2666 * This function is called only from vm_pageout_scan and
2667 * it notices that pageout scan may be rendered ineffective
2668 * due to a FS deadlock and will jetsam a process if possible.
2669 * If jetsam isn't supported, it'll move the page to the active
2670 * queue to try and get some different pages pushed onwards so
2671 * we can try to get out of this scenario.
2672 */
2673 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2674 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2675 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2676 {
2677 struct vm_pageout_queue *eq;
2678 vm_object_t cur_object = VM_OBJECT_NULL;
2679
2680 cur_object = *object;
2681
2682 eq = &vm_pageout_queue_external;
2683
2684 if (cur_object->internal == FALSE) {
2685 /*
2686 * we need to break up the following potential deadlock case...
2687 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2688 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2689 * c) Most of the pages in the inactive queue belong to this file.
2690 *
2691 * we are potentially in this deadlock because...
2692 * a) the external pageout queue is throttled
2693 * b) we're done with the active queue and moved on to the inactive queue
2694 * c) we've got a dirty external page
2695 *
2696 * since we don't know the reason for the external pageout queue being throttled we
2697 * must suspect that we are deadlocked, so move the current page onto the active queue
2698 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2699 *
2700 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2701 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2702 * pool the next time we select a victim page... if we can make enough new free pages,
2703 * the deadlock will break, the external pageout queue will empty and it will no longer
2704 * be throttled
2705 *
2706 * if we have jetsam configured, keep a count of the pages reactivated this way so
2707 * that we can try to find clean pages in the active/inactive queues before
2708 * deciding to jetsam a process
2709 */
2710 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2711
2712 vm_page_check_pageable_safe(m);
2713 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2714 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2715 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2716 vm_page_active_count++;
2717 vm_page_pageable_external_count++;
2718
2719 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2720
2721 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2722
2723 #pragma unused(force_anonymous)
2724
2725 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2726
2727 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2728 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2729 /*
2730 * Possible deadlock scenario so request jetsam action
2731 */
2732
2733 assert(cur_object);
2734 vm_object_unlock(cur_object);
2735
2736 cur_object = VM_OBJECT_NULL;
2737
2738 /*
2739 * VM pageout scan needs to know we have dropped this lock and so set the
2740 * object variable we got passed in to NULL.
2741 */
2742 *object = VM_OBJECT_NULL;
2743
2744 vm_page_unlock_queues();
2745
2746 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2747 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2748
2749 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2750 if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
2751 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2752 }
2753
2754 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2755 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2756
2757 vm_page_lock_queues();
2758 *delayed_unlock = 1;
2759 }
2760 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2761
2762 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2763 #pragma unused(delayed_unlock)
2764
2765 *force_anonymous = TRUE;
2766 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2767 } else {
2768 vm_page_activate(m);
2769 counter_inc(&vm_statistics_reactivations);
2770
2771 #if CONFIG_BACKGROUND_QUEUE
2772 #if DEVELOPMENT || DEBUG
2773 if (is_page_from_bg_q == TRUE) {
2774 if (cur_object->internal) {
2775 vm_pageout_rejected_bq_internal++;
2776 } else {
2777 vm_pageout_rejected_bq_external++;
2778 }
2779 }
2780 #endif /* DEVELOPMENT || DEBUG */
2781 #endif /* CONFIG_BACKGROUND_QUEUE */
2782
2783 vm_pageout_state.vm_pageout_inactive_used++;
2784 }
2785 }
2786
2787
2788 void
vm_page_balance_inactive(int max_to_move)2789 vm_page_balance_inactive(int max_to_move)
2790 {
2791 vm_page_t m;
2792
2793 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2794
2795 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2796 /*
2797 * It is likely that the hibernation code path is
2798 * dealing with these very queues as we are about
2799 * to move pages around in/from them and completely
2800 * change the linkage of the pages.
2801 *
2802 * And so we skip the rebalancing of these queues.
2803 */
2804 return;
2805 }
2806 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2807 vm_page_inactive_count +
2808 vm_page_speculative_count);
2809
2810 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2811 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2812
2813 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2814
2815 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2816 assert(!m->vmp_laundry);
2817 assert(VM_PAGE_OBJECT(m) != kernel_object);
2818 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2819
2820 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2821
2822 /*
2823 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2824 *
2825 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2826 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2827 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2828 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2829 * by pageout_scan, which is just fine since the last reference would have happened quite far
2830 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2831 * have happened before we moved the page
2832 */
2833 if (m->vmp_pmapped == TRUE) {
2834 /*
2835 * We might be holding the page queue lock as a
2836 * spin lock and clearing the "referenced" bit could
2837 * take a while if there are lots of mappings of
2838 * that page, so make sure we acquire the lock as
2839 * as mutex to avoid a spinlock timeout.
2840 */
2841 vm_page_lockconvert_queues();
2842 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2843 }
2844
2845 /*
2846 * The page might be absent or busy,
2847 * but vm_page_deactivate can handle that.
2848 * FALSE indicates that we don't want a H/W clear reference
2849 */
2850 vm_page_deactivate_internal(m, FALSE);
2851 }
2852 }
2853
2854
2855 /*
2856 * vm_pageout_scan does the dirty work for the pageout daemon.
2857 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2858 * held and vm_page_free_wanted == 0.
2859 */
2860 void
vm_pageout_scan(void)2861 vm_pageout_scan(void)
2862 {
2863 unsigned int loop_count = 0;
2864 unsigned int inactive_burst_count = 0;
2865 unsigned int reactivated_this_call;
2866 unsigned int reactivate_limit;
2867 vm_page_t local_freeq = NULL;
2868 int local_freed = 0;
2869 int delayed_unlock;
2870 int delayed_unlock_limit = 0;
2871 int refmod_state = 0;
2872 int vm_pageout_deadlock_target = 0;
2873 struct vm_pageout_queue *iq;
2874 struct vm_pageout_queue *eq;
2875 struct vm_speculative_age_q *sq;
2876 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2877 boolean_t inactive_throttled = FALSE;
2878 vm_object_t object = NULL;
2879 uint32_t inactive_reclaim_run;
2880 boolean_t grab_anonymous = FALSE;
2881 boolean_t force_anonymous = FALSE;
2882 boolean_t force_speculative_aging = FALSE;
2883 int anons_grabbed = 0;
2884 int page_prev_q_state = 0;
2885 boolean_t page_from_bg_q = FALSE;
2886 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
2887 vm_object_t m_object = VM_OBJECT_NULL;
2888 int retval = 0;
2889 boolean_t lock_yield_check = FALSE;
2890
2891
2892 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2893 vm_pageout_vminfo.vm_pageout_freed_speculative,
2894 vm_pageout_state.vm_pageout_inactive_clean,
2895 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2896 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2897
2898 flow_control.state = FCS_IDLE;
2899 iq = &vm_pageout_queue_internal;
2900 eq = &vm_pageout_queue_external;
2901 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2902
2903 /* Ask the pmap layer to return any pages it no longer needs. */
2904 uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
2905
2906 vm_page_lock_queues();
2907
2908 vm_page_wire_count -= pmap_wired_pages_freed;
2909
2910 delayed_unlock = 1;
2911
2912 /*
2913 * Calculate the max number of referenced pages on the inactive
2914 * queue that we will reactivate.
2915 */
2916 reactivated_this_call = 0;
2917 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
2918 vm_page_inactive_count);
2919 inactive_reclaim_run = 0;
2920
2921 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2922
2923 /*
2924 * We must limit the rate at which we send pages to the pagers
2925 * so that we don't tie up too many pages in the I/O queues.
2926 * We implement a throttling mechanism using the laundry count
2927 * to limit the number of pages outstanding to the default
2928 * and external pagers. We can bypass the throttles and look
2929 * for clean pages if the pageout queues don't drain in a timely
2930 * fashion since this may indicate that the pageout paths are
2931 * stalled waiting for memory, which only we can provide.
2932 */
2933
2934 vps_init_page_targets();
2935 assert(object == NULL);
2936 assert(delayed_unlock != 0);
2937
2938 for (;;) {
2939 vm_page_t m;
2940
2941 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
2942
2943 if (lock_yield_check) {
2944 lock_yield_check = FALSE;
2945
2946 if (delayed_unlock++ > delayed_unlock_limit) {
2947 int freed = local_freed;
2948
2949 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2950 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2951 if (freed == 0) {
2952 lck_mtx_yield(&vm_page_queue_lock);
2953 }
2954 } else if (vm_pageout_scan_wants_object) {
2955 vm_page_unlock_queues();
2956 mutex_pause(0);
2957 vm_page_lock_queues();
2958 }
2959 }
2960
2961 if (vm_upl_wait_for_pages < 0) {
2962 vm_upl_wait_for_pages = 0;
2963 }
2964
2965 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2966
2967 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
2968 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
2969 }
2970
2971 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
2972
2973 assert(delayed_unlock);
2974
2975 /*
2976 * maintain our balance
2977 */
2978 vm_page_balance_inactive(1);
2979
2980
2981 /**********************************************************************
2982 * above this point we're playing with the active and secluded queues
2983 * below this point we're playing with the throttling mechanisms
2984 * and the inactive queue
2985 **********************************************************************/
2986
2987 if (vm_page_free_count + local_freed >= vm_page_free_target) {
2988 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2989
2990 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2991 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2992 /*
2993 * make sure the pageout I/O threads are running
2994 * throttled in case there are still requests
2995 * in the laundry... since we have met our targets
2996 * we don't need the laundry to be cleaned in a timely
2997 * fashion... so let's avoid interfering with foreground
2998 * activity
2999 */
3000 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
3001
3002 lck_mtx_lock(&vm_page_queue_free_lock);
3003
3004 if ((vm_page_free_count >= vm_page_free_target) &&
3005 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3006 /*
3007 * done - we have met our target *and*
3008 * there is no one waiting for a page.
3009 */
3010 return_from_scan:
3011 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3012
3013 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3014 vm_pageout_state.vm_pageout_inactive,
3015 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3016 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3017 vm_pageout_vminfo.vm_pageout_freed_speculative,
3018 vm_pageout_state.vm_pageout_inactive_clean,
3019 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3020 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3021
3022 return;
3023 }
3024 lck_mtx_unlock(&vm_page_queue_free_lock);
3025 }
3026
3027 /*
3028 * Before anything, we check if we have any ripe volatile
3029 * objects around. If so, try to purge the first object.
3030 * If the purge fails, fall through to reclaim a page instead.
3031 * If the purge succeeds, go back to the top and reevalute
3032 * the new memory situation.
3033 */
3034 retval = vps_purge_object();
3035
3036 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3037 /*
3038 * Success
3039 */
3040 if (object != NULL) {
3041 vm_object_unlock(object);
3042 object = NULL;
3043 }
3044
3045 lock_yield_check = FALSE;
3046 continue;
3047 }
3048
3049 /*
3050 * If our 'aged' queue is empty and we have some speculative pages
3051 * in the other queues, let's go through and see if we need to age
3052 * them.
3053 *
3054 * If we succeeded in aging a speculative Q or just that everything
3055 * looks normal w.r.t queue age and queue counts, we keep going onward.
3056 *
3057 * If, for some reason, we seem to have a mismatch between the spec.
3058 * page count and the page queues, we reset those variables and
3059 * restart the loop (LD TODO: Track this better?).
3060 */
3061 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3062 retval = vps_age_speculative_queue(force_speculative_aging);
3063
3064 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3065 lock_yield_check = FALSE;
3066 continue;
3067 }
3068 }
3069 force_speculative_aging = FALSE;
3070
3071 /*
3072 * Check to see if we need to evict objects from the cache.
3073 *
3074 * Note: 'object' here doesn't have anything to do with
3075 * the eviction part. We just need to make sure we have dropped
3076 * any object lock we might be holding if we need to go down
3077 * into the eviction logic.
3078 */
3079 retval = vps_object_cache_evict(&object);
3080
3081 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3082 lock_yield_check = FALSE;
3083 continue;
3084 }
3085
3086
3087 /*
3088 * Calculate our filecache_min that will affect the loop
3089 * going forward.
3090 */
3091 vps_calculate_filecache_min();
3092
3093 /*
3094 * LD TODO: Use a structure to hold all state variables for a single
3095 * vm_pageout_scan iteration and pass that structure to this function instead.
3096 */
3097 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3098 &delayed_unlock, &local_freeq, &local_freed,
3099 &vm_pageout_deadlock_target, inactive_burst_count);
3100
3101 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3102 if (loop_count >= vm_page_inactive_count) {
3103 loop_count = 0;
3104 }
3105
3106 inactive_burst_count = 0;
3107
3108 assert(object == NULL);
3109 assert(delayed_unlock != 0);
3110
3111 lock_yield_check = FALSE;
3112 continue;
3113 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3114 goto return_from_scan;
3115 }
3116
3117 flow_control.state = FCS_IDLE;
3118
3119 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3120 vm_pageout_inactive_external_forced_reactivate_limit);
3121 loop_count++;
3122 inactive_burst_count++;
3123 vm_pageout_state.vm_pageout_inactive++;
3124
3125 /*
3126 * Choose a victim.
3127 */
3128
3129 m = NULL;
3130 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3131
3132 if (m == NULL) {
3133 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3134 inactive_burst_count = 0;
3135
3136 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3137 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3138 }
3139
3140 lock_yield_check = TRUE;
3141 continue;
3142 }
3143
3144 /*
3145 * if we've gotten here, we have no victim page.
3146 * check to see if we've not finished balancing the queues
3147 * or we have a page on the aged speculative queue that we
3148 * skipped due to force_anonymous == TRUE.. or we have
3149 * speculative pages that we can prematurely age... if
3150 * one of these cases we'll keep going, else panic
3151 */
3152 force_anonymous = FALSE;
3153 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3154
3155 if (!vm_page_queue_empty(&sq->age_q)) {
3156 lock_yield_check = TRUE;
3157 continue;
3158 }
3159
3160 if (vm_page_speculative_count) {
3161 force_speculative_aging = TRUE;
3162 lock_yield_check = TRUE;
3163 continue;
3164 }
3165 panic("vm_pageout: no victim");
3166
3167 /* NOTREACHED */
3168 }
3169
3170 assert(VM_PAGE_PAGEABLE(m));
3171 m_object = VM_PAGE_OBJECT(m);
3172 force_anonymous = FALSE;
3173
3174 page_prev_q_state = m->vmp_q_state;
3175 /*
3176 * we just found this page on one of our queues...
3177 * it can't also be on the pageout queue, so safe
3178 * to call vm_page_queues_remove
3179 */
3180 vm_page_queues_remove(m, TRUE);
3181
3182 assert(!m->vmp_laundry);
3183 assert(!m->vmp_private);
3184 assert(!m->vmp_fictitious);
3185 assert(m_object != kernel_object);
3186 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3187
3188 vm_pageout_vminfo.vm_pageout_considered_page++;
3189
3190 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3191
3192 /*
3193 * check to see if we currently are working
3194 * with the same object... if so, we've
3195 * already got the lock
3196 */
3197 if (m_object != object) {
3198 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3199
3200 /*
3201 * vps_switch_object() will always drop the 'object' lock first
3202 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3203 * either 'm_object' or NULL.
3204 */
3205 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3206
3207 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3208 lock_yield_check = TRUE;
3209 continue;
3210 }
3211 }
3212 assert(m_object == object);
3213 assert(VM_PAGE_OBJECT(m) == m_object);
3214
3215 if (m->vmp_busy) {
3216 /*
3217 * Somebody is already playing with this page.
3218 * Put it back on the appropriate queue
3219 *
3220 */
3221 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3222
3223 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3224 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3225 }
3226
3227 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3228
3229 lock_yield_check = TRUE;
3230 continue;
3231 }
3232
3233 /*
3234 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3235 * If already cleaning this page in place
3236 * just leave if off the paging queues.
3237 * We can leave the page mapped, and upl_commit_range
3238 * will put it on the clean queue.
3239 *
3240 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3241 * an msync INVALIDATE is in progress...
3242 * this page has been marked for destruction
3243 * after it has been cleaned,
3244 * but not yet gathered into a UPL
3245 * where 'cleaning' will be set...
3246 * just leave it off the paging queues
3247 *
3248 * if (m->vmp_free_when_done && m->vmp_clenaing)
3249 * an msync INVALIDATE is in progress
3250 * and the UPL has already gathered this page...
3251 * just leave it off the paging queues
3252 */
3253 if (m->vmp_free_when_done || m->vmp_cleaning) {
3254 lock_yield_check = TRUE;
3255 continue;
3256 }
3257
3258
3259 /*
3260 * If it's absent, in error or the object is no longer alive,
3261 * we can reclaim the page... in the no longer alive case,
3262 * there are 2 states the page can be in that preclude us
3263 * from reclaiming it - busy or cleaning - that we've already
3264 * dealt with
3265 */
3266 if (m->vmp_absent || m->vmp_error || !object->alive) {
3267 if (m->vmp_absent) {
3268 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3269 } else if (!object->alive) {
3270 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3271 } else {
3272 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3273 }
3274 reclaim_page:
3275 if (vm_pageout_deadlock_target) {
3276 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3277 vm_pageout_deadlock_target--;
3278 }
3279
3280 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3281
3282 if (object->internal) {
3283 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3284 } else {
3285 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3286 }
3287 assert(!m->vmp_cleaning);
3288 assert(!m->vmp_laundry);
3289
3290 if (!object->internal &&
3291 object->pager != NULL &&
3292 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3293 shared_region_pager_reclaimed++;
3294 }
3295
3296 m->vmp_busy = TRUE;
3297
3298 /*
3299 * remove page from object here since we're already
3300 * behind the object lock... defer the rest of the work
3301 * we'd normally do in vm_page_free_prepare_object
3302 * until 'vm_page_free_list' is called
3303 */
3304 if (m->vmp_tabled) {
3305 vm_page_remove(m, TRUE);
3306 }
3307
3308 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3309 m->vmp_snext = local_freeq;
3310 local_freeq = m;
3311 local_freed++;
3312
3313 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3314 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3315 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3316 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3317 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3318 vm_pageout_vminfo.vm_pageout_freed_internal++;
3319 } else {
3320 vm_pageout_vminfo.vm_pageout_freed_external++;
3321 }
3322
3323 inactive_burst_count = 0;
3324
3325 lock_yield_check = TRUE;
3326 continue;
3327 }
3328 if (object->copy == VM_OBJECT_NULL) {
3329 /*
3330 * No one else can have any interest in this page.
3331 * If this is an empty purgable object, the page can be
3332 * reclaimed even if dirty.
3333 * If the page belongs to a volatile purgable object, we
3334 * reactivate it if the compressor isn't active.
3335 */
3336 if (object->purgable == VM_PURGABLE_EMPTY) {
3337 if (m->vmp_pmapped == TRUE) {
3338 /* unmap the page */
3339 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3340 if (refmod_state & VM_MEM_MODIFIED) {
3341 SET_PAGE_DIRTY(m, FALSE);
3342 }
3343 }
3344 if (m->vmp_dirty || m->vmp_precious) {
3345 /* we saved the cost of cleaning this page ! */
3346 vm_page_purged_count++;
3347 }
3348 goto reclaim_page;
3349 }
3350
3351 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3352 /*
3353 * With the VM compressor, the cost of
3354 * reclaiming a page is much lower (no I/O),
3355 * so if we find a "volatile" page, it's better
3356 * to let it get compressed rather than letting
3357 * it occupy a full page until it gets purged.
3358 * So no need to check for "volatile" here.
3359 */
3360 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3361 /*
3362 * Avoid cleaning a "volatile" page which might
3363 * be purged soon.
3364 */
3365
3366 /* if it's wired, we can't put it on our queue */
3367 assert(!VM_PAGE_WIRED(m));
3368
3369 /* just stick it back on! */
3370 reactivated_this_call++;
3371
3372 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3373 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3374 }
3375
3376 goto reactivate_page;
3377 }
3378 }
3379 /*
3380 * If it's being used, reactivate.
3381 * (Fictitious pages are either busy or absent.)
3382 * First, update the reference and dirty bits
3383 * to make sure the page is unreferenced.
3384 */
3385 refmod_state = -1;
3386
3387 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3388 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3389
3390 if (refmod_state & VM_MEM_REFERENCED) {
3391 m->vmp_reference = TRUE;
3392 }
3393 if (refmod_state & VM_MEM_MODIFIED) {
3394 SET_PAGE_DIRTY(m, FALSE);
3395 }
3396 }
3397
3398 if (m->vmp_reference || m->vmp_dirty) {
3399 /* deal with a rogue "reusable" page */
3400 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3401 }
3402
3403 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3404 vm_pageout_state.vm_page_xpmapped_min = 0;
3405 } else {
3406 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3407 }
3408
3409 if (!m->vmp_no_cache &&
3410 page_from_bg_q == FALSE &&
3411 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3412 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3413 /*
3414 * The page we pulled off the inactive list has
3415 * been referenced. It is possible for other
3416 * processors to be touching pages faster than we
3417 * can clear the referenced bit and traverse the
3418 * inactive queue, so we limit the number of
3419 * reactivations.
3420 */
3421 if (++reactivated_this_call >= reactivate_limit) {
3422 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3423 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3424 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3425 } else {
3426 uint32_t isinuse;
3427
3428 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3429 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3430 }
3431
3432 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3433 reactivate_page:
3434 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3435 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3436 /*
3437 * no explict mappings of this object exist
3438 * and it's not open via the filesystem
3439 */
3440 vm_page_deactivate(m);
3441 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3442 } else {
3443 /*
3444 * The page was/is being used, so put back on active list.
3445 */
3446 vm_page_activate(m);
3447 counter_inc(&vm_statistics_reactivations);
3448 inactive_burst_count = 0;
3449 }
3450 #if CONFIG_BACKGROUND_QUEUE
3451 #if DEVELOPMENT || DEBUG
3452 if (page_from_bg_q == TRUE) {
3453 if (m_object->internal) {
3454 vm_pageout_rejected_bq_internal++;
3455 } else {
3456 vm_pageout_rejected_bq_external++;
3457 }
3458 }
3459 #endif /* DEVELOPMENT || DEBUG */
3460 #endif /* CONFIG_BACKGROUND_QUEUE */
3461
3462 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3463 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3464 }
3465 vm_pageout_state.vm_pageout_inactive_used++;
3466
3467 lock_yield_check = TRUE;
3468 continue;
3469 }
3470 /*
3471 * Make sure we call pmap_get_refmod() if it
3472 * wasn't already called just above, to update
3473 * the dirty bit.
3474 */
3475 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3476 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3477 if (refmod_state & VM_MEM_MODIFIED) {
3478 SET_PAGE_DIRTY(m, FALSE);
3479 }
3480 }
3481 }
3482
3483 /*
3484 * we've got a candidate page to steal...
3485 *
3486 * m->vmp_dirty is up to date courtesy of the
3487 * preceding check for m->vmp_reference... if
3488 * we get here, then m->vmp_reference had to be
3489 * FALSE (or possibly "reactivate_limit" was
3490 * exceeded), but in either case we called
3491 * pmap_get_refmod() and updated both
3492 * m->vmp_reference and m->vmp_dirty
3493 *
3494 * if it's dirty or precious we need to
3495 * see if the target queue is throtttled
3496 * it if is, we need to skip over it by moving it back
3497 * to the end of the inactive queue
3498 */
3499
3500 inactive_throttled = FALSE;
3501
3502 if (m->vmp_dirty || m->vmp_precious) {
3503 if (object->internal) {
3504 if (VM_PAGE_Q_THROTTLED(iq)) {
3505 inactive_throttled = TRUE;
3506 }
3507 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3508 inactive_throttled = TRUE;
3509 }
3510 }
3511 throttle_inactive:
3512 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3513 object->internal && m->vmp_dirty &&
3514 (object->purgable == VM_PURGABLE_DENY ||
3515 object->purgable == VM_PURGABLE_NONVOLATILE ||
3516 object->purgable == VM_PURGABLE_VOLATILE)) {
3517 vm_page_check_pageable_safe(m);
3518 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3519 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3520 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3521 vm_page_throttled_count++;
3522
3523 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3524
3525 inactive_burst_count = 0;
3526
3527 lock_yield_check = TRUE;
3528 continue;
3529 }
3530 if (inactive_throttled == TRUE) {
3531 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3532 &delayed_unlock, &force_anonymous, page_from_bg_q);
3533
3534 inactive_burst_count = 0;
3535
3536 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3537 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3538 }
3539
3540 lock_yield_check = TRUE;
3541 continue;
3542 }
3543
3544 /*
3545 * we've got a page that we can steal...
3546 * eliminate all mappings and make sure
3547 * we have the up-to-date modified state
3548 *
3549 * if we need to do a pmap_disconnect then we
3550 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3551 * provides the true state atomically... the
3552 * page was still mapped up to the pmap_disconnect
3553 * and may have been dirtied at the last microsecond
3554 *
3555 * Note that if 'pmapped' is FALSE then the page is not
3556 * and has not been in any map, so there is no point calling
3557 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3558 * of likely usage of the page.
3559 */
3560 if (m->vmp_pmapped == TRUE) {
3561 int pmap_options;
3562
3563 /*
3564 * Don't count this page as going into the compressor
3565 * if any of these are true:
3566 * 1) compressed pager isn't enabled
3567 * 2) Freezer enabled device with compressed pager
3568 * backend (exclusive use) i.e. most of the VM system
3569 * (including vm_pageout_scan) has no knowledge of
3570 * the compressor
3571 * 3) This page belongs to a file and hence will not be
3572 * sent into the compressor
3573 */
3574 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3575 object->internal == FALSE) {
3576 pmap_options = 0;
3577 } else if (m->vmp_dirty || m->vmp_precious) {
3578 /*
3579 * VM knows that this page is dirty (or
3580 * precious) and needs to be compressed
3581 * rather than freed.
3582 * Tell the pmap layer to count this page
3583 * as "compressed".
3584 */
3585 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3586 } else {
3587 /*
3588 * VM does not know if the page needs to
3589 * be preserved but the pmap layer might tell
3590 * us if any mapping has "modified" it.
3591 * Let's the pmap layer to count this page
3592 * as compressed if and only if it has been
3593 * modified.
3594 */
3595 pmap_options =
3596 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3597 }
3598 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3599 pmap_options,
3600 NULL);
3601 if (refmod_state & VM_MEM_MODIFIED) {
3602 SET_PAGE_DIRTY(m, FALSE);
3603 }
3604 }
3605
3606 /*
3607 * reset our count of pages that have been reclaimed
3608 * since the last page was 'stolen'
3609 */
3610 inactive_reclaim_run = 0;
3611
3612 /*
3613 * If it's clean and not precious, we can free the page.
3614 */
3615 if (!m->vmp_dirty && !m->vmp_precious) {
3616 vm_pageout_state.vm_pageout_inactive_clean++;
3617
3618 /*
3619 * OK, at this point we have found a page we are going to free.
3620 */
3621 #if CONFIG_PHANTOM_CACHE
3622 if (!object->internal) {
3623 vm_phantom_cache_add_ghost(m);
3624 }
3625 #endif
3626 goto reclaim_page;
3627 }
3628
3629 /*
3630 * The page may have been dirtied since the last check
3631 * for a throttled target queue (which may have been skipped
3632 * if the page was clean then). With the dirty page
3633 * disconnected here, we can make one final check.
3634 */
3635 if (object->internal) {
3636 if (VM_PAGE_Q_THROTTLED(iq)) {
3637 inactive_throttled = TRUE;
3638 }
3639 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3640 inactive_throttled = TRUE;
3641 }
3642
3643 if (inactive_throttled == TRUE) {
3644 goto throttle_inactive;
3645 }
3646
3647 #if VM_PRESSURE_EVENTS
3648 #if CONFIG_JETSAM
3649
3650 /*
3651 * If Jetsam is enabled, then the sending
3652 * of memory pressure notifications is handled
3653 * from the same thread that takes care of high-water
3654 * and other jetsams i.e. the memorystatus_thread.
3655 */
3656
3657 #else /* CONFIG_JETSAM */
3658
3659 vm_pressure_response();
3660
3661 #endif /* CONFIG_JETSAM */
3662 #endif /* VM_PRESSURE_EVENTS */
3663
3664 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3665 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3666 }
3667
3668 if (object->internal) {
3669 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3670 } else {
3671 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3672 }
3673
3674 /*
3675 * internal pages will go to the compressor...
3676 * external pages will go to the appropriate pager to be cleaned
3677 * and upon completion will end up on 'vm_page_queue_cleaned' which
3678 * is a preferred queue to steal from
3679 */
3680 vm_pageout_cluster(m);
3681 inactive_burst_count = 0;
3682
3683 /*
3684 * back to top of pageout scan loop
3685 */
3686 }
3687 }
3688
3689
3690 void
vm_page_free_reserve(int pages)3691 vm_page_free_reserve(
3692 int pages)
3693 {
3694 int free_after_reserve;
3695
3696 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3697 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3698 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3699 } else {
3700 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3701 }
3702 } else {
3703 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3704 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3705 } else {
3706 vm_page_free_reserved += pages;
3707 }
3708 }
3709 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3710
3711 vm_page_free_min = vm_page_free_reserved +
3712 VM_PAGE_FREE_MIN(free_after_reserve);
3713
3714 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3715 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3716 }
3717
3718 vm_page_free_target = vm_page_free_reserved +
3719 VM_PAGE_FREE_TARGET(free_after_reserve);
3720
3721 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3722 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3723 }
3724
3725 if (vm_page_free_target < vm_page_free_min + 5) {
3726 vm_page_free_target = vm_page_free_min + 5;
3727 }
3728
3729 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3730 }
3731
3732 /*
3733 * vm_pageout is the high level pageout daemon.
3734 */
3735
3736 void
vm_pageout_continue(void)3737 vm_pageout_continue(void)
3738 {
3739 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3740 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3741
3742 lck_mtx_lock(&vm_page_queue_free_lock);
3743 vm_pageout_running = TRUE;
3744 lck_mtx_unlock(&vm_page_queue_free_lock);
3745
3746 vm_pageout_scan();
3747 /*
3748 * we hold both the vm_page_queue_free_lock
3749 * and the vm_page_queues_lock at this point
3750 */
3751 assert(vm_page_free_wanted == 0);
3752 assert(vm_page_free_wanted_privileged == 0);
3753 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3754
3755 vm_pageout_running = FALSE;
3756 #if XNU_TARGET_OS_OSX
3757 if (vm_pageout_waiter) {
3758 vm_pageout_waiter = FALSE;
3759 thread_wakeup((event_t)&vm_pageout_waiter);
3760 }
3761 #endif /* XNU_TARGET_OS_OSX */
3762
3763 lck_mtx_unlock(&vm_page_queue_free_lock);
3764 vm_page_unlock_queues();
3765
3766 thread_block((thread_continue_t)vm_pageout_continue);
3767 /*NOTREACHED*/
3768 }
3769
3770 #if XNU_TARGET_OS_OSX
3771 kern_return_t
vm_pageout_wait(uint64_t deadline)3772 vm_pageout_wait(uint64_t deadline)
3773 {
3774 kern_return_t kr;
3775
3776 lck_mtx_lock(&vm_page_queue_free_lock);
3777 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3778 vm_pageout_waiter = TRUE;
3779 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3780 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3781 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3782 kr = KERN_OPERATION_TIMED_OUT;
3783 }
3784 }
3785 lck_mtx_unlock(&vm_page_queue_free_lock);
3786
3787 return kr;
3788 }
3789 #endif /* XNU_TARGET_OS_OSX */
3790
3791
3792 static void
vm_pageout_iothread_external_continue(struct vm_pageout_queue * q)3793 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3794 {
3795 vm_page_t m = NULL;
3796 vm_object_t object;
3797 vm_object_offset_t offset;
3798 memory_object_t pager;
3799
3800 /* On systems with a compressor, the external IO thread clears its
3801 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3802 * creation)
3803 */
3804 if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3805 current_thread()->options &= ~TH_OPT_VMPRIV;
3806 }
3807
3808 vm_page_lockspin_queues();
3809
3810 while (!vm_page_queue_empty(&q->pgo_pending)) {
3811 q->pgo_busy = TRUE;
3812 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3813
3814 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3815 VM_PAGE_CHECK(m);
3816 /*
3817 * grab a snapshot of the object and offset this
3818 * page is tabled in so that we can relookup this
3819 * page after we've taken the object lock - these
3820 * fields are stable while we hold the page queues lock
3821 * but as soon as we drop it, there is nothing to keep
3822 * this page in this object... we hold an activity_in_progress
3823 * on this object which will keep it from terminating
3824 */
3825 object = VM_PAGE_OBJECT(m);
3826 offset = m->vmp_offset;
3827
3828 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3829 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3830
3831 vm_page_unlock_queues();
3832
3833 vm_object_lock(object);
3834
3835 m = vm_page_lookup(object, offset);
3836
3837 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3838 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3839 /*
3840 * it's either the same page that someone else has
3841 * started cleaning (or it's finished cleaning or
3842 * been put back on the pageout queue), or
3843 * the page has been freed or we have found a
3844 * new page at this offset... in all of these cases
3845 * we merely need to release the activity_in_progress
3846 * we took when we put the page on the pageout queue
3847 */
3848 vm_object_activity_end(object);
3849 vm_object_unlock(object);
3850
3851 vm_page_lockspin_queues();
3852 continue;
3853 }
3854 pager = object->pager;
3855
3856 if (pager == MEMORY_OBJECT_NULL) {
3857 /*
3858 * This pager has been destroyed by either
3859 * memory_object_destroy or vm_object_destroy, and
3860 * so there is nowhere for the page to go.
3861 */
3862 if (m->vmp_free_when_done) {
3863 /*
3864 * Just free the page... VM_PAGE_FREE takes
3865 * care of cleaning up all the state...
3866 * including doing the vm_pageout_throttle_up
3867 */
3868 VM_PAGE_FREE(m);
3869 } else {
3870 vm_page_lockspin_queues();
3871
3872 vm_pageout_throttle_up(m);
3873 vm_page_activate(m);
3874
3875 vm_page_unlock_queues();
3876
3877 /*
3878 * And we are done with it.
3879 */
3880 }
3881 vm_object_activity_end(object);
3882 vm_object_unlock(object);
3883
3884 vm_page_lockspin_queues();
3885 continue;
3886 }
3887 #if 0
3888 /*
3889 * we don't hold the page queue lock
3890 * so this check isn't safe to make
3891 */
3892 VM_PAGE_CHECK(m);
3893 #endif
3894 /*
3895 * give back the activity_in_progress reference we
3896 * took when we queued up this page and replace it
3897 * it with a paging_in_progress reference that will
3898 * also hold the paging offset from changing and
3899 * prevent the object from terminating
3900 */
3901 vm_object_activity_end(object);
3902 vm_object_paging_begin(object);
3903 vm_object_unlock(object);
3904
3905 /*
3906 * Send the data to the pager.
3907 * any pageout clustering happens there
3908 */
3909 memory_object_data_return(pager,
3910 m->vmp_offset + object->paging_offset,
3911 PAGE_SIZE,
3912 NULL,
3913 NULL,
3914 FALSE,
3915 FALSE,
3916 0);
3917
3918 vm_object_lock(object);
3919 vm_object_paging_end(object);
3920 vm_object_unlock(object);
3921
3922 vm_pageout_io_throttle();
3923
3924 vm_page_lockspin_queues();
3925 }
3926 q->pgo_busy = FALSE;
3927 q->pgo_idle = TRUE;
3928
3929 assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3930 vm_page_unlock_queues();
3931
3932 thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3933 /*NOTREACHED*/
3934 }
3935
3936
3937 #define MAX_FREE_BATCH 32
3938 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3939 * this thread.
3940 */
3941
3942
3943 void
3944 vm_pageout_iothread_internal_continue(struct cq *);
3945 void
vm_pageout_iothread_internal_continue(struct cq * cq)3946 vm_pageout_iothread_internal_continue(struct cq *cq)
3947 {
3948 struct vm_pageout_queue *q;
3949 vm_page_t m = NULL;
3950 boolean_t pgo_draining;
3951 vm_page_t local_q;
3952 int local_cnt;
3953 vm_page_t local_freeq = NULL;
3954 int local_freed = 0;
3955 int local_batch_size;
3956 #if DEVELOPMENT || DEBUG
3957 int ncomps = 0;
3958 boolean_t marked_active = FALSE;
3959 #endif
3960 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3961
3962 q = cq->q;
3963 #if __AMP__
3964 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
3965 local_batch_size = (q->pgo_maxlaundry >> 3);
3966 local_batch_size = MAX(local_batch_size, 16);
3967 } else {
3968 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3969 }
3970 #else
3971 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3972 #endif
3973
3974 #if RECORD_THE_COMPRESSED_DATA
3975 if (q->pgo_laundry) {
3976 c_compressed_record_init();
3977 }
3978 #endif
3979 while (TRUE) {
3980 int pages_left_on_q = 0;
3981
3982 local_cnt = 0;
3983 local_q = NULL;
3984
3985 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3986
3987 vm_page_lock_queues();
3988 #if DEVELOPMENT || DEBUG
3989 if (marked_active == FALSE) {
3990 vmct_active++;
3991 vmct_state[cq->id] = VMCT_ACTIVE;
3992 marked_active = TRUE;
3993 if (vmct_active == 1) {
3994 vm_compressor_epoch_start = mach_absolute_time();
3995 }
3996 }
3997 #endif
3998 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3999
4000 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4001
4002 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4003 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4004 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4005 VM_PAGE_CHECK(m);
4006
4007 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4008 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4009 m->vmp_laundry = FALSE;
4010
4011 m->vmp_snext = local_q;
4012 local_q = m;
4013 local_cnt++;
4014 }
4015 if (local_q == NULL) {
4016 break;
4017 }
4018
4019 q->pgo_busy = TRUE;
4020
4021 if ((pgo_draining = q->pgo_draining) == FALSE) {
4022 vm_pageout_throttle_up_batch(q, local_cnt);
4023 pages_left_on_q = q->pgo_laundry;
4024 } else {
4025 pages_left_on_q = q->pgo_laundry - local_cnt;
4026 }
4027
4028 vm_page_unlock_queues();
4029
4030 #if !RECORD_THE_COMPRESSED_DATA
4031 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4032 thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
4033 }
4034 #endif
4035 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4036
4037 while (local_q) {
4038 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4039
4040 m = local_q;
4041 local_q = m->vmp_snext;
4042 m->vmp_snext = NULL;
4043
4044 if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4045 #if DEVELOPMENT || DEBUG
4046 ncomps++;
4047 #endif
4048 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4049
4050 m->vmp_snext = local_freeq;
4051 local_freeq = m;
4052 local_freed++;
4053
4054 if (local_freed >= MAX_FREE_BATCH) {
4055 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4056
4057 vm_page_free_list(local_freeq, TRUE);
4058
4059 local_freeq = NULL;
4060 local_freed = 0;
4061 }
4062 }
4063 #if !CONFIG_JETSAM
4064 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4065 kern_return_t wait_result;
4066 int need_wakeup = 0;
4067
4068 if (local_freeq) {
4069 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4070
4071 vm_page_free_list(local_freeq, TRUE);
4072 local_freeq = NULL;
4073 local_freed = 0;
4074
4075 continue;
4076 }
4077 lck_mtx_lock_spin(&vm_page_queue_free_lock);
4078
4079 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4080 if (vm_page_free_wanted_privileged++ == 0) {
4081 need_wakeup = 1;
4082 }
4083 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4084
4085 lck_mtx_unlock(&vm_page_queue_free_lock);
4086
4087 if (need_wakeup) {
4088 thread_wakeup((event_t)&vm_page_free_wanted);
4089 }
4090
4091 if (wait_result == THREAD_WAITING) {
4092 thread_block(THREAD_CONTINUE_NULL);
4093 }
4094 } else {
4095 lck_mtx_unlock(&vm_page_queue_free_lock);
4096 }
4097 }
4098 #endif
4099 }
4100 if (local_freeq) {
4101 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4102
4103 vm_page_free_list(local_freeq, TRUE);
4104 local_freeq = NULL;
4105 local_freed = 0;
4106 }
4107 if (pgo_draining == TRUE) {
4108 vm_page_lockspin_queues();
4109 vm_pageout_throttle_up_batch(q, local_cnt);
4110 vm_page_unlock_queues();
4111 }
4112 }
4113 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4114
4115 /*
4116 * queue lock is held and our q is empty
4117 */
4118 q->pgo_busy = FALSE;
4119 q->pgo_idle = TRUE;
4120
4121 assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
4122 #if DEVELOPMENT || DEBUG
4123 if (marked_active == TRUE) {
4124 vmct_active--;
4125 vmct_state[cq->id] = VMCT_IDLE;
4126
4127 if (vmct_active == 0) {
4128 vm_compressor_epoch_stop = mach_absolute_time();
4129 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4130 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4131 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4132 /* This interval includes intervals where one or more
4133 * compressor threads were pre-empted
4134 */
4135 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4136 }
4137 }
4138 #endif
4139 vm_page_unlock_queues();
4140 #if DEVELOPMENT || DEBUG
4141 if (__improbable(vm_compressor_time_thread)) {
4142 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4143 vmct_stats.vmct_pages[cq->id] += ncomps;
4144 vmct_stats.vmct_iterations[cq->id]++;
4145 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4146 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4147 }
4148 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4149 vmct_stats.vmct_minpages[cq->id] = ncomps;
4150 }
4151 }
4152 #endif
4153
4154 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4155
4156 thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4157 /*NOTREACHED*/
4158 }
4159
4160
4161 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4162 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4163 {
4164 vm_object_t object;
4165 memory_object_t pager;
4166 int compressed_count_delta;
4167 kern_return_t retval;
4168
4169 object = VM_PAGE_OBJECT(m);
4170
4171 assert(!m->vmp_free_when_done);
4172 assert(!m->vmp_laundry);
4173
4174 pager = object->pager;
4175
4176 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4177 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4178
4179 vm_object_lock(object);
4180
4181 /*
4182 * If there is no memory object for the page, create
4183 * one and hand it to the compression pager.
4184 */
4185
4186 if (!object->pager_initialized) {
4187 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4188 }
4189 if (!object->pager_initialized) {
4190 vm_object_compressor_pager_create(object);
4191 }
4192
4193 pager = object->pager;
4194
4195 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4196 /*
4197 * Still no pager for the object,
4198 * or the pager has been destroyed.
4199 * Reactivate the page.
4200 *
4201 * Should only happen if there is no
4202 * compression pager
4203 */
4204 PAGE_WAKEUP_DONE(m);
4205
4206 vm_page_lockspin_queues();
4207 vm_page_activate(m);
4208 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4209 vm_page_unlock_queues();
4210
4211 /*
4212 * And we are done with it.
4213 */
4214 vm_object_activity_end(object);
4215 vm_object_unlock(object);
4216
4217 return KERN_FAILURE;
4218 }
4219 vm_object_unlock(object);
4220
4221 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4222 }
4223 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4224 assert(object->activity_in_progress > 0);
4225
4226 retval = vm_compressor_pager_put(
4227 pager,
4228 m->vmp_offset + object->paging_offset,
4229 VM_PAGE_GET_PHYS_PAGE(m),
4230 current_chead,
4231 scratch_buf,
4232 &compressed_count_delta);
4233
4234 vm_object_lock(object);
4235
4236 assert(object->activity_in_progress > 0);
4237 assert(VM_PAGE_OBJECT(m) == object);
4238 assert( !VM_PAGE_WIRED(m));
4239
4240 vm_compressor_pager_count(pager,
4241 compressed_count_delta,
4242 FALSE, /* shared_lock */
4243 object);
4244
4245 if (retval == KERN_SUCCESS) {
4246 /*
4247 * If the object is purgeable, its owner's
4248 * purgeable ledgers will be updated in
4249 * vm_page_remove() but the page still
4250 * contributes to the owner's memory footprint,
4251 * so account for it as such.
4252 */
4253 if ((object->purgable != VM_PURGABLE_DENY ||
4254 object->vo_ledger_tag) &&
4255 object->vo_owner != NULL) {
4256 /* one more compressed purgeable/tagged page */
4257 vm_object_owner_compressed_update(object,
4258 +1);
4259 }
4260 counter_inc(&vm_statistics_compressions);
4261
4262 if (m->vmp_tabled) {
4263 vm_page_remove(m, TRUE);
4264 }
4265 } else {
4266 PAGE_WAKEUP_DONE(m);
4267
4268 vm_page_lockspin_queues();
4269
4270 vm_page_activate(m);
4271 vm_pageout_vminfo.vm_compressor_failed++;
4272
4273 vm_page_unlock_queues();
4274 }
4275 vm_object_activity_end(object);
4276 vm_object_unlock(object);
4277
4278 return retval;
4279 }
4280
4281
4282 static void
vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue * eq,boolean_t req_lowpriority)4283 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4284 {
4285 uint32_t policy;
4286
4287 if (hibernate_cleaning_in_progress == TRUE) {
4288 req_lowpriority = FALSE;
4289 }
4290
4291 if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4292 vm_page_unlock_queues();
4293
4294 if (req_lowpriority == TRUE) {
4295 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4296 DTRACE_VM(laundrythrottle);
4297 } else {
4298 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4299 DTRACE_VM(laundryunthrottle);
4300 }
4301 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4302 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4303
4304 vm_page_lock_queues();
4305 eq->pgo_lowpriority = req_lowpriority;
4306 }
4307 }
4308
4309
4310 static void
vm_pageout_iothread_external(void)4311 vm_pageout_iothread_external(void)
4312 {
4313 thread_t self = current_thread();
4314
4315 self->options |= TH_OPT_VMPRIV;
4316
4317 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4318
4319 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4320 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4321
4322 vm_page_lock_queues();
4323
4324 vm_pageout_queue_external.pgo_tid = self->thread_id;
4325 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4326 vm_pageout_queue_external.pgo_inited = TRUE;
4327
4328 vm_page_unlock_queues();
4329
4330 #if CONFIG_THREAD_GROUPS
4331 thread_group_vm_add();
4332 #endif /* CONFIG_THREAD_GROUPS */
4333
4334 vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4335
4336 /*NOTREACHED*/
4337 }
4338
4339
4340 static void
vm_pageout_iothread_internal(struct cq * cq)4341 vm_pageout_iothread_internal(struct cq *cq)
4342 {
4343 thread_t self = current_thread();
4344
4345 self->options |= TH_OPT_VMPRIV;
4346
4347 vm_page_lock_queues();
4348
4349 vm_pageout_queue_internal.pgo_tid = self->thread_id;
4350 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4351 vm_pageout_queue_internal.pgo_inited = TRUE;
4352
4353 vm_page_unlock_queues();
4354
4355 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4356 thread_vm_bind_group_add();
4357 }
4358
4359 #if CONFIG_THREAD_GROUPS
4360 thread_group_vm_add();
4361 #endif /* CONFIG_THREAD_GROUPS */
4362
4363 #if __AMP__
4364 if (vm_compressor_ebound) {
4365 /*
4366 * Use the soft bound option for vm_compressor to allow it to run on
4367 * P-cores if E-cluster is unavailable.
4368 */
4369 thread_bind_cluster_type(self, 'E', true);
4370 }
4371 #endif /* __AMP__ */
4372
4373 thread_set_thread_name(current_thread(), "VM_compressor");
4374 #if DEVELOPMENT || DEBUG
4375 vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4376 #endif
4377 vm_pageout_iothread_internal_continue(cq);
4378
4379 /*NOTREACHED*/
4380 }
4381
4382 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4383 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4384 {
4385 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4386 return KERN_SUCCESS;
4387 } else {
4388 return KERN_FAILURE; /* Already set */
4389 }
4390 }
4391
4392 extern boolean_t memorystatus_manual_testing_on;
4393 extern unsigned int memorystatus_level;
4394
4395
4396 #if VM_PRESSURE_EVENTS
4397
4398 boolean_t vm_pressure_events_enabled = FALSE;
4399
4400 extern uint64_t next_warning_notification_sent_at_ts;
4401 extern uint64_t next_critical_notification_sent_at_ts;
4402
4403 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4404
4405 /*
4406 * The last time there was change in pressure level OR we forced a check
4407 * because the system is stuck in a non-normal pressure level.
4408 */
4409 uint64_t vm_pressure_last_level_transition_abs = 0;
4410
4411 /*
4412 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4413 * level before resending out notifications for that level again.
4414 */
4415 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4416
4417 void
vm_pressure_response(void)4418 vm_pressure_response(void)
4419 {
4420 vm_pressure_level_t old_level = kVMPressureNormal;
4421 int new_level = -1;
4422 unsigned int total_pages;
4423 uint64_t available_memory = 0;
4424 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4425 bool force_check = false;
4426 int time_in_mins;
4427
4428
4429 if (vm_pressure_events_enabled == FALSE) {
4430 return;
4431 }
4432
4433 #if !XNU_TARGET_OS_OSX
4434
4435 available_memory = (uint64_t) memorystatus_available_pages;
4436
4437 #else /* !XNU_TARGET_OS_OSX */
4438
4439 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4440 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4441
4442 #endif /* !XNU_TARGET_OS_OSX */
4443
4444 total_pages = (unsigned int) atop_64(max_mem);
4445 #if CONFIG_SECLUDED_MEMORY
4446 total_pages -= vm_page_secluded_count;
4447 #endif /* CONFIG_SECLUDED_MEMORY */
4448 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4449
4450 if (memorystatus_manual_testing_on) {
4451 return;
4452 }
4453
4454 curr_ts = mach_absolute_time();
4455 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4456
4457 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4458 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4459 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4460
4461 old_level = memorystatus_vm_pressure_level;
4462
4463 switch (memorystatus_vm_pressure_level) {
4464 case kVMPressureNormal:
4465 {
4466 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4467 new_level = kVMPressureCritical;
4468 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4469 new_level = kVMPressureWarning;
4470 }
4471 break;
4472 }
4473
4474 case kVMPressureWarning:
4475 case kVMPressureUrgent:
4476 {
4477 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4478 new_level = kVMPressureNormal;
4479 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4480 new_level = kVMPressureCritical;
4481 } else if (force_check) {
4482 new_level = kVMPressureWarning;
4483 next_warning_notification_sent_at_ts = curr_ts;
4484 }
4485 break;
4486 }
4487
4488 case kVMPressureCritical:
4489 {
4490 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4491 new_level = kVMPressureNormal;
4492 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4493 new_level = kVMPressureWarning;
4494 } else if (force_check) {
4495 new_level = kVMPressureCritical;
4496 next_critical_notification_sent_at_ts = curr_ts;
4497 }
4498 break;
4499 }
4500
4501 default:
4502 return;
4503 }
4504
4505 if (new_level != -1 || force_check) {
4506 if (new_level != -1) {
4507 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4508
4509 if (new_level != (int) old_level) {
4510 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4511 new_level, old_level, 0, 0);
4512 }
4513 } else {
4514 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4515 new_level, old_level, force_check, 0);
4516 }
4517
4518 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4519 /*
4520 * We don't want to schedule a wakeup while hibernation is in progress
4521 * because that could collide with checks for non-monotonicity in the scheduler.
4522 * We do however do all the updates to memorystatus_vm_pressure_level because
4523 * we _might_ want to use that for decisions regarding which pages or how
4524 * many pages we want to dump in hibernation.
4525 */
4526 return;
4527 }
4528
4529 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4530 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4531 thread_wakeup(&vm_pressure_thread);
4532 }
4533
4534 if (old_level != memorystatus_vm_pressure_level) {
4535 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4536 }
4537 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4538 }
4539 }
4540 }
4541 #endif /* VM_PRESSURE_EVENTS */
4542
4543 /*
4544 * Function called by a kernel thread to either get the current pressure level or
4545 * wait until memory pressure changes from a given level.
4546 */
4547 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4548 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4549 {
4550 #if !VM_PRESSURE_EVENTS
4551
4552 return KERN_FAILURE;
4553
4554 #else /* VM_PRESSURE_EVENTS */
4555
4556 wait_result_t wr = 0;
4557 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4558
4559 if (pressure_level == NULL) {
4560 return KERN_INVALID_ARGUMENT;
4561 }
4562
4563 if (*pressure_level == kVMPressureJetsam) {
4564 if (!wait_for_pressure) {
4565 return KERN_INVALID_ARGUMENT;
4566 }
4567
4568 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4569 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4570 THREAD_INTERRUPTIBLE);
4571 if (wr == THREAD_WAITING) {
4572 ++memorystatus_jetsam_fg_band_waiters;
4573 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4574 wr = thread_block(THREAD_CONTINUE_NULL);
4575 } else {
4576 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4577 }
4578 if (wr != THREAD_AWAKENED) {
4579 return KERN_ABORTED;
4580 }
4581 *pressure_level = kVMPressureJetsam;
4582 return KERN_SUCCESS;
4583 }
4584
4585 if (wait_for_pressure == TRUE) {
4586 while (old_level == *pressure_level) {
4587 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4588 THREAD_INTERRUPTIBLE);
4589 if (wr == THREAD_WAITING) {
4590 wr = thread_block(THREAD_CONTINUE_NULL);
4591 }
4592 if (wr == THREAD_INTERRUPTED) {
4593 return KERN_ABORTED;
4594 }
4595
4596 if (wr == THREAD_AWAKENED) {
4597 old_level = memorystatus_vm_pressure_level;
4598 }
4599 }
4600 }
4601
4602 *pressure_level = old_level;
4603 return KERN_SUCCESS;
4604 #endif /* VM_PRESSURE_EVENTS */
4605 }
4606
4607 #if VM_PRESSURE_EVENTS
4608 void
vm_pressure_thread(void)4609 vm_pressure_thread(void)
4610 {
4611 static boolean_t thread_initialized = FALSE;
4612
4613 if (thread_initialized == TRUE) {
4614 vm_pageout_state.vm_pressure_thread_running = TRUE;
4615 consider_vm_pressure_events();
4616 vm_pageout_state.vm_pressure_thread_running = FALSE;
4617 }
4618
4619 #if CONFIG_THREAD_GROUPS
4620 thread_group_vm_add();
4621 #endif /* CONFIG_THREAD_GROUPS */
4622
4623 thread_set_thread_name(current_thread(), "VM_pressure");
4624 thread_initialized = TRUE;
4625 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4626 thread_block((thread_continue_t)vm_pressure_thread);
4627 }
4628 #endif /* VM_PRESSURE_EVENTS */
4629
4630
4631 /*
4632 * called once per-second via "compute_averages"
4633 */
4634 void
compute_pageout_gc_throttle(__unused void * arg)4635 compute_pageout_gc_throttle(__unused void *arg)
4636 {
4637 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4638 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4639
4640 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4641 }
4642 }
4643
4644 /*
4645 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4646 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4647 * jetsams. We need to check if the zone map size is above its jetsam limit to
4648 * decide if this was indeed the case.
4649 *
4650 * We need to do this on a different thread because of the following reasons:
4651 *
4652 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4653 * itself causing the system to hang. We perform synchronous jetsams if we're
4654 * leaking in the VM map entries zone, so the leaking process could be doing a
4655 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4656 * jetsam itself. We also need the vm_map lock on the process termination path,
4657 * which would now lead the dying process to deadlock against itself.
4658 *
4659 * 2. The jetsam path might need to allocate zone memory itself. We could try
4660 * using the non-blocking variant of zalloc for this path, but we can still
4661 * end up trying to do a kernel_memory_allocate when the zone maps are almost
4662 * full.
4663 */
4664 __dead2
4665 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4666 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4667 {
4668 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4669
4670 if (step == VM_PAGEOUT_GC_INIT) {
4671 /* first time being called is not about GC */
4672 #if CONFIG_THREAD_GROUPS
4673 thread_group_vm_add();
4674 #endif /* CONFIG_THREAD_GROUPS */
4675 } else if (zone_map_nearing_exhaustion()) {
4676 /*
4677 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4678 *
4679 * Bail out after calling zone_gc (which triggers the
4680 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4681 * operations that clear out a bunch of caches might allocate zone
4682 * memory themselves (for eg. vm_map operations would need VM map
4683 * entries). Since the zone map is almost full at this point, we
4684 * could end up with a panic. We just need to quickly jetsam a
4685 * process and exit here.
4686 *
4687 * It could so happen that we were woken up to relieve memory
4688 * pressure and the zone map also happened to be near its limit at
4689 * the time, in which case we'll skip out early. But that should be
4690 * ok; if memory pressure persists, the thread will simply be woken
4691 * up again.
4692 */
4693 zone_gc(ZONE_GC_JETSAM);
4694 } else {
4695 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4696 boolean_t buf_large_zfree = FALSE;
4697 boolean_t first_try = TRUE;
4698
4699 stack_collect();
4700
4701 consider_machine_collect();
4702 mbuf_drain(FALSE);
4703
4704 do {
4705 if (consider_buffer_cache_collect != NULL) {
4706 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4707 }
4708 if (first_try == TRUE || buf_large_zfree == TRUE) {
4709 /*
4710 * zone_gc should be last, because the other operations
4711 * might return memory to zones.
4712 */
4713 zone_gc(ZONE_GC_TRIM);
4714 }
4715 first_try = FALSE;
4716 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4717
4718 consider_machine_adjust();
4719 }
4720
4721 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4722
4723 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4724 __builtin_unreachable();
4725 }
4726
4727
4728 #if VM_PAGE_BUCKETS_CHECK
4729 #if VM_PAGE_FAKE_BUCKETS
4730 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4731 #endif /* VM_PAGE_FAKE_BUCKETS */
4732 #endif /* VM_PAGE_BUCKETS_CHECK */
4733
4734
4735
4736 void
vm_set_restrictions(unsigned int num_cpus)4737 vm_set_restrictions(unsigned int num_cpus)
4738 {
4739 int vm_restricted_to_single_processor = 0;
4740
4741 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4742 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4743 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4744 } else {
4745 assert(num_cpus > 0);
4746
4747 if (num_cpus <= 3) {
4748 /*
4749 * on systems with a limited number of CPUS, bind the
4750 * 4 major threads that can free memory and that tend to use
4751 * a fair bit of CPU under pressured conditions to a single processor.
4752 * This insures that these threads don't hog all of the available CPUs
4753 * (important for camera launch), while allowing them to run independently
4754 * w/r to locks... the 4 threads are
4755 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4756 * vm_compressor_swap_trigger_thread (minor and major compactions),
4757 * memorystatus_thread (jetsams).
4758 *
4759 * the first time the thread is run, it is responsible for checking the
4760 * state of vm_restricted_to_single_processor, and if TRUE it calls
4761 * thread_bind_master... someday this should be replaced with a group
4762 * scheduling mechanism and KPI.
4763 */
4764 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4765 } else {
4766 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4767 }
4768 }
4769 }
4770
4771 /*
4772 * Set up vm_config based on the vm_compressor_mode.
4773 * Must run BEFORE the pageout thread starts up.
4774 */
4775 __startup_func
4776 void
vm_config_init(void)4777 vm_config_init(void)
4778 {
4779 bzero(&vm_config, sizeof(vm_config));
4780
4781 switch (vm_compressor_mode) {
4782 case VM_PAGER_DEFAULT:
4783 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4784 OS_FALLTHROUGH;
4785
4786 case VM_PAGER_COMPRESSOR_WITH_SWAP:
4787 vm_config.compressor_is_present = TRUE;
4788 vm_config.swap_is_present = TRUE;
4789 vm_config.compressor_is_active = TRUE;
4790 vm_config.swap_is_active = TRUE;
4791 break;
4792
4793 case VM_PAGER_COMPRESSOR_NO_SWAP:
4794 vm_config.compressor_is_present = TRUE;
4795 vm_config.swap_is_present = TRUE;
4796 vm_config.compressor_is_active = TRUE;
4797 break;
4798
4799 case VM_PAGER_FREEZER_DEFAULT:
4800 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4801 OS_FALLTHROUGH;
4802
4803 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4804 vm_config.compressor_is_present = TRUE;
4805 vm_config.swap_is_present = TRUE;
4806 break;
4807
4808 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4809 vm_config.compressor_is_present = TRUE;
4810 vm_config.swap_is_present = TRUE;
4811 vm_config.compressor_is_active = TRUE;
4812 vm_config.freezer_swap_is_active = TRUE;
4813 break;
4814
4815 case VM_PAGER_NOT_CONFIGURED:
4816 break;
4817
4818 default:
4819 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4820 break;
4821 }
4822 }
4823
4824 __startup_func
4825 static void
vm_pageout_create_gc_thread(void)4826 vm_pageout_create_gc_thread(void)
4827 {
4828 thread_t thread;
4829
4830 if (kernel_thread_create(vm_pageout_garbage_collect,
4831 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
4832 panic("vm_pageout_garbage_collect: create failed");
4833 }
4834 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
4835 if (thread->reserved_stack == 0) {
4836 assert(thread->kernel_stack);
4837 thread->reserved_stack = thread->kernel_stack;
4838 }
4839
4840 /* thread is started in vm_pageout() */
4841 vm_pageout_gc_thread = thread;
4842 }
4843 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
4844
4845 void
vm_pageout(void)4846 vm_pageout(void)
4847 {
4848 thread_t self = current_thread();
4849 thread_t thread;
4850 kern_return_t result;
4851 spl_t s;
4852
4853 /*
4854 * Set thread privileges.
4855 */
4856 s = splsched();
4857
4858 #if CONFIG_VPS_DYNAMIC_PRIO
4859
4860 int vps_dynprio_bootarg = 0;
4861
4862 if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
4863 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
4864 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
4865 } else {
4866 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4867 vps_dynamic_priority_enabled = TRUE;
4868 } else {
4869 vps_dynamic_priority_enabled = FALSE;
4870 }
4871 }
4872
4873 if (vps_dynamic_priority_enabled) {
4874 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
4875 thread_set_eager_preempt(self);
4876 } else {
4877 sched_set_kernel_thread_priority(self, BASEPRI_VM);
4878 }
4879
4880 #else /* CONFIG_VPS_DYNAMIC_PRIO */
4881
4882 vps_dynamic_priority_enabled = FALSE;
4883 sched_set_kernel_thread_priority(self, BASEPRI_VM);
4884
4885 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
4886
4887 thread_lock(self);
4888 self->options |= TH_OPT_VMPRIV;
4889 thread_unlock(self);
4890
4891 if (!self->reserved_stack) {
4892 self->reserved_stack = self->kernel_stack;
4893 }
4894
4895 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
4896 vps_dynamic_priority_enabled == FALSE) {
4897 thread_vm_bind_group_add();
4898 }
4899
4900
4901 #if CONFIG_THREAD_GROUPS
4902 thread_group_vm_add();
4903 #endif /* CONFIG_THREAD_GROUPS */
4904
4905 #if __AMP__
4906 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
4907 if (vm_pgo_pbound) {
4908 /*
4909 * Use the soft bound option for vm pageout to allow it to run on
4910 * E-cores if P-cluster is unavailable.
4911 */
4912 thread_bind_cluster_type(self, 'P', true);
4913 }
4914 #endif /* __AMP__ */
4915
4916 splx(s);
4917
4918 thread_set_thread_name(current_thread(), "VM_pageout_scan");
4919
4920 /*
4921 * Initialize some paging parameters.
4922 */
4923
4924 vm_pageout_state.vm_pressure_thread_running = FALSE;
4925 vm_pageout_state.vm_pressure_changed = FALSE;
4926 vm_pageout_state.memorystatus_purge_on_warning = 2;
4927 vm_pageout_state.memorystatus_purge_on_urgent = 5;
4928 vm_pageout_state.memorystatus_purge_on_critical = 8;
4929 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
4930 vm_pageout_state.vm_page_speculative_percentage = 5;
4931 vm_pageout_state.vm_page_speculative_target = 0;
4932
4933 vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
4934 vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
4935
4936 vm_pageout_state.vm_pageout_swap_wait = 0;
4937 vm_pageout_state.vm_pageout_idle_wait = 0;
4938 vm_pageout_state.vm_pageout_empty_wait = 0;
4939 vm_pageout_state.vm_pageout_burst_wait = 0;
4940 vm_pageout_state.vm_pageout_deadlock_wait = 0;
4941 vm_pageout_state.vm_pageout_deadlock_relief = 0;
4942 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
4943
4944 vm_pageout_state.vm_pageout_inactive = 0;
4945 vm_pageout_state.vm_pageout_inactive_used = 0;
4946 vm_pageout_state.vm_pageout_inactive_clean = 0;
4947
4948 vm_pageout_state.vm_memory_pressure = 0;
4949 vm_pageout_state.vm_page_filecache_min = 0;
4950 #if CONFIG_JETSAM
4951 vm_pageout_state.vm_page_filecache_min_divisor = 70;
4952 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
4953 #else
4954 vm_pageout_state.vm_page_filecache_min_divisor = 27;
4955 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
4956 #endif
4957 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
4958
4959 vm_pageout_state.vm_pageout_considered_page_last = 0;
4960
4961 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
4962 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4963 }
4964
4965 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
4966 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4967 }
4968
4969 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
4970 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4971 }
4972
4973 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
4974 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4975 }
4976
4977 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
4978 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4979 }
4980
4981 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
4982 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4983 }
4984
4985 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
4986 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4987 }
4988 /*
4989 * even if we've already called vm_page_free_reserve
4990 * call it again here to insure that the targets are
4991 * accurately calculated (it uses vm_page_free_count_init)
4992 * calling it with an arg of 0 will not change the reserve
4993 * but will re-calculate free_min and free_target
4994 */
4995 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4996 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4997 } else {
4998 vm_page_free_reserve(0);
4999 }
5000
5001
5002 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5003 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5004 vm_pageout_queue_external.pgo_laundry = 0;
5005 vm_pageout_queue_external.pgo_idle = FALSE;
5006 vm_pageout_queue_external.pgo_busy = FALSE;
5007 vm_pageout_queue_external.pgo_throttled = FALSE;
5008 vm_pageout_queue_external.pgo_draining = FALSE;
5009 vm_pageout_queue_external.pgo_lowpriority = FALSE;
5010 vm_pageout_queue_external.pgo_tid = -1;
5011 vm_pageout_queue_external.pgo_inited = FALSE;
5012
5013 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5014 vm_pageout_queue_internal.pgo_maxlaundry = 0;
5015 vm_pageout_queue_internal.pgo_laundry = 0;
5016 vm_pageout_queue_internal.pgo_idle = FALSE;
5017 vm_pageout_queue_internal.pgo_busy = FALSE;
5018 vm_pageout_queue_internal.pgo_throttled = FALSE;
5019 vm_pageout_queue_internal.pgo_draining = FALSE;
5020 vm_pageout_queue_internal.pgo_lowpriority = FALSE;
5021 vm_pageout_queue_internal.pgo_tid = -1;
5022 vm_pageout_queue_internal.pgo_inited = FALSE;
5023
5024 /* internal pageout thread started when default pager registered first time */
5025 /* external pageout and garbage collection threads started here */
5026
5027 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
5028 BASEPRI_VM,
5029 &vm_pageout_state.vm_pageout_external_iothread);
5030 if (result != KERN_SUCCESS) {
5031 panic("vm_pageout_iothread_external: create failed");
5032 }
5033 thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
5034 thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
5035
5036 thread_mtx_lock(vm_pageout_gc_thread );
5037 thread_start(vm_pageout_gc_thread );
5038 thread_mtx_unlock(vm_pageout_gc_thread);
5039
5040 #if VM_PRESSURE_EVENTS
5041 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5042 BASEPRI_DEFAULT,
5043 &thread);
5044
5045 if (result != KERN_SUCCESS) {
5046 panic("vm_pressure_thread: create failed");
5047 }
5048
5049 thread_deallocate(thread);
5050 #endif
5051
5052 vm_object_reaper_init();
5053
5054
5055 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5056 vm_compressor_init();
5057 }
5058
5059 #if VM_PRESSURE_EVENTS
5060 vm_pressure_events_enabled = TRUE;
5061 #endif /* VM_PRESSURE_EVENTS */
5062
5063 #if CONFIG_PHANTOM_CACHE
5064 vm_phantom_cache_init();
5065 #endif
5066 #if VM_PAGE_BUCKETS_CHECK
5067 #if VM_PAGE_FAKE_BUCKETS
5068 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5069 (uint64_t) vm_page_fake_buckets_start,
5070 (uint64_t) vm_page_fake_buckets_end);
5071 pmap_protect(kernel_pmap,
5072 vm_page_fake_buckets_start,
5073 vm_page_fake_buckets_end,
5074 VM_PROT_READ);
5075 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5076 #endif /* VM_PAGE_FAKE_BUCKETS */
5077 #endif /* VM_PAGE_BUCKETS_CHECK */
5078
5079 #if VM_OBJECT_TRACKING
5080 vm_object_tracking_init();
5081 #endif /* VM_OBJECT_TRACKING */
5082
5083 #if __arm64__
5084 // vm_tests();
5085 #endif /* __arm64__ */
5086
5087 vm_pageout_continue();
5088
5089 /*
5090 * Unreached code!
5091 *
5092 * The vm_pageout_continue() call above never returns, so the code below is never
5093 * executed. We take advantage of this to declare several DTrace VM related probe
5094 * points that our kernel doesn't have an analog for. These are probe points that
5095 * exist in Solaris and are in the DTrace documentation, so people may have written
5096 * scripts that use them. Declaring the probe points here means their scripts will
5097 * compile and execute which we want for portability of the scripts, but since this
5098 * section of code is never reached, the probe points will simply never fire. Yes,
5099 * this is basically a hack. The problem is the DTrace probe points were chosen with
5100 * Solaris specific VM events in mind, not portability to different VM implementations.
5101 */
5102
5103 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5104 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5105 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5106 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5107 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5108 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5109 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5110 /*NOTREACHED*/
5111 }
5112
5113
5114
5115 kern_return_t
vm_pageout_internal_start(void)5116 vm_pageout_internal_start(void)
5117 {
5118 kern_return_t result = KERN_SUCCESS;
5119 host_basic_info_data_t hinfo;
5120 vm_offset_t buf, bufsize;
5121
5122 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5123
5124 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5125 #define BSD_HOST 1
5126 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5127
5128 assert(hinfo.max_cpus > 0);
5129
5130 #if !XNU_TARGET_OS_OSX
5131 vm_pageout_state.vm_compressor_thread_count = 1;
5132 #else /* !XNU_TARGET_OS_OSX */
5133 if (hinfo.max_cpus > 4) {
5134 vm_pageout_state.vm_compressor_thread_count = 2;
5135 } else {
5136 vm_pageout_state.vm_compressor_thread_count = 1;
5137 }
5138 #endif /* !XNU_TARGET_OS_OSX */
5139 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5140 sizeof(vm_pageout_state.vm_compressor_thread_count));
5141
5142 #if __AMP__
5143 PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5144 if (vm_compressor_ebound) {
5145 vm_pageout_state.vm_compressor_thread_count = 2;
5146 }
5147 #endif
5148 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5149 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5150 }
5151 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5152 vm_pageout_state.vm_compressor_thread_count = 1;
5153 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5154 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5155 }
5156
5157 vm_pageout_queue_internal.pgo_maxlaundry =
5158 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5159
5160 PE_parse_boot_argn("vmpgoi_maxlaundry",
5161 &vm_pageout_queue_internal.pgo_maxlaundry,
5162 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5163
5164 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5165 if (kernel_memory_allocate(kernel_map, &buf,
5166 bufsize * vm_pageout_state.vm_compressor_thread_count,
5167 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR)) {
5168 panic("vm_pageout_internal_start: Unable to allocate %zd bytes",
5169 (size_t)(bufsize * vm_pageout_state.vm_compressor_thread_count));
5170 }
5171
5172 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5173 ciq[i].id = i;
5174 ciq[i].q = &vm_pageout_queue_internal;
5175 ciq[i].current_chead = NULL;
5176 ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5177
5178 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5179 (void *)&ciq[i], BASEPRI_VM,
5180 &vm_pageout_state.vm_pageout_internal_iothread);
5181
5182 if (result == KERN_SUCCESS) {
5183 thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5184 } else {
5185 break;
5186 }
5187 }
5188 return result;
5189 }
5190
5191 #if CONFIG_IOSCHED
5192 /*
5193 * To support I/O Expedite for compressed files we mark the upls with special flags.
5194 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5195 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5196 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5197 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5198 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5199 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5200 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5201 * unless the real I/O upl is being destroyed).
5202 */
5203
5204
5205 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5206 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5207 {
5208 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5209
5210 upl_lock(src_upl);
5211 if (src_upl->decmp_io_upl) {
5212 /*
5213 * If there is already an alive real I/O UPL, ignore this new UPL.
5214 * This case should rarely happen and even if it does, it just means
5215 * that we might issue a spurious expedite which the driver is expected
5216 * to handle.
5217 */
5218 upl_unlock(src_upl);
5219 return;
5220 }
5221 src_upl->decmp_io_upl = (void *)upl;
5222 src_upl->ref_count++;
5223
5224 upl->flags |= UPL_DECMP_REAL_IO;
5225 upl->decmp_io_upl = (void *)src_upl;
5226 upl_unlock(src_upl);
5227 }
5228 #endif /* CONFIG_IOSCHED */
5229
5230 #if UPL_DEBUG
5231 int upl_debug_enabled = 1;
5232 #else
5233 int upl_debug_enabled = 0;
5234 #endif
5235
5236 static upl_t
upl_create(int type,int flags,upl_size_t size)5237 upl_create(int type, int flags, upl_size_t size)
5238 {
5239 upl_t upl;
5240 vm_size_t page_field_size = 0;
5241 int upl_flags = 0;
5242 vm_size_t upl_size = sizeof(struct upl);
5243
5244 assert(page_aligned(size));
5245
5246 size = round_page_32(size);
5247
5248 if (type & UPL_CREATE_LITE) {
5249 page_field_size = (atop(size) + 7) >> 3;
5250 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5251
5252 upl_flags |= UPL_LITE;
5253 }
5254 if (type & UPL_CREATE_INTERNAL) {
5255 upl_size += sizeof(struct upl_page_info) * atop(size);
5256
5257 upl_flags |= UPL_INTERNAL;
5258 }
5259 upl = (upl_t)kheap_alloc(KHEAP_DEFAULT, upl_size + page_field_size, Z_WAITOK | Z_ZERO);
5260
5261 upl->flags = upl_flags | flags;
5262 upl->ref_count = 1;
5263 upl_lock_init(upl);
5264 #if CONFIG_IOSCHED
5265 if (type & UPL_CREATE_IO_TRACKING) {
5266 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5267 }
5268
5269 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5270 /* Only support expedite on internal UPLs */
5271 thread_t curthread = current_thread();
5272 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(size), Z_WAITOK | Z_ZERO);
5273 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5274 if (curthread->decmp_upl != NULL) {
5275 upl_set_decmp_info(upl, curthread->decmp_upl);
5276 }
5277 }
5278 #endif
5279 #if CONFIG_IOSCHED || UPL_DEBUG
5280 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5281 upl->upl_creator = current_thread();
5282 upl->flags |= UPL_TRACKED_BY_OBJECT;
5283 }
5284 #endif
5285
5286 #if UPL_DEBUG
5287 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5288 #endif /* UPL_DEBUG */
5289
5290 return upl;
5291 }
5292
5293 static void
upl_destroy(upl_t upl)5294 upl_destroy(upl_t upl)
5295 {
5296 int page_field_size; /* bit field in word size buf */
5297 int size;
5298
5299 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5300
5301 if (upl->ext_ref_count) {
5302 panic("upl(%p) ext_ref_count", upl);
5303 }
5304
5305 #if CONFIG_IOSCHED
5306 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5307 upl_t src_upl;
5308 src_upl = upl->decmp_io_upl;
5309 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5310 upl_lock(src_upl);
5311 src_upl->decmp_io_upl = NULL;
5312 upl_unlock(src_upl);
5313 upl_deallocate(src_upl);
5314 }
5315 #endif /* CONFIG_IOSCHED */
5316
5317 #if CONFIG_IOSCHED || UPL_DEBUG
5318 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5319 !(upl->flags & UPL_VECTOR)) {
5320 vm_object_t object;
5321
5322 if (upl->flags & UPL_SHADOWED) {
5323 object = upl->map_object->shadow;
5324 } else {
5325 object = upl->map_object;
5326 }
5327
5328 vm_object_lock(object);
5329 queue_remove(&object->uplq, upl, upl_t, uplq);
5330 vm_object_activity_end(object);
5331 vm_object_collapse(object, 0, TRUE);
5332 vm_object_unlock(object);
5333 }
5334 #endif
5335 /*
5336 * drop a reference on the map_object whether or
5337 * not a pageout object is inserted
5338 */
5339 if (upl->flags & UPL_SHADOWED) {
5340 vm_object_deallocate(upl->map_object);
5341 }
5342
5343 if (upl->flags & UPL_DEVICE_MEMORY) {
5344 size = PAGE_SIZE;
5345 } else {
5346 size = upl_adjusted_size(upl, PAGE_MASK);
5347 }
5348 page_field_size = 0;
5349
5350 if (upl->flags & UPL_LITE) {
5351 page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5352 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5353 }
5354 upl_lock_destroy(upl);
5355 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5356
5357 #if CONFIG_IOSCHED
5358 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5359 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5360 }
5361 #endif
5362
5363 if (upl->flags & UPL_INTERNAL) {
5364 kheap_free(KHEAP_DEFAULT, upl,
5365 sizeof(struct upl) +
5366 (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5367 + page_field_size);
5368 } else {
5369 kheap_free(KHEAP_DEFAULT, upl, sizeof(struct upl) + page_field_size);
5370 }
5371 }
5372
5373 void
upl_deallocate(upl_t upl)5374 upl_deallocate(upl_t upl)
5375 {
5376 upl_lock(upl);
5377
5378 if (--upl->ref_count == 0) {
5379 if (vector_upl_is_valid(upl)) {
5380 vector_upl_deallocate(upl);
5381 }
5382 upl_unlock(upl);
5383
5384 if (upl->upl_iodone) {
5385 upl_callout_iodone(upl);
5386 }
5387
5388 upl_destroy(upl);
5389 } else {
5390 upl_unlock(upl);
5391 }
5392 }
5393
5394 #if CONFIG_IOSCHED
5395 void
upl_mark_decmp(upl_t upl)5396 upl_mark_decmp(upl_t upl)
5397 {
5398 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5399 upl->flags |= UPL_DECMP_REQ;
5400 upl->upl_creator->decmp_upl = (void *)upl;
5401 }
5402 }
5403
5404 void
upl_unmark_decmp(upl_t upl)5405 upl_unmark_decmp(upl_t upl)
5406 {
5407 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5408 upl->upl_creator->decmp_upl = NULL;
5409 }
5410 }
5411
5412 #endif /* CONFIG_IOSCHED */
5413
5414 #define VM_PAGE_Q_BACKING_UP(q) \
5415 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5416
5417 boolean_t must_throttle_writes(void);
5418
5419 boolean_t
must_throttle_writes()5420 must_throttle_writes()
5421 {
5422 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5423 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5424 return TRUE;
5425 }
5426
5427 return FALSE;
5428 }
5429
5430 #define MIN_DELAYED_WORK_CTX_ALLOCATED (16)
5431 #define MAX_DELAYED_WORK_CTX_ALLOCATED (512)
5432
5433 int vm_page_delayed_work_ctx_needed = 0;
5434 SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone;
5435
5436 void
vm_page_delayed_work_init_ctx(void)5437 vm_page_delayed_work_init_ctx(void)
5438 {
5439 size_t elem_size = sizeof(struct vm_page_delayed_work_ctx);
5440
5441 dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size,
5442 ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) {
5443 zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED);
5444 });
5445
5446 zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED);
5447 }
5448
5449 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5450 vm_page_delayed_work_get_ctx(void)
5451 {
5452 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5453
5454 dw_ctx = (struct vm_page_delayed_work_ctx*) zalloc_noblock(dw_ctx_zone);
5455
5456 if (dw_ctx) {
5457 dw_ctx->delayed_owner = current_thread();
5458 } else {
5459 vm_page_delayed_work_ctx_needed++;
5460 }
5461 return dw_ctx ? dw_ctx->dwp : NULL;
5462 }
5463
5464 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5465 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5466 {
5467 struct vm_page_delayed_work_ctx *ldw_ctx;
5468
5469 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5470 ldw_ctx->delayed_owner = NULL;
5471
5472 zfree(dw_ctx_zone, ldw_ctx);
5473 }
5474
5475 /*
5476 * Routine: vm_object_upl_request
5477 * Purpose:
5478 * Cause the population of a portion of a vm_object.
5479 * Depending on the nature of the request, the pages
5480 * returned may be contain valid data or be uninitialized.
5481 * A page list structure, listing the physical pages
5482 * will be returned upon request.
5483 * This function is called by the file system or any other
5484 * supplier of backing store to a pager.
5485 * IMPORTANT NOTE: The caller must still respect the relationship
5486 * between the vm_object and its backing memory object. The
5487 * caller MUST NOT substitute changes in the backing file
5488 * without first doing a memory_object_lock_request on the
5489 * target range unless it is know that the pages are not
5490 * shared with another entity at the pager level.
5491 * Copy_in_to:
5492 * if a page list structure is present
5493 * return the mapped physical pages, where a
5494 * page is not present, return a non-initialized
5495 * one. If the no_sync bit is turned on, don't
5496 * call the pager unlock to synchronize with other
5497 * possible copies of the page. Leave pages busy
5498 * in the original object, if a page list structure
5499 * was specified. When a commit of the page list
5500 * pages is done, the dirty bit will be set for each one.
5501 * Copy_out_from:
5502 * If a page list structure is present, return
5503 * all mapped pages. Where a page does not exist
5504 * map a zero filled one. Leave pages busy in
5505 * the original object. If a page list structure
5506 * is not specified, this call is a no-op.
5507 *
5508 * Note: access of default pager objects has a rather interesting
5509 * twist. The caller of this routine, presumably the file system
5510 * page cache handling code, will never actually make a request
5511 * against a default pager backed object. Only the default
5512 * pager will make requests on backing store related vm_objects
5513 * In this way the default pager can maintain the relationship
5514 * between backing store files (abstract memory objects) and
5515 * the vm_objects (cache objects), they support.
5516 *
5517 */
5518
5519 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5520 vm_object_upl_request(
5521 vm_object_t object,
5522 vm_object_offset_t offset,
5523 upl_size_t size,
5524 upl_t *upl_ptr,
5525 upl_page_info_array_t user_page_list,
5526 unsigned int *page_list_count,
5527 upl_control_flags_t cntrl_flags,
5528 vm_tag_t tag)
5529 {
5530 vm_page_t dst_page = VM_PAGE_NULL;
5531 vm_object_offset_t dst_offset;
5532 upl_size_t xfer_size;
5533 unsigned int size_in_pages;
5534 boolean_t dirty;
5535 boolean_t hw_dirty;
5536 upl_t upl = NULL;
5537 unsigned int entry;
5538 vm_page_t alias_page = NULL;
5539 int refmod_state = 0;
5540 wpl_array_t lite_list = NULL;
5541 vm_object_t last_copy_object;
5542 struct vm_page_delayed_work dw_array;
5543 struct vm_page_delayed_work *dwp, *dwp_start;
5544 bool dwp_finish_ctx = TRUE;
5545 int dw_count;
5546 int dw_limit;
5547 int io_tracking_flag = 0;
5548 int grab_options;
5549 int page_grab_count = 0;
5550 ppnum_t phys_page;
5551 pmap_flush_context pmap_flush_context_storage;
5552 boolean_t pmap_flushes_delayed = FALSE;
5553 #if DEVELOPMENT || DEBUG
5554 task_t task = current_task();
5555 #endif /* DEVELOPMENT || DEBUG */
5556
5557 dwp_start = dwp = NULL;
5558
5559 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5560 /*
5561 * For forward compatibility's sake,
5562 * reject any unknown flag.
5563 */
5564 return KERN_INVALID_VALUE;
5565 }
5566 if ((!object->internal) && (object->paging_offset != 0)) {
5567 panic("vm_object_upl_request: external object with non-zero paging offset");
5568 }
5569 if (object->phys_contiguous) {
5570 panic("vm_object_upl_request: contiguous object specified");
5571 }
5572
5573 assertf(page_aligned(offset) && page_aligned(size),
5574 "offset 0x%llx size 0x%x",
5575 offset, size);
5576
5577 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5578
5579 dw_count = 0;
5580 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5581 dwp_start = vm_page_delayed_work_get_ctx();
5582 if (dwp_start == NULL) {
5583 dwp_start = &dw_array;
5584 dw_limit = 1;
5585 dwp_finish_ctx = FALSE;
5586 }
5587
5588 dwp = dwp_start;
5589
5590 if (size > MAX_UPL_SIZE_BYTES) {
5591 size = MAX_UPL_SIZE_BYTES;
5592 }
5593
5594 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5595 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5596 }
5597
5598 #if CONFIG_IOSCHED || UPL_DEBUG
5599 if (object->io_tracking || upl_debug_enabled) {
5600 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5601 }
5602 #endif
5603 #if CONFIG_IOSCHED
5604 if (object->io_tracking) {
5605 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5606 }
5607 #endif
5608
5609 if (cntrl_flags & UPL_SET_INTERNAL) {
5610 if (cntrl_flags & UPL_SET_LITE) {
5611 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5612
5613 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5614 lite_list = (wpl_array_t)
5615 (((uintptr_t)user_page_list) +
5616 ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5617 if (size == 0) {
5618 user_page_list = NULL;
5619 lite_list = NULL;
5620 }
5621 } else {
5622 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5623
5624 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5625 if (size == 0) {
5626 user_page_list = NULL;
5627 }
5628 }
5629 } else {
5630 if (cntrl_flags & UPL_SET_LITE) {
5631 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5632
5633 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5634 if (size == 0) {
5635 lite_list = NULL;
5636 }
5637 } else {
5638 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5639 }
5640 }
5641 *upl_ptr = upl;
5642
5643 if (user_page_list) {
5644 user_page_list[0].device = FALSE;
5645 }
5646
5647 if (cntrl_flags & UPL_SET_LITE) {
5648 upl->map_object = object;
5649 } else {
5650 upl->map_object = vm_object_allocate(size);
5651 /*
5652 * No neeed to lock the new object: nobody else knows
5653 * about it yet, so it's all ours so far.
5654 */
5655 upl->map_object->shadow = object;
5656 upl->map_object->pageout = TRUE;
5657 upl->map_object->can_persist = FALSE;
5658 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5659 upl->map_object->vo_shadow_offset = offset;
5660 upl->map_object->wimg_bits = object->wimg_bits;
5661 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5662 "object %p shadow_offset 0x%llx",
5663 upl->map_object, upl->map_object->vo_shadow_offset);
5664
5665 alias_page = vm_page_grab_fictitious(TRUE);
5666
5667 upl->flags |= UPL_SHADOWED;
5668 }
5669 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5670 upl->flags |= UPL_PAGEOUT;
5671 }
5672
5673 vm_object_lock(object);
5674 vm_object_activity_begin(object);
5675
5676 grab_options = 0;
5677 #if CONFIG_SECLUDED_MEMORY
5678 if (object->can_grab_secluded) {
5679 grab_options |= VM_PAGE_GRAB_SECLUDED;
5680 }
5681 #endif /* CONFIG_SECLUDED_MEMORY */
5682
5683 /*
5684 * we can lock in the paging_offset once paging_in_progress is set
5685 */
5686 upl->u_size = size;
5687 upl->u_offset = offset + object->paging_offset;
5688
5689 #if CONFIG_IOSCHED || UPL_DEBUG
5690 if (object->io_tracking || upl_debug_enabled) {
5691 vm_object_activity_begin(object);
5692 queue_enter(&object->uplq, upl, upl_t, uplq);
5693 }
5694 #endif
5695 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5696 /*
5697 * Honor copy-on-write obligations
5698 *
5699 * The caller is gathering these pages and
5700 * might modify their contents. We need to
5701 * make sure that the copy object has its own
5702 * private copies of these pages before we let
5703 * the caller modify them.
5704 */
5705 vm_object_update(object,
5706 offset,
5707 size,
5708 NULL,
5709 NULL,
5710 FALSE, /* should_return */
5711 MEMORY_OBJECT_COPY_SYNC,
5712 VM_PROT_NO_CHANGE);
5713
5714 VM_PAGEOUT_DEBUG(upl_cow, 1);
5715 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5716 }
5717 /*
5718 * remember which copy object we synchronized with
5719 */
5720 last_copy_object = object->copy;
5721 entry = 0;
5722
5723 xfer_size = size;
5724 dst_offset = offset;
5725 size_in_pages = size / PAGE_SIZE;
5726
5727 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5728 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5729 object->scan_collisions = 0;
5730 }
5731
5732 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5733 boolean_t isSSD = FALSE;
5734
5735 #if !XNU_TARGET_OS_OSX
5736 isSSD = TRUE;
5737 #else /* !XNU_TARGET_OS_OSX */
5738 vnode_pager_get_isSSD(object->pager, &isSSD);
5739 #endif /* !XNU_TARGET_OS_OSX */
5740 vm_object_unlock(object);
5741
5742 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5743
5744 if (isSSD == TRUE) {
5745 delay(1000 * size_in_pages);
5746 } else {
5747 delay(5000 * size_in_pages);
5748 }
5749 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5750
5751 vm_object_lock(object);
5752 }
5753
5754 while (xfer_size) {
5755 dwp->dw_mask = 0;
5756
5757 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5758 vm_object_unlock(object);
5759 alias_page = vm_page_grab_fictitious(TRUE);
5760 vm_object_lock(object);
5761 }
5762 if (cntrl_flags & UPL_COPYOUT_FROM) {
5763 upl->flags |= UPL_PAGE_SYNC_DONE;
5764
5765 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5766 dst_page->vmp_fictitious ||
5767 dst_page->vmp_absent ||
5768 dst_page->vmp_error ||
5769 dst_page->vmp_cleaning ||
5770 (VM_PAGE_WIRED(dst_page))) {
5771 if (user_page_list) {
5772 user_page_list[entry].phys_addr = 0;
5773 }
5774
5775 goto try_next_page;
5776 }
5777 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5778
5779 /*
5780 * grab this up front...
5781 * a high percentange of the time we're going to
5782 * need the hardware modification state a bit later
5783 * anyway... so we can eliminate an extra call into
5784 * the pmap layer by grabbing it here and recording it
5785 */
5786 if (dst_page->vmp_pmapped) {
5787 refmod_state = pmap_get_refmod(phys_page);
5788 } else {
5789 refmod_state = 0;
5790 }
5791
5792 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5793 /*
5794 * page is on inactive list and referenced...
5795 * reactivate it now... this gets it out of the
5796 * way of vm_pageout_scan which would have to
5797 * reactivate it upon tripping over it
5798 */
5799 dwp->dw_mask |= DW_vm_page_activate;
5800 }
5801 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5802 /*
5803 * we're only asking for DIRTY pages to be returned
5804 */
5805 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5806 /*
5807 * if we were the page stolen by vm_pageout_scan to be
5808 * cleaned (as opposed to a buddy being clustered in
5809 * or this request is not being driven by a PAGEOUT cluster
5810 * then we only need to check for the page being dirty or
5811 * precious to decide whether to return it
5812 */
5813 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
5814 goto check_busy;
5815 }
5816 goto dont_return;
5817 }
5818 /*
5819 * this is a request for a PAGEOUT cluster and this page
5820 * is merely along for the ride as a 'buddy'... not only
5821 * does it have to be dirty to be returned, but it also
5822 * can't have been referenced recently...
5823 */
5824 if ((hibernate_cleaning_in_progress == TRUE ||
5825 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
5826 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5827 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
5828 goto check_busy;
5829 }
5830 dont_return:
5831 /*
5832 * if we reach here, we're not to return
5833 * the page... go on to the next one
5834 */
5835 if (dst_page->vmp_laundry == TRUE) {
5836 /*
5837 * if we get here, the page is not 'cleaning' (filtered out above).
5838 * since it has been referenced, remove it from the laundry
5839 * so we don't pay the cost of an I/O to clean a page
5840 * we're just going to take back
5841 */
5842 vm_page_lockspin_queues();
5843
5844 vm_pageout_steal_laundry(dst_page, TRUE);
5845 vm_page_activate(dst_page);
5846
5847 vm_page_unlock_queues();
5848 }
5849 if (user_page_list) {
5850 user_page_list[entry].phys_addr = 0;
5851 }
5852
5853 goto try_next_page;
5854 }
5855 check_busy:
5856 if (dst_page->vmp_busy) {
5857 if (cntrl_flags & UPL_NOBLOCK) {
5858 if (user_page_list) {
5859 user_page_list[entry].phys_addr = 0;
5860 }
5861 dwp->dw_mask = 0;
5862
5863 goto try_next_page;
5864 }
5865 /*
5866 * someone else is playing with the
5867 * page. We will have to wait.
5868 */
5869 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5870
5871 continue;
5872 }
5873 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5874 vm_page_lockspin_queues();
5875
5876 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5877 /*
5878 * we've buddied up a page for a clustered pageout
5879 * that has already been moved to the pageout
5880 * queue by pageout_scan... we need to remove
5881 * it from the queue and drop the laundry count
5882 * on that queue
5883 */
5884 vm_pageout_throttle_up(dst_page);
5885 }
5886 vm_page_unlock_queues();
5887 }
5888 hw_dirty = refmod_state & VM_MEM_MODIFIED;
5889 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
5890
5891 if (phys_page > upl->highest_page) {
5892 upl->highest_page = phys_page;
5893 }
5894
5895 assert(!pmap_is_noencrypt(phys_page));
5896
5897 if (cntrl_flags & UPL_SET_LITE) {
5898 unsigned int pg_num;
5899
5900 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
5901 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
5902 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
5903
5904 if (hw_dirty) {
5905 if (pmap_flushes_delayed == FALSE) {
5906 pmap_flush_context_init(&pmap_flush_context_storage);
5907 pmap_flushes_delayed = TRUE;
5908 }
5909 pmap_clear_refmod_options(phys_page,
5910 VM_MEM_MODIFIED,
5911 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
5912 &pmap_flush_context_storage);
5913 }
5914
5915 /*
5916 * Mark original page as cleaning
5917 * in place.
5918 */
5919 dst_page->vmp_cleaning = TRUE;
5920 dst_page->vmp_precious = FALSE;
5921 } else {
5922 /*
5923 * use pageclean setup, it is more
5924 * convenient even for the pageout
5925 * cases here
5926 */
5927 vm_object_lock(upl->map_object);
5928 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5929 vm_object_unlock(upl->map_object);
5930
5931 alias_page->vmp_absent = FALSE;
5932 alias_page = NULL;
5933 }
5934 if (dirty) {
5935 SET_PAGE_DIRTY(dst_page, FALSE);
5936 } else {
5937 dst_page->vmp_dirty = FALSE;
5938 }
5939
5940 if (!dirty) {
5941 dst_page->vmp_precious = TRUE;
5942 }
5943
5944 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
5945 if (!VM_PAGE_WIRED(dst_page)) {
5946 dst_page->vmp_free_when_done = TRUE;
5947 }
5948 }
5949 } else {
5950 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5951 /*
5952 * Honor copy-on-write obligations
5953 *
5954 * The copy object has changed since we
5955 * last synchronized for copy-on-write.
5956 * Another copy object might have been
5957 * inserted while we released the object's
5958 * lock. Since someone could have seen the
5959 * original contents of the remaining pages
5960 * through that new object, we have to
5961 * synchronize with it again for the remaining
5962 * pages only. The previous pages are "busy"
5963 * so they can not be seen through the new
5964 * mapping. The new mapping will see our
5965 * upcoming changes for those previous pages,
5966 * but that's OK since they couldn't see what
5967 * was there before. It's just a race anyway
5968 * and there's no guarantee of consistency or
5969 * atomicity. We just don't want new mappings
5970 * to see both the *before* and *after* pages.
5971 */
5972 if (object->copy != VM_OBJECT_NULL) {
5973 vm_object_update(
5974 object,
5975 dst_offset,/* current offset */
5976 xfer_size, /* remaining size */
5977 NULL,
5978 NULL,
5979 FALSE, /* should_return */
5980 MEMORY_OBJECT_COPY_SYNC,
5981 VM_PROT_NO_CHANGE);
5982
5983 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
5984 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
5985 }
5986 /*
5987 * remember the copy object we synced with
5988 */
5989 last_copy_object = object->copy;
5990 }
5991 dst_page = vm_page_lookup(object, dst_offset);
5992
5993 if (dst_page != VM_PAGE_NULL) {
5994 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5995 /*
5996 * skip over pages already present in the cache
5997 */
5998 if (user_page_list) {
5999 user_page_list[entry].phys_addr = 0;
6000 }
6001
6002 goto try_next_page;
6003 }
6004 if (dst_page->vmp_fictitious) {
6005 panic("need corner case for fictitious page");
6006 }
6007
6008 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6009 /*
6010 * someone else is playing with the
6011 * page. We will have to wait.
6012 */
6013 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6014
6015 continue;
6016 }
6017 if (dst_page->vmp_laundry) {
6018 vm_pageout_steal_laundry(dst_page, FALSE);
6019 }
6020 } else {
6021 if (object->private) {
6022 /*
6023 * This is a nasty wrinkle for users
6024 * of upl who encounter device or
6025 * private memory however, it is
6026 * unavoidable, only a fault can
6027 * resolve the actual backing
6028 * physical page by asking the
6029 * backing device.
6030 */
6031 if (user_page_list) {
6032 user_page_list[entry].phys_addr = 0;
6033 }
6034
6035 goto try_next_page;
6036 }
6037 if (object->scan_collisions) {
6038 /*
6039 * the pageout_scan thread is trying to steal
6040 * pages from this object, but has run into our
6041 * lock... grab 2 pages from the head of the object...
6042 * the first is freed on behalf of pageout_scan, the
6043 * 2nd is for our own use... we use vm_object_page_grab
6044 * in both cases to avoid taking pages from the free
6045 * list since we are under memory pressure and our
6046 * lock on this object is getting in the way of
6047 * relieving it
6048 */
6049 dst_page = vm_object_page_grab(object);
6050
6051 if (dst_page != VM_PAGE_NULL) {
6052 vm_page_release(dst_page,
6053 FALSE);
6054 }
6055
6056 dst_page = vm_object_page_grab(object);
6057 }
6058 if (dst_page == VM_PAGE_NULL) {
6059 /*
6060 * need to allocate a page
6061 */
6062 dst_page = vm_page_grab_options(grab_options);
6063 if (dst_page != VM_PAGE_NULL) {
6064 page_grab_count++;
6065 }
6066 }
6067 if (dst_page == VM_PAGE_NULL) {
6068 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6069 /*
6070 * we don't want to stall waiting for pages to come onto the free list
6071 * while we're already holding absent pages in this UPL
6072 * the caller will deal with the empty slots
6073 */
6074 if (user_page_list) {
6075 user_page_list[entry].phys_addr = 0;
6076 }
6077
6078 goto try_next_page;
6079 }
6080 /*
6081 * no pages available... wait
6082 * then try again for the same
6083 * offset...
6084 */
6085 vm_object_unlock(object);
6086
6087 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6088
6089 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6090
6091 VM_PAGE_WAIT();
6092 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6093
6094 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6095
6096 vm_object_lock(object);
6097
6098 continue;
6099 }
6100 vm_page_insert(dst_page, object, dst_offset);
6101
6102 dst_page->vmp_absent = TRUE;
6103 dst_page->vmp_busy = FALSE;
6104
6105 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6106 /*
6107 * if UPL_RET_ONLY_ABSENT was specified,
6108 * than we're definitely setting up a
6109 * upl for a clustered read/pagein
6110 * operation... mark the pages as clustered
6111 * so upl_commit_range can put them on the
6112 * speculative list
6113 */
6114 dst_page->vmp_clustered = TRUE;
6115
6116 if (!(cntrl_flags & UPL_FILE_IO)) {
6117 counter_inc(&vm_statistics_pageins);
6118 }
6119 }
6120 }
6121 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6122
6123 dst_page->vmp_overwriting = TRUE;
6124
6125 if (dst_page->vmp_pmapped) {
6126 if (!(cntrl_flags & UPL_FILE_IO)) {
6127 /*
6128 * eliminate all mappings from the
6129 * original object and its prodigy
6130 */
6131 refmod_state = pmap_disconnect(phys_page);
6132 } else {
6133 refmod_state = pmap_get_refmod(phys_page);
6134 }
6135 } else {
6136 refmod_state = 0;
6137 }
6138
6139 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6140 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6141
6142 if (cntrl_flags & UPL_SET_LITE) {
6143 unsigned int pg_num;
6144
6145 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6146 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6147 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6148
6149 if (hw_dirty) {
6150 pmap_clear_modify(phys_page);
6151 }
6152
6153 /*
6154 * Mark original page as cleaning
6155 * in place.
6156 */
6157 dst_page->vmp_cleaning = TRUE;
6158 dst_page->vmp_precious = FALSE;
6159 } else {
6160 /*
6161 * use pageclean setup, it is more
6162 * convenient even for the pageout
6163 * cases here
6164 */
6165 vm_object_lock(upl->map_object);
6166 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6167 vm_object_unlock(upl->map_object);
6168
6169 alias_page->vmp_absent = FALSE;
6170 alias_page = NULL;
6171 }
6172
6173 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6174 upl->flags &= ~UPL_CLEAR_DIRTY;
6175 upl->flags |= UPL_SET_DIRTY;
6176 dirty = TRUE;
6177 /*
6178 * Page belonging to a code-signed object is about to
6179 * be written. Mark it tainted and disconnect it from
6180 * all pmaps so processes have to fault it back in and
6181 * deal with the tainted bit.
6182 */
6183 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6184 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6185 vm_page_upl_tainted++;
6186 if (dst_page->vmp_pmapped) {
6187 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6188 if (refmod_state & VM_MEM_REFERENCED) {
6189 dst_page->vmp_reference = TRUE;
6190 }
6191 }
6192 }
6193 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6194 /*
6195 * clean in place for read implies
6196 * that a write will be done on all
6197 * the pages that are dirty before
6198 * a upl commit is done. The caller
6199 * is obligated to preserve the
6200 * contents of all pages marked dirty
6201 */
6202 upl->flags |= UPL_CLEAR_DIRTY;
6203 }
6204 dst_page->vmp_dirty = dirty;
6205
6206 if (!dirty) {
6207 dst_page->vmp_precious = TRUE;
6208 }
6209
6210 if (!VM_PAGE_WIRED(dst_page)) {
6211 /*
6212 * deny access to the target page while
6213 * it is being worked on
6214 */
6215 dst_page->vmp_busy = TRUE;
6216 } else {
6217 dwp->dw_mask |= DW_vm_page_wire;
6218 }
6219
6220 /*
6221 * We might be about to satisfy a fault which has been
6222 * requested. So no need for the "restart" bit.
6223 */
6224 dst_page->vmp_restart = FALSE;
6225 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6226 /*
6227 * expect the page to be used
6228 */
6229 dwp->dw_mask |= DW_set_reference;
6230 }
6231 if (cntrl_flags & UPL_PRECIOUS) {
6232 if (object->internal) {
6233 SET_PAGE_DIRTY(dst_page, FALSE);
6234 dst_page->vmp_precious = FALSE;
6235 } else {
6236 dst_page->vmp_precious = TRUE;
6237 }
6238 } else {
6239 dst_page->vmp_precious = FALSE;
6240 }
6241 }
6242 if (dst_page->vmp_busy) {
6243 upl->flags |= UPL_HAS_BUSY;
6244 }
6245
6246 if (phys_page > upl->highest_page) {
6247 upl->highest_page = phys_page;
6248 }
6249 assert(!pmap_is_noencrypt(phys_page));
6250 if (user_page_list) {
6251 user_page_list[entry].phys_addr = phys_page;
6252 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6253 user_page_list[entry].absent = dst_page->vmp_absent;
6254 user_page_list[entry].dirty = dst_page->vmp_dirty;
6255 user_page_list[entry].precious = dst_page->vmp_precious;
6256 user_page_list[entry].device = FALSE;
6257 user_page_list[entry].needed = FALSE;
6258 if (dst_page->vmp_clustered == TRUE) {
6259 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6260 } else {
6261 user_page_list[entry].speculative = FALSE;
6262 }
6263 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6264 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6265 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6266 user_page_list[entry].mark = FALSE;
6267 }
6268 /*
6269 * if UPL_RET_ONLY_ABSENT is set, then
6270 * we are working with a fresh page and we've
6271 * just set the clustered flag on it to
6272 * indicate that it was drug in as part of a
6273 * speculative cluster... so leave it alone
6274 */
6275 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6276 /*
6277 * someone is explicitly grabbing this page...
6278 * update clustered and speculative state
6279 *
6280 */
6281 if (dst_page->vmp_clustered) {
6282 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6283 }
6284 }
6285 try_next_page:
6286 if (dwp->dw_mask) {
6287 if (dwp->dw_mask & DW_vm_page_activate) {
6288 counter_inc(&vm_statistics_reactivations);
6289 }
6290
6291 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6292
6293 if (dw_count >= dw_limit) {
6294 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6295
6296 dwp = dwp_start;
6297 dw_count = 0;
6298 }
6299 }
6300 entry++;
6301 dst_offset += PAGE_SIZE_64;
6302 xfer_size -= PAGE_SIZE;
6303 }
6304 if (dw_count) {
6305 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6306 dwp = dwp_start;
6307 dw_count = 0;
6308 }
6309
6310 if (alias_page != NULL) {
6311 VM_PAGE_FREE(alias_page);
6312 }
6313 if (pmap_flushes_delayed == TRUE) {
6314 pmap_flush(&pmap_flush_context_storage);
6315 }
6316
6317 if (page_list_count != NULL) {
6318 if (upl->flags & UPL_INTERNAL) {
6319 *page_list_count = 0;
6320 } else if (*page_list_count > entry) {
6321 *page_list_count = entry;
6322 }
6323 }
6324 #if UPL_DEBUG
6325 upl->upl_state = 1;
6326 #endif
6327 vm_object_unlock(object);
6328
6329 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6330 #if DEVELOPMENT || DEBUG
6331 if (task != NULL) {
6332 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6333 }
6334 #endif /* DEVELOPMENT || DEBUG */
6335
6336 if (dwp_start && dwp_finish_ctx) {
6337 vm_page_delayed_work_finish_ctx(dwp_start);
6338 dwp_start = dwp = NULL;
6339 }
6340
6341 return KERN_SUCCESS;
6342 }
6343
6344 /*
6345 * Routine: vm_object_super_upl_request
6346 * Purpose:
6347 * Cause the population of a portion of a vm_object
6348 * in much the same way as memory_object_upl_request.
6349 * Depending on the nature of the request, the pages
6350 * returned may be contain valid data or be uninitialized.
6351 * However, the region may be expanded up to the super
6352 * cluster size provided.
6353 */
6354
6355 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6356 vm_object_super_upl_request(
6357 vm_object_t object,
6358 vm_object_offset_t offset,
6359 upl_size_t size,
6360 upl_size_t super_cluster,
6361 upl_t *upl,
6362 upl_page_info_t *user_page_list,
6363 unsigned int *page_list_count,
6364 upl_control_flags_t cntrl_flags,
6365 vm_tag_t tag)
6366 {
6367 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6368 return KERN_FAILURE;
6369 }
6370
6371 assert(object->paging_in_progress);
6372 offset = offset - object->paging_offset;
6373
6374 if (super_cluster > size) {
6375 vm_object_offset_t base_offset;
6376 upl_size_t super_size;
6377 vm_object_size_t super_size_64;
6378
6379 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6380 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6381 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6382 super_size = (upl_size_t) super_size_64;
6383 assert(super_size == super_size_64);
6384
6385 if (offset > (base_offset + super_size)) {
6386 panic("vm_object_super_upl_request: Missed target pageout"
6387 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6388 offset, base_offset, super_size, super_cluster,
6389 size, object->paging_offset);
6390 }
6391 /*
6392 * apparently there is a case where the vm requests a
6393 * page to be written out who's offset is beyond the
6394 * object size
6395 */
6396 if ((offset + size) > (base_offset + super_size)) {
6397 super_size_64 = (offset + size) - base_offset;
6398 super_size = (upl_size_t) super_size_64;
6399 assert(super_size == super_size_64);
6400 }
6401
6402 offset = base_offset;
6403 size = super_size;
6404 }
6405 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6406 }
6407
6408 int cs_executable_create_upl = 0;
6409 extern int proc_selfpid(void);
6410 extern char *proc_name_address(void *p);
6411
6412 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6413 vm_map_create_upl(
6414 vm_map_t map,
6415 vm_map_address_t offset,
6416 upl_size_t *upl_size,
6417 upl_t *upl,
6418 upl_page_info_array_t page_list,
6419 unsigned int *count,
6420 upl_control_flags_t *flags,
6421 vm_tag_t tag)
6422 {
6423 vm_map_entry_t entry;
6424 upl_control_flags_t caller_flags;
6425 int force_data_sync;
6426 int sync_cow_data;
6427 vm_object_t local_object;
6428 vm_map_offset_t local_offset;
6429 vm_map_offset_t local_start;
6430 kern_return_t ret;
6431 vm_map_address_t original_offset;
6432 vm_map_size_t original_size, adjusted_size;
6433 vm_map_offset_t local_entry_start;
6434 vm_object_offset_t local_entry_offset;
6435 vm_object_offset_t offset_in_mapped_page;
6436 boolean_t release_map = FALSE;
6437
6438 start_with_map:
6439
6440 original_offset = offset;
6441 original_size = *upl_size;
6442 adjusted_size = original_size;
6443
6444 caller_flags = *flags;
6445
6446 if (caller_flags & ~UPL_VALID_FLAGS) {
6447 /*
6448 * For forward compatibility's sake,
6449 * reject any unknown flag.
6450 */
6451 ret = KERN_INVALID_VALUE;
6452 goto done;
6453 }
6454 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6455 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6456
6457 if (upl == NULL) {
6458 ret = KERN_INVALID_ARGUMENT;
6459 goto done;
6460 }
6461
6462 REDISCOVER_ENTRY:
6463 vm_map_lock_read(map);
6464
6465 if (!vm_map_lookup_entry(map, offset, &entry)) {
6466 vm_map_unlock_read(map);
6467 ret = KERN_FAILURE;
6468 goto done;
6469 }
6470
6471 local_entry_start = entry->vme_start;
6472 local_entry_offset = VME_OFFSET(entry);
6473
6474 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6475 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6476 }
6477
6478 if (entry->vme_end - original_offset < adjusted_size) {
6479 adjusted_size = entry->vme_end - original_offset;
6480 assert(adjusted_size > 0);
6481 *upl_size = (upl_size_t) adjusted_size;
6482 assert(*upl_size == adjusted_size);
6483 }
6484
6485 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6486 *flags = 0;
6487
6488 if (!entry->is_sub_map &&
6489 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6490 if (VME_OBJECT(entry)->private) {
6491 *flags = UPL_DEV_MEMORY;
6492 }
6493
6494 if (VME_OBJECT(entry)->phys_contiguous) {
6495 *flags |= UPL_PHYS_CONTIG;
6496 }
6497 }
6498 vm_map_unlock_read(map);
6499 ret = KERN_SUCCESS;
6500 goto done;
6501 }
6502
6503 offset_in_mapped_page = 0;
6504 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6505 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6506 *upl_size = (upl_size_t)
6507 (vm_map_round_page(original_offset + adjusted_size,
6508 VM_MAP_PAGE_MASK(map))
6509 - offset);
6510
6511 offset_in_mapped_page = original_offset - offset;
6512 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6513
6514 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6515 }
6516
6517 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6518 !VME_OBJECT(entry)->phys_contiguous) {
6519 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6520 *upl_size = MAX_UPL_SIZE_BYTES;
6521 }
6522 }
6523
6524 /*
6525 * Create an object if necessary.
6526 */
6527 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6528 if (vm_map_lock_read_to_write(map)) {
6529 goto REDISCOVER_ENTRY;
6530 }
6531
6532 VME_OBJECT_SET(entry,
6533 vm_object_allocate((vm_size_t)
6534 vm_object_round_page((entry->vme_end - entry->vme_start))));
6535 VME_OFFSET_SET(entry, 0);
6536 assert(entry->use_pmap);
6537
6538 vm_map_lock_write_to_read(map);
6539 }
6540
6541 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6542 !entry->is_sub_map &&
6543 !(entry->protection & VM_PROT_WRITE)) {
6544 vm_map_unlock_read(map);
6545 ret = KERN_PROTECTION_FAILURE;
6546 goto done;
6547 }
6548
6549 #if !XNU_TARGET_OS_OSX
6550 if (map->pmap != kernel_pmap &&
6551 (caller_flags & UPL_COPYOUT_FROM) &&
6552 (entry->protection & VM_PROT_EXECUTE) &&
6553 !(entry->protection & VM_PROT_WRITE)) {
6554 vm_offset_t kaddr;
6555 vm_size_t ksize;
6556
6557 /*
6558 * We're about to create a read-only UPL backed by
6559 * memory from an executable mapping.
6560 * Wiring the pages would result in the pages being copied
6561 * (due to the "MAP_PRIVATE" mapping) and no longer
6562 * code-signed, so no longer eligible for execution.
6563 * Instead, let's copy the data into a kernel buffer and
6564 * create the UPL from this kernel buffer.
6565 * The kernel buffer is then freed, leaving the UPL holding
6566 * the last reference on the VM object, so the memory will
6567 * be released when the UPL is committed.
6568 */
6569
6570 vm_map_unlock_read(map);
6571 entry = VM_MAP_ENTRY_NULL;
6572 /* allocate kernel buffer */
6573 ksize = round_page(*upl_size);
6574 kaddr = 0;
6575 ret = kmem_alloc_pageable(kernel_map,
6576 &kaddr,
6577 ksize,
6578 tag);
6579 if (ret == KERN_SUCCESS) {
6580 /* copyin the user data */
6581 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6582 }
6583 if (ret == KERN_SUCCESS) {
6584 if (ksize > *upl_size) {
6585 /* zero out the extra space in kernel buffer */
6586 memset((void *)(kaddr + *upl_size),
6587 0,
6588 ksize - *upl_size);
6589 }
6590 /* create the UPL from the kernel buffer */
6591 vm_object_offset_t offset_in_object;
6592 vm_object_offset_t offset_in_object_page;
6593
6594 offset_in_object = offset - local_entry_start + local_entry_offset;
6595 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6596 assert(offset_in_object_page < PAGE_SIZE);
6597 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6598 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6599 ret = vm_map_create_upl(kernel_map,
6600 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6601 upl_size, upl, page_list, count, flags, tag);
6602 }
6603 if (kaddr != 0) {
6604 /* free the kernel buffer */
6605 kmem_free(kernel_map, kaddr, ksize);
6606 kaddr = 0;
6607 ksize = 0;
6608 }
6609 #if DEVELOPMENT || DEBUG
6610 DTRACE_VM4(create_upl_from_executable,
6611 vm_map_t, map,
6612 vm_map_address_t, offset,
6613 upl_size_t, *upl_size,
6614 kern_return_t, ret);
6615 #endif /* DEVELOPMENT || DEBUG */
6616 goto done;
6617 }
6618 #endif /* !XNU_TARGET_OS_OSX */
6619
6620 local_object = VME_OBJECT(entry);
6621 assert(local_object != VM_OBJECT_NULL);
6622
6623 if (!entry->is_sub_map &&
6624 !entry->needs_copy &&
6625 *upl_size != 0 &&
6626 local_object->vo_size > *upl_size && /* partial UPL */
6627 entry->wired_count == 0 && /* No COW for entries that are wired */
6628 (map->pmap != kernel_pmap) && /* alias checks */
6629 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6630 ||
6631 ( /* case 2 */
6632 local_object->internal &&
6633 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6634 local_object->ref_count > 1))) {
6635 vm_prot_t prot;
6636
6637 /*
6638 * Case 1:
6639 * Set up the targeted range for copy-on-write to avoid
6640 * applying true_share/copy_delay to the entire object.
6641 *
6642 * Case 2:
6643 * This map entry covers only part of an internal
6644 * object. There could be other map entries covering
6645 * other areas of this object and some of these map
6646 * entries could be marked as "needs_copy", which
6647 * assumes that the object is COPY_SYMMETRIC.
6648 * To avoid marking this object as COPY_DELAY and
6649 * "true_share", let's shadow it and mark the new
6650 * (smaller) object as "true_share" and COPY_DELAY.
6651 */
6652
6653 if (vm_map_lock_read_to_write(map)) {
6654 goto REDISCOVER_ENTRY;
6655 }
6656 vm_map_lock_assert_exclusive(map);
6657 assert(VME_OBJECT(entry) == local_object);
6658
6659 vm_map_clip_start(map,
6660 entry,
6661 vm_map_trunc_page(offset,
6662 VM_MAP_PAGE_MASK(map)));
6663 vm_map_clip_end(map,
6664 entry,
6665 vm_map_round_page(offset + *upl_size,
6666 VM_MAP_PAGE_MASK(map)));
6667 if ((entry->vme_end - offset) < *upl_size) {
6668 *upl_size = (upl_size_t) (entry->vme_end - offset);
6669 assert(*upl_size == entry->vme_end - offset);
6670 }
6671
6672 prot = entry->protection & ~VM_PROT_WRITE;
6673 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6674 prot |= VM_PROT_EXECUTE;
6675 }
6676 vm_object_pmap_protect(local_object,
6677 VME_OFFSET(entry),
6678 entry->vme_end - entry->vme_start,
6679 ((entry->is_shared ||
6680 map->mapped_in_other_pmaps)
6681 ? PMAP_NULL
6682 : map->pmap),
6683 VM_MAP_PAGE_SIZE(map),
6684 entry->vme_start,
6685 prot);
6686
6687 assert(entry->wired_count == 0);
6688
6689 /*
6690 * Lock the VM object and re-check its status: if it's mapped
6691 * in another address space, we could still be racing with
6692 * another thread holding that other VM map exclusively.
6693 */
6694 vm_object_lock(local_object);
6695 if (local_object->true_share) {
6696 /* object is already in proper state: no COW needed */
6697 assert(local_object->copy_strategy !=
6698 MEMORY_OBJECT_COPY_SYMMETRIC);
6699 } else {
6700 /* not true_share: ask for copy-on-write below */
6701 assert(local_object->copy_strategy ==
6702 MEMORY_OBJECT_COPY_SYMMETRIC);
6703 entry->needs_copy = TRUE;
6704 }
6705 vm_object_unlock(local_object);
6706
6707 vm_map_lock_write_to_read(map);
6708 }
6709
6710 if (entry->needs_copy) {
6711 /*
6712 * Honor copy-on-write for COPY_SYMMETRIC
6713 * strategy.
6714 */
6715 vm_map_t local_map;
6716 vm_object_t object;
6717 vm_object_offset_t new_offset;
6718 vm_prot_t prot;
6719 boolean_t wired;
6720 vm_map_version_t version;
6721 vm_map_t real_map;
6722 vm_prot_t fault_type;
6723
6724 local_map = map;
6725
6726 if (caller_flags & UPL_COPYOUT_FROM) {
6727 fault_type = VM_PROT_READ | VM_PROT_COPY;
6728 vm_counters.create_upl_extra_cow++;
6729 vm_counters.create_upl_extra_cow_pages +=
6730 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6731 } else {
6732 fault_type = VM_PROT_WRITE;
6733 }
6734 if (vm_map_lookup_locked(&local_map,
6735 offset, fault_type,
6736 OBJECT_LOCK_EXCLUSIVE,
6737 &version, &object,
6738 &new_offset, &prot, &wired,
6739 NULL,
6740 &real_map, NULL) != KERN_SUCCESS) {
6741 if (fault_type == VM_PROT_WRITE) {
6742 vm_counters.create_upl_lookup_failure_write++;
6743 } else {
6744 vm_counters.create_upl_lookup_failure_copy++;
6745 }
6746 vm_map_unlock_read(local_map);
6747 ret = KERN_FAILURE;
6748 goto done;
6749 }
6750 if (real_map != local_map) {
6751 vm_map_unlock(real_map);
6752 }
6753 vm_map_unlock_read(local_map);
6754
6755 vm_object_unlock(object);
6756
6757 goto REDISCOVER_ENTRY;
6758 }
6759
6760 if (entry->is_sub_map) {
6761 vm_map_t submap;
6762
6763 submap = VME_SUBMAP(entry);
6764 local_start = entry->vme_start;
6765 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6766
6767 vm_map_reference(submap);
6768 vm_map_unlock_read(map);
6769
6770 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6771 offset += offset_in_mapped_page;
6772 *upl_size -= offset_in_mapped_page;
6773
6774 if (release_map) {
6775 vm_map_deallocate(map);
6776 }
6777 map = submap;
6778 release_map = TRUE;
6779 offset = local_offset + (offset - local_start);
6780 goto start_with_map;
6781 }
6782
6783 if (sync_cow_data &&
6784 (VME_OBJECT(entry)->shadow ||
6785 VME_OBJECT(entry)->copy)) {
6786 local_object = VME_OBJECT(entry);
6787 local_start = entry->vme_start;
6788 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6789
6790 vm_object_reference(local_object);
6791 vm_map_unlock_read(map);
6792
6793 if (local_object->shadow && local_object->copy) {
6794 vm_object_lock_request(local_object->shadow,
6795 ((vm_object_offset_t)
6796 ((offset - local_start) +
6797 local_offset) +
6798 local_object->vo_shadow_offset),
6799 *upl_size, FALSE,
6800 MEMORY_OBJECT_DATA_SYNC,
6801 VM_PROT_NO_CHANGE);
6802 }
6803 sync_cow_data = FALSE;
6804 vm_object_deallocate(local_object);
6805
6806 goto REDISCOVER_ENTRY;
6807 }
6808 if (force_data_sync) {
6809 local_object = VME_OBJECT(entry);
6810 local_start = entry->vme_start;
6811 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6812
6813 vm_object_reference(local_object);
6814 vm_map_unlock_read(map);
6815
6816 vm_object_lock_request(local_object,
6817 ((vm_object_offset_t)
6818 ((offset - local_start) +
6819 local_offset)),
6820 (vm_object_size_t)*upl_size,
6821 FALSE,
6822 MEMORY_OBJECT_DATA_SYNC,
6823 VM_PROT_NO_CHANGE);
6824
6825 force_data_sync = FALSE;
6826 vm_object_deallocate(local_object);
6827
6828 goto REDISCOVER_ENTRY;
6829 }
6830 if (VME_OBJECT(entry)->private) {
6831 *flags = UPL_DEV_MEMORY;
6832 } else {
6833 *flags = 0;
6834 }
6835
6836 if (VME_OBJECT(entry)->phys_contiguous) {
6837 *flags |= UPL_PHYS_CONTIG;
6838 }
6839
6840 local_object = VME_OBJECT(entry);
6841 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6842 local_start = entry->vme_start;
6843
6844 /*
6845 * Wiring will copy the pages to the shadow object.
6846 * The shadow object will not be code-signed so
6847 * attempting to execute code from these copied pages
6848 * would trigger a code-signing violation.
6849 */
6850 if (entry->protection & VM_PROT_EXECUTE) {
6851 #if MACH_ASSERT
6852 printf("pid %d[%s] create_upl out of executable range from "
6853 "0x%llx to 0x%llx: side effects may include "
6854 "code-signing violations later on\n",
6855 proc_selfpid(),
6856 (current_task()->bsd_info
6857 ? proc_name_address(current_task()->bsd_info)
6858 : "?"),
6859 (uint64_t) entry->vme_start,
6860 (uint64_t) entry->vme_end);
6861 #endif /* MACH_ASSERT */
6862 DTRACE_VM2(cs_executable_create_upl,
6863 uint64_t, (uint64_t)entry->vme_start,
6864 uint64_t, (uint64_t)entry->vme_end);
6865 cs_executable_create_upl++;
6866 }
6867
6868 vm_object_lock(local_object);
6869
6870 /*
6871 * Ensure that this object is "true_share" and "copy_delay" now,
6872 * while we're still holding the VM map lock. After we unlock the map,
6873 * anything could happen to that mapping, including some copy-on-write
6874 * activity. We need to make sure that the IOPL will point at the
6875 * same memory as the mapping.
6876 */
6877 if (local_object->true_share) {
6878 assert(local_object->copy_strategy !=
6879 MEMORY_OBJECT_COPY_SYMMETRIC);
6880 } else if (local_object != kernel_object &&
6881 local_object != compressor_object &&
6882 !local_object->phys_contiguous) {
6883 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6884 if (!local_object->true_share &&
6885 vm_object_tracking_btlog) {
6886 btlog_record(vm_object_tracking_btlog, local_object,
6887 VM_OBJECT_TRACKING_OP_TRUESHARE,
6888 btref_get(__builtin_frame_address(0), 0));
6889 }
6890 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6891 local_object->true_share = TRUE;
6892 if (local_object->copy_strategy ==
6893 MEMORY_OBJECT_COPY_SYMMETRIC) {
6894 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6895 }
6896 }
6897
6898 vm_object_reference_locked(local_object);
6899 vm_object_unlock(local_object);
6900
6901 vm_map_unlock_read(map);
6902
6903 offset += offset_in_mapped_page;
6904 assert(*upl_size > offset_in_mapped_page);
6905 *upl_size -= offset_in_mapped_page;
6906
6907 ret = vm_object_iopl_request(local_object,
6908 ((vm_object_offset_t)
6909 ((offset - local_start) + local_offset)),
6910 *upl_size,
6911 upl,
6912 page_list,
6913 count,
6914 caller_flags,
6915 tag);
6916 vm_object_deallocate(local_object);
6917
6918 done:
6919 if (release_map) {
6920 vm_map_deallocate(map);
6921 }
6922
6923 return ret;
6924 }
6925
6926 /*
6927 * Internal routine to enter a UPL into a VM map.
6928 *
6929 * JMM - This should just be doable through the standard
6930 * vm_map_enter() API.
6931 */
6932 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)6933 vm_map_enter_upl_range(
6934 vm_map_t map,
6935 upl_t upl,
6936 vm_object_offset_t offset_to_map,
6937 upl_size_t size_to_map,
6938 vm_prot_t prot_to_map,
6939 vm_map_offset_t *dst_addr)
6940 {
6941 vm_map_size_t size;
6942 vm_object_offset_t offset;
6943 vm_map_offset_t addr;
6944 vm_page_t m;
6945 kern_return_t kr;
6946 int isVectorUPL = 0, curr_upl = 0;
6947 upl_t vector_upl = NULL;
6948 vm_offset_t vector_upl_dst_addr = 0;
6949 vm_map_t vector_upl_submap = NULL;
6950 upl_offset_t subupl_offset = 0;
6951 upl_size_t subupl_size = 0;
6952
6953 if (upl == UPL_NULL) {
6954 return KERN_INVALID_ARGUMENT;
6955 }
6956
6957 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
6958 assert(map == kernel_map);
6959
6960 if ((isVectorUPL = vector_upl_is_valid(upl))) {
6961 int mapped = 0, valid_upls = 0;
6962 vector_upl = upl;
6963
6964 upl_lock(vector_upl);
6965 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6966 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
6967 if (upl == NULL) {
6968 continue;
6969 }
6970 valid_upls++;
6971 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6972 mapped++;
6973 }
6974 }
6975
6976 if (mapped) {
6977 if (mapped != valid_upls) {
6978 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
6979 } else {
6980 upl_unlock(vector_upl);
6981 return KERN_FAILURE;
6982 }
6983 }
6984
6985 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
6986 panic("TODO4K: vector UPL not implemented");
6987 }
6988
6989 kr = kmem_suballoc(map, &vector_upl_dst_addr,
6990 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
6991 VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE,
6992 &vector_upl_submap);
6993 if (kr != KERN_SUCCESS) {
6994 panic("Vector UPL submap allocation failed");
6995 }
6996 map = vector_upl_submap;
6997 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6998 curr_upl = 0;
6999 } else {
7000 upl_lock(upl);
7001 }
7002
7003 process_upl_to_enter:
7004 if (isVectorUPL) {
7005 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7006 *dst_addr = vector_upl_dst_addr;
7007 upl_unlock(vector_upl);
7008 return KERN_SUCCESS;
7009 }
7010 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7011 if (upl == NULL) {
7012 goto process_upl_to_enter;
7013 }
7014
7015 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7016 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7017 } else {
7018 /*
7019 * check to see if already mapped
7020 */
7021 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7022 upl_unlock(upl);
7023 return KERN_FAILURE;
7024 }
7025 }
7026
7027 if ((!(upl->flags & UPL_SHADOWED)) &&
7028 ((upl->flags & UPL_HAS_BUSY) ||
7029 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7030 vm_object_t object;
7031 vm_page_t alias_page;
7032 vm_object_offset_t new_offset;
7033 unsigned int pg_num;
7034 wpl_array_t lite_list;
7035
7036 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7037 if (upl->flags & UPL_INTERNAL) {
7038 lite_list = (wpl_array_t)
7039 ((((uintptr_t)upl) + sizeof(struct upl))
7040 + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
7041 } else {
7042 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
7043 }
7044 object = upl->map_object;
7045 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7046
7047 vm_object_lock(upl->map_object);
7048
7049 upl->map_object->shadow = object;
7050 upl->map_object->pageout = TRUE;
7051 upl->map_object->can_persist = FALSE;
7052 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7053 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7054 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7055 "object %p shadow_offset 0x%llx",
7056 upl->map_object,
7057 (uint64_t)upl->map_object->vo_shadow_offset);
7058 upl->map_object->wimg_bits = object->wimg_bits;
7059 offset = upl->map_object->vo_shadow_offset;
7060 new_offset = 0;
7061
7062 upl->flags |= UPL_SHADOWED;
7063
7064 while (size) {
7065 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7066 assert(pg_num == new_offset / PAGE_SIZE);
7067
7068 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7069 alias_page = vm_page_grab_fictitious(TRUE);
7070
7071 vm_object_lock(object);
7072
7073 m = vm_page_lookup(object, offset);
7074 if (m == VM_PAGE_NULL) {
7075 panic("vm_upl_map: page missing");
7076 }
7077
7078 /*
7079 * Convert the fictitious page to a private
7080 * shadow of the real page.
7081 */
7082 assert(alias_page->vmp_fictitious);
7083 alias_page->vmp_fictitious = FALSE;
7084 alias_page->vmp_private = TRUE;
7085 alias_page->vmp_free_when_done = TRUE;
7086 /*
7087 * since m is a page in the upl it must
7088 * already be wired or BUSY, so it's
7089 * safe to assign the underlying physical
7090 * page to the alias
7091 */
7092 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7093
7094 vm_object_unlock(object);
7095
7096 vm_page_lockspin_queues();
7097 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7098 vm_page_unlock_queues();
7099
7100 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7101
7102 assert(!alias_page->vmp_wanted);
7103 alias_page->vmp_busy = FALSE;
7104 alias_page->vmp_absent = FALSE;
7105 }
7106 size -= PAGE_SIZE;
7107 offset += PAGE_SIZE_64;
7108 new_offset += PAGE_SIZE_64;
7109 }
7110 vm_object_unlock(upl->map_object);
7111 }
7112 if (upl->flags & UPL_SHADOWED) {
7113 if (isVectorUPL) {
7114 offset = 0;
7115 } else {
7116 offset = offset_to_map;
7117 }
7118 } else {
7119 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7120 if (!isVectorUPL) {
7121 offset += offset_to_map;
7122 }
7123 }
7124
7125 if (isVectorUPL) {
7126 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7127 } else {
7128 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7129 }
7130
7131 vm_object_reference(upl->map_object);
7132
7133 if (!isVectorUPL) {
7134 *dst_addr = 0;
7135 /*
7136 * NEED A UPL_MAP ALIAS
7137 */
7138 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7139 VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7140 upl->map_object, offset, FALSE,
7141 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7142
7143 if (kr != KERN_SUCCESS) {
7144 vm_object_deallocate(upl->map_object);
7145 upl_unlock(upl);
7146 return kr;
7147 }
7148 } else {
7149 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7150 VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7151 upl->map_object, offset, FALSE,
7152 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7153 if (kr) {
7154 panic("vm_map_enter failed for a Vector UPL");
7155 }
7156 }
7157 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7158 /* this will have to be an increment rather than */
7159 /* an assignment. */
7160 vm_object_lock(upl->map_object);
7161
7162 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7163 m = vm_page_lookup(upl->map_object, offset);
7164
7165 if (m) {
7166 m->vmp_pmapped = TRUE;
7167
7168 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7169 * but only in kernel space. If this was on a user map,
7170 * we'd have to set the wpmapped bit. */
7171 /* m->vmp_wpmapped = TRUE; */
7172 assert(map->pmap == kernel_pmap);
7173
7174 PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7175
7176 assert(kr == KERN_SUCCESS);
7177 #if KASAN
7178 kasan_notify_address(addr, PAGE_SIZE_64);
7179 #endif
7180 }
7181 offset += PAGE_SIZE_64;
7182 }
7183 vm_object_unlock(upl->map_object);
7184
7185 /*
7186 * hold a reference for the mapping
7187 */
7188 upl->ref_count++;
7189 upl->flags |= UPL_PAGE_LIST_MAPPED;
7190 upl->kaddr = (vm_offset_t) *dst_addr;
7191 assert(upl->kaddr == *dst_addr);
7192
7193 if (isVectorUPL) {
7194 goto process_upl_to_enter;
7195 }
7196
7197 if (!isVectorUPL) {
7198 vm_map_offset_t addr_adjustment;
7199
7200 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7201 if (addr_adjustment) {
7202 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7203 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7204 *dst_addr += addr_adjustment;
7205 }
7206 }
7207
7208 upl_unlock(upl);
7209
7210 return KERN_SUCCESS;
7211 }
7212
7213 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7214 vm_map_enter_upl(
7215 vm_map_t map,
7216 upl_t upl,
7217 vm_map_offset_t *dst_addr)
7218 {
7219 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7220 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7221 }
7222
7223 /*
7224 * Internal routine to remove a UPL mapping from a VM map.
7225 *
7226 * XXX - This should just be doable through a standard
7227 * vm_map_remove() operation. Otherwise, implicit clean-up
7228 * of the target map won't be able to correctly remove
7229 * these (and release the reference on the UPL). Having
7230 * to do this means we can't map these into user-space
7231 * maps yet.
7232 */
7233 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7234 vm_map_remove_upl_range(
7235 vm_map_t map,
7236 upl_t upl,
7237 __unused vm_object_offset_t offset_to_unmap,
7238 __unused upl_size_t size_to_unmap)
7239 {
7240 vm_address_t addr;
7241 upl_size_t size;
7242 int isVectorUPL = 0, curr_upl = 0;
7243 upl_t vector_upl = NULL;
7244
7245 if (upl == UPL_NULL) {
7246 return KERN_INVALID_ARGUMENT;
7247 }
7248
7249 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7250 int unmapped = 0, valid_upls = 0;
7251 vector_upl = upl;
7252 upl_lock(vector_upl);
7253 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7254 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7255 if (upl == NULL) {
7256 continue;
7257 }
7258 valid_upls++;
7259 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7260 unmapped++;
7261 }
7262 }
7263
7264 if (unmapped) {
7265 if (unmapped != valid_upls) {
7266 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7267 } else {
7268 upl_unlock(vector_upl);
7269 return KERN_FAILURE;
7270 }
7271 }
7272 curr_upl = 0;
7273 } else {
7274 upl_lock(upl);
7275 }
7276
7277 process_upl_to_remove:
7278 if (isVectorUPL) {
7279 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7280 vm_map_t v_upl_submap;
7281 vm_offset_t v_upl_submap_dst_addr;
7282 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7283
7284 vm_map_remove(map, v_upl_submap_dst_addr,
7285 v_upl_submap_dst_addr + vector_upl->u_size,
7286 VM_MAP_REMOVE_NO_FLAGS);
7287 vm_map_deallocate(v_upl_submap);
7288 upl_unlock(vector_upl);
7289 return KERN_SUCCESS;
7290 }
7291
7292 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7293 if (upl == NULL) {
7294 goto process_upl_to_remove;
7295 }
7296 }
7297
7298 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7299 addr = upl->kaddr;
7300 size = upl->u_mapped_size;
7301
7302 assert(upl->ref_count > 1);
7303 upl->ref_count--; /* removing mapping ref */
7304
7305 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7306 upl->kaddr = (vm_offset_t) 0;
7307 upl->u_mapped_size = 0;
7308
7309 if (!isVectorUPL) {
7310 upl_unlock(upl);
7311
7312 vm_map_remove(
7313 map,
7314 vm_map_trunc_page(addr,
7315 VM_MAP_PAGE_MASK(map)),
7316 vm_map_round_page(addr + size,
7317 VM_MAP_PAGE_MASK(map)),
7318 VM_MAP_REMOVE_NO_FLAGS);
7319 return KERN_SUCCESS;
7320 } else {
7321 /*
7322 * If it's a Vectored UPL, we'll be removing the entire
7323 * submap anyways, so no need to remove individual UPL
7324 * element mappings from within the submap
7325 */
7326 goto process_upl_to_remove;
7327 }
7328 }
7329 upl_unlock(upl);
7330
7331 return KERN_FAILURE;
7332 }
7333
7334 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7335 vm_map_remove_upl(
7336 vm_map_t map,
7337 upl_t upl)
7338 {
7339 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7340 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7341 }
7342
7343 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7344 upl_commit_range(
7345 upl_t upl,
7346 upl_offset_t offset,
7347 upl_size_t size,
7348 int flags,
7349 upl_page_info_t *page_list,
7350 mach_msg_type_number_t count,
7351 boolean_t *empty)
7352 {
7353 upl_size_t xfer_size, subupl_size;
7354 vm_object_t shadow_object;
7355 vm_object_t object;
7356 vm_object_t m_object;
7357 vm_object_offset_t target_offset;
7358 upl_offset_t subupl_offset = offset;
7359 int entry;
7360 wpl_array_t lite_list;
7361 int occupied;
7362 int clear_refmod = 0;
7363 int pgpgout_count = 0;
7364 struct vm_page_delayed_work dw_array;
7365 struct vm_page_delayed_work *dwp, *dwp_start;
7366 bool dwp_finish_ctx = TRUE;
7367 int dw_count;
7368 int dw_limit;
7369 int isVectorUPL = 0;
7370 upl_t vector_upl = NULL;
7371 boolean_t should_be_throttled = FALSE;
7372
7373 vm_page_t nxt_page = VM_PAGE_NULL;
7374 int fast_path_possible = 0;
7375 int fast_path_full_commit = 0;
7376 int throttle_page = 0;
7377 int unwired_count = 0;
7378 int local_queue_count = 0;
7379 vm_page_t first_local, last_local;
7380 vm_object_offset_t obj_start, obj_end, obj_offset;
7381 kern_return_t kr = KERN_SUCCESS;
7382
7383 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7384
7385 dwp_start = dwp = NULL;
7386
7387 subupl_size = size;
7388 *empty = FALSE;
7389
7390 if (upl == UPL_NULL) {
7391 return KERN_INVALID_ARGUMENT;
7392 }
7393
7394 dw_count = 0;
7395 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7396 dwp_start = vm_page_delayed_work_get_ctx();
7397 if (dwp_start == NULL) {
7398 dwp_start = &dw_array;
7399 dw_limit = 1;
7400 dwp_finish_ctx = FALSE;
7401 }
7402
7403 dwp = dwp_start;
7404
7405 if (count == 0) {
7406 page_list = NULL;
7407 }
7408
7409 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7410 vector_upl = upl;
7411 upl_lock(vector_upl);
7412 } else {
7413 upl_lock(upl);
7414 }
7415
7416 process_upl_to_commit:
7417
7418 if (isVectorUPL) {
7419 size = subupl_size;
7420 offset = subupl_offset;
7421 if (size == 0) {
7422 upl_unlock(vector_upl);
7423 kr = KERN_SUCCESS;
7424 goto done;
7425 }
7426 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7427 if (upl == NULL) {
7428 upl_unlock(vector_upl);
7429 kr = KERN_FAILURE;
7430 goto done;
7431 }
7432 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7433 subupl_size -= size;
7434 subupl_offset += size;
7435 }
7436
7437 #if UPL_DEBUG
7438 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7439 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7440
7441 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7442 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7443
7444 upl->upl_commit_index++;
7445 }
7446 #endif
7447 if (upl->flags & UPL_DEVICE_MEMORY) {
7448 xfer_size = 0;
7449 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7450 xfer_size = size;
7451 } else {
7452 if (!isVectorUPL) {
7453 upl_unlock(upl);
7454 } else {
7455 upl_unlock(vector_upl);
7456 }
7457 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7458 kr = KERN_FAILURE;
7459 goto done;
7460 }
7461 if (upl->flags & UPL_SET_DIRTY) {
7462 flags |= UPL_COMMIT_SET_DIRTY;
7463 }
7464 if (upl->flags & UPL_CLEAR_DIRTY) {
7465 flags |= UPL_COMMIT_CLEAR_DIRTY;
7466 }
7467
7468 if (upl->flags & UPL_INTERNAL) {
7469 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7470 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7471 } else {
7472 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7473 }
7474
7475 object = upl->map_object;
7476
7477 if (upl->flags & UPL_SHADOWED) {
7478 vm_object_lock(object);
7479 shadow_object = object->shadow;
7480 } else {
7481 shadow_object = object;
7482 }
7483 entry = offset / PAGE_SIZE;
7484 target_offset = (vm_object_offset_t)offset;
7485
7486 if (upl->flags & UPL_KERNEL_OBJECT) {
7487 vm_object_lock_shared(shadow_object);
7488 } else {
7489 vm_object_lock(shadow_object);
7490 }
7491
7492 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7493
7494 if (upl->flags & UPL_ACCESS_BLOCKED) {
7495 assert(shadow_object->blocked_access);
7496 shadow_object->blocked_access = FALSE;
7497 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7498 }
7499
7500 if (shadow_object->code_signed) {
7501 /*
7502 * CODE SIGNING:
7503 * If the object is code-signed, do not let this UPL tell
7504 * us if the pages are valid or not. Let the pages be
7505 * validated by VM the normal way (when they get mapped or
7506 * copied).
7507 */
7508 flags &= ~UPL_COMMIT_CS_VALIDATED;
7509 }
7510 if (!page_list) {
7511 /*
7512 * No page list to get the code-signing info from !?
7513 */
7514 flags &= ~UPL_COMMIT_CS_VALIDATED;
7515 }
7516 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7517 should_be_throttled = TRUE;
7518 }
7519
7520 if ((upl->flags & UPL_IO_WIRE) &&
7521 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7522 !isVectorUPL &&
7523 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7524 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7525 if (!vm_page_queue_empty(&shadow_object->memq)) {
7526 if (size == shadow_object->vo_size) {
7527 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7528 fast_path_full_commit = 1;
7529 }
7530 fast_path_possible = 1;
7531
7532 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7533 (shadow_object->purgable == VM_PURGABLE_DENY ||
7534 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7535 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7536 throttle_page = 1;
7537 }
7538 }
7539 }
7540 first_local = VM_PAGE_NULL;
7541 last_local = VM_PAGE_NULL;
7542
7543 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7544 obj_end = obj_start + xfer_size;
7545 obj_start = vm_object_trunc_page(obj_start);
7546 obj_end = vm_object_round_page(obj_end);
7547 for (obj_offset = obj_start;
7548 obj_offset < obj_end;
7549 obj_offset += PAGE_SIZE) {
7550 vm_page_t t, m;
7551
7552 dwp->dw_mask = 0;
7553 clear_refmod = 0;
7554
7555 m = VM_PAGE_NULL;
7556
7557 if (upl->flags & UPL_LITE) {
7558 unsigned int pg_num;
7559
7560 if (nxt_page != VM_PAGE_NULL) {
7561 m = nxt_page;
7562 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7563 target_offset = m->vmp_offset;
7564 }
7565 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7566 assert(pg_num == target_offset / PAGE_SIZE);
7567
7568 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7569 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7570
7571 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7572 m = vm_page_lookup(shadow_object, obj_offset);
7573 }
7574 } else {
7575 m = NULL;
7576 }
7577 }
7578 if (upl->flags & UPL_SHADOWED) {
7579 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7580 t->vmp_free_when_done = FALSE;
7581
7582 VM_PAGE_FREE(t);
7583
7584 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7585 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7586 }
7587 }
7588 }
7589 if (m == VM_PAGE_NULL) {
7590 goto commit_next_page;
7591 }
7592
7593 m_object = VM_PAGE_OBJECT(m);
7594
7595 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7596 assert(m->vmp_busy);
7597
7598 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7599 goto commit_next_page;
7600 }
7601
7602 if (flags & UPL_COMMIT_CS_VALIDATED) {
7603 /*
7604 * CODE SIGNING:
7605 * Set the code signing bits according to
7606 * what the UPL says they should be.
7607 */
7608 m->vmp_cs_validated |= page_list[entry].cs_validated;
7609 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7610 m->vmp_cs_nx |= page_list[entry].cs_nx;
7611 }
7612 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7613 m->vmp_written_by_kernel = TRUE;
7614 }
7615
7616 if (upl->flags & UPL_IO_WIRE) {
7617 if (page_list) {
7618 page_list[entry].phys_addr = 0;
7619 }
7620
7621 if (flags & UPL_COMMIT_SET_DIRTY) {
7622 SET_PAGE_DIRTY(m, FALSE);
7623 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7624 m->vmp_dirty = FALSE;
7625
7626 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7627 m->vmp_cs_validated &&
7628 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7629 /*
7630 * CODE SIGNING:
7631 * This page is no longer dirty
7632 * but could have been modified,
7633 * so it will need to be
7634 * re-validated.
7635 */
7636 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7637
7638 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7639
7640 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7641 }
7642 clear_refmod |= VM_MEM_MODIFIED;
7643 }
7644 if (upl->flags & UPL_ACCESS_BLOCKED) {
7645 /*
7646 * We blocked access to the pages in this UPL.
7647 * Clear the "busy" bit and wake up any waiter
7648 * for this page.
7649 */
7650 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7651 }
7652 if (fast_path_possible) {
7653 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7654 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7655 if (m->vmp_absent) {
7656 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7657 assert(m->vmp_wire_count == 0);
7658 assert(m->vmp_busy);
7659
7660 m->vmp_absent = FALSE;
7661 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7662 } else {
7663 if (m->vmp_wire_count == 0) {
7664 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7665 }
7666 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7667
7668 /*
7669 * XXX FBDP need to update some other
7670 * counters here (purgeable_wired_count)
7671 * (ledgers), ...
7672 */
7673 assert(m->vmp_wire_count > 0);
7674 m->vmp_wire_count--;
7675
7676 if (m->vmp_wire_count == 0) {
7677 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7678 unwired_count++;
7679 }
7680 }
7681 if (m->vmp_wire_count == 0) {
7682 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7683
7684 if (last_local == VM_PAGE_NULL) {
7685 assert(first_local == VM_PAGE_NULL);
7686
7687 last_local = m;
7688 first_local = m;
7689 } else {
7690 assert(first_local != VM_PAGE_NULL);
7691
7692 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7693 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7694 first_local = m;
7695 }
7696 local_queue_count++;
7697
7698 if (throttle_page) {
7699 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7700 } else {
7701 if (flags & UPL_COMMIT_INACTIVATE) {
7702 if (shadow_object->internal) {
7703 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7704 } else {
7705 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7706 }
7707 } else {
7708 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7709 }
7710 }
7711 }
7712 } else {
7713 if (flags & UPL_COMMIT_INACTIVATE) {
7714 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7715 clear_refmod |= VM_MEM_REFERENCED;
7716 }
7717 if (m->vmp_absent) {
7718 if (flags & UPL_COMMIT_FREE_ABSENT) {
7719 dwp->dw_mask |= DW_vm_page_free;
7720 } else {
7721 m->vmp_absent = FALSE;
7722 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7723
7724 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7725 dwp->dw_mask |= DW_vm_page_activate;
7726 }
7727 }
7728 } else {
7729 dwp->dw_mask |= DW_vm_page_unwire;
7730 }
7731 }
7732 goto commit_next_page;
7733 }
7734 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7735
7736 if (page_list) {
7737 page_list[entry].phys_addr = 0;
7738 }
7739
7740 /*
7741 * make sure to clear the hardware
7742 * modify or reference bits before
7743 * releasing the BUSY bit on this page
7744 * otherwise we risk losing a legitimate
7745 * change of state
7746 */
7747 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7748 m->vmp_dirty = FALSE;
7749
7750 clear_refmod |= VM_MEM_MODIFIED;
7751 }
7752 if (m->vmp_laundry) {
7753 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7754 }
7755
7756 if (VM_PAGE_WIRED(m)) {
7757 m->vmp_free_when_done = FALSE;
7758 }
7759
7760 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7761 m->vmp_cs_validated &&
7762 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7763 /*
7764 * CODE SIGNING:
7765 * This page is no longer dirty
7766 * but could have been modified,
7767 * so it will need to be
7768 * re-validated.
7769 */
7770 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7771
7772 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7773
7774 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7775 }
7776 if (m->vmp_overwriting) {
7777 /*
7778 * the (COPY_OUT_FROM == FALSE) request_page_list case
7779 */
7780 if (m->vmp_busy) {
7781 #if CONFIG_PHANTOM_CACHE
7782 if (m->vmp_absent && !m_object->internal) {
7783 dwp->dw_mask |= DW_vm_phantom_cache_update;
7784 }
7785 #endif
7786 m->vmp_absent = FALSE;
7787
7788 dwp->dw_mask |= DW_clear_busy;
7789 } else {
7790 /*
7791 * alternate (COPY_OUT_FROM == FALSE) page_list case
7792 * Occurs when the original page was wired
7793 * at the time of the list request
7794 */
7795 assert(VM_PAGE_WIRED(m));
7796
7797 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7798 }
7799 m->vmp_overwriting = FALSE;
7800 }
7801 m->vmp_cleaning = FALSE;
7802
7803 if (m->vmp_free_when_done) {
7804 /*
7805 * With the clean queue enabled, UPL_PAGEOUT should
7806 * no longer set the pageout bit. Its pages now go
7807 * to the clean queue.
7808 *
7809 * We don't use the cleaned Q anymore and so this
7810 * assert isn't correct. The code for the clean Q
7811 * still exists and might be used in the future. If we
7812 * go back to the cleaned Q, we will re-enable this
7813 * assert.
7814 *
7815 * assert(!(upl->flags & UPL_PAGEOUT));
7816 */
7817 assert(!m_object->internal);
7818
7819 m->vmp_free_when_done = FALSE;
7820
7821 if ((flags & UPL_COMMIT_SET_DIRTY) ||
7822 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7823 /*
7824 * page was re-dirtied after we started
7825 * the pageout... reactivate it since
7826 * we don't know whether the on-disk
7827 * copy matches what is now in memory
7828 */
7829 SET_PAGE_DIRTY(m, FALSE);
7830
7831 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7832
7833 if (upl->flags & UPL_PAGEOUT) {
7834 counter_inc(&vm_statistics_reactivations);
7835 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7836 }
7837 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
7838 /*
7839 * Someone else might still be handling this
7840 * page (vm_fault() for example), so let's not
7841 * free it or "un-busy" it!
7842 * Put that page in the "speculative" queue
7843 * for now (since we would otherwise have freed
7844 * it) and let whoever is keeping the page
7845 * "busy" move it if needed when they're done
7846 * with it.
7847 */
7848 dwp->dw_mask |= DW_vm_page_speculate;
7849 } else {
7850 /*
7851 * page has been successfully cleaned
7852 * go ahead and free it for other use
7853 */
7854 if (m_object->internal) {
7855 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7856 } else {
7857 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7858 }
7859 m->vmp_dirty = FALSE;
7860 if (!(upl->flags & UPL_HAS_BUSY)) {
7861 assert(!m->vmp_busy);
7862 }
7863 m->vmp_busy = TRUE;
7864
7865 dwp->dw_mask |= DW_vm_page_free;
7866 }
7867 goto commit_next_page;
7868 }
7869 /*
7870 * It is a part of the semantic of COPYOUT_FROM
7871 * UPLs that a commit implies cache sync
7872 * between the vm page and the backing store
7873 * this can be used to strip the precious bit
7874 * as well as clean
7875 */
7876 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
7877 m->vmp_precious = FALSE;
7878 }
7879
7880 if (flags & UPL_COMMIT_SET_DIRTY) {
7881 SET_PAGE_DIRTY(m, FALSE);
7882 } else {
7883 m->vmp_dirty = FALSE;
7884 }
7885
7886 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7887 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
7888 pgpgout_count++;
7889
7890 counter_inc(&vm_statistics_pageouts);
7891 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7892
7893 dwp->dw_mask |= DW_enqueue_cleaned;
7894 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
7895 /*
7896 * page coming back in from being 'frozen'...
7897 * it was dirty before it was frozen, so keep it so
7898 * the vm_page_activate will notice that it really belongs
7899 * on the throttle queue and put it there
7900 */
7901 SET_PAGE_DIRTY(m, FALSE);
7902 dwp->dw_mask |= DW_vm_page_activate;
7903 } else {
7904 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7905 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7906 clear_refmod |= VM_MEM_REFERENCED;
7907 } else if (!VM_PAGE_PAGEABLE(m)) {
7908 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
7909 dwp->dw_mask |= DW_vm_page_speculate;
7910 } else if (m->vmp_reference) {
7911 dwp->dw_mask |= DW_vm_page_activate;
7912 } else {
7913 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7914 clear_refmod |= VM_MEM_REFERENCED;
7915 }
7916 }
7917 }
7918 if (upl->flags & UPL_ACCESS_BLOCKED) {
7919 /*
7920 * We blocked access to the pages in this URL.
7921 * Clear the "busy" bit on this page before we
7922 * wake up any waiter.
7923 */
7924 dwp->dw_mask |= DW_clear_busy;
7925 }
7926 /*
7927 * Wakeup any thread waiting for the page to be un-cleaning.
7928 */
7929 dwp->dw_mask |= DW_PAGE_WAKEUP;
7930
7931 commit_next_page:
7932 if (clear_refmod) {
7933 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7934 }
7935
7936 target_offset += PAGE_SIZE_64;
7937 xfer_size -= PAGE_SIZE;
7938 entry++;
7939
7940 if (dwp->dw_mask) {
7941 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7942 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7943
7944 if (dw_count >= dw_limit) {
7945 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7946
7947 dwp = dwp_start;
7948 dw_count = 0;
7949 }
7950 } else {
7951 if (dwp->dw_mask & DW_clear_busy) {
7952 m->vmp_busy = FALSE;
7953 }
7954
7955 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
7956 PAGE_WAKEUP(m);
7957 }
7958 }
7959 }
7960 }
7961 if (dw_count) {
7962 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7963 dwp = dwp_start;
7964 dw_count = 0;
7965 }
7966
7967 if (fast_path_possible) {
7968 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7969 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7970
7971 if (local_queue_count || unwired_count) {
7972 if (local_queue_count) {
7973 vm_page_t first_target;
7974 vm_page_queue_head_t *target_queue;
7975
7976 if (throttle_page) {
7977 target_queue = &vm_page_queue_throttled;
7978 } else {
7979 if (flags & UPL_COMMIT_INACTIVATE) {
7980 if (shadow_object->internal) {
7981 target_queue = &vm_page_queue_anonymous;
7982 } else {
7983 target_queue = &vm_page_queue_inactive;
7984 }
7985 } else {
7986 target_queue = &vm_page_queue_active;
7987 }
7988 }
7989 /*
7990 * Transfer the entire local queue to a regular LRU page queues.
7991 */
7992 vm_page_lockspin_queues();
7993
7994 first_target = (vm_page_t) vm_page_queue_first(target_queue);
7995
7996 if (vm_page_queue_empty(target_queue)) {
7997 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7998 } else {
7999 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8000 }
8001
8002 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8003 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8004 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8005
8006 /*
8007 * Adjust the global page counts.
8008 */
8009 if (throttle_page) {
8010 vm_page_throttled_count += local_queue_count;
8011 } else {
8012 if (flags & UPL_COMMIT_INACTIVATE) {
8013 if (shadow_object->internal) {
8014 vm_page_anonymous_count += local_queue_count;
8015 }
8016 vm_page_inactive_count += local_queue_count;
8017
8018 token_new_pagecount += local_queue_count;
8019 } else {
8020 vm_page_active_count += local_queue_count;
8021 }
8022
8023 if (shadow_object->internal) {
8024 vm_page_pageable_internal_count += local_queue_count;
8025 } else {
8026 vm_page_pageable_external_count += local_queue_count;
8027 }
8028 }
8029 } else {
8030 vm_page_lockspin_queues();
8031 }
8032 if (unwired_count) {
8033 vm_page_wire_count -= unwired_count;
8034 VM_CHECK_MEMORYSTATUS;
8035 }
8036 vm_page_unlock_queues();
8037
8038 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8039 }
8040 }
8041 occupied = 1;
8042
8043 if (upl->flags & UPL_DEVICE_MEMORY) {
8044 occupied = 0;
8045 } else if (upl->flags & UPL_LITE) {
8046 int pg_num;
8047 int i;
8048
8049 occupied = 0;
8050
8051 if (!fast_path_full_commit) {
8052 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8053 pg_num = (pg_num + 31) >> 5;
8054
8055 for (i = 0; i < pg_num; i++) {
8056 if (lite_list[i] != 0) {
8057 occupied = 1;
8058 break;
8059 }
8060 }
8061 }
8062 } else {
8063 if (vm_page_queue_empty(&upl->map_object->memq)) {
8064 occupied = 0;
8065 }
8066 }
8067 if (occupied == 0) {
8068 /*
8069 * If this UPL element belongs to a Vector UPL and is
8070 * empty, then this is the right function to deallocate
8071 * it. So go ahead set the *empty variable. The flag
8072 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8073 * should be considered relevant for the Vector UPL and not
8074 * the internal UPLs.
8075 */
8076 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8077 *empty = TRUE;
8078 }
8079
8080 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8081 /*
8082 * this is not a paging object
8083 * so we need to drop the paging reference
8084 * that was taken when we created the UPL
8085 * against this object
8086 */
8087 vm_object_activity_end(shadow_object);
8088 vm_object_collapse(shadow_object, 0, TRUE);
8089 } else {
8090 /*
8091 * we dontated the paging reference to
8092 * the map object... vm_pageout_object_terminate
8093 * will drop this reference
8094 */
8095 }
8096 }
8097 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8098 vm_object_unlock(shadow_object);
8099 if (object != shadow_object) {
8100 vm_object_unlock(object);
8101 }
8102
8103 if (!isVectorUPL) {
8104 upl_unlock(upl);
8105 } else {
8106 /*
8107 * If we completed our operations on an UPL that is
8108 * part of a Vectored UPL and if empty is TRUE, then
8109 * we should go ahead and deallocate this UPL element.
8110 * Then we check if this was the last of the UPL elements
8111 * within that Vectored UPL. If so, set empty to TRUE
8112 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8113 * can go ahead and deallocate the Vector UPL too.
8114 */
8115 if (*empty == TRUE) {
8116 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8117 upl_deallocate(upl);
8118 }
8119 goto process_upl_to_commit;
8120 }
8121 if (pgpgout_count) {
8122 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8123 }
8124
8125 kr = KERN_SUCCESS;
8126 done:
8127 if (dwp_start && dwp_finish_ctx) {
8128 vm_page_delayed_work_finish_ctx(dwp_start);
8129 dwp_start = dwp = NULL;
8130 }
8131
8132 return kr;
8133 }
8134
8135 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8136 upl_abort_range(
8137 upl_t upl,
8138 upl_offset_t offset,
8139 upl_size_t size,
8140 int error,
8141 boolean_t *empty)
8142 {
8143 upl_page_info_t *user_page_list = NULL;
8144 upl_size_t xfer_size, subupl_size;
8145 vm_object_t shadow_object;
8146 vm_object_t object;
8147 vm_object_offset_t target_offset;
8148 upl_offset_t subupl_offset = offset;
8149 int entry;
8150 wpl_array_t lite_list;
8151 int occupied;
8152 struct vm_page_delayed_work dw_array;
8153 struct vm_page_delayed_work *dwp, *dwp_start;
8154 bool dwp_finish_ctx = TRUE;
8155 int dw_count;
8156 int dw_limit;
8157 int isVectorUPL = 0;
8158 upl_t vector_upl = NULL;
8159 vm_object_offset_t obj_start, obj_end, obj_offset;
8160 kern_return_t kr = KERN_SUCCESS;
8161
8162 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8163
8164 dwp_start = dwp = NULL;
8165
8166 subupl_size = size;
8167 *empty = FALSE;
8168
8169 if (upl == UPL_NULL) {
8170 return KERN_INVALID_ARGUMENT;
8171 }
8172
8173 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8174 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8175 }
8176
8177 dw_count = 0;
8178 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8179 dwp_start = vm_page_delayed_work_get_ctx();
8180 if (dwp_start == NULL) {
8181 dwp_start = &dw_array;
8182 dw_limit = 1;
8183 dwp_finish_ctx = FALSE;
8184 }
8185
8186 dwp = dwp_start;
8187
8188 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8189 vector_upl = upl;
8190 upl_lock(vector_upl);
8191 } else {
8192 upl_lock(upl);
8193 }
8194
8195 process_upl_to_abort:
8196 if (isVectorUPL) {
8197 size = subupl_size;
8198 offset = subupl_offset;
8199 if (size == 0) {
8200 upl_unlock(vector_upl);
8201 kr = KERN_SUCCESS;
8202 goto done;
8203 }
8204 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8205 if (upl == NULL) {
8206 upl_unlock(vector_upl);
8207 kr = KERN_FAILURE;
8208 goto done;
8209 }
8210 subupl_size -= size;
8211 subupl_offset += size;
8212 }
8213
8214 *empty = FALSE;
8215
8216 #if UPL_DEBUG
8217 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8218 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8219
8220 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8221 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8222 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8223
8224 upl->upl_commit_index++;
8225 }
8226 #endif
8227 if (upl->flags & UPL_DEVICE_MEMORY) {
8228 xfer_size = 0;
8229 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8230 xfer_size = size;
8231 } else {
8232 if (!isVectorUPL) {
8233 upl_unlock(upl);
8234 } else {
8235 upl_unlock(vector_upl);
8236 }
8237 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8238 kr = KERN_FAILURE;
8239 goto done;
8240 }
8241 if (upl->flags & UPL_INTERNAL) {
8242 lite_list = (wpl_array_t)
8243 ((((uintptr_t)upl) + sizeof(struct upl))
8244 + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8245
8246 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8247 } else {
8248 lite_list = (wpl_array_t)
8249 (((uintptr_t)upl) + sizeof(struct upl));
8250 }
8251 object = upl->map_object;
8252
8253 if (upl->flags & UPL_SHADOWED) {
8254 vm_object_lock(object);
8255 shadow_object = object->shadow;
8256 } else {
8257 shadow_object = object;
8258 }
8259
8260 entry = offset / PAGE_SIZE;
8261 target_offset = (vm_object_offset_t)offset;
8262
8263 if (upl->flags & UPL_KERNEL_OBJECT) {
8264 vm_object_lock_shared(shadow_object);
8265 } else {
8266 vm_object_lock(shadow_object);
8267 }
8268
8269 if (upl->flags & UPL_ACCESS_BLOCKED) {
8270 assert(shadow_object->blocked_access);
8271 shadow_object->blocked_access = FALSE;
8272 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8273 }
8274
8275 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8276 panic("upl_abort_range: kernel_object being DUMPED");
8277 }
8278
8279 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8280 obj_end = obj_start + xfer_size;
8281 obj_start = vm_object_trunc_page(obj_start);
8282 obj_end = vm_object_round_page(obj_end);
8283 for (obj_offset = obj_start;
8284 obj_offset < obj_end;
8285 obj_offset += PAGE_SIZE) {
8286 vm_page_t t, m;
8287 unsigned int pg_num;
8288 boolean_t needed;
8289
8290 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8291 assert(pg_num == target_offset / PAGE_SIZE);
8292
8293 needed = FALSE;
8294
8295 if (user_page_list) {
8296 needed = user_page_list[pg_num].needed;
8297 }
8298
8299 dwp->dw_mask = 0;
8300 m = VM_PAGE_NULL;
8301
8302 if (upl->flags & UPL_LITE) {
8303 if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8304 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8305
8306 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8307 m = vm_page_lookup(shadow_object, obj_offset);
8308 }
8309 }
8310 }
8311 if (upl->flags & UPL_SHADOWED) {
8312 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8313 t->vmp_free_when_done = FALSE;
8314
8315 VM_PAGE_FREE(t);
8316
8317 if (m == VM_PAGE_NULL) {
8318 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8319 }
8320 }
8321 }
8322 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8323 goto abort_next_page;
8324 }
8325
8326 if (m != VM_PAGE_NULL) {
8327 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8328
8329 if (m->vmp_absent) {
8330 boolean_t must_free = TRUE;
8331
8332 /*
8333 * COPYOUT = FALSE case
8334 * check for error conditions which must
8335 * be passed back to the pages customer
8336 */
8337 if (error & UPL_ABORT_RESTART) {
8338 m->vmp_restart = TRUE;
8339 m->vmp_absent = FALSE;
8340 m->vmp_unusual = TRUE;
8341 must_free = FALSE;
8342 } else if (error & UPL_ABORT_UNAVAILABLE) {
8343 m->vmp_restart = FALSE;
8344 m->vmp_unusual = TRUE;
8345 must_free = FALSE;
8346 } else if (error & UPL_ABORT_ERROR) {
8347 m->vmp_restart = FALSE;
8348 m->vmp_absent = FALSE;
8349 m->vmp_error = TRUE;
8350 m->vmp_unusual = TRUE;
8351 must_free = FALSE;
8352 }
8353 if (m->vmp_clustered && needed == FALSE) {
8354 /*
8355 * This page was a part of a speculative
8356 * read-ahead initiated by the kernel
8357 * itself. No one is expecting this
8358 * page and no one will clean up its
8359 * error state if it ever becomes valid
8360 * in the future.
8361 * We have to free it here.
8362 */
8363 must_free = TRUE;
8364 }
8365 m->vmp_cleaning = FALSE;
8366
8367 if (m->vmp_overwriting && !m->vmp_busy) {
8368 /*
8369 * this shouldn't happen since
8370 * this is an 'absent' page, but
8371 * it doesn't hurt to check for
8372 * the 'alternate' method of
8373 * stabilizing the page...
8374 * we will mark 'busy' to be cleared
8375 * in the following code which will
8376 * take care of the primary stabilzation
8377 * method (i.e. setting 'busy' to TRUE)
8378 */
8379 dwp->dw_mask |= DW_vm_page_unwire;
8380 }
8381 m->vmp_overwriting = FALSE;
8382
8383 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8384
8385 if (must_free == TRUE) {
8386 dwp->dw_mask |= DW_vm_page_free;
8387 } else {
8388 dwp->dw_mask |= DW_vm_page_activate;
8389 }
8390 } else {
8391 /*
8392 * Handle the trusted pager throttle.
8393 */
8394 if (m->vmp_laundry) {
8395 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8396 }
8397
8398 if (upl->flags & UPL_ACCESS_BLOCKED) {
8399 /*
8400 * We blocked access to the pages in this UPL.
8401 * Clear the "busy" bit and wake up any waiter
8402 * for this page.
8403 */
8404 dwp->dw_mask |= DW_clear_busy;
8405 }
8406 if (m->vmp_overwriting) {
8407 if (m->vmp_busy) {
8408 dwp->dw_mask |= DW_clear_busy;
8409 } else {
8410 /*
8411 * deal with the 'alternate' method
8412 * of stabilizing the page...
8413 * we will either free the page
8414 * or mark 'busy' to be cleared
8415 * in the following code which will
8416 * take care of the primary stabilzation
8417 * method (i.e. setting 'busy' to TRUE)
8418 */
8419 dwp->dw_mask |= DW_vm_page_unwire;
8420 }
8421 m->vmp_overwriting = FALSE;
8422 }
8423 m->vmp_free_when_done = FALSE;
8424 m->vmp_cleaning = FALSE;
8425
8426 if (error & UPL_ABORT_DUMP_PAGES) {
8427 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8428
8429 dwp->dw_mask |= DW_vm_page_free;
8430 } else {
8431 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8432 if (error & UPL_ABORT_REFERENCE) {
8433 /*
8434 * we've been told to explictly
8435 * reference this page... for
8436 * file I/O, this is done by
8437 * implementing an LRU on the inactive q
8438 */
8439 dwp->dw_mask |= DW_vm_page_lru;
8440 } else if (!VM_PAGE_PAGEABLE(m)) {
8441 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8442 }
8443 }
8444 dwp->dw_mask |= DW_PAGE_WAKEUP;
8445 }
8446 }
8447 }
8448 abort_next_page:
8449 target_offset += PAGE_SIZE_64;
8450 xfer_size -= PAGE_SIZE;
8451 entry++;
8452
8453 if (dwp->dw_mask) {
8454 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8455 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8456
8457 if (dw_count >= dw_limit) {
8458 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8459
8460 dwp = dwp_start;
8461 dw_count = 0;
8462 }
8463 } else {
8464 if (dwp->dw_mask & DW_clear_busy) {
8465 m->vmp_busy = FALSE;
8466 }
8467
8468 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8469 PAGE_WAKEUP(m);
8470 }
8471 }
8472 }
8473 }
8474 if (dw_count) {
8475 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8476 dwp = dwp_start;
8477 dw_count = 0;
8478 }
8479
8480 occupied = 1;
8481
8482 if (upl->flags & UPL_DEVICE_MEMORY) {
8483 occupied = 0;
8484 } else if (upl->flags & UPL_LITE) {
8485 int pg_num;
8486 int i;
8487
8488 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8489 pg_num = (pg_num + 31) >> 5;
8490 occupied = 0;
8491
8492 for (i = 0; i < pg_num; i++) {
8493 if (lite_list[i] != 0) {
8494 occupied = 1;
8495 break;
8496 }
8497 }
8498 } else {
8499 if (vm_page_queue_empty(&upl->map_object->memq)) {
8500 occupied = 0;
8501 }
8502 }
8503 if (occupied == 0) {
8504 /*
8505 * If this UPL element belongs to a Vector UPL and is
8506 * empty, then this is the right function to deallocate
8507 * it. So go ahead set the *empty variable. The flag
8508 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8509 * should be considered relevant for the Vector UPL and
8510 * not the internal UPLs.
8511 */
8512 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8513 *empty = TRUE;
8514 }
8515
8516 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8517 /*
8518 * this is not a paging object
8519 * so we need to drop the paging reference
8520 * that was taken when we created the UPL
8521 * against this object
8522 */
8523 vm_object_activity_end(shadow_object);
8524 vm_object_collapse(shadow_object, 0, TRUE);
8525 } else {
8526 /*
8527 * we dontated the paging reference to
8528 * the map object... vm_pageout_object_terminate
8529 * will drop this reference
8530 */
8531 }
8532 }
8533 vm_object_unlock(shadow_object);
8534 if (object != shadow_object) {
8535 vm_object_unlock(object);
8536 }
8537
8538 if (!isVectorUPL) {
8539 upl_unlock(upl);
8540 } else {
8541 /*
8542 * If we completed our operations on an UPL that is
8543 * part of a Vectored UPL and if empty is TRUE, then
8544 * we should go ahead and deallocate this UPL element.
8545 * Then we check if this was the last of the UPL elements
8546 * within that Vectored UPL. If so, set empty to TRUE
8547 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8548 * can go ahead and deallocate the Vector UPL too.
8549 */
8550 if (*empty == TRUE) {
8551 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8552 upl_deallocate(upl);
8553 }
8554 goto process_upl_to_abort;
8555 }
8556
8557 kr = KERN_SUCCESS;
8558
8559 done:
8560 if (dwp_start && dwp_finish_ctx) {
8561 vm_page_delayed_work_finish_ctx(dwp_start);
8562 dwp_start = dwp = NULL;
8563 }
8564
8565 return kr;
8566 }
8567
8568
8569 kern_return_t
upl_abort(upl_t upl,int error)8570 upl_abort(
8571 upl_t upl,
8572 int error)
8573 {
8574 boolean_t empty;
8575
8576 if (upl == UPL_NULL) {
8577 return KERN_INVALID_ARGUMENT;
8578 }
8579
8580 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8581 }
8582
8583
8584 /* an option on commit should be wire */
8585 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8586 upl_commit(
8587 upl_t upl,
8588 upl_page_info_t *page_list,
8589 mach_msg_type_number_t count)
8590 {
8591 boolean_t empty;
8592
8593 if (upl == UPL_NULL) {
8594 return KERN_INVALID_ARGUMENT;
8595 }
8596
8597 return upl_commit_range(upl, 0, upl->u_size, 0,
8598 page_list, count, &empty);
8599 }
8600
8601
8602 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8603 iopl_valid_data(
8604 upl_t upl,
8605 vm_tag_t tag)
8606 {
8607 vm_object_t object;
8608 vm_offset_t offset;
8609 vm_page_t m, nxt_page = VM_PAGE_NULL;
8610 upl_size_t size;
8611 int wired_count = 0;
8612
8613 if (upl == NULL) {
8614 panic("iopl_valid_data: NULL upl");
8615 }
8616 if (vector_upl_is_valid(upl)) {
8617 panic("iopl_valid_data: vector upl");
8618 }
8619 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8620 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8621 }
8622
8623 object = upl->map_object;
8624
8625 if (object == kernel_object || object == compressor_object) {
8626 panic("iopl_valid_data: object == kernel or compressor");
8627 }
8628
8629 if (object->purgable == VM_PURGABLE_VOLATILE ||
8630 object->purgable == VM_PURGABLE_EMPTY) {
8631 panic("iopl_valid_data: object %p purgable %d",
8632 object, object->purgable);
8633 }
8634
8635 size = upl_adjusted_size(upl, PAGE_MASK);
8636
8637 vm_object_lock(object);
8638 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8639
8640 bool whole_object;
8641
8642 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8643 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8644 whole_object = true;
8645 } else {
8646 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8647 whole_object = false;
8648 }
8649
8650 while (size) {
8651 if (whole_object) {
8652 if (nxt_page != VM_PAGE_NULL) {
8653 m = nxt_page;
8654 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8655 }
8656 } else {
8657 m = vm_page_lookup(object, offset);
8658 offset += PAGE_SIZE;
8659
8660 if (m == VM_PAGE_NULL) {
8661 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8662 }
8663 }
8664 if (m->vmp_busy) {
8665 if (!m->vmp_absent) {
8666 panic("iopl_valid_data: busy page w/o absent");
8667 }
8668
8669 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8670 panic("iopl_valid_data: busy+absent page on page queue");
8671 }
8672 if (m->vmp_reusable) {
8673 panic("iopl_valid_data: %p is reusable", m);
8674 }
8675
8676 m->vmp_absent = FALSE;
8677 m->vmp_dirty = TRUE;
8678 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8679 assert(m->vmp_wire_count == 0);
8680 m->vmp_wire_count++;
8681 assert(m->vmp_wire_count);
8682 if (m->vmp_wire_count == 1) {
8683 m->vmp_q_state = VM_PAGE_IS_WIRED;
8684 wired_count++;
8685 } else {
8686 panic("iopl_valid_data: %p already wired", m);
8687 }
8688
8689 PAGE_WAKEUP_DONE(m);
8690 }
8691 size -= PAGE_SIZE;
8692 }
8693 if (wired_count) {
8694 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8695 assert(object->resident_page_count >= object->wired_page_count);
8696
8697 /* no need to adjust purgeable accounting for this object: */
8698 assert(object->purgable != VM_PURGABLE_VOLATILE);
8699 assert(object->purgable != VM_PURGABLE_EMPTY);
8700
8701 vm_page_lockspin_queues();
8702 vm_page_wire_count += wired_count;
8703 vm_page_unlock_queues();
8704 }
8705 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8706 vm_object_unlock(object);
8707 }
8708
8709
8710 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8711 vm_object_set_pmap_cache_attr(
8712 vm_object_t object,
8713 upl_page_info_array_t user_page_list,
8714 unsigned int num_pages,
8715 boolean_t batch_pmap_op)
8716 {
8717 unsigned int cache_attr = 0;
8718
8719 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8720 assert(user_page_list);
8721 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8722 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8723 }
8724 }
8725
8726
8727 boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8728 kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8729
8730
8731
8732 boolean_t
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8733 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8734 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8735 {
8736 vm_page_t dst_page;
8737 unsigned int entry;
8738 int page_count;
8739 int delayed_unlock = 0;
8740 boolean_t retval = TRUE;
8741 ppnum_t phys_page;
8742
8743 vm_object_lock_assert_exclusive(object);
8744 assert(object->purgable != VM_PURGABLE_VOLATILE);
8745 assert(object->purgable != VM_PURGABLE_EMPTY);
8746 assert(object->pager == NULL);
8747 assert(object->copy == NULL);
8748 assert(object->shadow == NULL);
8749
8750 page_count = object->resident_page_count;
8751 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8752
8753 vm_page_lock_queues();
8754
8755 while (page_count--) {
8756 if (dst_page->vmp_busy ||
8757 dst_page->vmp_fictitious ||
8758 dst_page->vmp_absent ||
8759 dst_page->vmp_error ||
8760 dst_page->vmp_cleaning ||
8761 dst_page->vmp_restart ||
8762 dst_page->vmp_laundry) {
8763 retval = FALSE;
8764 goto done;
8765 }
8766 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8767 retval = FALSE;
8768 goto done;
8769 }
8770 dst_page->vmp_reference = TRUE;
8771
8772 vm_page_wire(dst_page, tag, FALSE);
8773
8774 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8775 SET_PAGE_DIRTY(dst_page, FALSE);
8776 }
8777 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8778 assert(entry >= 0 && entry < object->resident_page_count);
8779 lite_list[entry >> 5] |= 1U << (entry & 31);
8780
8781 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8782
8783 if (phys_page > upl->highest_page) {
8784 upl->highest_page = phys_page;
8785 }
8786
8787 if (user_page_list) {
8788 user_page_list[entry].phys_addr = phys_page;
8789 user_page_list[entry].absent = dst_page->vmp_absent;
8790 user_page_list[entry].dirty = dst_page->vmp_dirty;
8791 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8792 user_page_list[entry].precious = dst_page->vmp_precious;
8793 user_page_list[entry].device = FALSE;
8794 user_page_list[entry].speculative = FALSE;
8795 user_page_list[entry].cs_validated = FALSE;
8796 user_page_list[entry].cs_tainted = FALSE;
8797 user_page_list[entry].cs_nx = FALSE;
8798 user_page_list[entry].needed = FALSE;
8799 user_page_list[entry].mark = FALSE;
8800 }
8801 if (delayed_unlock++ > 256) {
8802 delayed_unlock = 0;
8803 lck_mtx_yield(&vm_page_queue_lock);
8804
8805 VM_CHECK_MEMORYSTATUS;
8806 }
8807 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8808 }
8809 done:
8810 vm_page_unlock_queues();
8811
8812 VM_CHECK_MEMORYSTATUS;
8813
8814 return retval;
8815 }
8816
8817
8818 kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,wpl_array_t lite_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8819 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8820 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
8821 int page_count, int* page_grab_count)
8822 {
8823 vm_page_t dst_page;
8824 boolean_t no_zero_fill = FALSE;
8825 int interruptible;
8826 int pages_wired = 0;
8827 int pages_inserted = 0;
8828 int entry = 0;
8829 uint64_t delayed_ledger_update = 0;
8830 kern_return_t ret = KERN_SUCCESS;
8831 int grab_options;
8832 ppnum_t phys_page;
8833
8834 vm_object_lock_assert_exclusive(object);
8835 assert(object->purgable != VM_PURGABLE_VOLATILE);
8836 assert(object->purgable != VM_PURGABLE_EMPTY);
8837 assert(object->pager == NULL);
8838 assert(object->copy == NULL);
8839 assert(object->shadow == NULL);
8840
8841 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8842 interruptible = THREAD_ABORTSAFE;
8843 } else {
8844 interruptible = THREAD_UNINT;
8845 }
8846
8847 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8848 no_zero_fill = TRUE;
8849 }
8850
8851 grab_options = 0;
8852 #if CONFIG_SECLUDED_MEMORY
8853 if (object->can_grab_secluded) {
8854 grab_options |= VM_PAGE_GRAB_SECLUDED;
8855 }
8856 #endif /* CONFIG_SECLUDED_MEMORY */
8857
8858 while (page_count--) {
8859 while ((dst_page = vm_page_grab_options(grab_options))
8860 == VM_PAGE_NULL) {
8861 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8862
8863 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8864
8865 if (vm_page_wait(interruptible) == FALSE) {
8866 /*
8867 * interrupted case
8868 */
8869 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8870
8871 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8872
8873 ret = MACH_SEND_INTERRUPTED;
8874 goto done;
8875 }
8876 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8877
8878 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8879 }
8880 if (no_zero_fill == FALSE) {
8881 vm_page_zero_fill(dst_page);
8882 } else {
8883 dst_page->vmp_absent = TRUE;
8884 }
8885
8886 dst_page->vmp_reference = TRUE;
8887
8888 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8889 SET_PAGE_DIRTY(dst_page, FALSE);
8890 }
8891 if (dst_page->vmp_absent == FALSE) {
8892 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
8893 assert(dst_page->vmp_wire_count == 0);
8894 dst_page->vmp_wire_count++;
8895 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
8896 assert(dst_page->vmp_wire_count);
8897 pages_wired++;
8898 PAGE_WAKEUP_DONE(dst_page);
8899 }
8900 pages_inserted++;
8901
8902 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8903
8904 lite_list[entry >> 5] |= 1U << (entry & 31);
8905
8906 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8907
8908 if (phys_page > upl->highest_page) {
8909 upl->highest_page = phys_page;
8910 }
8911
8912 if (user_page_list) {
8913 user_page_list[entry].phys_addr = phys_page;
8914 user_page_list[entry].absent = dst_page->vmp_absent;
8915 user_page_list[entry].dirty = dst_page->vmp_dirty;
8916 user_page_list[entry].free_when_done = FALSE;
8917 user_page_list[entry].precious = FALSE;
8918 user_page_list[entry].device = FALSE;
8919 user_page_list[entry].speculative = FALSE;
8920 user_page_list[entry].cs_validated = FALSE;
8921 user_page_list[entry].cs_tainted = FALSE;
8922 user_page_list[entry].cs_nx = FALSE;
8923 user_page_list[entry].needed = FALSE;
8924 user_page_list[entry].mark = FALSE;
8925 }
8926 entry++;
8927 *dst_offset += PAGE_SIZE_64;
8928 }
8929 done:
8930 if (pages_wired) {
8931 vm_page_lockspin_queues();
8932 vm_page_wire_count += pages_wired;
8933 vm_page_unlock_queues();
8934 }
8935 if (pages_inserted) {
8936 if (object->internal) {
8937 OSAddAtomic(pages_inserted, &vm_page_internal_count);
8938 } else {
8939 OSAddAtomic(pages_inserted, &vm_page_external_count);
8940 }
8941 }
8942 if (delayed_ledger_update) {
8943 task_t owner;
8944 int ledger_idx_volatile;
8945 int ledger_idx_nonvolatile;
8946 int ledger_idx_volatile_compressed;
8947 int ledger_idx_nonvolatile_compressed;
8948 boolean_t do_footprint;
8949
8950 owner = VM_OBJECT_OWNER(object);
8951 assert(owner);
8952
8953 vm_object_ledger_tag_ledgers(object,
8954 &ledger_idx_volatile,
8955 &ledger_idx_nonvolatile,
8956 &ledger_idx_volatile_compressed,
8957 &ledger_idx_nonvolatile_compressed,
8958 &do_footprint);
8959
8960 /* more non-volatile bytes */
8961 ledger_credit(owner->ledger,
8962 ledger_idx_nonvolatile,
8963 delayed_ledger_update);
8964 if (do_footprint) {
8965 /* more footprint */
8966 ledger_credit(owner->ledger,
8967 task_ledgers.phys_footprint,
8968 delayed_ledger_update);
8969 }
8970 }
8971
8972 assert(page_grab_count);
8973 *page_grab_count = pages_inserted;
8974
8975 return ret;
8976 }
8977
8978
8979
8980 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)8981 vm_object_iopl_request(
8982 vm_object_t object,
8983 vm_object_offset_t offset,
8984 upl_size_t size,
8985 upl_t *upl_ptr,
8986 upl_page_info_array_t user_page_list,
8987 unsigned int *page_list_count,
8988 upl_control_flags_t cntrl_flags,
8989 vm_tag_t tag)
8990 {
8991 vm_page_t dst_page;
8992 vm_object_offset_t dst_offset;
8993 upl_size_t xfer_size;
8994 upl_t upl = NULL;
8995 unsigned int entry;
8996 wpl_array_t lite_list = NULL;
8997 int no_zero_fill = FALSE;
8998 unsigned int size_in_pages;
8999 int page_grab_count = 0;
9000 u_int32_t psize;
9001 kern_return_t ret;
9002 vm_prot_t prot;
9003 struct vm_object_fault_info fault_info = {};
9004 struct vm_page_delayed_work dw_array;
9005 struct vm_page_delayed_work *dwp, *dwp_start;
9006 bool dwp_finish_ctx = TRUE;
9007 int dw_count;
9008 int dw_limit;
9009 int dw_index;
9010 boolean_t caller_lookup;
9011 int io_tracking_flag = 0;
9012 int interruptible;
9013 ppnum_t phys_page;
9014
9015 boolean_t set_cache_attr_needed = FALSE;
9016 boolean_t free_wired_pages = FALSE;
9017 boolean_t fast_path_empty_req = FALSE;
9018 boolean_t fast_path_full_req = FALSE;
9019
9020 #if DEVELOPMENT || DEBUG
9021 task_t task = current_task();
9022 #endif /* DEVELOPMENT || DEBUG */
9023
9024 dwp_start = dwp = NULL;
9025
9026 vm_object_offset_t original_offset = offset;
9027 upl_size_t original_size = size;
9028
9029 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9030
9031 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9032 offset = vm_object_trunc_page(offset);
9033 if (size != original_size || offset != original_offset) {
9034 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9035 }
9036
9037 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9038 /*
9039 * For forward compatibility's sake,
9040 * reject any unknown flag.
9041 */
9042 return KERN_INVALID_VALUE;
9043 }
9044 if (vm_lopage_needed == FALSE) {
9045 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9046 }
9047
9048 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9049 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9050 return KERN_INVALID_VALUE;
9051 }
9052
9053 if (object->phys_contiguous) {
9054 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9055 return KERN_INVALID_ADDRESS;
9056 }
9057
9058 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9059 return KERN_INVALID_ADDRESS;
9060 }
9061 }
9062 }
9063 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9064 no_zero_fill = TRUE;
9065 }
9066
9067 if (cntrl_flags & UPL_COPYOUT_FROM) {
9068 prot = VM_PROT_READ;
9069 } else {
9070 prot = VM_PROT_READ | VM_PROT_WRITE;
9071 }
9072
9073 if ((!object->internal) && (object->paging_offset != 0)) {
9074 panic("vm_object_iopl_request: external object with non-zero paging offset");
9075 }
9076
9077
9078 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9079
9080 #if CONFIG_IOSCHED || UPL_DEBUG
9081 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9082 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9083 }
9084 #endif
9085
9086 #if CONFIG_IOSCHED
9087 if (object->io_tracking) {
9088 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9089 if (object != kernel_object) {
9090 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9091 }
9092 }
9093 #endif
9094
9095 if (object->phys_contiguous) {
9096 psize = PAGE_SIZE;
9097 } else {
9098 psize = size;
9099
9100 dw_count = 0;
9101 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9102 dwp_start = vm_page_delayed_work_get_ctx();
9103 if (dwp_start == NULL) {
9104 dwp_start = &dw_array;
9105 dw_limit = 1;
9106 dwp_finish_ctx = FALSE;
9107 }
9108
9109 dwp = dwp_start;
9110 }
9111
9112 if (cntrl_flags & UPL_SET_INTERNAL) {
9113 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9114
9115 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9116 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
9117 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
9118 if (size == 0) {
9119 user_page_list = NULL;
9120 lite_list = NULL;
9121 }
9122 } else {
9123 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9124
9125 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
9126 if (size == 0) {
9127 lite_list = NULL;
9128 }
9129 }
9130 if (user_page_list) {
9131 user_page_list[0].device = FALSE;
9132 }
9133 *upl_ptr = upl;
9134
9135 if (cntrl_flags & UPL_NOZEROFILLIO) {
9136 DTRACE_VM4(upl_nozerofillio,
9137 vm_object_t, object,
9138 vm_object_offset_t, offset,
9139 upl_size_t, size,
9140 upl_t, upl);
9141 }
9142
9143 upl->map_object = object;
9144 upl->u_offset = original_offset;
9145 upl->u_size = original_size;
9146
9147 size_in_pages = size / PAGE_SIZE;
9148
9149 if (object == kernel_object &&
9150 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9151 upl->flags |= UPL_KERNEL_OBJECT;
9152 #if UPL_DEBUG
9153 vm_object_lock(object);
9154 #else
9155 vm_object_lock_shared(object);
9156 #endif
9157 } else {
9158 vm_object_lock(object);
9159 vm_object_activity_begin(object);
9160 }
9161 /*
9162 * paging in progress also protects the paging_offset
9163 */
9164 upl->u_offset = original_offset + object->paging_offset;
9165
9166 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9167 /*
9168 * The user requested that access to the pages in this UPL
9169 * be blocked until the UPL is commited or aborted.
9170 */
9171 upl->flags |= UPL_ACCESS_BLOCKED;
9172 }
9173
9174 #if CONFIG_IOSCHED || UPL_DEBUG
9175 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9176 vm_object_activity_begin(object);
9177 queue_enter(&object->uplq, upl, upl_t, uplq);
9178 }
9179 #endif
9180
9181 if (object->phys_contiguous) {
9182 if (upl->flags & UPL_ACCESS_BLOCKED) {
9183 assert(!object->blocked_access);
9184 object->blocked_access = TRUE;
9185 }
9186
9187 vm_object_unlock(object);
9188
9189 /*
9190 * don't need any shadow mappings for this one
9191 * since it is already I/O memory
9192 */
9193 upl->flags |= UPL_DEVICE_MEMORY;
9194
9195 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9196
9197 if (user_page_list) {
9198 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9199 user_page_list[0].device = TRUE;
9200 }
9201 if (page_list_count != NULL) {
9202 if (upl->flags & UPL_INTERNAL) {
9203 *page_list_count = 0;
9204 } else {
9205 *page_list_count = 1;
9206 }
9207 }
9208
9209 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9210 #if DEVELOPMENT || DEBUG
9211 if (task != NULL) {
9212 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9213 }
9214 #endif /* DEVELOPMENT || DEBUG */
9215 return KERN_SUCCESS;
9216 }
9217 if (object != kernel_object && object != compressor_object) {
9218 /*
9219 * Protect user space from future COW operations
9220 */
9221 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9222 if (!object->true_share &&
9223 vm_object_tracking_btlog) {
9224 btlog_record(vm_object_tracking_btlog, object,
9225 VM_OBJECT_TRACKING_OP_TRUESHARE,
9226 btref_get(__builtin_frame_address(0), 0));
9227 }
9228 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9229
9230 vm_object_lock_assert_exclusive(object);
9231 object->true_share = TRUE;
9232
9233 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9234 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9235 }
9236 }
9237
9238 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9239 object->copy != VM_OBJECT_NULL) {
9240 /*
9241 * Honor copy-on-write obligations
9242 *
9243 * The caller is gathering these pages and
9244 * might modify their contents. We need to
9245 * make sure that the copy object has its own
9246 * private copies of these pages before we let
9247 * the caller modify them.
9248 *
9249 * NOTE: someone else could map the original object
9250 * after we've done this copy-on-write here, and they
9251 * could then see an inconsistent picture of the memory
9252 * while it's being modified via the UPL. To prevent this,
9253 * we would have to block access to these pages until the
9254 * UPL is released. We could use the UPL_BLOCK_ACCESS
9255 * code path for that...
9256 */
9257 vm_object_update(object,
9258 offset,
9259 size,
9260 NULL,
9261 NULL,
9262 FALSE, /* should_return */
9263 MEMORY_OBJECT_COPY_SYNC,
9264 VM_PROT_NO_CHANGE);
9265 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9266 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9267 }
9268 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9269 object->purgable != VM_PURGABLE_VOLATILE &&
9270 object->purgable != VM_PURGABLE_EMPTY &&
9271 object->copy == NULL &&
9272 size == object->vo_size &&
9273 offset == 0 &&
9274 object->shadow == NULL &&
9275 object->pager == NULL) {
9276 if (object->resident_page_count == size_in_pages) {
9277 assert(object != compressor_object);
9278 assert(object != kernel_object);
9279 fast_path_full_req = TRUE;
9280 } else if (object->resident_page_count == 0) {
9281 assert(object != compressor_object);
9282 assert(object != kernel_object);
9283 fast_path_empty_req = TRUE;
9284 set_cache_attr_needed = TRUE;
9285 }
9286 }
9287
9288 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9289 interruptible = THREAD_ABORTSAFE;
9290 } else {
9291 interruptible = THREAD_UNINT;
9292 }
9293
9294 entry = 0;
9295
9296 xfer_size = size;
9297 dst_offset = offset;
9298
9299 if (fast_path_full_req) {
9300 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9301 goto finish;
9302 }
9303 /*
9304 * we couldn't complete the processing of this request on the fast path
9305 * so fall through to the slow path and finish up
9306 */
9307 } else if (fast_path_empty_req) {
9308 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9309 ret = KERN_MEMORY_ERROR;
9310 goto return_err;
9311 }
9312 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9313
9314 if (ret) {
9315 free_wired_pages = TRUE;
9316 goto return_err;
9317 }
9318 goto finish;
9319 }
9320
9321 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9322 fault_info.lo_offset = offset;
9323 fault_info.hi_offset = offset + xfer_size;
9324 fault_info.mark_zf_absent = TRUE;
9325 fault_info.interruptible = interruptible;
9326 fault_info.batch_pmap_op = TRUE;
9327
9328 while (xfer_size) {
9329 vm_fault_return_t result;
9330
9331 dwp->dw_mask = 0;
9332
9333 if (fast_path_full_req) {
9334 /*
9335 * if we get here, it means that we ran into a page
9336 * state we couldn't handle in the fast path and
9337 * bailed out to the slow path... since the order
9338 * we look at pages is different between the 2 paths,
9339 * the following check is needed to determine whether
9340 * this page was already processed in the fast path
9341 */
9342 if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9343 goto skip_page;
9344 }
9345 }
9346 dst_page = vm_page_lookup(object, dst_offset);
9347
9348 if (dst_page == VM_PAGE_NULL ||
9349 dst_page->vmp_busy ||
9350 dst_page->vmp_error ||
9351 dst_page->vmp_restart ||
9352 dst_page->vmp_absent ||
9353 dst_page->vmp_fictitious) {
9354 if (object == kernel_object) {
9355 panic("vm_object_iopl_request: missing/bad page in kernel object");
9356 }
9357 if (object == compressor_object) {
9358 panic("vm_object_iopl_request: missing/bad page in compressor object");
9359 }
9360
9361 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9362 ret = KERN_MEMORY_ERROR;
9363 goto return_err;
9364 }
9365 set_cache_attr_needed = TRUE;
9366
9367 /*
9368 * We just looked up the page and the result remains valid
9369 * until the object lock is release, so send it to
9370 * vm_fault_page() (as "dst_page"), to avoid having to
9371 * look it up again there.
9372 */
9373 caller_lookup = TRUE;
9374
9375 do {
9376 vm_page_t top_page;
9377 kern_return_t error_code;
9378
9379 fault_info.cluster_size = xfer_size;
9380
9381 vm_object_paging_begin(object);
9382
9383 result = vm_fault_page(object, dst_offset,
9384 prot | VM_PROT_WRITE, FALSE,
9385 caller_lookup,
9386 &prot, &dst_page, &top_page,
9387 (int *)0,
9388 &error_code, no_zero_fill,
9389 FALSE, &fault_info);
9390
9391 /* our lookup is no longer valid at this point */
9392 caller_lookup = FALSE;
9393
9394 switch (result) {
9395 case VM_FAULT_SUCCESS:
9396 page_grab_count++;
9397
9398 if (!dst_page->vmp_absent) {
9399 PAGE_WAKEUP_DONE(dst_page);
9400 } else {
9401 /*
9402 * we only get back an absent page if we
9403 * requested that it not be zero-filled
9404 * because we are about to fill it via I/O
9405 *
9406 * absent pages should be left BUSY
9407 * to prevent them from being faulted
9408 * into an address space before we've
9409 * had a chance to complete the I/O on
9410 * them since they may contain info that
9411 * shouldn't be seen by the faulting task
9412 */
9413 }
9414 /*
9415 * Release paging references and
9416 * top-level placeholder page, if any.
9417 */
9418 if (top_page != VM_PAGE_NULL) {
9419 vm_object_t local_object;
9420
9421 local_object = VM_PAGE_OBJECT(top_page);
9422
9423 /*
9424 * comparing 2 packed pointers
9425 */
9426 if (top_page->vmp_object != dst_page->vmp_object) {
9427 vm_object_lock(local_object);
9428 VM_PAGE_FREE(top_page);
9429 vm_object_paging_end(local_object);
9430 vm_object_unlock(local_object);
9431 } else {
9432 VM_PAGE_FREE(top_page);
9433 vm_object_paging_end(local_object);
9434 }
9435 }
9436 vm_object_paging_end(object);
9437 break;
9438
9439 case VM_FAULT_RETRY:
9440 vm_object_lock(object);
9441 break;
9442
9443 case VM_FAULT_MEMORY_SHORTAGE:
9444 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9445
9446 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9447
9448 if (vm_page_wait(interruptible)) {
9449 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9450
9451 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9452 vm_object_lock(object);
9453
9454 break;
9455 }
9456 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9457
9458 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9459
9460 OS_FALLTHROUGH;
9461
9462 case VM_FAULT_INTERRUPTED:
9463 error_code = MACH_SEND_INTERRUPTED;
9464 OS_FALLTHROUGH;
9465 case VM_FAULT_MEMORY_ERROR:
9466 memory_error:
9467 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9468
9469 vm_object_lock(object);
9470 goto return_err;
9471
9472 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9473 /* success but no page: fail */
9474 vm_object_paging_end(object);
9475 vm_object_unlock(object);
9476 goto memory_error;
9477
9478 default:
9479 panic("vm_object_iopl_request: unexpected error"
9480 " 0x%x from vm_fault_page()\n", result);
9481 }
9482 } while (result != VM_FAULT_SUCCESS);
9483 }
9484 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9485
9486 if (upl->flags & UPL_KERNEL_OBJECT) {
9487 goto record_phys_addr;
9488 }
9489
9490 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9491 dst_page->vmp_busy = TRUE;
9492 goto record_phys_addr;
9493 }
9494
9495 if (dst_page->vmp_cleaning) {
9496 /*
9497 * Someone else is cleaning this page in place.
9498 * In theory, we should be able to proceed and use this
9499 * page but they'll probably end up clearing the "busy"
9500 * bit on it in upl_commit_range() but they didn't set
9501 * it, so they would clear our "busy" bit and open
9502 * us to race conditions.
9503 * We'd better wait for the cleaning to complete and
9504 * then try again.
9505 */
9506 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9507 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9508 continue;
9509 }
9510 if (dst_page->vmp_laundry) {
9511 vm_pageout_steal_laundry(dst_page, FALSE);
9512 }
9513
9514 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9515 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9516 vm_page_t low_page;
9517 int refmod;
9518
9519 /*
9520 * support devices that can't DMA above 32 bits
9521 * by substituting pages from a pool of low address
9522 * memory for any pages we find above the 4G mark
9523 * can't substitute if the page is already wired because
9524 * we don't know whether that physical address has been
9525 * handed out to some other 64 bit capable DMA device to use
9526 */
9527 if (VM_PAGE_WIRED(dst_page)) {
9528 ret = KERN_PROTECTION_FAILURE;
9529 goto return_err;
9530 }
9531 low_page = vm_page_grablo();
9532
9533 if (low_page == VM_PAGE_NULL) {
9534 ret = KERN_RESOURCE_SHORTAGE;
9535 goto return_err;
9536 }
9537 /*
9538 * from here until the vm_page_replace completes
9539 * we musn't drop the object lock... we don't
9540 * want anyone refaulting this page in and using
9541 * it after we disconnect it... we want the fault
9542 * to find the new page being substituted.
9543 */
9544 if (dst_page->vmp_pmapped) {
9545 refmod = pmap_disconnect(phys_page);
9546 } else {
9547 refmod = 0;
9548 }
9549
9550 if (!dst_page->vmp_absent) {
9551 vm_page_copy(dst_page, low_page);
9552 }
9553
9554 low_page->vmp_reference = dst_page->vmp_reference;
9555 low_page->vmp_dirty = dst_page->vmp_dirty;
9556 low_page->vmp_absent = dst_page->vmp_absent;
9557
9558 if (refmod & VM_MEM_REFERENCED) {
9559 low_page->vmp_reference = TRUE;
9560 }
9561 if (refmod & VM_MEM_MODIFIED) {
9562 SET_PAGE_DIRTY(low_page, FALSE);
9563 }
9564
9565 vm_page_replace(low_page, object, dst_offset);
9566
9567 dst_page = low_page;
9568 /*
9569 * vm_page_grablo returned the page marked
9570 * BUSY... we don't need a PAGE_WAKEUP_DONE
9571 * here, because we've never dropped the object lock
9572 */
9573 if (!dst_page->vmp_absent) {
9574 dst_page->vmp_busy = FALSE;
9575 }
9576
9577 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9578 }
9579 if (!dst_page->vmp_busy) {
9580 dwp->dw_mask |= DW_vm_page_wire;
9581 }
9582
9583 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9584 /*
9585 * Mark the page "busy" to block any future page fault
9586 * on this page in addition to wiring it.
9587 * We'll also remove the mapping
9588 * of all these pages before leaving this routine.
9589 */
9590 assert(!dst_page->vmp_fictitious);
9591 dst_page->vmp_busy = TRUE;
9592 }
9593 /*
9594 * expect the page to be used
9595 * page queues lock must be held to set 'reference'
9596 */
9597 dwp->dw_mask |= DW_set_reference;
9598
9599 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9600 SET_PAGE_DIRTY(dst_page, TRUE);
9601 /*
9602 * Page belonging to a code-signed object is about to
9603 * be written. Mark it tainted and disconnect it from
9604 * all pmaps so processes have to fault it back in and
9605 * deal with the tainted bit.
9606 */
9607 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9608 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9609 vm_page_iopl_tainted++;
9610 if (dst_page->vmp_pmapped) {
9611 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9612 if (refmod & VM_MEM_REFERENCED) {
9613 dst_page->vmp_reference = TRUE;
9614 }
9615 }
9616 }
9617 }
9618 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9619 pmap_sync_page_attributes_phys(phys_page);
9620 dst_page->vmp_written_by_kernel = FALSE;
9621 }
9622
9623 record_phys_addr:
9624 if (dst_page->vmp_busy) {
9625 upl->flags |= UPL_HAS_BUSY;
9626 }
9627
9628 lite_list[entry >> 5] |= 1U << (entry & 31);
9629
9630 if (phys_page > upl->highest_page) {
9631 upl->highest_page = phys_page;
9632 }
9633
9634 if (user_page_list) {
9635 user_page_list[entry].phys_addr = phys_page;
9636 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9637 user_page_list[entry].absent = dst_page->vmp_absent;
9638 user_page_list[entry].dirty = dst_page->vmp_dirty;
9639 user_page_list[entry].precious = dst_page->vmp_precious;
9640 user_page_list[entry].device = FALSE;
9641 user_page_list[entry].needed = FALSE;
9642 if (dst_page->vmp_clustered == TRUE) {
9643 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9644 } else {
9645 user_page_list[entry].speculative = FALSE;
9646 }
9647 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9648 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9649 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9650 user_page_list[entry].mark = FALSE;
9651 }
9652 if (object != kernel_object && object != compressor_object) {
9653 /*
9654 * someone is explicitly grabbing this page...
9655 * update clustered and speculative state
9656 *
9657 */
9658 if (dst_page->vmp_clustered) {
9659 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9660 }
9661 }
9662 skip_page:
9663 entry++;
9664 dst_offset += PAGE_SIZE_64;
9665 xfer_size -= PAGE_SIZE;
9666
9667 if (dwp->dw_mask) {
9668 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9669
9670 if (dw_count >= dw_limit) {
9671 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9672
9673 dwp = dwp_start;
9674 dw_count = 0;
9675 }
9676 }
9677 }
9678 assert(entry == size_in_pages);
9679
9680 if (dw_count) {
9681 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9682 dwp = dwp_start;
9683 dw_count = 0;
9684 }
9685 finish:
9686 if (user_page_list && set_cache_attr_needed == TRUE) {
9687 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9688 }
9689
9690 if (page_list_count != NULL) {
9691 if (upl->flags & UPL_INTERNAL) {
9692 *page_list_count = 0;
9693 } else if (*page_list_count > size_in_pages) {
9694 *page_list_count = size_in_pages;
9695 }
9696 }
9697 vm_object_unlock(object);
9698
9699 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9700 /*
9701 * We've marked all the pages "busy" so that future
9702 * page faults will block.
9703 * Now remove the mapping for these pages, so that they
9704 * can't be accessed without causing a page fault.
9705 */
9706 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9707 PMAP_NULL,
9708 PAGE_SIZE,
9709 0, VM_PROT_NONE);
9710 assert(!object->blocked_access);
9711 object->blocked_access = TRUE;
9712 }
9713
9714 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9715 #if DEVELOPMENT || DEBUG
9716 if (task != NULL) {
9717 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9718 }
9719 #endif /* DEVELOPMENT || DEBUG */
9720
9721 if (dwp_start && dwp_finish_ctx) {
9722 vm_page_delayed_work_finish_ctx(dwp_start);
9723 dwp_start = dwp = NULL;
9724 }
9725
9726 return KERN_SUCCESS;
9727
9728 return_err:
9729 dw_index = 0;
9730
9731 for (; offset < dst_offset; offset += PAGE_SIZE) {
9732 boolean_t need_unwire;
9733
9734 dst_page = vm_page_lookup(object, offset);
9735
9736 if (dst_page == VM_PAGE_NULL) {
9737 panic("vm_object_iopl_request: Wired page missing.");
9738 }
9739
9740 /*
9741 * if we've already processed this page in an earlier
9742 * dw_do_work, we need to undo the wiring... we will
9743 * leave the dirty and reference bits on if they
9744 * were set, since we don't have a good way of knowing
9745 * what the previous state was and we won't get here
9746 * under any normal circumstances... we will always
9747 * clear BUSY and wakeup any waiters via vm_page_free
9748 * or PAGE_WAKEUP_DONE
9749 */
9750 need_unwire = TRUE;
9751
9752 if (dw_count) {
9753 if ((dwp_start)[dw_index].dw_m == dst_page) {
9754 /*
9755 * still in the deferred work list
9756 * which means we haven't yet called
9757 * vm_page_wire on this page
9758 */
9759 need_unwire = FALSE;
9760
9761 dw_index++;
9762 dw_count--;
9763 }
9764 }
9765 vm_page_lock_queues();
9766
9767 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9768 vm_page_free(dst_page);
9769
9770 need_unwire = FALSE;
9771 } else {
9772 if (need_unwire == TRUE) {
9773 vm_page_unwire(dst_page, TRUE);
9774 }
9775
9776 PAGE_WAKEUP_DONE(dst_page);
9777 }
9778 vm_page_unlock_queues();
9779
9780 if (need_unwire == TRUE) {
9781 counter_inc(&vm_statistics_reactivations);
9782 }
9783 }
9784 #if UPL_DEBUG
9785 upl->upl_state = 2;
9786 #endif
9787 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9788 vm_object_activity_end(object);
9789 vm_object_collapse(object, 0, TRUE);
9790 }
9791 vm_object_unlock(object);
9792 upl_destroy(upl);
9793
9794 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9795 #if DEVELOPMENT || DEBUG
9796 if (task != NULL) {
9797 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9798 }
9799 #endif /* DEVELOPMENT || DEBUG */
9800
9801 if (dwp_start && dwp_finish_ctx) {
9802 vm_page_delayed_work_finish_ctx(dwp_start);
9803 dwp_start = dwp = NULL;
9804 }
9805 return ret;
9806 }
9807
9808 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9809 upl_transpose(
9810 upl_t upl1,
9811 upl_t upl2)
9812 {
9813 kern_return_t retval;
9814 boolean_t upls_locked;
9815 vm_object_t object1, object2;
9816
9817 /* LD: Should mapped UPLs be eligible for a transpose? */
9818 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9819 return KERN_INVALID_ARGUMENT;
9820 }
9821
9822 upls_locked = FALSE;
9823
9824 /*
9825 * Since we need to lock both UPLs at the same time,
9826 * avoid deadlocks by always taking locks in the same order.
9827 */
9828 if (upl1 < upl2) {
9829 upl_lock(upl1);
9830 upl_lock(upl2);
9831 } else {
9832 upl_lock(upl2);
9833 upl_lock(upl1);
9834 }
9835 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9836
9837 object1 = upl1->map_object;
9838 object2 = upl2->map_object;
9839
9840 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9841 upl1->u_size != upl2->u_size) {
9842 /*
9843 * We deal only with full objects, not subsets.
9844 * That's because we exchange the entire backing store info
9845 * for the objects: pager, resident pages, etc... We can't do
9846 * only part of it.
9847 */
9848 retval = KERN_INVALID_VALUE;
9849 goto done;
9850 }
9851
9852 /*
9853 * Tranpose the VM objects' backing store.
9854 */
9855 retval = vm_object_transpose(object1, object2,
9856 upl_adjusted_size(upl1, PAGE_MASK));
9857
9858 if (retval == KERN_SUCCESS) {
9859 /*
9860 * Make each UPL point to the correct VM object, i.e. the
9861 * object holding the pages that the UPL refers to...
9862 */
9863 #if CONFIG_IOSCHED || UPL_DEBUG
9864 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9865 vm_object_lock(object1);
9866 vm_object_lock(object2);
9867 }
9868 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9869 queue_remove(&object1->uplq, upl1, upl_t, uplq);
9870 }
9871 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9872 queue_remove(&object2->uplq, upl2, upl_t, uplq);
9873 }
9874 #endif
9875 upl1->map_object = object2;
9876 upl2->map_object = object1;
9877
9878 #if CONFIG_IOSCHED || UPL_DEBUG
9879 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9880 queue_enter(&object2->uplq, upl1, upl_t, uplq);
9881 }
9882 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9883 queue_enter(&object1->uplq, upl2, upl_t, uplq);
9884 }
9885 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9886 vm_object_unlock(object2);
9887 vm_object_unlock(object1);
9888 }
9889 #endif
9890 }
9891
9892 done:
9893 /*
9894 * Cleanup.
9895 */
9896 if (upls_locked) {
9897 upl_unlock(upl1);
9898 upl_unlock(upl2);
9899 upls_locked = FALSE;
9900 }
9901
9902 return retval;
9903 }
9904
9905 void
upl_range_needed(upl_t upl,int index,int count)9906 upl_range_needed(
9907 upl_t upl,
9908 int index,
9909 int count)
9910 {
9911 upl_page_info_t *user_page_list;
9912 int size_in_pages;
9913
9914 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
9915 return;
9916 }
9917
9918 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
9919
9920 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9921
9922 while (count-- && index < size_in_pages) {
9923 user_page_list[index++].needed = TRUE;
9924 }
9925 }
9926
9927
9928 /*
9929 * Reserve of virtual addresses in the kernel address space.
9930 * We need to map the physical pages in the kernel, so that we
9931 * can call the code-signing or slide routines with a kernel
9932 * virtual address. We keep this pool of pre-allocated kernel
9933 * virtual addresses so that we don't have to scan the kernel's
9934 * virtaul address space each time we need to work with
9935 * a physical page.
9936 */
9937 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
9938 #define VM_PAGING_NUM_PAGES 64
9939 vm_map_offset_t vm_paging_base_address = 0;
9940 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9941 int vm_paging_max_index = 0;
9942 int vm_paging_page_waiter = 0;
9943 int vm_paging_page_waiter_total = 0;
9944
9945 unsigned long vm_paging_no_kernel_page = 0;
9946 unsigned long vm_paging_objects_mapped = 0;
9947 unsigned long vm_paging_pages_mapped = 0;
9948 unsigned long vm_paging_objects_mapped_slow = 0;
9949 unsigned long vm_paging_pages_mapped_slow = 0;
9950
9951 __startup_func
9952 void
vm_paging_map_init(void)9953 vm_paging_map_init(void)
9954 {
9955 kern_return_t kr;
9956 vm_map_offset_t page_map_offset;
9957 vm_map_entry_t map_entry;
9958
9959 assert(vm_paging_base_address == 0);
9960
9961 /*
9962 * Initialize our pool of pre-allocated kernel
9963 * virtual addresses.
9964 */
9965 page_map_offset = 0;
9966 kr = vm_map_find_space(kernel_map,
9967 &page_map_offset,
9968 VM_PAGING_NUM_PAGES * PAGE_SIZE,
9969 0,
9970 VM_MAP_KERNEL_FLAGS_NONE,
9971 VM_KERN_MEMORY_NONE,
9972 &map_entry);
9973 if (kr != KERN_SUCCESS) {
9974 panic("vm_paging_map_init: kernel_map full");
9975 }
9976 VME_OBJECT_SET(map_entry, kernel_object);
9977 VME_OFFSET_SET(map_entry, page_map_offset);
9978 map_entry->protection = VM_PROT_NONE;
9979 map_entry->max_protection = VM_PROT_NONE;
9980 map_entry->permanent = TRUE;
9981 vm_object_reference(kernel_object);
9982 vm_map_unlock(kernel_map);
9983
9984 assert(vm_paging_base_address == 0);
9985 vm_paging_base_address = page_map_offset;
9986 }
9987
9988 /*
9989 * vm_paging_map_object:
9990 * Maps part of a VM object's pages in the kernel
9991 * virtual address space, using the pre-allocated
9992 * kernel virtual addresses, if possible.
9993 * Context:
9994 * The VM object is locked. This lock will get
9995 * dropped and re-acquired though, so the caller
9996 * must make sure the VM object is kept alive
9997 * (by holding a VM map that has a reference
9998 * on it, for example, or taking an extra reference).
9999 * The page should also be kept busy to prevent
10000 * it from being reclaimed.
10001 */
10002 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10003 vm_paging_map_object(
10004 vm_page_t page,
10005 vm_object_t object,
10006 vm_object_offset_t offset,
10007 vm_prot_t protection,
10008 boolean_t can_unlock_object,
10009 vm_map_size_t *size, /* IN/OUT */
10010 vm_map_offset_t *address, /* OUT */
10011 boolean_t *need_unmap) /* OUT */
10012 {
10013 kern_return_t kr;
10014 vm_map_offset_t page_map_offset;
10015 vm_map_size_t map_size;
10016 vm_object_offset_t object_offset;
10017 int i;
10018
10019 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10020 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10021 *address = (vm_map_offset_t)
10022 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10023 *need_unmap = FALSE;
10024 return KERN_SUCCESS;
10025
10026 assert(page->vmp_busy);
10027 /*
10028 * Use one of the pre-allocated kernel virtual addresses
10029 * and just enter the VM page in the kernel address space
10030 * at that virtual address.
10031 */
10032 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10033
10034 /*
10035 * Try and find an available kernel virtual address
10036 * from our pre-allocated pool.
10037 */
10038 page_map_offset = 0;
10039 for (;;) {
10040 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10041 if (vm_paging_page_inuse[i] == FALSE) {
10042 page_map_offset =
10043 vm_paging_base_address +
10044 (i * PAGE_SIZE);
10045 break;
10046 }
10047 }
10048 if (page_map_offset != 0) {
10049 /* found a space to map our page ! */
10050 break;
10051 }
10052
10053 if (can_unlock_object) {
10054 /*
10055 * If we can afford to unlock the VM object,
10056 * let's take the slow path now...
10057 */
10058 break;
10059 }
10060 /*
10061 * We can't afford to unlock the VM object, so
10062 * let's wait for a space to become available...
10063 */
10064 vm_paging_page_waiter_total++;
10065 vm_paging_page_waiter++;
10066 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10067 if (kr == THREAD_WAITING) {
10068 simple_unlock(&vm_paging_lock);
10069 kr = thread_block(THREAD_CONTINUE_NULL);
10070 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10071 }
10072 vm_paging_page_waiter--;
10073 /* ... and try again */
10074 }
10075
10076 if (page_map_offset != 0) {
10077 /*
10078 * We found a kernel virtual address;
10079 * map the physical page to that virtual address.
10080 */
10081 if (i > vm_paging_max_index) {
10082 vm_paging_max_index = i;
10083 }
10084 vm_paging_page_inuse[i] = TRUE;
10085 simple_unlock(&vm_paging_lock);
10086
10087 page->vmp_pmapped = TRUE;
10088
10089 /*
10090 * Keep the VM object locked over the PMAP_ENTER
10091 * and the actual use of the page by the kernel,
10092 * or this pmap mapping might get undone by a
10093 * vm_object_pmap_protect() call...
10094 */
10095 PMAP_ENTER(kernel_pmap,
10096 page_map_offset,
10097 page,
10098 protection,
10099 VM_PROT_NONE,
10100 0,
10101 TRUE,
10102 kr);
10103 assert(kr == KERN_SUCCESS);
10104 vm_paging_objects_mapped++;
10105 vm_paging_pages_mapped++;
10106 *address = page_map_offset;
10107 *need_unmap = TRUE;
10108
10109 #if KASAN
10110 kasan_notify_address(page_map_offset, PAGE_SIZE);
10111 #endif
10112
10113 /* all done and mapped, ready to use ! */
10114 return KERN_SUCCESS;
10115 }
10116
10117 /*
10118 * We ran out of pre-allocated kernel virtual
10119 * addresses. Just map the page in the kernel
10120 * the slow and regular way.
10121 */
10122 vm_paging_no_kernel_page++;
10123 simple_unlock(&vm_paging_lock);
10124 }
10125
10126 if (!can_unlock_object) {
10127 *address = 0;
10128 *size = 0;
10129 *need_unmap = FALSE;
10130 return KERN_NOT_SUPPORTED;
10131 }
10132
10133 object_offset = vm_object_trunc_page(offset);
10134 map_size = vm_map_round_page(*size,
10135 VM_MAP_PAGE_MASK(kernel_map));
10136
10137 /*
10138 * Try and map the required range of the object
10139 * in the kernel_map
10140 */
10141
10142 vm_object_reference_locked(object); /* for the map entry */
10143 vm_object_unlock(object);
10144
10145 kr = vm_map_enter(kernel_map,
10146 address,
10147 map_size,
10148 0,
10149 VM_FLAGS_ANYWHERE,
10150 VM_MAP_KERNEL_FLAGS_NONE,
10151 VM_KERN_MEMORY_NONE,
10152 object,
10153 object_offset,
10154 FALSE,
10155 protection,
10156 VM_PROT_ALL,
10157 VM_INHERIT_NONE);
10158 if (kr != KERN_SUCCESS) {
10159 *address = 0;
10160 *size = 0;
10161 *need_unmap = FALSE;
10162 vm_object_deallocate(object); /* for the map entry */
10163 vm_object_lock(object);
10164 return kr;
10165 }
10166
10167 *size = map_size;
10168
10169 /*
10170 * Enter the mapped pages in the page table now.
10171 */
10172 vm_object_lock(object);
10173 /*
10174 * VM object must be kept locked from before PMAP_ENTER()
10175 * until after the kernel is done accessing the page(s).
10176 * Otherwise, the pmap mappings in the kernel could be
10177 * undone by a call to vm_object_pmap_protect().
10178 */
10179
10180 for (page_map_offset = 0;
10181 map_size != 0;
10182 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10183 page = vm_page_lookup(object, offset + page_map_offset);
10184 if (page == VM_PAGE_NULL) {
10185 printf("vm_paging_map_object: no page !?");
10186 vm_object_unlock(object);
10187 kr = vm_map_remove(kernel_map, *address, *size,
10188 VM_MAP_REMOVE_NO_FLAGS);
10189 assert(kr == KERN_SUCCESS);
10190 *address = 0;
10191 *size = 0;
10192 *need_unmap = FALSE;
10193 vm_object_lock(object);
10194 return KERN_MEMORY_ERROR;
10195 }
10196 page->vmp_pmapped = TRUE;
10197
10198 PMAP_ENTER(kernel_pmap,
10199 *address + page_map_offset,
10200 page,
10201 protection,
10202 VM_PROT_NONE,
10203 0,
10204 TRUE,
10205 kr);
10206 assert(kr == KERN_SUCCESS);
10207 #if KASAN
10208 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10209 #endif
10210 }
10211
10212 vm_paging_objects_mapped_slow++;
10213 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10214
10215 *need_unmap = TRUE;
10216
10217 return KERN_SUCCESS;
10218 }
10219
10220 /*
10221 * vm_paging_unmap_object:
10222 * Unmaps part of a VM object's pages from the kernel
10223 * virtual address space.
10224 * Context:
10225 * The VM object is locked. This lock will get
10226 * dropped and re-acquired though.
10227 */
10228 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10229 vm_paging_unmap_object(
10230 vm_object_t object,
10231 vm_map_offset_t start,
10232 vm_map_offset_t end)
10233 {
10234 kern_return_t kr;
10235 int i;
10236
10237 if ((vm_paging_base_address == 0) ||
10238 (start < vm_paging_base_address) ||
10239 (end > (vm_paging_base_address
10240 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10241 /*
10242 * We didn't use our pre-allocated pool of
10243 * kernel virtual address. Deallocate the
10244 * virtual memory.
10245 */
10246 if (object != VM_OBJECT_NULL) {
10247 vm_object_unlock(object);
10248 }
10249 kr = vm_map_remove(kernel_map, start, end,
10250 VM_MAP_REMOVE_NO_FLAGS);
10251 if (object != VM_OBJECT_NULL) {
10252 vm_object_lock(object);
10253 }
10254 assert(kr == KERN_SUCCESS);
10255 } else {
10256 /*
10257 * We used a kernel virtual address from our
10258 * pre-allocated pool. Put it back in the pool
10259 * for next time.
10260 */
10261 assert(end - start == PAGE_SIZE);
10262 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10263 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10264
10265 /* undo the pmap mapping */
10266 pmap_remove(kernel_pmap, start, end);
10267
10268 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10269 vm_paging_page_inuse[i] = FALSE;
10270 if (vm_paging_page_waiter) {
10271 thread_wakeup(&vm_paging_page_waiter);
10272 }
10273 simple_unlock(&vm_paging_lock);
10274 }
10275 }
10276
10277
10278 /*
10279 * page->vmp_object must be locked
10280 */
10281 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10282 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10283 {
10284 if (!queues_locked) {
10285 vm_page_lockspin_queues();
10286 }
10287
10288 page->vmp_free_when_done = FALSE;
10289 /*
10290 * need to drop the laundry count...
10291 * we may also need to remove it
10292 * from the I/O paging queue...
10293 * vm_pageout_throttle_up handles both cases
10294 *
10295 * the laundry and pageout_queue flags are cleared...
10296 */
10297 vm_pageout_throttle_up(page);
10298
10299 if (!queues_locked) {
10300 vm_page_unlock_queues();
10301 }
10302 }
10303
10304 upl_t
vector_upl_create(vm_offset_t upl_offset)10305 vector_upl_create(vm_offset_t upl_offset)
10306 {
10307 int i = 0;
10308 upl_t upl;
10309 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, Z_WAITOK);
10310
10311 upl = upl_create(0, UPL_VECTOR, 0);
10312 upl->vector_upl = vector_upl;
10313 upl->u_offset = upl_offset;
10314 vector_upl->size = 0;
10315 vector_upl->offset = upl_offset;
10316 vector_upl->invalid_upls = 0;
10317 vector_upl->num_upls = 0;
10318 vector_upl->pagelist = NULL;
10319
10320 for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10321 vector_upl->upl_iostates[i].size = 0;
10322 vector_upl->upl_iostates[i].offset = 0;
10323 }
10324 return upl;
10325 }
10326
10327 void
vector_upl_deallocate(upl_t upl)10328 vector_upl_deallocate(upl_t upl)
10329 {
10330 if (upl) {
10331 vector_upl_t vector_upl = upl->vector_upl;
10332 if (vector_upl) {
10333 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10334 panic("Deallocating non-empty Vectored UPL");
10335 }
10336 kfree_data(vector_upl->pagelist, sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10337 vector_upl->invalid_upls = 0;
10338 vector_upl->num_upls = 0;
10339 vector_upl->pagelist = NULL;
10340 vector_upl->size = 0;
10341 vector_upl->offset = 0;
10342 kfree_type(struct _vector_upl, vector_upl);
10343 vector_upl = (vector_upl_t)0xfeedfeed;
10344 } else {
10345 panic("vector_upl_deallocate was passed a non-vectored upl");
10346 }
10347 } else {
10348 panic("vector_upl_deallocate was passed a NULL upl");
10349 }
10350 }
10351
10352 boolean_t
vector_upl_is_valid(upl_t upl)10353 vector_upl_is_valid(upl_t upl)
10354 {
10355 if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10356 vector_upl_t vector_upl = upl->vector_upl;
10357 if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10358 return FALSE;
10359 } else {
10360 return TRUE;
10361 }
10362 }
10363 return FALSE;
10364 }
10365
10366 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10367 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10368 {
10369 if (vector_upl_is_valid(upl)) {
10370 vector_upl_t vector_upl = upl->vector_upl;
10371
10372 if (vector_upl) {
10373 if (subupl) {
10374 if (io_size) {
10375 if (io_size < PAGE_SIZE) {
10376 io_size = PAGE_SIZE;
10377 }
10378 subupl->vector_upl = (void*)vector_upl;
10379 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10380 vector_upl->size += io_size;
10381 upl->u_size += io_size;
10382 } else {
10383 uint32_t i = 0, invalid_upls = 0;
10384 for (i = 0; i < vector_upl->num_upls; i++) {
10385 if (vector_upl->upl_elems[i] == subupl) {
10386 break;
10387 }
10388 }
10389 if (i == vector_upl->num_upls) {
10390 panic("Trying to remove sub-upl when none exists");
10391 }
10392
10393 vector_upl->upl_elems[i] = NULL;
10394 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10395 relaxed);
10396 if (invalid_upls == vector_upl->num_upls) {
10397 return TRUE;
10398 } else {
10399 return FALSE;
10400 }
10401 }
10402 } else {
10403 panic("vector_upl_set_subupl was passed a NULL upl element");
10404 }
10405 } else {
10406 panic("vector_upl_set_subupl was passed a non-vectored upl");
10407 }
10408 } else {
10409 panic("vector_upl_set_subupl was passed a NULL upl");
10410 }
10411
10412 return FALSE;
10413 }
10414
10415 void
vector_upl_set_pagelist(upl_t upl)10416 vector_upl_set_pagelist(upl_t upl)
10417 {
10418 if (vector_upl_is_valid(upl)) {
10419 uint32_t i = 0;
10420 vector_upl_t vector_upl = upl->vector_upl;
10421
10422 if (vector_upl) {
10423 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10424
10425 vector_upl->pagelist = kalloc_data(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE), Z_WAITOK);
10426
10427 for (i = 0; i < vector_upl->num_upls; i++) {
10428 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10429 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10430 pagelist_size += cur_upl_pagelist_size;
10431 if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10432 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10433 }
10434 }
10435 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10436 } else {
10437 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10438 }
10439 } else {
10440 panic("vector_upl_set_pagelist was passed a NULL upl");
10441 }
10442 }
10443
10444 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10445 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10446 {
10447 if (vector_upl_is_valid(upl)) {
10448 vector_upl_t vector_upl = upl->vector_upl;
10449 if (vector_upl) {
10450 if (index < vector_upl->num_upls) {
10451 return vector_upl->upl_elems[index];
10452 }
10453 } else {
10454 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10455 }
10456 }
10457 return NULL;
10458 }
10459
10460 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10461 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10462 {
10463 if (vector_upl_is_valid(upl)) {
10464 uint32_t i = 0;
10465 vector_upl_t vector_upl = upl->vector_upl;
10466
10467 if (vector_upl) {
10468 upl_t subupl = NULL;
10469 vector_upl_iostates_t subupl_state;
10470
10471 for (i = 0; i < vector_upl->num_upls; i++) {
10472 subupl = vector_upl->upl_elems[i];
10473 subupl_state = vector_upl->upl_iostates[i];
10474 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10475 /* We could have been passed an offset/size pair that belongs
10476 * to an UPL element that has already been committed/aborted.
10477 * If so, return NULL.
10478 */
10479 if (subupl == NULL) {
10480 return NULL;
10481 }
10482 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10483 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10484 if (*upl_size > subupl_state.size) {
10485 *upl_size = subupl_state.size;
10486 }
10487 }
10488 if (*upl_offset >= subupl_state.offset) {
10489 *upl_offset -= subupl_state.offset;
10490 } else if (i) {
10491 panic("Vector UPL offset miscalculation");
10492 }
10493 return subupl;
10494 }
10495 }
10496 } else {
10497 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10498 }
10499 }
10500 return NULL;
10501 }
10502
10503 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10504 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10505 {
10506 *v_upl_submap = NULL;
10507
10508 if (vector_upl_is_valid(upl)) {
10509 vector_upl_t vector_upl = upl->vector_upl;
10510 if (vector_upl) {
10511 *v_upl_submap = vector_upl->submap;
10512 *submap_dst_addr = vector_upl->submap_dst_addr;
10513 } else {
10514 panic("vector_upl_get_submap was passed a non-vectored UPL");
10515 }
10516 } else {
10517 panic("vector_upl_get_submap was passed a null UPL");
10518 }
10519 }
10520
10521 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10522 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10523 {
10524 if (vector_upl_is_valid(upl)) {
10525 vector_upl_t vector_upl = upl->vector_upl;
10526 if (vector_upl) {
10527 vector_upl->submap = submap;
10528 vector_upl->submap_dst_addr = submap_dst_addr;
10529 } else {
10530 panic("vector_upl_get_submap was passed a non-vectored UPL");
10531 }
10532 } else {
10533 panic("vector_upl_get_submap was passed a NULL UPL");
10534 }
10535 }
10536
10537 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10538 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10539 {
10540 if (vector_upl_is_valid(upl)) {
10541 uint32_t i = 0;
10542 vector_upl_t vector_upl = upl->vector_upl;
10543
10544 if (vector_upl) {
10545 for (i = 0; i < vector_upl->num_upls; i++) {
10546 if (vector_upl->upl_elems[i] == subupl) {
10547 break;
10548 }
10549 }
10550
10551 if (i == vector_upl->num_upls) {
10552 panic("setting sub-upl iostate when none exists");
10553 }
10554
10555 vector_upl->upl_iostates[i].offset = offset;
10556 if (size < PAGE_SIZE) {
10557 size = PAGE_SIZE;
10558 }
10559 vector_upl->upl_iostates[i].size = size;
10560 } else {
10561 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10562 }
10563 } else {
10564 panic("vector_upl_set_iostate was passed a NULL UPL");
10565 }
10566 }
10567
10568 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10569 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10570 {
10571 if (vector_upl_is_valid(upl)) {
10572 uint32_t i = 0;
10573 vector_upl_t vector_upl = upl->vector_upl;
10574
10575 if (vector_upl) {
10576 for (i = 0; i < vector_upl->num_upls; i++) {
10577 if (vector_upl->upl_elems[i] == subupl) {
10578 break;
10579 }
10580 }
10581
10582 if (i == vector_upl->num_upls) {
10583 panic("getting sub-upl iostate when none exists");
10584 }
10585
10586 *offset = vector_upl->upl_iostates[i].offset;
10587 *size = vector_upl->upl_iostates[i].size;
10588 } else {
10589 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10590 }
10591 } else {
10592 panic("vector_upl_get_iostate was passed a NULL UPL");
10593 }
10594 }
10595
10596 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10597 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10598 {
10599 if (vector_upl_is_valid(upl)) {
10600 vector_upl_t vector_upl = upl->vector_upl;
10601 if (vector_upl) {
10602 if (index < vector_upl->num_upls) {
10603 *offset = vector_upl->upl_iostates[index].offset;
10604 *size = vector_upl->upl_iostates[index].size;
10605 } else {
10606 *offset = *size = 0;
10607 }
10608 } else {
10609 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10610 }
10611 } else {
10612 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10613 }
10614 }
10615
10616 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10617 upl_get_internal_vectorupl_pagelist(upl_t upl)
10618 {
10619 return ((vector_upl_t)(upl->vector_upl))->pagelist;
10620 }
10621
10622 void *
upl_get_internal_vectorupl(upl_t upl)10623 upl_get_internal_vectorupl(upl_t upl)
10624 {
10625 return upl->vector_upl;
10626 }
10627
10628 vm_size_t
upl_get_internal_pagelist_offset(void)10629 upl_get_internal_pagelist_offset(void)
10630 {
10631 return sizeof(struct upl);
10632 }
10633
10634 void
upl_clear_dirty(upl_t upl,boolean_t value)10635 upl_clear_dirty(
10636 upl_t upl,
10637 boolean_t value)
10638 {
10639 if (value) {
10640 upl->flags |= UPL_CLEAR_DIRTY;
10641 } else {
10642 upl->flags &= ~UPL_CLEAR_DIRTY;
10643 }
10644 }
10645
10646 void
upl_set_referenced(upl_t upl,boolean_t value)10647 upl_set_referenced(
10648 upl_t upl,
10649 boolean_t value)
10650 {
10651 upl_lock(upl);
10652 if (value) {
10653 upl->ext_ref_count++;
10654 } else {
10655 if (!upl->ext_ref_count) {
10656 panic("upl_set_referenced not %p", upl);
10657 }
10658 upl->ext_ref_count--;
10659 }
10660 upl_unlock(upl);
10661 }
10662
10663 #if CONFIG_IOSCHED
10664 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10665 upl_set_blkno(
10666 upl_t upl,
10667 vm_offset_t upl_offset,
10668 int io_size,
10669 int64_t blkno)
10670 {
10671 int i, j;
10672 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10673 return;
10674 }
10675
10676 assert(upl->upl_reprio_info != 0);
10677 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10678 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10679 }
10680 }
10681 #endif
10682
10683 void inline
memoryshot(unsigned int event,unsigned int control)10684 memoryshot(unsigned int event, unsigned int control)
10685 {
10686 if (vm_debug_events) {
10687 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10688 vm_page_active_count, vm_page_inactive_count,
10689 vm_page_free_count, vm_page_speculative_count,
10690 vm_page_throttled_count);
10691 } else {
10692 (void) event;
10693 (void) control;
10694 }
10695 }
10696
10697 #ifdef MACH_BSD
10698
10699 boolean_t
upl_device_page(upl_page_info_t * upl)10700 upl_device_page(upl_page_info_t *upl)
10701 {
10702 return UPL_DEVICE_PAGE(upl);
10703 }
10704 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10705 upl_page_present(upl_page_info_t *upl, int index)
10706 {
10707 return UPL_PAGE_PRESENT(upl, index);
10708 }
10709 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10710 upl_speculative_page(upl_page_info_t *upl, int index)
10711 {
10712 return UPL_SPECULATIVE_PAGE(upl, index);
10713 }
10714 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10715 upl_dirty_page(upl_page_info_t *upl, int index)
10716 {
10717 return UPL_DIRTY_PAGE(upl, index);
10718 }
10719 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10720 upl_valid_page(upl_page_info_t *upl, int index)
10721 {
10722 return UPL_VALID_PAGE(upl, index);
10723 }
10724 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10725 upl_phys_page(upl_page_info_t *upl, int index)
10726 {
10727 return UPL_PHYS_PAGE(upl, index);
10728 }
10729
10730 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10731 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10732 {
10733 upl[index].mark = v;
10734 }
10735
10736 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10737 upl_page_get_mark(upl_page_info_t *upl, int index)
10738 {
10739 return upl[index].mark;
10740 }
10741
10742 void
vm_countdirtypages(void)10743 vm_countdirtypages(void)
10744 {
10745 vm_page_t m;
10746 int dpages;
10747 int pgopages;
10748 int precpages;
10749
10750
10751 dpages = 0;
10752 pgopages = 0;
10753 precpages = 0;
10754
10755 vm_page_lock_queues();
10756 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10757 do {
10758 if (m == (vm_page_t)0) {
10759 break;
10760 }
10761
10762 if (m->vmp_dirty) {
10763 dpages++;
10764 }
10765 if (m->vmp_free_when_done) {
10766 pgopages++;
10767 }
10768 if (m->vmp_precious) {
10769 precpages++;
10770 }
10771
10772 assert(VM_PAGE_OBJECT(m) != kernel_object);
10773 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10774 if (m == (vm_page_t)0) {
10775 break;
10776 }
10777 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10778 vm_page_unlock_queues();
10779
10780 vm_page_lock_queues();
10781 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10782 do {
10783 if (m == (vm_page_t)0) {
10784 break;
10785 }
10786
10787 dpages++;
10788 assert(m->vmp_dirty);
10789 assert(!m->vmp_free_when_done);
10790 assert(VM_PAGE_OBJECT(m) != kernel_object);
10791 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10792 if (m == (vm_page_t)0) {
10793 break;
10794 }
10795 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10796 vm_page_unlock_queues();
10797
10798 vm_page_lock_queues();
10799 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10800 do {
10801 if (m == (vm_page_t)0) {
10802 break;
10803 }
10804
10805 if (m->vmp_dirty) {
10806 dpages++;
10807 }
10808 if (m->vmp_free_when_done) {
10809 pgopages++;
10810 }
10811 if (m->vmp_precious) {
10812 precpages++;
10813 }
10814
10815 assert(VM_PAGE_OBJECT(m) != kernel_object);
10816 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10817 if (m == (vm_page_t)0) {
10818 break;
10819 }
10820 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10821 vm_page_unlock_queues();
10822
10823 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10824
10825 dpages = 0;
10826 pgopages = 0;
10827 precpages = 0;
10828
10829 vm_page_lock_queues();
10830 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10831
10832 do {
10833 if (m == (vm_page_t)0) {
10834 break;
10835 }
10836 if (m->vmp_dirty) {
10837 dpages++;
10838 }
10839 if (m->vmp_free_when_done) {
10840 pgopages++;
10841 }
10842 if (m->vmp_precious) {
10843 precpages++;
10844 }
10845
10846 assert(VM_PAGE_OBJECT(m) != kernel_object);
10847 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10848 if (m == (vm_page_t)0) {
10849 break;
10850 }
10851 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10852 vm_page_unlock_queues();
10853
10854 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10855 }
10856 #endif /* MACH_BSD */
10857
10858
10859 #if CONFIG_IOSCHED
10860 int
upl_get_cached_tier(upl_t upl)10861 upl_get_cached_tier(upl_t upl)
10862 {
10863 assert(upl);
10864 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10865 return upl->upl_priority;
10866 }
10867 return -1;
10868 }
10869 #endif /* CONFIG_IOSCHED */
10870
10871
10872 void
upl_callout_iodone(upl_t upl)10873 upl_callout_iodone(upl_t upl)
10874 {
10875 struct upl_io_completion *upl_ctx = upl->upl_iodone;
10876
10877 if (upl_ctx) {
10878 void (*iodone_func)(void *, int) = upl_ctx->io_done;
10879
10880 assert(upl_ctx->io_done);
10881
10882 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
10883 }
10884 }
10885
10886 void
upl_set_iodone(upl_t upl,void * upl_iodone)10887 upl_set_iodone(upl_t upl, void *upl_iodone)
10888 {
10889 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
10890 }
10891
10892 void
upl_set_iodone_error(upl_t upl,int error)10893 upl_set_iodone_error(upl_t upl, int error)
10894 {
10895 struct upl_io_completion *upl_ctx = upl->upl_iodone;
10896
10897 if (upl_ctx) {
10898 upl_ctx->io_error = error;
10899 }
10900 }
10901
10902
10903 ppnum_t
upl_get_highest_page(upl_t upl)10904 upl_get_highest_page(
10905 upl_t upl)
10906 {
10907 return upl->highest_page;
10908 }
10909
10910 upl_size_t
upl_get_size(upl_t upl)10911 upl_get_size(
10912 upl_t upl)
10913 {
10914 return upl_adjusted_size(upl, PAGE_MASK);
10915 }
10916
10917 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)10918 upl_adjusted_size(
10919 upl_t upl,
10920 vm_map_offset_t pgmask)
10921 {
10922 vm_object_offset_t start_offset, end_offset;
10923
10924 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
10925 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
10926
10927 return (upl_size_t)(end_offset - start_offset);
10928 }
10929
10930 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)10931 upl_adjusted_offset(
10932 upl_t upl,
10933 vm_map_offset_t pgmask)
10934 {
10935 return trunc_page_mask_64(upl->u_offset, pgmask);
10936 }
10937
10938 vm_object_offset_t
upl_get_data_offset(upl_t upl)10939 upl_get_data_offset(
10940 upl_t upl)
10941 {
10942 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
10943 }
10944
10945 upl_t
upl_associated_upl(upl_t upl)10946 upl_associated_upl(upl_t upl)
10947 {
10948 return upl->associated_upl;
10949 }
10950
10951 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)10952 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10953 {
10954 upl->associated_upl = associated_upl;
10955 }
10956
10957 struct vnode *
upl_lookup_vnode(upl_t upl)10958 upl_lookup_vnode(upl_t upl)
10959 {
10960 if (!upl->map_object->internal) {
10961 return vnode_pager_lookup_vnode(upl->map_object->pager);
10962 } else {
10963 return NULL;
10964 }
10965 }
10966
10967 #if UPL_DEBUG
10968 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)10969 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10970 {
10971 upl->ubc_alias1 = alias1;
10972 upl->ubc_alias2 = alias2;
10973 return KERN_SUCCESS;
10974 }
10975 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)10976 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10977 {
10978 if (al) {
10979 *al = upl->ubc_alias1;
10980 }
10981 if (al2) {
10982 *al2 = upl->ubc_alias2;
10983 }
10984 return KERN_SUCCESS;
10985 }
10986 #endif /* UPL_DEBUG */
10987
10988 #if VM_PRESSURE_EVENTS
10989 /*
10990 * Upward trajectory.
10991 */
10992 extern boolean_t vm_compressor_low_on_space(void);
10993
10994 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)10995 VM_PRESSURE_NORMAL_TO_WARNING(void)
10996 {
10997 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10998 /* Available pages below our threshold */
10999 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11000 /* No frozen processes to kill */
11001 if (memorystatus_frozen_count == 0) {
11002 /* Not enough suspended processes available. */
11003 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11004 return TRUE;
11005 }
11006 }
11007 }
11008 return FALSE;
11009 } else {
11010 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11011 }
11012 }
11013
11014 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11015 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11016 {
11017 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11018 /* Available pages below our threshold */
11019 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11020 return TRUE;
11021 }
11022 return FALSE;
11023 } else {
11024 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11025 }
11026 }
11027
11028 /*
11029 * Downward trajectory.
11030 */
11031 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11032 VM_PRESSURE_WARNING_TO_NORMAL(void)
11033 {
11034 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11035 /* Available pages above our threshold */
11036 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11037 if (memorystatus_available_pages > target_threshold) {
11038 return TRUE;
11039 }
11040 return FALSE;
11041 } else {
11042 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11043 }
11044 }
11045
11046 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11047 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11048 {
11049 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11050 /* Available pages above our threshold */
11051 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11052 if (memorystatus_available_pages > target_threshold) {
11053 return TRUE;
11054 }
11055 return FALSE;
11056 } else {
11057 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11058 }
11059 }
11060 #endif /* VM_PRESSURE_EVENTS */
11061