1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67 #include <ptrauth.h>
68
69 #include <debug.h>
70
71 #include <mach/mach_types.h>
72 #include <mach/memory_object.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/sdt.h>
79
80 #include <kern/kern_types.h>
81 #include <kern/counter.h>
82 #include <kern/host_statistics.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/sched.h>
86 #include <kern/thread.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_group.h>
91
92 #include <os/log.h>
93
94 #include <sys/kdebug_triage.h>
95
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map_internal.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109 #include <vm/vm_shared_region.h>
110 #include <vm/vm_compressor.h>
111
112 #include <san/kasan.h>
113
114 #if CONFIG_PHANTOM_CACHE
115 #include <vm/vm_phantom_cache.h>
116 #endif
117
118 #if UPL_DEBUG
119 #include <libkern/OSDebug.h>
120 #endif
121
122 extern int cs_debug;
123
124 extern void mbuf_drain(boolean_t);
125
126 #if VM_PRESSURE_EVENTS
127 #if CONFIG_JETSAM
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 #else /* CONFIG_JETSAM */
132 extern uint64_t memorystatus_available_pages;
133 extern uint64_t memorystatus_available_pages_pressure;
134 extern uint64_t memorystatus_available_pages_critical;
135 #endif /* CONFIG_JETSAM */
136
137 extern unsigned int memorystatus_frozen_count;
138 extern unsigned int memorystatus_suspended_count;
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140
141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143
144 void vm_pressure_response(void);
145 extern void consider_vm_pressure_events(void);
146
147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
148 #endif /* VM_PRESSURE_EVENTS */
149
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 boolean_t vps_dynamic_priority_enabled = FALSE;
153 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
154
155 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
156 #if !XNU_TARGET_OS_OSX
157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
158 #else /* !XNU_TARGET_OS_OSX */
159 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
160 #endif /* !XNU_TARGET_OS_OSX */
161 #endif
162
163 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
164 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
165 #endif
166
167 #ifndef VM_PAGE_LAUNDRY_MAX
168 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
169 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
170
171 #ifndef VM_PAGEOUT_BURST_WAIT
172 #define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
173 #endif /* VM_PAGEOUT_BURST_WAIT */
174
175 #ifndef VM_PAGEOUT_EMPTY_WAIT
176 #define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
177 #endif /* VM_PAGEOUT_EMPTY_WAIT */
178
179 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
180 #define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
181 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
182
183 #ifndef VM_PAGEOUT_IDLE_WAIT
184 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
185 #endif /* VM_PAGEOUT_IDLE_WAIT */
186
187 #ifndef VM_PAGEOUT_SWAP_WAIT
188 #define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
189 #endif /* VM_PAGEOUT_SWAP_WAIT */
190
191
192 #ifndef VM_PAGE_SPECULATIVE_TARGET
193 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
194 #endif /* VM_PAGE_SPECULATIVE_TARGET */
195
196
197 /*
198 * To obtain a reasonable LRU approximation, the inactive queue
199 * needs to be large enough to give pages on it a chance to be
200 * referenced a second time. This macro defines the fraction
201 * of active+inactive pages that should be inactive.
202 * The pageout daemon uses it to update vm_page_inactive_target.
203 *
204 * If vm_page_free_count falls below vm_page_free_target and
205 * vm_page_inactive_count is below vm_page_inactive_target,
206 * then the pageout daemon starts running.
207 */
208
209 #ifndef VM_PAGE_INACTIVE_TARGET
210 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
211 #endif /* VM_PAGE_INACTIVE_TARGET */
212
213 /*
214 * Once the pageout daemon starts running, it keeps going
215 * until vm_page_free_count meets or exceeds vm_page_free_target.
216 */
217
218 #ifndef VM_PAGE_FREE_TARGET
219 #if !XNU_TARGET_OS_OSX
220 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
221 #else /* !XNU_TARGET_OS_OSX */
222 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
223 #endif /* !XNU_TARGET_OS_OSX */
224 #endif /* VM_PAGE_FREE_TARGET */
225
226
227 /*
228 * The pageout daemon always starts running once vm_page_free_count
229 * falls below vm_page_free_min.
230 */
231
232 #ifndef VM_PAGE_FREE_MIN
233 #if !XNU_TARGET_OS_OSX
234 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
235 #else /* !XNU_TARGET_OS_OSX */
236 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
237 #endif /* !XNU_TARGET_OS_OSX */
238 #endif /* VM_PAGE_FREE_MIN */
239
240 #if !XNU_TARGET_OS_OSX
241 #define VM_PAGE_FREE_RESERVED_LIMIT 100
242 #define VM_PAGE_FREE_MIN_LIMIT 1500
243 #define VM_PAGE_FREE_TARGET_LIMIT 2000
244 #else /* !XNU_TARGET_OS_OSX */
245 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
246 #define VM_PAGE_FREE_MIN_LIMIT 3500
247 #define VM_PAGE_FREE_TARGET_LIMIT 4000
248 #endif /* !XNU_TARGET_OS_OSX */
249
250 /*
251 * When vm_page_free_count falls below vm_page_free_reserved,
252 * only vm-privileged threads can allocate pages. vm-privilege
253 * allows the pageout daemon and default pager (and any other
254 * associated threads needed for default pageout) to continue
255 * operation by dipping into the reserved pool of pages.
256 */
257
258 #ifndef VM_PAGE_FREE_RESERVED
259 #define VM_PAGE_FREE_RESERVED(n) \
260 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
261 #endif /* VM_PAGE_FREE_RESERVED */
262
263 /*
264 * When we dequeue pages from the inactive list, they are
265 * reactivated (ie, put back on the active queue) if referenced.
266 * However, it is possible to starve the free list if other
267 * processors are referencing pages faster than we can turn off
268 * the referenced bit. So we limit the number of reactivations
269 * we will make per call of vm_pageout_scan().
270 */
271 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
272
273 #ifndef VM_PAGE_REACTIVATE_LIMIT
274 #if !XNU_TARGET_OS_OSX
275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
276 #else /* !XNU_TARGET_OS_OSX */
277 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
278 #endif /* !XNU_TARGET_OS_OSX */
279 #endif /* VM_PAGE_REACTIVATE_LIMIT */
280 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
281
282 int vm_pageout_protect_realtime = true;
283
284 extern boolean_t hibernate_cleaning_in_progress;
285
286 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
287 struct pgo_iothread_state pgo_iothread_external_state;
288
289 #if VM_PRESSURE_EVENTS
290 void vm_pressure_thread(void);
291
292 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
293 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
294
295 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
296 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
297 #endif
298
299 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
300 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
301 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
302
303 extern void vm_pageout_continue(void);
304 extern void vm_pageout_scan(void);
305
306 boolean_t vm_pageout_running = FALSE;
307
308 uint32_t vm_page_upl_tainted = 0;
309 uint32_t vm_page_iopl_tainted = 0;
310
311 #if XNU_TARGET_OS_OSX
312 static boolean_t vm_pageout_waiter = FALSE;
313 #endif /* XNU_TARGET_OS_OSX */
314
315
316 #if DEVELOPMENT || DEBUG
317 struct vm_pageout_debug vm_pageout_debug;
318 #endif
319 struct vm_pageout_vminfo vm_pageout_vminfo;
320 struct vm_pageout_state vm_pageout_state;
321 struct vm_config vm_config;
322
323 struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
324 struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
325 #if DEVELOPMENT || DEBUG
326 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
327 #endif /* DEVELOPMENT || DEBUG */
328
329 int vm_upl_wait_for_pages = 0;
330 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
331
332 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
333
334 int vm_debug_events = 0;
335
336 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
337
338 #if CONFIG_MEMORYSTATUS
339 extern boolean_t memorystatus_kill_on_VM_page_shortage(void);
340
341 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
342 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
343
344 #endif
345
346 #if __AMP__
347
348 // bind compressor threads e-cores
349 #define VM_COMPRESSOR_EBOUND_DEFAULT 1
350
351 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
352 int vm_pgo_pbound = 0;
353 extern void thread_bind_cluster_type(thread_t, char, bool);
354
355 #endif /* __AMP__ */
356
357
358 /*
359 * Routine: vm_pageout_object_terminate
360 * Purpose:
361 * Destroy the pageout_object, and perform all of the
362 * required cleanup actions.
363 *
364 * In/Out conditions:
365 * The object must be locked, and will be returned locked.
366 */
367 void
vm_pageout_object_terminate(vm_object_t object)368 vm_pageout_object_terminate(
369 vm_object_t object)
370 {
371 vm_object_t shadow_object;
372
373 /*
374 * Deal with the deallocation (last reference) of a pageout object
375 * (used for cleaning-in-place) by dropping the paging references/
376 * freeing pages in the original object.
377 */
378
379 assert(object->pageout);
380 shadow_object = object->shadow;
381 vm_object_lock(shadow_object);
382
383 while (!vm_page_queue_empty(&object->memq)) {
384 vm_page_t p, m;
385 vm_object_offset_t offset;
386
387 p = (vm_page_t) vm_page_queue_first(&object->memq);
388
389 assert(p->vmp_private);
390 assert(p->vmp_free_when_done);
391 p->vmp_free_when_done = FALSE;
392 assert(!p->vmp_cleaning);
393 assert(!p->vmp_laundry);
394
395 offset = p->vmp_offset;
396 VM_PAGE_FREE(p);
397 p = VM_PAGE_NULL;
398
399 m = vm_page_lookup(shadow_object,
400 offset + object->vo_shadow_offset);
401
402 if (m == VM_PAGE_NULL) {
403 continue;
404 }
405
406 assert((m->vmp_dirty) || (m->vmp_precious) ||
407 (m->vmp_busy && m->vmp_cleaning));
408
409 /*
410 * Handle the trusted pager throttle.
411 * Also decrement the burst throttle (if external).
412 */
413 vm_page_lock_queues();
414 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
415 vm_pageout_throttle_up(m);
416 }
417
418 /*
419 * Handle the "target" page(s). These pages are to be freed if
420 * successfully cleaned. Target pages are always busy, and are
421 * wired exactly once. The initial target pages are not mapped,
422 * (so cannot be referenced or modified) but converted target
423 * pages may have been modified between the selection as an
424 * adjacent page and conversion to a target.
425 */
426 if (m->vmp_free_when_done) {
427 assert(m->vmp_busy);
428 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
429 assert(m->vmp_wire_count == 1);
430 m->vmp_cleaning = FALSE;
431 m->vmp_free_when_done = FALSE;
432 /*
433 * Revoke all access to the page. Since the object is
434 * locked, and the page is busy, this prevents the page
435 * from being dirtied after the pmap_disconnect() call
436 * returns.
437 *
438 * Since the page is left "dirty" but "not modifed", we
439 * can detect whether the page was redirtied during
440 * pageout by checking the modify state.
441 */
442 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
443 SET_PAGE_DIRTY(m, FALSE);
444 } else {
445 m->vmp_dirty = FALSE;
446 }
447
448 if (m->vmp_dirty) {
449 vm_page_unwire(m, TRUE); /* reactivates */
450 counter_inc(&vm_statistics_reactivations);
451 PAGE_WAKEUP_DONE(m);
452 } else {
453 vm_page_free(m); /* clears busy, etc. */
454 }
455 vm_page_unlock_queues();
456 continue;
457 }
458 /*
459 * Handle the "adjacent" pages. These pages were cleaned in
460 * place, and should be left alone.
461 * If prep_pin_count is nonzero, then someone is using the
462 * page, so make it active.
463 */
464 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
465 if (m->vmp_reference) {
466 vm_page_activate(m);
467 } else {
468 vm_page_deactivate(m);
469 }
470 }
471 if (m->vmp_overwriting) {
472 /*
473 * the (COPY_OUT_FROM == FALSE) request_page_list case
474 */
475 if (m->vmp_busy) {
476 /*
477 * We do not re-set m->vmp_dirty !
478 * The page was busy so no extraneous activity
479 * could have occurred. COPY_INTO is a read into the
480 * new pages. CLEAN_IN_PLACE does actually write
481 * out the pages but handling outside of this code
482 * will take care of resetting dirty. We clear the
483 * modify however for the Programmed I/O case.
484 */
485 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
486
487 m->vmp_busy = FALSE;
488 m->vmp_absent = FALSE;
489 } else {
490 /*
491 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
492 * Occurs when the original page was wired
493 * at the time of the list request
494 */
495 assert(VM_PAGE_WIRED(m));
496 vm_page_unwire(m, TRUE); /* reactivates */
497 }
498 m->vmp_overwriting = FALSE;
499 } else {
500 m->vmp_dirty = FALSE;
501 }
502 m->vmp_cleaning = FALSE;
503
504 /*
505 * Wakeup any thread waiting for the page to be un-cleaning.
506 */
507 PAGE_WAKEUP(m);
508 vm_page_unlock_queues();
509 }
510 /*
511 * Account for the paging reference taken in vm_paging_object_allocate.
512 */
513 vm_object_activity_end(shadow_object);
514 vm_object_unlock(shadow_object);
515
516 assert(object->ref_count == 0);
517 assert(object->paging_in_progress == 0);
518 assert(object->activity_in_progress == 0);
519 assert(object->resident_page_count == 0);
520 return;
521 }
522
523 /*
524 * Routine: vm_pageclean_setup
525 *
526 * Purpose: setup a page to be cleaned (made non-dirty), but not
527 * necessarily flushed from the VM page cache.
528 * This is accomplished by cleaning in place.
529 *
530 * The page must not be busy, and new_object
531 * must be locked.
532 *
533 */
534 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)535 vm_pageclean_setup(
536 vm_page_t m,
537 vm_page_t new_m,
538 vm_object_t new_object,
539 vm_object_offset_t new_offset)
540 {
541 assert(!m->vmp_busy);
542 #if 0
543 assert(!m->vmp_cleaning);
544 #endif
545
546 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
547
548 /*
549 * Mark original page as cleaning in place.
550 */
551 m->vmp_cleaning = TRUE;
552 SET_PAGE_DIRTY(m, FALSE);
553 m->vmp_precious = FALSE;
554
555 /*
556 * Convert the fictitious page to a private shadow of
557 * the real page.
558 */
559 assert(new_m->vmp_fictitious);
560 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
561 new_m->vmp_fictitious = FALSE;
562 new_m->vmp_private = TRUE;
563 new_m->vmp_free_when_done = TRUE;
564 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
565
566 vm_page_lockspin_queues();
567 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
568 vm_page_unlock_queues();
569
570 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
571 assert(!new_m->vmp_wanted);
572 new_m->vmp_busy = FALSE;
573 }
574
575 /*
576 * Routine: vm_pageout_initialize_page
577 * Purpose:
578 * Causes the specified page to be initialized in
579 * the appropriate memory object. This routine is used to push
580 * pages into a copy-object when they are modified in the
581 * permanent object.
582 *
583 * The page is moved to a temporary object and paged out.
584 *
585 * In/out conditions:
586 * The page in question must not be on any pageout queues.
587 * The object to which it belongs must be locked.
588 * The page must be busy, but not hold a paging reference.
589 *
590 * Implementation:
591 * Move this page to a completely new object.
592 */
593 void
vm_pageout_initialize_page(vm_page_t m)594 vm_pageout_initialize_page(
595 vm_page_t m)
596 {
597 vm_object_t object;
598 vm_object_offset_t paging_offset;
599 memory_object_t pager;
600
601 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
602
603 object = VM_PAGE_OBJECT(m);
604
605 assert(m->vmp_busy);
606 assert(object->internal);
607
608 /*
609 * Verify that we really want to clean this page
610 */
611 assert(!m->vmp_absent);
612 assert(m->vmp_dirty);
613
614 /*
615 * Create a paging reference to let us play with the object.
616 */
617 paging_offset = m->vmp_offset + object->paging_offset;
618
619 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
620 panic("reservation without pageout?"); /* alan */
621
622 VM_PAGE_FREE(m);
623 vm_object_unlock(object);
624
625 return;
626 }
627
628 /*
629 * If there's no pager, then we can't clean the page. This should
630 * never happen since this should be a copy object and therefore not
631 * an external object, so the pager should always be there.
632 */
633
634 pager = object->pager;
635
636 if (pager == MEMORY_OBJECT_NULL) {
637 panic("missing pager for copy object");
638
639 VM_PAGE_FREE(m);
640 return;
641 }
642
643 /*
644 * set the page for future call to vm_fault_list_request
645 */
646 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
647 SET_PAGE_DIRTY(m, FALSE);
648
649 /*
650 * keep the object from collapsing or terminating
651 */
652 vm_object_paging_begin(object);
653 vm_object_unlock(object);
654
655 /*
656 * Write the data to its pager.
657 * Note that the data is passed by naming the new object,
658 * not a virtual address; the pager interface has been
659 * manipulated to use the "internal memory" data type.
660 * [The object reference from its allocation is donated
661 * to the eventual recipient.]
662 */
663 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
664
665 vm_object_lock(object);
666 vm_object_paging_end(object);
667 }
668
669
670 /*
671 * vm_pageout_cluster:
672 *
673 * Given a page, queue it to the appropriate I/O thread,
674 * which will page it out and attempt to clean adjacent pages
675 * in the same operation.
676 *
677 * The object and queues must be locked. We will take a
678 * paging reference to prevent deallocation or collapse when we
679 * release the object lock back at the call site. The I/O thread
680 * is responsible for consuming this reference
681 *
682 * The page must not be on any pageout queue.
683 */
684 #if DEVELOPMENT || DEBUG
685 vmct_stats_t vmct_stats;
686
687 int32_t vmct_active = 0;
688 uint64_t vm_compressor_epoch_start = 0;
689 uint64_t vm_compressor_epoch_stop = 0;
690
691 typedef enum vmct_state_t {
692 VMCT_IDLE,
693 VMCT_AWAKENED,
694 VMCT_ACTIVE,
695 } vmct_state_t;
696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
697 #endif
698
699
700
701 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)702 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
703 {
704 vm_object_t object = VM_PAGE_OBJECT(m);
705
706 VM_PAGE_CHECK(m);
707 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
708 vm_object_lock_assert_exclusive(object);
709
710 /*
711 * Make sure it's OK to page this out.
712 */
713 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
714 assert(!m->vmp_cleaning && !m->vmp_laundry);
715 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
716
717 /*
718 * protect the object from collapse or termination
719 */
720 vm_object_activity_begin(object);
721
722
723 /*
724 * pgo_laundry count is tied to the laundry bit
725 */
726 m->vmp_laundry = TRUE;
727 q->pgo_laundry++;
728
729 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
730 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
731
732 // the benchmark queue will be woken up independently by the benchmark itself
733 if (
734 object->internal == TRUE
735 #if DEVELOPMENT || DEBUG
736 && q != &vm_pageout_queue_benchmark
737 #endif
738 ) {
739 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
740 m->vmp_busy = TRUE;
741 // Wake up the first compressor thread. It will wake subsequent threads if necessary.
742 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup, pgo_iothread_internal_state[0].pgo_iothread);
743 } else {
744 sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
745 }
746 VM_PAGE_CHECK(m);
747 }
748
749 void
vm_pageout_cluster(vm_page_t m)750 vm_pageout_cluster(vm_page_t m)
751 {
752 struct vm_pageout_queue *q;
753 vm_object_t object = VM_PAGE_OBJECT(m);
754 if (object->internal) {
755 q = &vm_pageout_queue_internal;
756 } else {
757 q = &vm_pageout_queue_external;
758 }
759 vm_pageout_cluster_to_queue(m, q);
760 }
761
762
763 /*
764 * A page is back from laundry or we are stealing it back from
765 * the laundering state. See if there are some pages waiting to
766 * go to laundry and if we can let some of them go now.
767 *
768 * Object and page queues must be locked.
769 */
770 void
vm_pageout_throttle_up(vm_page_t m)771 vm_pageout_throttle_up(
772 vm_page_t m)
773 {
774 struct vm_pageout_queue *q;
775 vm_object_t m_object;
776
777 m_object = VM_PAGE_OBJECT(m);
778
779 assert(m_object != VM_OBJECT_NULL);
780 assert(m_object != kernel_object);
781
782 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
783 vm_object_lock_assert_exclusive(m_object);
784
785 if (m_object->internal == TRUE) {
786 q = &vm_pageout_queue_internal;
787 } else {
788 q = &vm_pageout_queue_external;
789 }
790
791 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
792 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
793 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
794
795 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
796
797 vm_object_activity_end(m_object);
798
799 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
800 }
801 if (m->vmp_laundry == TRUE) {
802 m->vmp_laundry = FALSE;
803 q->pgo_laundry--;
804
805 if (q->pgo_throttled == TRUE) {
806 q->pgo_throttled = FALSE;
807 thread_wakeup((event_t) &q->pgo_laundry);
808 }
809 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
810 q->pgo_draining = FALSE;
811 thread_wakeup((event_t) (&q->pgo_laundry + 1));
812 }
813 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
814 }
815 }
816
817
818 static void
vm_pageout_throttle_up_batch(struct vm_pageout_queue * q,int batch_cnt)819 vm_pageout_throttle_up_batch(
820 struct vm_pageout_queue *q,
821 int batch_cnt)
822 {
823 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
824
825 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
826
827 q->pgo_laundry -= batch_cnt;
828
829 if (q->pgo_throttled == TRUE) {
830 q->pgo_throttled = FALSE;
831 thread_wakeup((event_t) &q->pgo_laundry);
832 }
833 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
834 q->pgo_draining = FALSE;
835 thread_wakeup((event_t) (&q->pgo_laundry + 1));
836 }
837 }
838
839
840
841 /*
842 * VM memory pressure monitoring.
843 *
844 * vm_pageout_scan() keeps track of the number of pages it considers and
845 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
846 *
847 * compute_memory_pressure() is called every second from compute_averages()
848 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
849 * of recalimed pages in a new vm_pageout_stat[] bucket.
850 *
851 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
852 * The caller provides the number of seconds ("nsecs") worth of statistics
853 * it wants, up to 30 seconds.
854 * It computes the number of pages reclaimed in the past "nsecs" seconds and
855 * also returns the number of pages the system still needs to reclaim at this
856 * moment in time.
857 */
858 #if DEVELOPMENT || DEBUG
859 #define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
860 #else
861 #define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
862 #endif
863 struct vm_pageout_stat {
864 unsigned long vm_page_active_count;
865 unsigned long vm_page_speculative_count;
866 unsigned long vm_page_inactive_count;
867 unsigned long vm_page_anonymous_count;
868
869 unsigned long vm_page_free_count;
870 unsigned long vm_page_wire_count;
871 unsigned long vm_page_compressor_count;
872
873 unsigned long vm_page_pages_compressed;
874 unsigned long vm_page_pageable_internal_count;
875 unsigned long vm_page_pageable_external_count;
876 unsigned long vm_page_xpmapped_external_count;
877
878 unsigned int pages_grabbed;
879 unsigned int pages_freed;
880
881 unsigned int pages_compressed;
882 unsigned int pages_grabbed_by_compressor;
883 unsigned int failed_compressions;
884
885 unsigned int pages_evicted;
886 unsigned int pages_purged;
887
888 unsigned int considered;
889 unsigned int considered_bq_internal;
890 unsigned int considered_bq_external;
891
892 unsigned int skipped_external;
893 unsigned int skipped_internal;
894 unsigned int filecache_min_reactivations;
895
896 unsigned int freed_speculative;
897 unsigned int freed_cleaned;
898 unsigned int freed_internal;
899 unsigned int freed_external;
900
901 unsigned int cleaned_dirty_external;
902 unsigned int cleaned_dirty_internal;
903
904 unsigned int inactive_referenced;
905 unsigned int inactive_nolock;
906 unsigned int reactivation_limit_exceeded;
907 unsigned int forced_inactive_reclaim;
908
909 unsigned int throttled_internal_q;
910 unsigned int throttled_external_q;
911
912 unsigned int phantom_ghosts_found;
913 unsigned int phantom_ghosts_added;
914
915 unsigned int vm_page_realtime_count;
916 unsigned int forcereclaimed_sharedcache;
917 unsigned int forcereclaimed_realtime;
918 unsigned int protected_sharedcache;
919 unsigned int protected_realtime;
920 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
921
922 unsigned int vm_pageout_stat_now = 0;
923
924 #define VM_PAGEOUT_STAT_BEFORE(i) \
925 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
926 #define VM_PAGEOUT_STAT_AFTER(i) \
927 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
928
929 #if VM_PAGE_BUCKETS_CHECK
930 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
931 #endif /* VM_PAGE_BUCKETS_CHECK */
932
933
934 void
935 record_memory_pressure(void);
936 void
record_memory_pressure(void)937 record_memory_pressure(void)
938 {
939 unsigned int vm_pageout_next;
940
941 #if VM_PAGE_BUCKETS_CHECK
942 /* check the consistency of VM page buckets at regular interval */
943 static int counter = 0;
944 if ((++counter % vm_page_buckets_check_interval) == 0) {
945 vm_page_buckets_check();
946 }
947 #endif /* VM_PAGE_BUCKETS_CHECK */
948
949 vm_pageout_state.vm_memory_pressure =
950 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
951 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
952 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
953 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
954
955 commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
956
957 /* move "now" forward */
958 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
959
960 bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
961
962 vm_pageout_stat_now = vm_pageout_next;
963 }
964
965
966 /*
967 * IMPORTANT
968 * mach_vm_ctl_page_free_wanted() is called indirectly, via
969 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
970 * it must be safe in the restricted stackshot context. Locks and/or
971 * blocking are not allowable.
972 */
973 unsigned int
mach_vm_ctl_page_free_wanted(void)974 mach_vm_ctl_page_free_wanted(void)
975 {
976 unsigned int page_free_target, page_free_count, page_free_wanted;
977
978 page_free_target = vm_page_free_target;
979 page_free_count = vm_page_free_count;
980 if (page_free_target > page_free_count) {
981 page_free_wanted = page_free_target - page_free_count;
982 } else {
983 page_free_wanted = 0;
984 }
985
986 return page_free_wanted;
987 }
988
989
990 /*
991 * IMPORTANT:
992 * mach_vm_pressure_monitor() is called when taking a stackshot, with
993 * wait_for_pressure FALSE, so that code path must remain safe in the
994 * restricted stackshot context. No blocking or locks are allowable.
995 * on that code path.
996 */
997
998 kern_return_t
mach_vm_pressure_monitor(boolean_t wait_for_pressure,unsigned int nsecs_monitored,unsigned int * pages_reclaimed_p,unsigned int * pages_wanted_p)999 mach_vm_pressure_monitor(
1000 boolean_t wait_for_pressure,
1001 unsigned int nsecs_monitored,
1002 unsigned int *pages_reclaimed_p,
1003 unsigned int *pages_wanted_p)
1004 {
1005 wait_result_t wr;
1006 unsigned int vm_pageout_then, vm_pageout_now;
1007 unsigned int pages_reclaimed;
1008 unsigned int units_of_monitor;
1009
1010 units_of_monitor = 8 * nsecs_monitored;
1011 /*
1012 * We don't take the vm_page_queue_lock here because we don't want
1013 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1014 * thread when it's trying to reclaim memory. We don't need fully
1015 * accurate monitoring anyway...
1016 */
1017
1018 if (wait_for_pressure) {
1019 /* wait until there's memory pressure */
1020 while (vm_page_free_count >= vm_page_free_target) {
1021 wr = assert_wait((event_t) &vm_page_free_wanted,
1022 THREAD_INTERRUPTIBLE);
1023 if (wr == THREAD_WAITING) {
1024 wr = thread_block(THREAD_CONTINUE_NULL);
1025 }
1026 if (wr == THREAD_INTERRUPTED) {
1027 return KERN_ABORTED;
1028 }
1029 if (wr == THREAD_AWAKENED) {
1030 /*
1031 * The memory pressure might have already
1032 * been relieved but let's not block again
1033 * and let's report that there was memory
1034 * pressure at some point.
1035 */
1036 break;
1037 }
1038 }
1039 }
1040
1041 /* provide the number of pages the system wants to reclaim */
1042 if (pages_wanted_p != NULL) {
1043 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1044 }
1045
1046 if (pages_reclaimed_p == NULL) {
1047 return KERN_SUCCESS;
1048 }
1049
1050 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1051 vm_pageout_now = vm_pageout_stat_now;
1052 pages_reclaimed = 0;
1053 for (vm_pageout_then =
1054 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1055 vm_pageout_then != vm_pageout_now &&
1056 units_of_monitor-- != 0;
1057 vm_pageout_then =
1058 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1059 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1060 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1061 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1062 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1063 }
1064 *pages_reclaimed_p = pages_reclaimed;
1065
1066 return KERN_SUCCESS;
1067 }
1068
1069
1070
1071 #if DEVELOPMENT || DEBUG
1072
1073 static void
1074 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1075
1076 /*
1077 * condition variable used to make sure there is
1078 * only a single sweep going on at a time
1079 */
1080 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1081
1082
1083 void
vm_pageout_disconnect_all_pages()1084 vm_pageout_disconnect_all_pages()
1085 {
1086 vm_page_lock_queues();
1087
1088 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1089 vm_page_unlock_queues();
1090 return;
1091 }
1092 vm_pageout_disconnect_all_pages_active = TRUE;
1093 vm_page_unlock_queues();
1094
1095 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1096 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1097 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1098
1099 vm_pageout_disconnect_all_pages_active = FALSE;
1100 }
1101
1102
1103 void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t * q,int qcount)1104 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1105 {
1106 vm_page_t m;
1107 vm_object_t t_object = NULL;
1108 vm_object_t l_object = NULL;
1109 vm_object_t m_object = NULL;
1110 int delayed_unlock = 0;
1111 int try_failed_count = 0;
1112 int disconnected_count = 0;
1113 int paused_count = 0;
1114 int object_locked_count = 0;
1115
1116 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1117 q, qcount, 0, 0, 0);
1118
1119 vm_page_lock_queues();
1120
1121 while (qcount && !vm_page_queue_empty(q)) {
1122 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1123
1124 m = (vm_page_t) vm_page_queue_first(q);
1125 m_object = VM_PAGE_OBJECT(m);
1126
1127 /*
1128 * check to see if we currently are working
1129 * with the same object... if so, we've
1130 * already got the lock
1131 */
1132 if (m_object != l_object) {
1133 /*
1134 * the object associated with candidate page is
1135 * different from the one we were just working
1136 * with... dump the lock if we still own it
1137 */
1138 if (l_object != NULL) {
1139 vm_object_unlock(l_object);
1140 l_object = NULL;
1141 }
1142 if (m_object != t_object) {
1143 try_failed_count = 0;
1144 }
1145
1146 /*
1147 * Try to lock object; since we've alread got the
1148 * page queues lock, we can only 'try' for this one.
1149 * if the 'try' fails, we need to do a mutex_pause
1150 * to allow the owner of the object lock a chance to
1151 * run...
1152 */
1153 if (!vm_object_lock_try_scan(m_object)) {
1154 if (try_failed_count > 20) {
1155 goto reenter_pg_on_q;
1156 }
1157 vm_page_unlock_queues();
1158 mutex_pause(try_failed_count++);
1159 vm_page_lock_queues();
1160 delayed_unlock = 0;
1161
1162 paused_count++;
1163
1164 t_object = m_object;
1165 continue;
1166 }
1167 object_locked_count++;
1168
1169 l_object = m_object;
1170 }
1171 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1172 /*
1173 * put it back on the head of its queue
1174 */
1175 goto reenter_pg_on_q;
1176 }
1177 if (m->vmp_pmapped == TRUE) {
1178 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1179
1180 disconnected_count++;
1181 }
1182 reenter_pg_on_q:
1183 vm_page_queue_remove(q, m, vmp_pageq);
1184 vm_page_queue_enter(q, m, vmp_pageq);
1185
1186 qcount--;
1187 try_failed_count = 0;
1188
1189 if (delayed_unlock++ > 128) {
1190 if (l_object != NULL) {
1191 vm_object_unlock(l_object);
1192 l_object = NULL;
1193 }
1194 lck_mtx_yield(&vm_page_queue_lock);
1195 delayed_unlock = 0;
1196 }
1197 }
1198 if (l_object != NULL) {
1199 vm_object_unlock(l_object);
1200 l_object = NULL;
1201 }
1202 vm_page_unlock_queues();
1203
1204 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1205 q, disconnected_count, object_locked_count, paused_count, 0);
1206 }
1207
1208 extern char* proc_best_name(struct proc* proc);
1209
1210 int
vm_toggle_task_selfdonate_pages(task_t task)1211 vm_toggle_task_selfdonate_pages(task_t task)
1212 {
1213 int state = 0;
1214 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1215 printf("VM Donation mode is OFF on the system\n");
1216 return state;
1217 }
1218 if (task != kernel_task) {
1219 task_lock(task);
1220 if (!task->donates_own_pages) {
1221 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1222 task->donates_own_pages = true;
1223 state = 1;
1224 } else if (task->donates_own_pages) {
1225 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1226 task->donates_own_pages = false;
1227 state = 0;
1228 }
1229 task_unlock(task);
1230 }
1231 return state;
1232 }
1233 #endif /* DEVELOPMENT || DEBUG */
1234
1235 void
vm_task_set_selfdonate_pages(task_t task,bool donate)1236 vm_task_set_selfdonate_pages(task_t task, bool donate)
1237 {
1238 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1239 assert(task != kernel_task);
1240
1241 task_lock(task);
1242 task->donates_own_pages = donate;
1243 task_unlock(task);
1244 }
1245
1246
1247
1248 static size_t
1249 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1250
1251 /*
1252 * condition variable used to make sure there is
1253 * only a single sweep going on at a time
1254 */
1255 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1256
1257
1258 void
vm_pageout_anonymous_pages()1259 vm_pageout_anonymous_pages()
1260 {
1261 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1262 vm_page_lock_queues();
1263
1264 if (vm_pageout_anonymous_pages_active == TRUE) {
1265 vm_page_unlock_queues();
1266 return;
1267 }
1268 vm_pageout_anonymous_pages_active = TRUE;
1269 vm_page_unlock_queues();
1270
1271 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1272 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1273 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1274
1275 if (VM_CONFIG_SWAP_IS_PRESENT) {
1276 vm_consider_swapping();
1277 }
1278
1279 vm_page_lock_queues();
1280 vm_pageout_anonymous_pages_active = FALSE;
1281 vm_page_unlock_queues();
1282 }
1283 }
1284
1285
1286 size_t
vm_pageout_page_queue(vm_page_queue_head_t * q,size_t qcount,bool perf_test)1287 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1288 {
1289 vm_page_t m;
1290 vm_object_t t_object = NULL;
1291 vm_object_t l_object = NULL;
1292 vm_object_t m_object = NULL;
1293 int delayed_unlock = 0;
1294 int try_failed_count = 0;
1295 int refmod_state;
1296 int pmap_options;
1297 struct vm_pageout_queue *iq;
1298 ppnum_t phys_page;
1299 size_t pages_moved = 0;
1300
1301
1302 iq = &vm_pageout_queue_internal;
1303
1304 vm_page_lock_queues();
1305
1306 #if DEVELOPMENT || DEBUG
1307 if (perf_test) {
1308 iq = &vm_pageout_queue_benchmark;
1309 // ensure the benchmark queue isn't throttled
1310 iq->pgo_maxlaundry = (unsigned int) qcount;
1311 }
1312 #endif /* DEVELOPMENT ||DEBUG */
1313
1314 while (qcount && !vm_page_queue_empty(q)) {
1315 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1316
1317 if (VM_PAGE_Q_THROTTLED(iq)) {
1318 if (l_object != NULL) {
1319 vm_object_unlock(l_object);
1320 l_object = NULL;
1321 }
1322 iq->pgo_draining = TRUE;
1323
1324 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1325 vm_page_unlock_queues();
1326
1327 thread_block(THREAD_CONTINUE_NULL);
1328
1329 vm_page_lock_queues();
1330 delayed_unlock = 0;
1331 continue;
1332 }
1333 m = (vm_page_t) vm_page_queue_first(q);
1334 m_object = VM_PAGE_OBJECT(m);
1335
1336 /*
1337 * check to see if we currently are working
1338 * with the same object... if so, we've
1339 * already got the lock
1340 */
1341 if (m_object != l_object) {
1342 if (!m_object->internal) {
1343 goto reenter_pg_on_q;
1344 }
1345
1346 /*
1347 * the object associated with candidate page is
1348 * different from the one we were just working
1349 * with... dump the lock if we still own it
1350 */
1351 if (l_object != NULL) {
1352 vm_object_unlock(l_object);
1353 l_object = NULL;
1354 }
1355 if (m_object != t_object) {
1356 try_failed_count = 0;
1357 }
1358
1359 /*
1360 * Try to lock object; since we've alread got the
1361 * page queues lock, we can only 'try' for this one.
1362 * if the 'try' fails, we need to do a mutex_pause
1363 * to allow the owner of the object lock a chance to
1364 * run...
1365 */
1366 if (!vm_object_lock_try_scan(m_object)) {
1367 if (try_failed_count > 20) {
1368 goto reenter_pg_on_q;
1369 }
1370 vm_page_unlock_queues();
1371 mutex_pause(try_failed_count++);
1372 vm_page_lock_queues();
1373 delayed_unlock = 0;
1374
1375 t_object = m_object;
1376 continue;
1377 }
1378 l_object = m_object;
1379 }
1380 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1381 /*
1382 * page is not to be cleaned
1383 * put it back on the head of its queue
1384 */
1385 goto reenter_pg_on_q;
1386 }
1387 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1388
1389 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1390 refmod_state = pmap_get_refmod(phys_page);
1391
1392 if (refmod_state & VM_MEM_REFERENCED) {
1393 m->vmp_reference = TRUE;
1394 }
1395 if (refmod_state & VM_MEM_MODIFIED) {
1396 SET_PAGE_DIRTY(m, FALSE);
1397 }
1398 }
1399 if (m->vmp_reference == TRUE) {
1400 m->vmp_reference = FALSE;
1401 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1402 goto reenter_pg_on_q;
1403 }
1404 if (m->vmp_pmapped == TRUE) {
1405 if (m->vmp_dirty || m->vmp_precious) {
1406 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1407 } else {
1408 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1409 }
1410 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1411 if (refmod_state & VM_MEM_MODIFIED) {
1412 SET_PAGE_DIRTY(m, FALSE);
1413 }
1414 }
1415
1416 if (!m->vmp_dirty && !m->vmp_precious) {
1417 vm_page_unlock_queues();
1418 VM_PAGE_FREE(m);
1419 vm_page_lock_queues();
1420 delayed_unlock = 0;
1421
1422 goto next_pg;
1423 }
1424 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1425 if (!m_object->pager_initialized) {
1426 vm_page_unlock_queues();
1427
1428 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1429
1430 if (!m_object->pager_initialized) {
1431 vm_object_compressor_pager_create(m_object);
1432 }
1433
1434 vm_page_lock_queues();
1435 delayed_unlock = 0;
1436 }
1437 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1438 goto reenter_pg_on_q;
1439 }
1440 /*
1441 * vm_object_compressor_pager_create will drop the object lock
1442 * which means 'm' may no longer be valid to use
1443 */
1444 continue;
1445 }
1446
1447 if (!perf_test) {
1448 /*
1449 * we've already factored out pages in the laundry which
1450 * means this page can't be on the pageout queue so it's
1451 * safe to do the vm_page_queues_remove
1452 */
1453 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1454 vm_page_queues_remove(m, TRUE);
1455 if (donate) {
1456 /*
1457 * The compressor needs to see this bit to know
1458 * where this page needs to land. Also if stolen,
1459 * this bit helps put the page back in the right
1460 * special queue where it belongs.
1461 */
1462 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1463 }
1464 } else {
1465 vm_page_queue_remove(q, m, vmp_pageq);
1466 }
1467
1468 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1469
1470 vm_pageout_cluster_to_queue(m, iq);
1471
1472 pages_moved++;
1473 goto next_pg;
1474
1475 reenter_pg_on_q:
1476 vm_page_queue_remove(q, m, vmp_pageq);
1477 vm_page_queue_enter(q, m, vmp_pageq);
1478 next_pg:
1479 qcount--;
1480 try_failed_count = 0;
1481
1482 if (delayed_unlock++ > 128) {
1483 if (l_object != NULL) {
1484 vm_object_unlock(l_object);
1485 l_object = NULL;
1486 }
1487 lck_mtx_yield(&vm_page_queue_lock);
1488 delayed_unlock = 0;
1489 }
1490 }
1491 if (l_object != NULL) {
1492 vm_object_unlock(l_object);
1493 l_object = NULL;
1494 }
1495 vm_page_unlock_queues();
1496 return pages_moved;
1497 }
1498
1499
1500
1501 /*
1502 * function in BSD to apply I/O throttle to the pageout thread
1503 */
1504 extern void vm_pageout_io_throttle(void);
1505
1506 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1507 MACRO_BEGIN \
1508 /* \
1509 * If a "reusable" page somehow made it back into \
1510 * the active queue, it's been re-used and is not \
1511 * quite re-usable. \
1512 * If the VM object was "all_reusable", consider it \
1513 * as "all re-used" instead of converting it to \
1514 * "partially re-used", which could be expensive. \
1515 */ \
1516 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1517 if ((m)->vmp_reusable || \
1518 (obj)->all_reusable) { \
1519 vm_object_reuse_pages((obj), \
1520 (m)->vmp_offset, \
1521 (m)->vmp_offset + PAGE_SIZE_64, \
1522 FALSE); \
1523 } \
1524 MACRO_END
1525
1526
1527 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1528 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1529
1530 #define FCS_IDLE 0
1531 #define FCS_DELAYED 1
1532 #define FCS_DEADLOCK_DETECTED 2
1533
1534 struct flow_control {
1535 int state;
1536 mach_timespec_t ts;
1537 };
1538
1539
1540 uint64_t vm_pageout_rejected_bq_internal = 0;
1541 uint64_t vm_pageout_rejected_bq_external = 0;
1542 uint64_t vm_pageout_skipped_bq_internal = 0;
1543 uint64_t vm_pageout_skipped_bq_external = 0;
1544
1545 #define ANONS_GRABBED_LIMIT 2
1546
1547
1548 #if 0
1549 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1550 #endif
1551 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1552
1553 #define VM_PAGEOUT_PB_NO_ACTION 0
1554 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1555 #define VM_PAGEOUT_PB_THREAD_YIELD 2
1556
1557
1558 #if 0
1559 static void
1560 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1561 {
1562 if (*local_freeq) {
1563 vm_page_unlock_queues();
1564
1565 VM_DEBUG_CONSTANT_EVENT(
1566 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1567 vm_page_free_count, 0, 0, 1);
1568
1569 vm_page_free_list(*local_freeq, TRUE);
1570
1571 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1572 vm_page_free_count, *local_freed, 0, 1);
1573
1574 *local_freeq = NULL;
1575 *local_freed = 0;
1576
1577 vm_page_lock_queues();
1578 } else {
1579 lck_mtx_yield(&vm_page_queue_lock);
1580 }
1581 *delayed_unlock = 1;
1582 }
1583 #endif
1584
1585
1586 static void
vm_pageout_prepare_to_block(vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int action)1587 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1588 vm_page_t *local_freeq, int *local_freed, int action)
1589 {
1590 vm_page_unlock_queues();
1591
1592 if (*object != NULL) {
1593 vm_object_unlock(*object);
1594 *object = NULL;
1595 }
1596 if (*local_freeq) {
1597 vm_page_free_list(*local_freeq, TRUE);
1598
1599 *local_freeq = NULL;
1600 *local_freed = 0;
1601 }
1602 *delayed_unlock = 1;
1603
1604 switch (action) {
1605 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1606 vm_consider_waking_compactor_swapper();
1607 break;
1608 case VM_PAGEOUT_PB_THREAD_YIELD:
1609 thread_yield_internal(1);
1610 break;
1611 case VM_PAGEOUT_PB_NO_ACTION:
1612 default:
1613 break;
1614 }
1615 vm_page_lock_queues();
1616 }
1617
1618
1619 static struct vm_pageout_vminfo last;
1620
1621 uint64_t last_vm_page_pages_grabbed = 0;
1622
1623 extern uint32_t c_segment_pages_compressed;
1624
1625 extern uint64_t shared_region_pager_reclaimed;
1626 extern struct memory_object_pager_ops shared_region_pager_ops;
1627
1628 void
update_vm_info(void)1629 update_vm_info(void)
1630 {
1631 unsigned long tmp;
1632 uint64_t tmp64;
1633
1634 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1635 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1636 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1637 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1638
1639 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1640 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1641 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1642
1643 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1644 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1645 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1646 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1647 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1648
1649 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1650 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1651 last.vm_pageout_considered_page = tmp;
1652
1653 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1654 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1655 last.vm_pageout_compressions = tmp64;
1656
1657 tmp = vm_pageout_vminfo.vm_compressor_failed;
1658 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1659 last.vm_compressor_failed = tmp;
1660
1661 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1662 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1663 last.vm_compressor_pages_grabbed = tmp64;
1664
1665 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1666 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1667 last.vm_phantom_cache_found_ghost = tmp;
1668
1669 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1670 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1671 last.vm_phantom_cache_added_ghost = tmp;
1672
1673 tmp64 = counter_load(&vm_page_grab_count);
1674 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1675 last_vm_page_pages_grabbed = tmp64;
1676
1677 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1678 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1679 last.vm_page_pages_freed = tmp;
1680
1681 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1682 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1683 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1684 last.vm_pageout_pages_evicted = tmp;
1685
1686 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1687 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1688 last.vm_pageout_pages_purged = tmp;
1689
1690 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1691 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1692 last.vm_pageout_freed_speculative = tmp;
1693
1694 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1695 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1696 last.vm_pageout_freed_external = tmp;
1697
1698 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1699 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1700 last.vm_pageout_inactive_referenced = tmp;
1701
1702 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1703 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1704 last.vm_pageout_scan_inactive_throttled_external = tmp;
1705
1706 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1707 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1708 last.vm_pageout_inactive_dirty_external = tmp;
1709
1710 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1711 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1712 last.vm_pageout_freed_cleaned = tmp;
1713
1714 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1715 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1716 last.vm_pageout_inactive_nolock = tmp;
1717
1718 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1719 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1720 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1721
1722 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1723 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1724 last.vm_pageout_skipped_external = tmp;
1725
1726 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1727 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1728 last.vm_pageout_skipped_internal = tmp;
1729
1730 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1731 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1732 last.vm_pageout_reactivation_limit_exceeded = tmp;
1733
1734 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1735 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1736 last.vm_pageout_inactive_force_reclaim = tmp;
1737
1738 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1739 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1740 last.vm_pageout_freed_internal = tmp;
1741
1742 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1743 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1744 last.vm_pageout_considered_bq_internal = tmp;
1745
1746 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1747 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1748 last.vm_pageout_considered_bq_external = tmp;
1749
1750 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1751 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1752 last.vm_pageout_filecache_min_reactivated = tmp;
1753
1754 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1755 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1756 last.vm_pageout_inactive_dirty_internal = tmp;
1757
1758 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1759 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1760 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1761
1762 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1763 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1764 last.vm_pageout_forcereclaimed_realtime = tmp;
1765
1766 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1767 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1768 last.vm_pageout_protected_sharedcache = tmp;
1769
1770 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1771 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1772 last.vm_pageout_protected_realtime = tmp;
1773 }
1774
1775 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1776 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1777 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1778 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1779 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1780 0);
1781
1782 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1783 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1784 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1785 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1786 0,
1787 0);
1788
1789 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1790 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1791 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1792 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1793 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1794 0);
1795
1796 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1797 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1798 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1799 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1800 vm_pageout_stats[vm_pageout_stat_now].considered,
1801 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1802 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1803 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1804 0);
1805
1806 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1807 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1808 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1809 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1810 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1811 0);
1812
1813 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1814 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1815 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1816 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1817 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1818 0);
1819
1820 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1821 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1822 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1823 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1824 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1825 0);
1826
1827 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1828 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1829 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1830 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1831 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1832 0);
1833
1834 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1835 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1836 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1837 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1838 vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1839 0);
1840 }
1841 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1842 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1843 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1844 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1845 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1846 0);
1847
1848 record_memory_pressure();
1849 }
1850
1851 extern boolean_t hibernation_vmqueues_inspection;
1852
1853 /*
1854 * Return values for functions called by vm_pageout_scan
1855 * that control its flow.
1856 *
1857 * PROCEED -- vm_pageout_scan will keep making forward progress.
1858 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1859 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1860 */
1861
1862 #define VM_PAGEOUT_SCAN_PROCEED (0)
1863 #define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1864 #define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1865
1866 /*
1867 * This function is called only from vm_pageout_scan and
1868 * it moves overflow secluded pages (one-at-a-time) to the
1869 * batched 'local' free Q or active Q.
1870 */
1871 static void
vps_deal_with_secluded_page_overflow(vm_page_t * local_freeq,int * local_freed)1872 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1873 {
1874 #if CONFIG_SECLUDED_MEMORY
1875 /*
1876 * Deal with secluded_q overflow.
1877 */
1878 if (vm_page_secluded_count > vm_page_secluded_target) {
1879 vm_page_t secluded_page;
1880
1881 /*
1882 * SECLUDED_AGING_BEFORE_ACTIVE:
1883 * Excess secluded pages go to the active queue and
1884 * will later go to the inactive queue.
1885 */
1886 assert((vm_page_secluded_count_free +
1887 vm_page_secluded_count_inuse) ==
1888 vm_page_secluded_count);
1889 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1890 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1891
1892 vm_page_queues_remove(secluded_page, FALSE);
1893 assert(!secluded_page->vmp_fictitious);
1894 assert(!VM_PAGE_WIRED(secluded_page));
1895
1896 if (secluded_page->vmp_object == 0) {
1897 /* transfer to free queue */
1898 assert(secluded_page->vmp_busy);
1899 secluded_page->vmp_snext = *local_freeq;
1900 *local_freeq = secluded_page;
1901 *local_freed += 1;
1902 } else {
1903 /* transfer to head of active queue */
1904 vm_page_enqueue_active(secluded_page, FALSE);
1905 secluded_page = VM_PAGE_NULL;
1906 }
1907 }
1908 #else /* CONFIG_SECLUDED_MEMORY */
1909
1910 #pragma unused(local_freeq)
1911 #pragma unused(local_freed)
1912
1913 return;
1914
1915 #endif /* CONFIG_SECLUDED_MEMORY */
1916 }
1917
1918 /*
1919 * This function is called only from vm_pageout_scan and
1920 * it initializes the loop targets for vm_pageout_scan().
1921 */
1922 static void
vps_init_page_targets(void)1923 vps_init_page_targets(void)
1924 {
1925 /*
1926 * LD TODO: Other page targets should be calculated here too.
1927 */
1928 vm_page_anonymous_min = vm_page_inactive_target / 20;
1929
1930 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1931 vm_pageout_state.vm_page_speculative_percentage = 50;
1932 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1933 vm_pageout_state.vm_page_speculative_percentage = 1;
1934 }
1935
1936 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1937 vm_page_inactive_count);
1938 }
1939
1940 /*
1941 * This function is called only from vm_pageout_scan and
1942 * it purges a single VM object at-a-time and will either
1943 * make vm_pageout_scan() restart the loop or keeping moving forward.
1944 */
1945 static int
vps_purge_object()1946 vps_purge_object()
1947 {
1948 int force_purge;
1949
1950 assert(available_for_purge >= 0);
1951 force_purge = 0; /* no force-purging */
1952
1953 #if VM_PRESSURE_EVENTS
1954 vm_pressure_level_t pressure_level;
1955
1956 pressure_level = memorystatus_vm_pressure_level;
1957
1958 if (pressure_level > kVMPressureNormal) {
1959 if (pressure_level >= kVMPressureCritical) {
1960 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1961 } else if (pressure_level >= kVMPressureUrgent) {
1962 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1963 } else if (pressure_level >= kVMPressureWarning) {
1964 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1965 }
1966 }
1967 #endif /* VM_PRESSURE_EVENTS */
1968
1969 if (available_for_purge || force_purge) {
1970 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1971
1972 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1973 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1974 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1975 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1976 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1977
1978 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1979 }
1980 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1981 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1982 }
1983
1984 return VM_PAGEOUT_SCAN_PROCEED;
1985 }
1986
1987 /*
1988 * This function is called only from vm_pageout_scan and
1989 * it will try to age the next speculative Q if the oldest
1990 * one is empty.
1991 */
1992 static int
vps_age_speculative_queue(boolean_t force_speculative_aging)1993 vps_age_speculative_queue(boolean_t force_speculative_aging)
1994 {
1995 #define DELAY_SPECULATIVE_AGE 1000
1996
1997 /*
1998 * try to pull pages from the aging bins...
1999 * see vm_page.h for an explanation of how
2000 * this mechanism works
2001 */
2002 boolean_t can_steal = FALSE;
2003 int num_scanned_queues;
2004 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2005 mach_timespec_t ts;
2006 struct vm_speculative_age_q *aq;
2007 struct vm_speculative_age_q *sq;
2008
2009 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2010
2011 aq = &vm_page_queue_speculative[speculative_steal_index];
2012
2013 num_scanned_queues = 0;
2014 while (vm_page_queue_empty(&aq->age_q) &&
2015 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2016 speculative_steal_index++;
2017
2018 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2019 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2020 }
2021
2022 aq = &vm_page_queue_speculative[speculative_steal_index];
2023 }
2024
2025 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2026 /*
2027 * XXX We've scanned all the speculative
2028 * queues but still haven't found one
2029 * that is not empty, even though
2030 * vm_page_speculative_count is not 0.
2031 */
2032 if (!vm_page_queue_empty(&sq->age_q)) {
2033 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2034 }
2035 #if DEVELOPMENT || DEBUG
2036 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2037 #endif
2038 /* readjust... */
2039 vm_page_speculative_count = 0;
2040 /* ... and continue */
2041 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2042 }
2043
2044 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2045 can_steal = TRUE;
2046 } else {
2047 if (!delay_speculative_age) {
2048 mach_timespec_t ts_fully_aged;
2049
2050 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2051 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2052 * 1000 * NSEC_PER_USEC;
2053
2054 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2055
2056 clock_sec_t sec;
2057 clock_nsec_t nsec;
2058 clock_get_system_nanotime(&sec, &nsec);
2059 ts.tv_sec = (unsigned int) sec;
2060 ts.tv_nsec = nsec;
2061
2062 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2063 can_steal = TRUE;
2064 } else {
2065 delay_speculative_age++;
2066 }
2067 } else {
2068 delay_speculative_age++;
2069 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2070 delay_speculative_age = 0;
2071 }
2072 }
2073 }
2074 if (can_steal == TRUE) {
2075 vm_page_speculate_ageit(aq);
2076 }
2077
2078 return VM_PAGEOUT_SCAN_PROCEED;
2079 }
2080
2081 /*
2082 * This function is called only from vm_pageout_scan and
2083 * it evicts a single VM object from the cache.
2084 */
2085 static int inline
vps_object_cache_evict(vm_object_t * object_to_unlock)2086 vps_object_cache_evict(vm_object_t *object_to_unlock)
2087 {
2088 static int cache_evict_throttle = 0;
2089 struct vm_speculative_age_q *sq;
2090
2091 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2092
2093 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2094 int pages_evicted;
2095
2096 if (*object_to_unlock != NULL) {
2097 vm_object_unlock(*object_to_unlock);
2098 *object_to_unlock = NULL;
2099 }
2100 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2101
2102 pages_evicted = vm_object_cache_evict(100, 10);
2103
2104 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2105
2106 if (pages_evicted) {
2107 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2108
2109 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2110 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2111 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2112
2113 /*
2114 * we just freed up to 100 pages,
2115 * so go back to the top of the main loop
2116 * and re-evaulate the memory situation
2117 */
2118 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2119 } else {
2120 cache_evict_throttle = 1000;
2121 }
2122 }
2123 if (cache_evict_throttle) {
2124 cache_evict_throttle--;
2125 }
2126
2127 return VM_PAGEOUT_SCAN_PROCEED;
2128 }
2129
2130
2131 /*
2132 * This function is called only from vm_pageout_scan and
2133 * it calculates the filecache min. that needs to be maintained
2134 * as we start to steal pages.
2135 */
2136 static void
vps_calculate_filecache_min(void)2137 vps_calculate_filecache_min(void)
2138 {
2139 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2140
2141 #if CONFIG_JETSAM
2142 /*
2143 * don't let the filecache_min fall below 15% of available memory
2144 * on systems with an active compressor that isn't nearing its
2145 * limits w/r to accepting new data
2146 *
2147 * on systems w/o the compressor/swapper, the filecache is always
2148 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2149 * since most (if not all) of the anonymous pages are in the
2150 * throttled queue (which isn't counted as available) which
2151 * effectively disables this filter
2152 */
2153 if (vm_compressor_low_on_space() || divisor == 0) {
2154 vm_pageout_state.vm_page_filecache_min = 0;
2155 } else {
2156 vm_pageout_state.vm_page_filecache_min =
2157 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2158 }
2159 #else
2160 if (vm_compressor_out_of_space() || divisor == 0) {
2161 vm_pageout_state.vm_page_filecache_min = 0;
2162 } else {
2163 /*
2164 * don't let the filecache_min fall below the specified critical level
2165 */
2166 vm_pageout_state.vm_page_filecache_min =
2167 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2168 }
2169 #endif
2170 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2171 vm_pageout_state.vm_page_filecache_min = 0;
2172 }
2173 }
2174
2175 /*
2176 * This function is called only from vm_pageout_scan and
2177 * it updates the flow control time to detect if VM pageoutscan
2178 * isn't making progress.
2179 */
2180 static void
vps_flow_control_reset_deadlock_timer(struct flow_control * flow_control)2181 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2182 {
2183 mach_timespec_t ts;
2184 clock_sec_t sec;
2185 clock_nsec_t nsec;
2186
2187 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2188 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2189 clock_get_system_nanotime(&sec, &nsec);
2190 flow_control->ts.tv_sec = (unsigned int) sec;
2191 flow_control->ts.tv_nsec = nsec;
2192 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2193
2194 flow_control->state = FCS_DELAYED;
2195
2196 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2197 }
2198
2199 /*
2200 * This function is called only from vm_pageout_scan and
2201 * it is the flow control logic of VM pageout scan which
2202 * controls if it should block and for how long.
2203 * Any blocking of vm_pageout_scan happens ONLY in this function.
2204 */
2205 static int
vps_flow_control(struct flow_control * flow_control,int * anons_grabbed,vm_object_t * object,int * delayed_unlock,vm_page_t * local_freeq,int * local_freed,int * vm_pageout_deadlock_target,unsigned int inactive_burst_count)2206 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2207 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2208 {
2209 boolean_t exceeded_burst_throttle = FALSE;
2210 unsigned int msecs = 0;
2211 uint32_t inactive_external_count;
2212 mach_timespec_t ts;
2213 struct vm_pageout_queue *iq;
2214 struct vm_pageout_queue *eq;
2215 struct vm_speculative_age_q *sq;
2216
2217 iq = &vm_pageout_queue_internal;
2218 eq = &vm_pageout_queue_external;
2219 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2220
2221 /*
2222 * Sometimes we have to pause:
2223 * 1) No inactive pages - nothing to do.
2224 * 2) Loop control - no acceptable pages found on the inactive queue
2225 * within the last vm_pageout_burst_inactive_throttle iterations
2226 * 3) Flow control - default pageout queue is full
2227 */
2228 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2229 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2230 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2231 vm_page_queue_empty(&sq->age_q)) {
2232 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2233 msecs = vm_pageout_state.vm_pageout_empty_wait;
2234 } else if (inactive_burst_count >=
2235 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2236 (vm_page_inactive_count +
2237 vm_page_speculative_count))) {
2238 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2239 msecs = vm_pageout_state.vm_pageout_burst_wait;
2240
2241 exceeded_burst_throttle = TRUE;
2242 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2243 VM_DYNAMIC_PAGING_ENABLED()) {
2244 clock_sec_t sec;
2245 clock_nsec_t nsec;
2246
2247 switch (flow_control->state) {
2248 case FCS_IDLE:
2249 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2250 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2251 /*
2252 * since the compressor is running independently of vm_pageout_scan
2253 * let's not wait for it just yet... as long as we have a healthy supply
2254 * of filecache pages to work with, let's keep stealing those.
2255 */
2256 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2257
2258 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2259 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2260 *anons_grabbed = ANONS_GRABBED_LIMIT;
2261 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2262 return VM_PAGEOUT_SCAN_PROCEED;
2263 }
2264 }
2265
2266 vps_flow_control_reset_deadlock_timer(flow_control);
2267 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2268
2269 break;
2270
2271 case FCS_DELAYED:
2272 clock_get_system_nanotime(&sec, &nsec);
2273 ts.tv_sec = (unsigned int) sec;
2274 ts.tv_nsec = nsec;
2275
2276 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2277 /*
2278 * the pageout thread for the default pager is potentially
2279 * deadlocked since the
2280 * default pager queue has been throttled for more than the
2281 * allowable time... we need to move some clean pages or dirty
2282 * pages belonging to the external pagers if they aren't throttled
2283 * vm_page_free_wanted represents the number of threads currently
2284 * blocked waiting for pages... we'll move one page for each of
2285 * these plus a fixed amount to break the logjam... once we're done
2286 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2287 * with a new timeout target since we have no way of knowing
2288 * whether we've broken the deadlock except through observation
2289 * of the queue associated with the default pager... we need to
2290 * stop moving pages and allow the system to run to see what
2291 * state it settles into.
2292 */
2293
2294 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2295 vm_page_free_wanted + vm_page_free_wanted_privileged;
2296 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2297 flow_control->state = FCS_DEADLOCK_DETECTED;
2298 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2299 return VM_PAGEOUT_SCAN_PROCEED;
2300 }
2301 /*
2302 * just resniff instead of trying
2303 * to compute a new delay time... we're going to be
2304 * awakened immediately upon a laundry completion,
2305 * so we won't wait any longer than necessary
2306 */
2307 msecs = vm_pageout_state.vm_pageout_idle_wait;
2308 break;
2309
2310 case FCS_DEADLOCK_DETECTED:
2311 if (*vm_pageout_deadlock_target) {
2312 return VM_PAGEOUT_SCAN_PROCEED;
2313 }
2314
2315 vps_flow_control_reset_deadlock_timer(flow_control);
2316 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2317
2318 break;
2319 }
2320 } else {
2321 /*
2322 * No need to pause...
2323 */
2324 return VM_PAGEOUT_SCAN_PROCEED;
2325 }
2326
2327 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2328
2329 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2330 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2331
2332 if (vm_page_free_count >= vm_page_free_target) {
2333 /*
2334 * we're here because
2335 * 1) someone else freed up some pages while we had
2336 * the queues unlocked above
2337 * and we've hit one of the 3 conditions that
2338 * cause us to pause the pageout scan thread
2339 *
2340 * since we already have enough free pages,
2341 * let's avoid stalling and return normally
2342 *
2343 * before we return, make sure the pageout I/O threads
2344 * are running throttled in case there are still requests
2345 * in the laundry... since we have enough free pages
2346 * we don't need the laundry to be cleaned in a timely
2347 * fashion... so let's avoid interfering with foreground
2348 * activity
2349 *
2350 * we don't want to hold vm_page_queue_free_lock when
2351 * calling vm_pageout_adjust_eq_iothrottle (since it
2352 * may cause other locks to be taken), we do the intitial
2353 * check outside of the lock. Once we take the lock,
2354 * we recheck the condition since it may have changed.
2355 * if it has, no problem, we will make the threads
2356 * non-throttled before actually blocking
2357 */
2358 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2359 }
2360 vm_free_page_lock();
2361
2362 if (vm_page_free_count >= vm_page_free_target &&
2363 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2364 return VM_PAGEOUT_SCAN_DONE_RETURN;
2365 }
2366 vm_free_page_unlock();
2367
2368 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2369 /*
2370 * we're most likely about to block due to one of
2371 * the 3 conditions that cause vm_pageout_scan to
2372 * not be able to make forward progress w/r
2373 * to providing new pages to the free queue,
2374 * so unthrottle the I/O threads in case we
2375 * have laundry to be cleaned... it needs
2376 * to be completed ASAP.
2377 *
2378 * even if we don't block, we want the io threads
2379 * running unthrottled since the sum of free +
2380 * clean pages is still under our free target
2381 */
2382 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2383 }
2384 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2385 /*
2386 * if we get here we're below our free target and
2387 * we're stalling due to a full laundry queue or
2388 * we don't have any inactive pages other then
2389 * those in the clean queue...
2390 * however, we have pages on the clean queue that
2391 * can be moved to the free queue, so let's not
2392 * stall the pageout scan
2393 */
2394 flow_control->state = FCS_IDLE;
2395 return VM_PAGEOUT_SCAN_PROCEED;
2396 }
2397 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2398 flow_control->state = FCS_IDLE;
2399 return VM_PAGEOUT_SCAN_PROCEED;
2400 }
2401
2402 VM_CHECK_MEMORYSTATUS;
2403
2404 if (flow_control->state != FCS_IDLE) {
2405 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2406 }
2407
2408 iq->pgo_throttled = TRUE;
2409 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2410
2411 vm_page_unlock_queues();
2412
2413 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2414
2415 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2416 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2417 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2418
2419 thread_block(THREAD_CONTINUE_NULL);
2420
2421 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2422 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2423 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2424
2425 vm_page_lock_queues();
2426
2427 iq->pgo_throttled = FALSE;
2428
2429 vps_init_page_targets();
2430
2431 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2432 }
2433
2434 extern boolean_t vm_darkwake_mode;
2435 /*
2436 * This function is called only from vm_pageout_scan and
2437 * it will find and return the most appropriate page to be
2438 * reclaimed.
2439 */
2440 static int
vps_choose_victim_page(vm_page_t * victim_page,int * anons_grabbed,boolean_t * grab_anonymous,boolean_t force_anonymous,boolean_t * is_page_from_bg_q,unsigned int * reactivated_this_call)2441 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2442 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2443 {
2444 vm_page_t m = NULL;
2445 vm_object_t m_object = VM_OBJECT_NULL;
2446 uint32_t inactive_external_count;
2447 struct vm_speculative_age_q *sq;
2448 struct vm_pageout_queue *iq;
2449 int retval = VM_PAGEOUT_SCAN_PROCEED;
2450
2451 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2452 iq = &vm_pageout_queue_internal;
2453
2454 *is_page_from_bg_q = FALSE;
2455
2456 m = NULL;
2457 m_object = VM_OBJECT_NULL;
2458
2459 if (VM_DYNAMIC_PAGING_ENABLED()) {
2460 assert(vm_page_throttled_count == 0);
2461 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2462 }
2463
2464 /*
2465 * Try for a clean-queue inactive page.
2466 * These are pages that vm_pageout_scan tried to steal earlier, but
2467 * were dirty and had to be cleaned. Pick them up now that they are clean.
2468 */
2469 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2470 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2471
2472 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2473
2474 goto found_page;
2475 }
2476
2477 /*
2478 * The next most eligible pages are ones we paged in speculatively,
2479 * but which have not yet been touched and have been aged out.
2480 */
2481 if (!vm_page_queue_empty(&sq->age_q)) {
2482 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2483
2484 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2485
2486 if (!m->vmp_dirty || force_anonymous == FALSE) {
2487 goto found_page;
2488 } else {
2489 m = NULL;
2490 }
2491 }
2492
2493 #if !CONFIG_JETSAM
2494 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2495 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2496 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2497 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2498 goto found_page;
2499 }
2500 }
2501 #endif /* !CONFIG_JETSAM */
2502
2503 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2504 vm_object_t bg_m_object = NULL;
2505
2506 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2507
2508 bg_m_object = VM_PAGE_OBJECT(m);
2509
2510 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2511 /*
2512 * This page is on the background queue
2513 * but not on a pageable queue OR is busy during
2514 * darkwake mode when the target is artificially lowered.
2515 * If it is busy during darkwake mode, and we don't skip it,
2516 * we will just swing back around and try again with the same
2517 * queue and might hit the same page or its neighbor in a
2518 * similar state. Both of these are transient states and will
2519 * get resolved, but, at this point let's ignore this page.
2520 */
2521 if (vm_darkwake_mode && m->vmp_busy) {
2522 if (bg_m_object->internal) {
2523 vm_pageout_skipped_bq_internal++;
2524 } else {
2525 vm_pageout_skipped_bq_external++;
2526 }
2527 }
2528 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2529 if (bg_m_object->internal &&
2530 (VM_PAGE_Q_THROTTLED(iq) ||
2531 vm_compressor_out_of_space() == TRUE ||
2532 vm_page_free_count < (vm_page_free_reserved / 4))) {
2533 vm_pageout_skipped_bq_internal++;
2534 } else {
2535 *is_page_from_bg_q = TRUE;
2536
2537 if (bg_m_object->internal) {
2538 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2539 } else {
2540 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2541 }
2542 goto found_page;
2543 }
2544 }
2545 }
2546
2547 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2548
2549 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2550 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2551 *grab_anonymous = TRUE;
2552 *anons_grabbed = 0;
2553
2554 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2555 vm_pageout_vminfo.vm_pageout_skipped_external++;
2556 } else {
2557 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2558 /*
2559 * No swap and we are in dangerously low levels of free memory.
2560 * If we keep going ahead with anonymous pages, we are going to run into a situation
2561 * where the compressor will be stuck waiting for free pages (if it isn't already).
2562 *
2563 * So, pick a file backed page...
2564 */
2565 *grab_anonymous = FALSE;
2566 *anons_grabbed = ANONS_GRABBED_LIMIT;
2567 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2568 }
2569 }
2570 goto want_anonymous;
2571 }
2572 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2573
2574 #if CONFIG_JETSAM
2575 /* If the file-backed pool has accumulated
2576 * significantly more pages than the jetsam
2577 * threshold, prefer to reclaim those
2578 * inline to minimise compute overhead of reclaiming
2579 * anonymous pages.
2580 * This calculation does not account for the CPU local
2581 * external page queues, as those are expected to be
2582 * much smaller relative to the global pools.
2583 */
2584
2585 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2586
2587 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2588 if (vm_page_pageable_external_count >
2589 vm_pageout_state.vm_page_filecache_min) {
2590 if ((vm_page_pageable_external_count *
2591 vm_pageout_memorystatus_fb_factor_dr) >
2592 (memorystatus_available_pages_critical *
2593 vm_pageout_memorystatus_fb_factor_nr)) {
2594 *grab_anonymous = FALSE;
2595
2596 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2597 }
2598 }
2599 if (*grab_anonymous) {
2600 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2601 }
2602 }
2603 #endif /* CONFIG_JETSAM */
2604
2605 want_anonymous:
2606 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2607 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2608 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2609
2610 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2611 *anons_grabbed = 0;
2612
2613 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2614 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2615 if ((++(*reactivated_this_call) % 100)) {
2616 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2617
2618 vm_page_activate(m);
2619 counter_inc(&vm_statistics_reactivations);
2620 #if DEVELOPMENT || DEBUG
2621 if (*is_page_from_bg_q == TRUE) {
2622 if (m_object->internal) {
2623 vm_pageout_rejected_bq_internal++;
2624 } else {
2625 vm_pageout_rejected_bq_external++;
2626 }
2627 }
2628 #endif /* DEVELOPMENT || DEBUG */
2629 vm_pageout_state.vm_pageout_inactive_used++;
2630
2631 m = NULL;
2632 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2633
2634 goto found_page;
2635 }
2636
2637 /*
2638 * steal 1 of the file backed pages even if
2639 * we are under the limit that has been set
2640 * for a healthy filecache
2641 */
2642 }
2643 }
2644 goto found_page;
2645 }
2646 }
2647 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2648 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2649
2650 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2651 *anons_grabbed += 1;
2652
2653 goto found_page;
2654 }
2655
2656 m = NULL;
2657
2658 found_page:
2659 *victim_page = m;
2660
2661 return retval;
2662 }
2663
2664 /*
2665 * This function is called only from vm_pageout_scan and
2666 * it will put a page back on the active/inactive queue
2667 * if we can't reclaim it for some reason.
2668 */
2669 static void
vps_requeue_page(vm_page_t m,int page_prev_q_state,__unused boolean_t page_from_bg_q)2670 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2671 {
2672 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2673 vm_page_enqueue_inactive(m, FALSE);
2674 } else {
2675 vm_page_activate(m);
2676 }
2677
2678 #if DEVELOPMENT || DEBUG
2679 vm_object_t m_object = VM_PAGE_OBJECT(m);
2680
2681 if (page_from_bg_q == TRUE) {
2682 if (m_object->internal) {
2683 vm_pageout_rejected_bq_internal++;
2684 } else {
2685 vm_pageout_rejected_bq_external++;
2686 }
2687 }
2688 #endif /* DEVELOPMENT || DEBUG */
2689 }
2690
2691 /*
2692 * This function is called only from vm_pageout_scan and
2693 * it will try to grab the victim page's VM object (m_object)
2694 * which differs from the previous victim page's object (object).
2695 */
2696 static int
vps_switch_object(vm_page_t m,vm_object_t m_object,vm_object_t * object,int page_prev_q_state,boolean_t avoid_anon_pages,boolean_t page_from_bg_q)2697 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2698 {
2699 struct vm_speculative_age_q *sq;
2700
2701 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2702
2703 /*
2704 * the object associated with candidate page is
2705 * different from the one we were just working
2706 * with... dump the lock if we still own it
2707 */
2708 if (*object != NULL) {
2709 vm_object_unlock(*object);
2710 *object = NULL;
2711 }
2712 /*
2713 * Try to lock object; since we've alread got the
2714 * page queues lock, we can only 'try' for this one.
2715 * if the 'try' fails, we need to do a mutex_pause
2716 * to allow the owner of the object lock a chance to
2717 * run... otherwise, we're likely to trip over this
2718 * object in the same state as we work our way through
2719 * the queue... clumps of pages associated with the same
2720 * object are fairly typical on the inactive and active queues
2721 */
2722 if (!vm_object_lock_try_scan(m_object)) {
2723 vm_page_t m_want = NULL;
2724
2725 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2726
2727 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2728 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2729 }
2730
2731 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2732
2733 m->vmp_reference = FALSE;
2734
2735 if (!m_object->object_is_shared_cache) {
2736 /*
2737 * don't apply this optimization if this is the shared cache
2738 * object, it's too easy to get rid of very hot and important
2739 * pages...
2740 * m->vmp_object must be stable since we hold the page queues lock...
2741 * we can update the scan_collisions field sans the object lock
2742 * since it is a separate field and this is the only spot that does
2743 * a read-modify-write operation and it is never executed concurrently...
2744 * we can asynchronously set this field to 0 when creating a UPL, so it
2745 * is possible for the value to be a bit non-determistic, but that's ok
2746 * since it's only used as a hint
2747 */
2748 m_object->scan_collisions = 1;
2749 }
2750 if (page_from_bg_q) {
2751 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2752 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2753 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2754 } else if (!vm_page_queue_empty(&sq->age_q)) {
2755 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2756 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2757 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2758 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2759 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2760 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2761 }
2762
2763 /*
2764 * this is the next object we're going to be interested in
2765 * try to make sure its available after the mutex_pause
2766 * returns control
2767 */
2768 if (m_want) {
2769 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2770 }
2771
2772 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2773
2774 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2775 } else {
2776 *object = m_object;
2777 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2778 }
2779
2780 return VM_PAGEOUT_SCAN_PROCEED;
2781 }
2782
2783 /*
2784 * This function is called only from vm_pageout_scan and
2785 * it notices that pageout scan may be rendered ineffective
2786 * due to a FS deadlock and will jetsam a process if possible.
2787 * If jetsam isn't supported, it'll move the page to the active
2788 * queue to try and get some different pages pushed onwards so
2789 * we can try to get out of this scenario.
2790 */
2791 static void
vps_deal_with_throttled_queues(vm_page_t m,vm_object_t * object,uint32_t * vm_pageout_inactive_external_forced_reactivate_limit,int * delayed_unlock,boolean_t * force_anonymous,__unused boolean_t is_page_from_bg_q)2792 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2793 int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2794 {
2795 struct vm_pageout_queue *eq;
2796 vm_object_t cur_object = VM_OBJECT_NULL;
2797
2798 cur_object = *object;
2799
2800 eq = &vm_pageout_queue_external;
2801
2802 if (cur_object->internal == FALSE) {
2803 /*
2804 * we need to break up the following potential deadlock case...
2805 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2806 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2807 * c) Most of the pages in the inactive queue belong to this file.
2808 *
2809 * we are potentially in this deadlock because...
2810 * a) the external pageout queue is throttled
2811 * b) we're done with the active queue and moved on to the inactive queue
2812 * c) we've got a dirty external page
2813 *
2814 * since we don't know the reason for the external pageout queue being throttled we
2815 * must suspect that we are deadlocked, so move the current page onto the active queue
2816 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2817 *
2818 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2819 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2820 * pool the next time we select a victim page... if we can make enough new free pages,
2821 * the deadlock will break, the external pageout queue will empty and it will no longer
2822 * be throttled
2823 *
2824 * if we have jetsam configured, keep a count of the pages reactivated this way so
2825 * that we can try to find clean pages in the active/inactive queues before
2826 * deciding to jetsam a process
2827 */
2828 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2829
2830 vm_page_check_pageable_safe(m);
2831 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2832 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2833 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2834 vm_page_active_count++;
2835 vm_page_pageable_external_count++;
2836
2837 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2838
2839 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2840
2841 #pragma unused(force_anonymous)
2842
2843 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2844
2845 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2846 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2847 /*
2848 * Possible deadlock scenario so request jetsam action
2849 */
2850
2851 assert(cur_object);
2852 vm_object_unlock(cur_object);
2853
2854 cur_object = VM_OBJECT_NULL;
2855
2856 /*
2857 * VM pageout scan needs to know we have dropped this lock and so set the
2858 * object variable we got passed in to NULL.
2859 */
2860 *object = VM_OBJECT_NULL;
2861
2862 vm_page_unlock_queues();
2863
2864 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2865 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2866
2867 /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2868 if (memorystatus_kill_on_VM_page_shortage() == TRUE) {
2869 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2870 }
2871
2872 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2873 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2874
2875 vm_page_lock_queues();
2876 *delayed_unlock = 1;
2877 }
2878 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2879
2880 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2881 #pragma unused(delayed_unlock)
2882
2883 *force_anonymous = TRUE;
2884 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2885 } else {
2886 vm_page_activate(m);
2887 counter_inc(&vm_statistics_reactivations);
2888
2889 #if DEVELOPMENT || DEBUG
2890 if (is_page_from_bg_q == TRUE) {
2891 if (cur_object->internal) {
2892 vm_pageout_rejected_bq_internal++;
2893 } else {
2894 vm_pageout_rejected_bq_external++;
2895 }
2896 }
2897 #endif /* DEVELOPMENT || DEBUG */
2898
2899 vm_pageout_state.vm_pageout_inactive_used++;
2900 }
2901 }
2902
2903
2904 void
vm_page_balance_inactive(int max_to_move)2905 vm_page_balance_inactive(int max_to_move)
2906 {
2907 vm_page_t m;
2908
2909 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2910
2911 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2912 /*
2913 * It is likely that the hibernation code path is
2914 * dealing with these very queues as we are about
2915 * to move pages around in/from them and completely
2916 * change the linkage of the pages.
2917 *
2918 * And so we skip the rebalancing of these queues.
2919 */
2920 return;
2921 }
2922 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2923 vm_page_inactive_count +
2924 vm_page_speculative_count);
2925
2926 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2927 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2928
2929 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2930
2931 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2932 assert(!m->vmp_laundry);
2933 assert(VM_PAGE_OBJECT(m) != kernel_object);
2934 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2935
2936 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2937
2938 /*
2939 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2940 *
2941 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2942 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2943 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2944 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2945 * by pageout_scan, which is just fine since the last reference would have happened quite far
2946 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2947 * have happened before we moved the page
2948 */
2949 if (m->vmp_pmapped == TRUE) {
2950 /*
2951 * We might be holding the page queue lock as a
2952 * spin lock and clearing the "referenced" bit could
2953 * take a while if there are lots of mappings of
2954 * that page, so make sure we acquire the lock as
2955 * as mutex to avoid a spinlock timeout.
2956 */
2957 vm_page_lockconvert_queues();
2958 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2959 }
2960
2961 /*
2962 * The page might be absent or busy,
2963 * but vm_page_deactivate can handle that.
2964 * FALSE indicates that we don't want a H/W clear reference
2965 */
2966 vm_page_deactivate_internal(m, FALSE);
2967 }
2968 }
2969
2970 /*
2971 * vm_pageout_scan does the dirty work for the pageout daemon.
2972 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2973 * held and vm_page_free_wanted == 0.
2974 */
2975 void
vm_pageout_scan(void)2976 vm_pageout_scan(void)
2977 {
2978 unsigned int loop_count = 0;
2979 unsigned int inactive_burst_count = 0;
2980 unsigned int reactivated_this_call;
2981 unsigned int reactivate_limit;
2982 vm_page_t local_freeq = NULL;
2983 int local_freed = 0;
2984 int delayed_unlock;
2985 int delayed_unlock_limit = 0;
2986 int refmod_state = 0;
2987 int vm_pageout_deadlock_target = 0;
2988 struct vm_pageout_queue *iq;
2989 struct vm_pageout_queue *eq;
2990 struct vm_speculative_age_q *sq;
2991 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2992 boolean_t inactive_throttled = FALSE;
2993 vm_object_t object = NULL;
2994 uint32_t inactive_reclaim_run;
2995 boolean_t grab_anonymous = FALSE;
2996 boolean_t force_anonymous = FALSE;
2997 boolean_t force_speculative_aging = FALSE;
2998 int anons_grabbed = 0;
2999 int page_prev_q_state = 0;
3000 boolean_t page_from_bg_q = FALSE;
3001 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
3002 vm_object_t m_object = VM_OBJECT_NULL;
3003 int retval = 0;
3004 boolean_t lock_yield_check = FALSE;
3005
3006
3007 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3008 vm_pageout_vminfo.vm_pageout_freed_speculative,
3009 vm_pageout_state.vm_pageout_inactive_clean,
3010 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3011 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3012
3013 flow_control.state = FCS_IDLE;
3014 iq = &vm_pageout_queue_internal;
3015 eq = &vm_pageout_queue_external;
3016 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3017
3018 /* Ask the pmap layer to return any pages it no longer needs. */
3019 pmap_release_pages_fast();
3020
3021 vm_page_lock_queues();
3022
3023 delayed_unlock = 1;
3024
3025 /*
3026 * Calculate the max number of referenced pages on the inactive
3027 * queue that we will reactivate.
3028 */
3029 reactivated_this_call = 0;
3030 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3031 vm_page_inactive_count);
3032 inactive_reclaim_run = 0;
3033
3034 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3035
3036 /*
3037 * We must limit the rate at which we send pages to the pagers
3038 * so that we don't tie up too many pages in the I/O queues.
3039 * We implement a throttling mechanism using the laundry count
3040 * to limit the number of pages outstanding to the default
3041 * and external pagers. We can bypass the throttles and look
3042 * for clean pages if the pageout queues don't drain in a timely
3043 * fashion since this may indicate that the pageout paths are
3044 * stalled waiting for memory, which only we can provide.
3045 */
3046
3047 vps_init_page_targets();
3048 assert(object == NULL);
3049 assert(delayed_unlock != 0);
3050
3051 for (;;) {
3052 vm_page_t m;
3053
3054 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3055
3056 if (lock_yield_check) {
3057 lock_yield_check = FALSE;
3058
3059 if (delayed_unlock++ > delayed_unlock_limit) {
3060 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3061 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3062 } else if (vm_pageout_scan_wants_object) {
3063 vm_page_unlock_queues();
3064 mutex_pause(0);
3065 vm_page_lock_queues();
3066 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3067 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3068 }
3069 }
3070
3071 if (vm_upl_wait_for_pages < 0) {
3072 vm_upl_wait_for_pages = 0;
3073 }
3074
3075 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3076
3077 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3078 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3079 }
3080
3081 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3082
3083 assert(delayed_unlock);
3084
3085 /*
3086 * maintain our balance
3087 */
3088 vm_page_balance_inactive(1);
3089
3090
3091 /**********************************************************************
3092 * above this point we're playing with the active and secluded queues
3093 * below this point we're playing with the throttling mechanisms
3094 * and the inactive queue
3095 **********************************************************************/
3096
3097 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3098 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3099
3100 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3101 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3102 /*
3103 * make sure the pageout I/O threads are running
3104 * throttled in case there are still requests
3105 * in the laundry... since we have met our targets
3106 * we don't need the laundry to be cleaned in a timely
3107 * fashion... so let's avoid interfering with foreground
3108 * activity
3109 */
3110 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3111
3112 vm_free_page_lock();
3113
3114 if ((vm_page_free_count >= vm_page_free_target) &&
3115 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3116 /*
3117 * done - we have met our target *and*
3118 * there is no one waiting for a page.
3119 */
3120 return_from_scan:
3121 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3122
3123 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3124 vm_pageout_state.vm_pageout_inactive,
3125 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3126 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3127 vm_pageout_vminfo.vm_pageout_freed_speculative,
3128 vm_pageout_state.vm_pageout_inactive_clean,
3129 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3130 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3131
3132 return;
3133 }
3134 vm_free_page_unlock();
3135 }
3136
3137 /*
3138 * Before anything, we check if we have any ripe volatile
3139 * objects around. If so, try to purge the first object.
3140 * If the purge fails, fall through to reclaim a page instead.
3141 * If the purge succeeds, go back to the top and reevalute
3142 * the new memory situation.
3143 */
3144 retval = vps_purge_object();
3145
3146 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3147 /*
3148 * Success
3149 */
3150 if (object != NULL) {
3151 vm_object_unlock(object);
3152 object = NULL;
3153 }
3154
3155 lock_yield_check = FALSE;
3156 continue;
3157 }
3158
3159 /*
3160 * If our 'aged' queue is empty and we have some speculative pages
3161 * in the other queues, let's go through and see if we need to age
3162 * them.
3163 *
3164 * If we succeeded in aging a speculative Q or just that everything
3165 * looks normal w.r.t queue age and queue counts, we keep going onward.
3166 *
3167 * If, for some reason, we seem to have a mismatch between the spec.
3168 * page count and the page queues, we reset those variables and
3169 * restart the loop (LD TODO: Track this better?).
3170 */
3171 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3172 retval = vps_age_speculative_queue(force_speculative_aging);
3173
3174 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3175 lock_yield_check = FALSE;
3176 continue;
3177 }
3178 }
3179 force_speculative_aging = FALSE;
3180
3181 /*
3182 * Check to see if we need to evict objects from the cache.
3183 *
3184 * Note: 'object' here doesn't have anything to do with
3185 * the eviction part. We just need to make sure we have dropped
3186 * any object lock we might be holding if we need to go down
3187 * into the eviction logic.
3188 */
3189 retval = vps_object_cache_evict(&object);
3190
3191 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3192 lock_yield_check = FALSE;
3193 continue;
3194 }
3195
3196
3197 /*
3198 * Calculate our filecache_min that will affect the loop
3199 * going forward.
3200 */
3201 vps_calculate_filecache_min();
3202
3203 /*
3204 * LD TODO: Use a structure to hold all state variables for a single
3205 * vm_pageout_scan iteration and pass that structure to this function instead.
3206 */
3207 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3208 &delayed_unlock, &local_freeq, &local_freed,
3209 &vm_pageout_deadlock_target, inactive_burst_count);
3210
3211 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3212 if (loop_count >= vm_page_inactive_count) {
3213 loop_count = 0;
3214 }
3215
3216 inactive_burst_count = 0;
3217
3218 assert(object == NULL);
3219 assert(delayed_unlock != 0);
3220
3221 lock_yield_check = FALSE;
3222 continue;
3223 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3224 goto return_from_scan;
3225 }
3226
3227 flow_control.state = FCS_IDLE;
3228
3229 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3230 vm_pageout_inactive_external_forced_reactivate_limit);
3231 loop_count++;
3232 inactive_burst_count++;
3233 vm_pageout_state.vm_pageout_inactive++;
3234
3235 /*
3236 * Choose a victim.
3237 */
3238
3239 m = NULL;
3240 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3241
3242 if (m == NULL) {
3243 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3244 inactive_burst_count = 0;
3245
3246 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3247 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3248 }
3249
3250 lock_yield_check = TRUE;
3251 continue;
3252 }
3253
3254 /*
3255 * if we've gotten here, we have no victim page.
3256 * check to see if we've not finished balancing the queues
3257 * or we have a page on the aged speculative queue that we
3258 * skipped due to force_anonymous == TRUE.. or we have
3259 * speculative pages that we can prematurely age... if
3260 * one of these cases we'll keep going, else panic
3261 */
3262 force_anonymous = FALSE;
3263 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3264
3265 if (!vm_page_queue_empty(&sq->age_q)) {
3266 lock_yield_check = TRUE;
3267 continue;
3268 }
3269
3270 if (vm_page_speculative_count) {
3271 force_speculative_aging = TRUE;
3272 lock_yield_check = TRUE;
3273 continue;
3274 }
3275 panic("vm_pageout: no victim");
3276
3277 /* NOTREACHED */
3278 }
3279
3280 assert(VM_PAGE_PAGEABLE(m));
3281 m_object = VM_PAGE_OBJECT(m);
3282 force_anonymous = FALSE;
3283
3284 page_prev_q_state = m->vmp_q_state;
3285 /*
3286 * we just found this page on one of our queues...
3287 * it can't also be on the pageout queue, so safe
3288 * to call vm_page_queues_remove
3289 */
3290 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3291 vm_page_queues_remove(m, TRUE);
3292 if (donate) {
3293 /*
3294 * The compressor needs to see this bit to know
3295 * where this page needs to land. Also if stolen,
3296 * this bit helps put the page back in the right
3297 * special queue where it belongs.
3298 */
3299 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3300 }
3301
3302 assert(!m->vmp_laundry);
3303 assert(!m->vmp_private);
3304 assert(!m->vmp_fictitious);
3305 assert(m_object != kernel_object);
3306 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3307
3308 vm_pageout_vminfo.vm_pageout_considered_page++;
3309
3310 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3311
3312 /*
3313 * check to see if we currently are working
3314 * with the same object... if so, we've
3315 * already got the lock
3316 */
3317 if (m_object != object) {
3318 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3319
3320 /*
3321 * vps_switch_object() will always drop the 'object' lock first
3322 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3323 * either 'm_object' or NULL.
3324 */
3325 retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3326
3327 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3328 lock_yield_check = TRUE;
3329 continue;
3330 }
3331 }
3332 assert(m_object == object);
3333 assert(VM_PAGE_OBJECT(m) == m_object);
3334
3335 if (m->vmp_busy) {
3336 /*
3337 * Somebody is already playing with this page.
3338 * Put it back on the appropriate queue
3339 *
3340 */
3341 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3342
3343 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3344 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3345 }
3346
3347 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3348
3349 lock_yield_check = TRUE;
3350 continue;
3351 }
3352
3353 /*
3354 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3355 * If already cleaning this page in place
3356 * just leave if off the paging queues.
3357 * We can leave the page mapped, and upl_commit_range
3358 * will put it on the clean queue.
3359 *
3360 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3361 * an msync INVALIDATE is in progress...
3362 * this page has been marked for destruction
3363 * after it has been cleaned,
3364 * but not yet gathered into a UPL
3365 * where 'cleaning' will be set...
3366 * just leave it off the paging queues
3367 *
3368 * if (m->vmp_free_when_done && m->vmp_clenaing)
3369 * an msync INVALIDATE is in progress
3370 * and the UPL has already gathered this page...
3371 * just leave it off the paging queues
3372 */
3373 if (m->vmp_free_when_done || m->vmp_cleaning) {
3374 lock_yield_check = TRUE;
3375 continue;
3376 }
3377
3378
3379 /*
3380 * If it's absent, in error or the object is no longer alive,
3381 * we can reclaim the page... in the no longer alive case,
3382 * there are 2 states the page can be in that preclude us
3383 * from reclaiming it - busy or cleaning - that we've already
3384 * dealt with
3385 */
3386 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3387 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3388 if (m->vmp_absent) {
3389 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3390 } else if (!object->alive ||
3391 (!object->internal &&
3392 object->pager == MEMORY_OBJECT_NULL)) {
3393 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3394 } else {
3395 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3396 }
3397 reclaim_page:
3398 if (vm_pageout_deadlock_target) {
3399 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3400 vm_pageout_deadlock_target--;
3401 }
3402
3403 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3404
3405 if (object->internal) {
3406 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3407 } else {
3408 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3409 }
3410 assert(!m->vmp_cleaning);
3411 assert(!m->vmp_laundry);
3412
3413 if (!object->internal &&
3414 object->pager != NULL &&
3415 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3416 shared_region_pager_reclaimed++;
3417 }
3418
3419 m->vmp_busy = TRUE;
3420
3421 /*
3422 * remove page from object here since we're already
3423 * behind the object lock... defer the rest of the work
3424 * we'd normally do in vm_page_free_prepare_object
3425 * until 'vm_page_free_list' is called
3426 */
3427 if (m->vmp_tabled) {
3428 vm_page_remove(m, TRUE);
3429 }
3430
3431 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3432 m->vmp_snext = local_freeq;
3433 local_freeq = m;
3434 local_freed++;
3435
3436 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3437 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3438 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3439 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3440 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3441 vm_pageout_vminfo.vm_pageout_freed_internal++;
3442 } else {
3443 vm_pageout_vminfo.vm_pageout_freed_external++;
3444 }
3445
3446 inactive_burst_count = 0;
3447
3448 lock_yield_check = TRUE;
3449 continue;
3450 }
3451 if (object->copy == VM_OBJECT_NULL) {
3452 /*
3453 * No one else can have any interest in this page.
3454 * If this is an empty purgable object, the page can be
3455 * reclaimed even if dirty.
3456 * If the page belongs to a volatile purgable object, we
3457 * reactivate it if the compressor isn't active.
3458 */
3459 if (object->purgable == VM_PURGABLE_EMPTY) {
3460 if (m->vmp_pmapped == TRUE) {
3461 /* unmap the page */
3462 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3463 if (refmod_state & VM_MEM_MODIFIED) {
3464 SET_PAGE_DIRTY(m, FALSE);
3465 }
3466 }
3467 if (m->vmp_dirty || m->vmp_precious) {
3468 /* we saved the cost of cleaning this page ! */
3469 vm_page_purged_count++;
3470 }
3471 goto reclaim_page;
3472 }
3473
3474 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3475 /*
3476 * With the VM compressor, the cost of
3477 * reclaiming a page is much lower (no I/O),
3478 * so if we find a "volatile" page, it's better
3479 * to let it get compressed rather than letting
3480 * it occupy a full page until it gets purged.
3481 * So no need to check for "volatile" here.
3482 */
3483 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3484 /*
3485 * Avoid cleaning a "volatile" page which might
3486 * be purged soon.
3487 */
3488
3489 /* if it's wired, we can't put it on our queue */
3490 assert(!VM_PAGE_WIRED(m));
3491
3492 /* just stick it back on! */
3493 reactivated_this_call++;
3494
3495 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3496 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3497 }
3498
3499 goto reactivate_page;
3500 }
3501 }
3502 /*
3503 * If it's being used, reactivate.
3504 * (Fictitious pages are either busy or absent.)
3505 * First, update the reference and dirty bits
3506 * to make sure the page is unreferenced.
3507 */
3508 refmod_state = -1;
3509
3510 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3511 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3512
3513 if (refmod_state & VM_MEM_REFERENCED) {
3514 m->vmp_reference = TRUE;
3515 }
3516 if (refmod_state & VM_MEM_MODIFIED) {
3517 SET_PAGE_DIRTY(m, FALSE);
3518 }
3519 }
3520
3521 if (m->vmp_reference || m->vmp_dirty) {
3522 /* deal with a rogue "reusable" page */
3523 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3524 }
3525
3526 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3527 vm_pageout_state.vm_page_xpmapped_min = 0;
3528 } else {
3529 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3530 }
3531
3532 if (!m->vmp_no_cache &&
3533 page_from_bg_q == FALSE &&
3534 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3535 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3536 /*
3537 * The page we pulled off the inactive list has
3538 * been referenced. It is possible for other
3539 * processors to be touching pages faster than we
3540 * can clear the referenced bit and traverse the
3541 * inactive queue, so we limit the number of
3542 * reactivations.
3543 */
3544 if (++reactivated_this_call >= reactivate_limit &&
3545 !object->object_is_shared_cache &&
3546 !((m->vmp_realtime ||
3547 object->for_realtime) &&
3548 vm_pageout_protect_realtime)) {
3549 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3550 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3551 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3552 if (object->object_is_shared_cache) {
3553 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3554 } else if (m->vmp_realtime ||
3555 object->for_realtime) {
3556 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3557 }
3558 } else {
3559 uint32_t isinuse;
3560
3561 if (reactivated_this_call >= reactivate_limit) {
3562 if (object->object_is_shared_cache) {
3563 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3564 } else if ((m->vmp_realtime ||
3565 object->for_realtime) &&
3566 vm_pageout_protect_realtime) {
3567 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3568 }
3569 }
3570 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3571 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3572 }
3573
3574 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3575 reactivate_page:
3576 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3577 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3578 /*
3579 * no explict mappings of this object exist
3580 * and it's not open via the filesystem
3581 */
3582 vm_page_deactivate(m);
3583 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3584 } else {
3585 /*
3586 * The page was/is being used, so put back on active list.
3587 */
3588 vm_page_activate(m);
3589 counter_inc(&vm_statistics_reactivations);
3590 inactive_burst_count = 0;
3591 }
3592 #if DEVELOPMENT || DEBUG
3593 if (page_from_bg_q == TRUE) {
3594 if (m_object->internal) {
3595 vm_pageout_rejected_bq_internal++;
3596 } else {
3597 vm_pageout_rejected_bq_external++;
3598 }
3599 }
3600 #endif /* DEVELOPMENT || DEBUG */
3601
3602 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3603 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3604 }
3605 vm_pageout_state.vm_pageout_inactive_used++;
3606
3607 lock_yield_check = TRUE;
3608 continue;
3609 }
3610 /*
3611 * Make sure we call pmap_get_refmod() if it
3612 * wasn't already called just above, to update
3613 * the dirty bit.
3614 */
3615 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3616 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3617 if (refmod_state & VM_MEM_MODIFIED) {
3618 SET_PAGE_DIRTY(m, FALSE);
3619 }
3620 }
3621 }
3622
3623 /*
3624 * we've got a candidate page to steal...
3625 *
3626 * m->vmp_dirty is up to date courtesy of the
3627 * preceding check for m->vmp_reference... if
3628 * we get here, then m->vmp_reference had to be
3629 * FALSE (or possibly "reactivate_limit" was
3630 * exceeded), but in either case we called
3631 * pmap_get_refmod() and updated both
3632 * m->vmp_reference and m->vmp_dirty
3633 *
3634 * if it's dirty or precious we need to
3635 * see if the target queue is throtttled
3636 * it if is, we need to skip over it by moving it back
3637 * to the end of the inactive queue
3638 */
3639
3640 inactive_throttled = FALSE;
3641
3642 if (m->vmp_dirty || m->vmp_precious) {
3643 if (object->internal) {
3644 if (VM_PAGE_Q_THROTTLED(iq)) {
3645 inactive_throttled = TRUE;
3646 }
3647 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3648 inactive_throttled = TRUE;
3649 }
3650 }
3651 throttle_inactive:
3652 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3653 object->internal && m->vmp_dirty &&
3654 (object->purgable == VM_PURGABLE_DENY ||
3655 object->purgable == VM_PURGABLE_NONVOLATILE ||
3656 object->purgable == VM_PURGABLE_VOLATILE)) {
3657 vm_page_check_pageable_safe(m);
3658 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3659 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3660 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3661 vm_page_throttled_count++;
3662
3663 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3664
3665 inactive_burst_count = 0;
3666
3667 lock_yield_check = TRUE;
3668 continue;
3669 }
3670 if (inactive_throttled == TRUE) {
3671 vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3672 &delayed_unlock, &force_anonymous, page_from_bg_q);
3673
3674 inactive_burst_count = 0;
3675
3676 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3677 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3678 }
3679
3680 lock_yield_check = TRUE;
3681 continue;
3682 }
3683
3684 /*
3685 * we've got a page that we can steal...
3686 * eliminate all mappings and make sure
3687 * we have the up-to-date modified state
3688 *
3689 * if we need to do a pmap_disconnect then we
3690 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3691 * provides the true state atomically... the
3692 * page was still mapped up to the pmap_disconnect
3693 * and may have been dirtied at the last microsecond
3694 *
3695 * Note that if 'pmapped' is FALSE then the page is not
3696 * and has not been in any map, so there is no point calling
3697 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3698 * of likely usage of the page.
3699 */
3700 if (m->vmp_pmapped == TRUE) {
3701 int pmap_options;
3702
3703 /*
3704 * Don't count this page as going into the compressor
3705 * if any of these are true:
3706 * 1) compressed pager isn't enabled
3707 * 2) Freezer enabled device with compressed pager
3708 * backend (exclusive use) i.e. most of the VM system
3709 * (including vm_pageout_scan) has no knowledge of
3710 * the compressor
3711 * 3) This page belongs to a file and hence will not be
3712 * sent into the compressor
3713 */
3714 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3715 object->internal == FALSE) {
3716 pmap_options = 0;
3717 } else if (m->vmp_dirty || m->vmp_precious) {
3718 /*
3719 * VM knows that this page is dirty (or
3720 * precious) and needs to be compressed
3721 * rather than freed.
3722 * Tell the pmap layer to count this page
3723 * as "compressed".
3724 */
3725 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3726 } else {
3727 /*
3728 * VM does not know if the page needs to
3729 * be preserved but the pmap layer might tell
3730 * us if any mapping has "modified" it.
3731 * Let's the pmap layer to count this page
3732 * as compressed if and only if it has been
3733 * modified.
3734 */
3735 pmap_options =
3736 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3737 }
3738 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3739 pmap_options,
3740 NULL);
3741 if (refmod_state & VM_MEM_MODIFIED) {
3742 SET_PAGE_DIRTY(m, FALSE);
3743 }
3744 }
3745
3746 /*
3747 * reset our count of pages that have been reclaimed
3748 * since the last page was 'stolen'
3749 */
3750 inactive_reclaim_run = 0;
3751
3752 /*
3753 * If it's clean and not precious, we can free the page.
3754 */
3755 if (!m->vmp_dirty && !m->vmp_precious) {
3756 vm_pageout_state.vm_pageout_inactive_clean++;
3757
3758 /*
3759 * OK, at this point we have found a page we are going to free.
3760 */
3761 #if CONFIG_PHANTOM_CACHE
3762 if (!object->internal) {
3763 vm_phantom_cache_add_ghost(m);
3764 }
3765 #endif
3766 goto reclaim_page;
3767 }
3768
3769 /*
3770 * The page may have been dirtied since the last check
3771 * for a throttled target queue (which may have been skipped
3772 * if the page was clean then). With the dirty page
3773 * disconnected here, we can make one final check.
3774 */
3775 if (object->internal) {
3776 if (VM_PAGE_Q_THROTTLED(iq)) {
3777 inactive_throttled = TRUE;
3778 }
3779 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3780 inactive_throttled = TRUE;
3781 }
3782
3783 if (inactive_throttled == TRUE) {
3784 goto throttle_inactive;
3785 }
3786
3787 #if VM_PRESSURE_EVENTS
3788 #if CONFIG_JETSAM
3789
3790 /*
3791 * If Jetsam is enabled, then the sending
3792 * of memory pressure notifications is handled
3793 * from the same thread that takes care of high-water
3794 * and other jetsams i.e. the memorystatus_thread.
3795 */
3796
3797 #else /* CONFIG_JETSAM */
3798
3799 vm_pressure_response();
3800
3801 #endif /* CONFIG_JETSAM */
3802 #endif /* VM_PRESSURE_EVENTS */
3803
3804 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3805 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3806 }
3807
3808 if (object->internal) {
3809 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3810 } else {
3811 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3812 }
3813
3814 /*
3815 * internal pages will go to the compressor...
3816 * external pages will go to the appropriate pager to be cleaned
3817 * and upon completion will end up on 'vm_page_queue_cleaned' which
3818 * is a preferred queue to steal from
3819 */
3820 vm_pageout_cluster(m);
3821 inactive_burst_count = 0;
3822
3823 /*
3824 * back to top of pageout scan loop
3825 */
3826 }
3827 }
3828
3829
3830 void
vm_page_free_reserve(int pages)3831 vm_page_free_reserve(
3832 int pages)
3833 {
3834 int free_after_reserve;
3835
3836 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3837 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3838 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3839 } else {
3840 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3841 }
3842 } else {
3843 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3844 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3845 } else {
3846 vm_page_free_reserved += pages;
3847 }
3848 }
3849 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3850
3851 vm_page_free_min = vm_page_free_reserved +
3852 VM_PAGE_FREE_MIN(free_after_reserve);
3853
3854 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3855 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3856 }
3857
3858 vm_page_free_target = vm_page_free_reserved +
3859 VM_PAGE_FREE_TARGET(free_after_reserve);
3860
3861 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3862 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3863 }
3864
3865 if (vm_page_free_target < vm_page_free_min + 5) {
3866 vm_page_free_target = vm_page_free_min + 5;
3867 }
3868
3869 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3870 }
3871
3872 /*
3873 * vm_pageout is the high level pageout daemon.
3874 */
3875
3876 void
vm_pageout_continue(void)3877 vm_pageout_continue(void)
3878 {
3879 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3880 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3881
3882 vm_free_page_lock();
3883 vm_pageout_running = TRUE;
3884 vm_free_page_unlock();
3885
3886 vm_pageout_scan();
3887 /*
3888 * we hold both the vm_page_queue_free_lock
3889 * and the vm_page_queues_lock at this point
3890 */
3891 assert(vm_page_free_wanted == 0);
3892 assert(vm_page_free_wanted_privileged == 0);
3893 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3894
3895 vm_pageout_running = FALSE;
3896 #if XNU_TARGET_OS_OSX
3897 if (vm_pageout_waiter) {
3898 vm_pageout_waiter = FALSE;
3899 thread_wakeup((event_t)&vm_pageout_waiter);
3900 }
3901 #endif /* XNU_TARGET_OS_OSX */
3902
3903 vm_free_page_unlock();
3904 vm_page_unlock_queues();
3905
3906 thread_block((thread_continue_t)vm_pageout_continue);
3907 /*NOTREACHED*/
3908 }
3909
3910 #if XNU_TARGET_OS_OSX
3911 kern_return_t
vm_pageout_wait(uint64_t deadline)3912 vm_pageout_wait(uint64_t deadline)
3913 {
3914 kern_return_t kr;
3915
3916 vm_free_page_lock();
3917 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3918 vm_pageout_waiter = TRUE;
3919 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3920 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3921 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3922 kr = KERN_OPERATION_TIMED_OUT;
3923 }
3924 }
3925 vm_free_page_unlock();
3926
3927 return kr;
3928 }
3929 #endif /* XNU_TARGET_OS_OSX */
3930
3931 OS_NORETURN
3932 static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state * ethr,__unused wait_result_t w)3933 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3934 {
3935 vm_page_t m = NULL;
3936 vm_object_t object;
3937 vm_object_offset_t offset;
3938 memory_object_t pager;
3939 struct vm_pageout_queue *q = ethr->q;
3940
3941 /* On systems with a compressor, the external IO thread clears its
3942 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3943 * creation)
3944 */
3945 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3946 current_thread()->options &= ~TH_OPT_VMPRIV;
3947 }
3948
3949 sched_cond_ack(&(ethr->pgo_wakeup));
3950
3951 while (true) {
3952 vm_page_lockspin_queues();
3953
3954 while (!vm_page_queue_empty(&q->pgo_pending)) {
3955 q->pgo_busy = TRUE;
3956 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3957
3958 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3959 VM_PAGE_CHECK(m);
3960 /*
3961 * grab a snapshot of the object and offset this
3962 * page is tabled in so that we can relookup this
3963 * page after we've taken the object lock - these
3964 * fields are stable while we hold the page queues lock
3965 * but as soon as we drop it, there is nothing to keep
3966 * this page in this object... we hold an activity_in_progress
3967 * on this object which will keep it from terminating
3968 */
3969 object = VM_PAGE_OBJECT(m);
3970 offset = m->vmp_offset;
3971
3972 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3973 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3974
3975 vm_page_unlock_queues();
3976
3977 vm_object_lock(object);
3978
3979 m = vm_page_lookup(object, offset);
3980
3981 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3982 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3983 /*
3984 * it's either the same page that someone else has
3985 * started cleaning (or it's finished cleaning or
3986 * been put back on the pageout queue), or
3987 * the page has been freed or we have found a
3988 * new page at this offset... in all of these cases
3989 * we merely need to release the activity_in_progress
3990 * we took when we put the page on the pageout queue
3991 */
3992 vm_object_activity_end(object);
3993 vm_object_unlock(object);
3994
3995 vm_page_lockspin_queues();
3996 continue;
3997 }
3998 pager = object->pager;
3999
4000 if (pager == MEMORY_OBJECT_NULL) {
4001 /*
4002 * This pager has been destroyed by either
4003 * memory_object_destroy or vm_object_destroy, and
4004 * so there is nowhere for the page to go.
4005 */
4006 if (m->vmp_free_when_done) {
4007 /*
4008 * Just free the page... VM_PAGE_FREE takes
4009 * care of cleaning up all the state...
4010 * including doing the vm_pageout_throttle_up
4011 */
4012 VM_PAGE_FREE(m);
4013 } else {
4014 vm_page_lockspin_queues();
4015
4016 vm_pageout_throttle_up(m);
4017 vm_page_activate(m);
4018
4019 vm_page_unlock_queues();
4020
4021 /*
4022 * And we are done with it.
4023 */
4024 }
4025 vm_object_activity_end(object);
4026 vm_object_unlock(object);
4027
4028 vm_page_lockspin_queues();
4029 continue;
4030 }
4031 #if 0
4032 /*
4033 * we don't hold the page queue lock
4034 * so this check isn't safe to make
4035 */
4036 VM_PAGE_CHECK(m);
4037 #endif
4038 /*
4039 * give back the activity_in_progress reference we
4040 * took when we queued up this page and replace it
4041 * it with a paging_in_progress reference that will
4042 * also hold the paging offset from changing and
4043 * prevent the object from terminating
4044 */
4045 vm_object_activity_end(object);
4046 vm_object_paging_begin(object);
4047 vm_object_unlock(object);
4048
4049 /*
4050 * Send the data to the pager.
4051 * any pageout clustering happens there
4052 */
4053 memory_object_data_return(pager,
4054 m->vmp_offset + object->paging_offset,
4055 PAGE_SIZE,
4056 NULL,
4057 NULL,
4058 FALSE,
4059 FALSE,
4060 0);
4061
4062 vm_object_lock(object);
4063 vm_object_paging_end(object);
4064 vm_object_unlock(object);
4065
4066 vm_pageout_io_throttle();
4067
4068 vm_page_lockspin_queues();
4069 }
4070 q->pgo_busy = FALSE;
4071
4072 vm_page_unlock_queues();
4073 sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4074 }
4075 /*NOTREACHED*/
4076 }
4077
4078
4079 #define MAX_FREE_BATCH 32
4080 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4081 * this thread.
4082 */
4083
4084
4085 OS_NORETURN
4086 static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state * cq,__unused wait_result_t w)4087 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4088 {
4089 struct vm_pageout_queue *q;
4090 vm_page_t m = NULL;
4091 boolean_t pgo_draining;
4092 vm_page_t local_q;
4093 int local_cnt;
4094 vm_page_t local_freeq = NULL;
4095 int local_freed = 0;
4096 int local_batch_size;
4097 #if DEVELOPMENT || DEBUG
4098 int ncomps = 0;
4099 boolean_t marked_active = FALSE;
4100 int num_pages_processed = 0;
4101 #endif
4102 void *chead = NULL;
4103
4104 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4105
4106 sched_cond_ack(&(cq->pgo_wakeup));
4107
4108 q = cq->q;
4109
4110 while (true) {
4111 #if DEVELOPMENT || DEBUG
4112 bool benchmark_accounting = false;
4113 /*
4114 * If we're running the compressor perf test, only process the benchmark pages.
4115 * We'll get back to our regular queue once the benchmark is done
4116 */
4117 if (compressor_running_perf_test) {
4118 q = cq->benchmark_q;
4119 if (!vm_page_queue_empty(&q->pgo_pending)) {
4120 benchmark_accounting = true;
4121 } else {
4122 q = cq->q;
4123 benchmark_accounting = false;
4124 }
4125 }
4126 #endif /* DEVELOPMENT || DEBUG */
4127
4128 #if __AMP__
4129 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4130 local_batch_size = (q->pgo_maxlaundry >> 3);
4131 local_batch_size = MAX(local_batch_size, 16);
4132 } else {
4133 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4134 }
4135 #else
4136 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4137 #endif
4138
4139 #if RECORD_THE_COMPRESSED_DATA
4140 if (q->pgo_laundry) {
4141 c_compressed_record_init();
4142 }
4143 #endif
4144 while (true) {
4145 int pages_left_on_q = 0;
4146
4147 local_cnt = 0;
4148 local_q = NULL;
4149
4150 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4151
4152 vm_page_lock_queues();
4153 #if DEVELOPMENT || DEBUG
4154 if (marked_active == FALSE) {
4155 vmct_active++;
4156 vmct_state[cq->id] = VMCT_ACTIVE;
4157 marked_active = TRUE;
4158 if (vmct_active == 1) {
4159 vm_compressor_epoch_start = mach_absolute_time();
4160 }
4161 }
4162 #endif
4163 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4164
4165 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4166
4167 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4168 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4169 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4170 VM_PAGE_CHECK(m);
4171
4172 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4173 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4174 m->vmp_laundry = FALSE;
4175
4176 m->vmp_snext = local_q;
4177 local_q = m;
4178 local_cnt++;
4179 }
4180 if (local_q == NULL) {
4181 break;
4182 }
4183
4184 q->pgo_busy = TRUE;
4185
4186 if ((pgo_draining = q->pgo_draining) == FALSE) {
4187 vm_pageout_throttle_up_batch(q, local_cnt);
4188 pages_left_on_q = q->pgo_laundry;
4189 } else {
4190 pages_left_on_q = q->pgo_laundry - local_cnt;
4191 }
4192
4193 vm_page_unlock_queues();
4194
4195 #if !RECORD_THE_COMPRESSED_DATA
4196 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4197 // wake up the next compressor thread
4198 sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4199 pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4200 }
4201 #endif
4202 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4203
4204 while (local_q) {
4205 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4206
4207 m = local_q;
4208 local_q = m->vmp_snext;
4209 m->vmp_snext = NULL;
4210
4211 /*
4212 * Technically we need the pageq locks to manipulate this field.
4213 * However, this page has been removed from all queues and is only
4214 * known to this compressor thread dealing with this local queue.
4215 *
4216 * TODO LIONEL: Add a second localq that is the early localq and
4217 * put special pages like this one on that queue in the block above
4218 * under the pageq lock to avoid this 'works but not clean' logic.
4219 */
4220 void *donate_queue_head;
4221 #if XNU_TARGET_OS_OSX
4222 donate_queue_head = &cq->current_early_swapout_chead;
4223 #else /* XNU_TARGET_OS_OSX */
4224 donate_queue_head = &cq->current_late_swapout_chead;
4225 #endif /* XNU_TARGET_OS_OSX */
4226 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4227 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4228 chead = donate_queue_head;
4229 } else {
4230 chead = &cq->current_regular_swapout_chead;
4231 }
4232
4233 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4234 #if DEVELOPMENT || DEBUG
4235 ncomps++;
4236 #endif
4237 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4238
4239 m->vmp_snext = local_freeq;
4240 local_freeq = m;
4241 local_freed++;
4242
4243 if (local_freed >= MAX_FREE_BATCH) {
4244 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4245
4246 vm_page_free_list(local_freeq, TRUE);
4247
4248 local_freeq = NULL;
4249 local_freed = 0;
4250 }
4251 }
4252 #if DEVELOPMENT || DEBUG
4253 num_pages_processed++;
4254 #endif /* DEVELOPMENT || DEBUG */
4255 #if !CONFIG_JETSAM
4256 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4257 kern_return_t wait_result;
4258 int need_wakeup = 0;
4259
4260 if (local_freeq) {
4261 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4262
4263 vm_page_free_list(local_freeq, TRUE);
4264 local_freeq = NULL;
4265 local_freed = 0;
4266
4267 continue;
4268 }
4269 vm_free_page_lock_spin();
4270
4271 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4272 if (vm_page_free_wanted_privileged++ == 0) {
4273 need_wakeup = 1;
4274 }
4275 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4276
4277 vm_free_page_unlock();
4278
4279 if (need_wakeup) {
4280 thread_wakeup((event_t)&vm_page_free_wanted);
4281 }
4282
4283 if (wait_result == THREAD_WAITING) {
4284 thread_block(THREAD_CONTINUE_NULL);
4285 }
4286 } else {
4287 vm_free_page_unlock();
4288 }
4289 }
4290 #endif
4291 }
4292 if (local_freeq) {
4293 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4294
4295 vm_page_free_list(local_freeq, TRUE);
4296 local_freeq = NULL;
4297 local_freed = 0;
4298 }
4299 if (pgo_draining == TRUE) {
4300 vm_page_lockspin_queues();
4301 vm_pageout_throttle_up_batch(q, local_cnt);
4302 vm_page_unlock_queues();
4303 }
4304 }
4305 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4306
4307 /*
4308 * queue lock is held and our q is empty
4309 */
4310 q->pgo_busy = FALSE;
4311 #if DEVELOPMENT || DEBUG
4312 if (marked_active == TRUE) {
4313 vmct_active--;
4314 vmct_state[cq->id] = VMCT_IDLE;
4315
4316 if (vmct_active == 0) {
4317 vm_compressor_epoch_stop = mach_absolute_time();
4318 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4319 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4320 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4321 /* This interval includes intervals where one or more
4322 * compressor threads were pre-empted
4323 */
4324 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4325 }
4326 }
4327 if (compressor_running_perf_test && benchmark_accounting) {
4328 /*
4329 * We could turn ON compressor_running_perf_test while still processing
4330 * regular non-benchmark pages. We shouldn't count them here else we
4331 * could overshoot. We might also still be populating that benchmark Q
4332 * and be under pressure. So we will go back to the regular queues. And
4333 * benchmark accounting will be off for that case too.
4334 */
4335 compressor_perf_test_pages_processed += num_pages_processed;
4336 thread_wakeup(&compressor_perf_test_pages_processed);
4337 }
4338 #endif
4339 vm_page_unlock_queues();
4340 #if DEVELOPMENT || DEBUG
4341 if (__improbable(vm_compressor_time_thread)) {
4342 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4343 vmct_stats.vmct_pages[cq->id] += ncomps;
4344 vmct_stats.vmct_iterations[cq->id]++;
4345 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4346 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4347 }
4348 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4349 vmct_stats.vmct_minpages[cq->id] = ncomps;
4350 }
4351 }
4352 #endif
4353
4354 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4355 #if DEVELOPMENT || DEBUG
4356 if (compressor_running_perf_test && benchmark_accounting) {
4357 /*
4358 * We've been exclusively compressing pages from the benchmark queue,
4359 * do 1 pass over the internal queue before blocking.
4360 */
4361 continue;
4362 }
4363 #endif
4364
4365 sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4366 }
4367 /*NOTREACHED*/
4368 }
4369
4370
4371 kern_return_t
vm_pageout_compress_page(void ** current_chead,char * scratch_buf,vm_page_t m)4372 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4373 {
4374 vm_object_t object;
4375 memory_object_t pager;
4376 int compressed_count_delta;
4377 kern_return_t retval;
4378
4379 object = VM_PAGE_OBJECT(m);
4380
4381 assert(!m->vmp_free_when_done);
4382 assert(!m->vmp_laundry);
4383
4384 pager = object->pager;
4385
4386 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4387 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4388
4389 vm_object_lock(object);
4390
4391 /*
4392 * If there is no memory object for the page, create
4393 * one and hand it to the compression pager.
4394 */
4395
4396 if (!object->pager_initialized) {
4397 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4398 }
4399 if (!object->pager_initialized) {
4400 vm_object_compressor_pager_create(object);
4401 }
4402
4403 pager = object->pager;
4404
4405 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4406 /*
4407 * Still no pager for the object,
4408 * or the pager has been destroyed.
4409 * Reactivate the page.
4410 *
4411 * Should only happen if there is no
4412 * compression pager
4413 */
4414 PAGE_WAKEUP_DONE(m);
4415
4416 vm_page_lockspin_queues();
4417 vm_page_activate(m);
4418 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4419 vm_page_unlock_queues();
4420
4421 /*
4422 * And we are done with it.
4423 */
4424 vm_object_activity_end(object);
4425 vm_object_unlock(object);
4426
4427 return KERN_FAILURE;
4428 }
4429 vm_object_unlock(object);
4430
4431 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4432 }
4433 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4434 assert(object->activity_in_progress > 0);
4435
4436 retval = vm_compressor_pager_put(
4437 pager,
4438 m->vmp_offset + object->paging_offset,
4439 VM_PAGE_GET_PHYS_PAGE(m),
4440 current_chead,
4441 scratch_buf,
4442 &compressed_count_delta);
4443
4444 vm_object_lock(object);
4445
4446 assert(object->activity_in_progress > 0);
4447 assert(VM_PAGE_OBJECT(m) == object);
4448 assert( !VM_PAGE_WIRED(m));
4449
4450 vm_compressor_pager_count(pager,
4451 compressed_count_delta,
4452 FALSE, /* shared_lock */
4453 object);
4454
4455 if (retval == KERN_SUCCESS) {
4456 /*
4457 * If the object is purgeable, its owner's
4458 * purgeable ledgers will be updated in
4459 * vm_page_remove() but the page still
4460 * contributes to the owner's memory footprint,
4461 * so account for it as such.
4462 */
4463 if ((object->purgable != VM_PURGABLE_DENY ||
4464 object->vo_ledger_tag) &&
4465 object->vo_owner != NULL) {
4466 /* one more compressed purgeable/tagged page */
4467 vm_object_owner_compressed_update(object,
4468 +1);
4469 }
4470 counter_inc(&vm_statistics_compressions);
4471
4472 if (m->vmp_tabled) {
4473 vm_page_remove(m, TRUE);
4474 }
4475 } else {
4476 PAGE_WAKEUP_DONE(m);
4477
4478 vm_page_lockspin_queues();
4479
4480 vm_page_activate(m);
4481 vm_pageout_vminfo.vm_compressor_failed++;
4482
4483 vm_page_unlock_queues();
4484 }
4485 vm_object_activity_end(object);
4486 vm_object_unlock(object);
4487
4488 return retval;
4489 }
4490
4491
4492 static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state * ethr,boolean_t req_lowpriority)4493 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4494 {
4495 uint32_t policy;
4496
4497 if (hibernate_cleaning_in_progress == TRUE) {
4498 req_lowpriority = FALSE;
4499 }
4500
4501 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4502 vm_page_unlock_queues();
4503
4504 if (req_lowpriority == TRUE) {
4505 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4506 DTRACE_VM(laundrythrottle);
4507 } else {
4508 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4509 DTRACE_VM(laundryunthrottle);
4510 }
4511 proc_set_thread_policy(ethr->pgo_iothread,
4512 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4513
4514 vm_page_lock_queues();
4515 ethr->q->pgo_lowpriority = req_lowpriority;
4516 }
4517 }
4518
4519 OS_NORETURN
4520 static void
vm_pageout_iothread_external(struct pgo_iothread_state * ethr,__unused wait_result_t w)4521 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4522 {
4523 thread_t self = current_thread();
4524
4525 self->options |= TH_OPT_VMPRIV;
4526
4527 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4528
4529 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4530 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4531
4532 vm_page_lock_queues();
4533
4534 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4535 vm_pageout_queue_external.pgo_inited = TRUE;
4536
4537 vm_page_unlock_queues();
4538
4539 #if CONFIG_THREAD_GROUPS
4540 thread_group_vm_add();
4541 #endif /* CONFIG_THREAD_GROUPS */
4542
4543 vm_pageout_iothread_external_continue(ethr, 0);
4544 /*NOTREACHED*/
4545 }
4546
4547
4548 OS_NORETURN
4549 static void
vm_pageout_iothread_internal(struct pgo_iothread_state * cthr,__unused wait_result_t w)4550 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4551 {
4552 thread_t self = current_thread();
4553
4554 self->options |= TH_OPT_VMPRIV;
4555
4556 vm_page_lock_queues();
4557
4558 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4559 vm_pageout_queue_internal.pgo_inited = TRUE;
4560
4561 #if DEVELOPMENT || DEBUG
4562 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4563 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4564 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4565 #endif /* DEVELOPMENT || DEBUG */
4566
4567 vm_page_unlock_queues();
4568
4569 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4570 thread_vm_bind_group_add();
4571 }
4572
4573 #if CONFIG_THREAD_GROUPS
4574 thread_group_vm_add();
4575 #endif /* CONFIG_THREAD_GROUPS */
4576
4577 #if __AMP__
4578 if (vm_compressor_ebound) {
4579 /*
4580 * Use the soft bound option for vm_compressor to allow it to run on
4581 * P-cores if E-cluster is unavailable.
4582 */
4583 thread_bind_cluster_type(self, 'E', true);
4584 }
4585 #endif /* __AMP__ */
4586
4587 thread_set_thread_name(current_thread(), "VM_compressor");
4588 #if DEVELOPMENT || DEBUG
4589 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4590 #endif
4591 vm_pageout_iothread_internal_continue(cthr, 0);
4592
4593 /*NOTREACHED*/
4594 }
4595
4596 kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (* func)(int))4597 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4598 {
4599 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4600 return KERN_SUCCESS;
4601 } else {
4602 return KERN_FAILURE; /* Already set */
4603 }
4604 }
4605
4606 extern boolean_t memorystatus_manual_testing_on;
4607 extern unsigned int memorystatus_level;
4608
4609
4610 #if VM_PRESSURE_EVENTS
4611
4612 boolean_t vm_pressure_events_enabled = FALSE;
4613
4614 extern uint64_t next_warning_notification_sent_at_ts;
4615 extern uint64_t next_critical_notification_sent_at_ts;
4616
4617 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4618
4619 /*
4620 * The last time there was change in pressure level OR we forced a check
4621 * because the system is stuck in a non-normal pressure level.
4622 */
4623 uint64_t vm_pressure_last_level_transition_abs = 0;
4624
4625 /*
4626 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4627 * level before resending out notifications for that level again.
4628 */
4629 int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4630
4631 void
vm_pressure_response(void)4632 vm_pressure_response(void)
4633 {
4634 vm_pressure_level_t old_level = kVMPressureNormal;
4635 int new_level = -1;
4636 unsigned int total_pages;
4637 uint64_t available_memory = 0;
4638 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4639 bool force_check = false;
4640 int time_in_mins;
4641
4642
4643 if (vm_pressure_events_enabled == FALSE) {
4644 return;
4645 }
4646
4647 #if !XNU_TARGET_OS_OSX
4648
4649 available_memory = (uint64_t) memorystatus_available_pages;
4650
4651 #else /* !XNU_TARGET_OS_OSX */
4652
4653 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4654 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4655
4656 #endif /* !XNU_TARGET_OS_OSX */
4657
4658 total_pages = (unsigned int) atop_64(max_mem);
4659 #if CONFIG_SECLUDED_MEMORY
4660 total_pages -= vm_page_secluded_count;
4661 #endif /* CONFIG_SECLUDED_MEMORY */
4662 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4663
4664 if (memorystatus_manual_testing_on) {
4665 return;
4666 }
4667
4668 curr_ts = mach_absolute_time();
4669 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4670
4671 absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4672 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4673 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4674
4675 old_level = memorystatus_vm_pressure_level;
4676
4677 switch (memorystatus_vm_pressure_level) {
4678 case kVMPressureNormal:
4679 {
4680 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4681 new_level = kVMPressureCritical;
4682 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4683 new_level = kVMPressureWarning;
4684 }
4685 break;
4686 }
4687
4688 case kVMPressureWarning:
4689 case kVMPressureUrgent:
4690 {
4691 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4692 new_level = kVMPressureNormal;
4693 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4694 new_level = kVMPressureCritical;
4695 } else if (force_check) {
4696 new_level = kVMPressureWarning;
4697 next_warning_notification_sent_at_ts = curr_ts;
4698 }
4699 break;
4700 }
4701
4702 case kVMPressureCritical:
4703 {
4704 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4705 new_level = kVMPressureNormal;
4706 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4707 new_level = kVMPressureWarning;
4708 } else if (force_check) {
4709 new_level = kVMPressureCritical;
4710 next_critical_notification_sent_at_ts = curr_ts;
4711 }
4712 break;
4713 }
4714
4715 default:
4716 return;
4717 }
4718
4719 if (new_level != -1 || force_check) {
4720 if (new_level != -1) {
4721 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4722
4723 if (new_level != (int) old_level) {
4724 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4725 new_level, old_level, 0, 0);
4726 }
4727 } else {
4728 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4729 new_level, old_level, force_check, 0);
4730 }
4731
4732 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4733 /*
4734 * We don't want to schedule a wakeup while hibernation is in progress
4735 * because that could collide with checks for non-monotonicity in the scheduler.
4736 * We do however do all the updates to memorystatus_vm_pressure_level because
4737 * we _might_ want to use that for decisions regarding which pages or how
4738 * many pages we want to dump in hibernation.
4739 */
4740 return;
4741 }
4742
4743 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4744 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4745 thread_wakeup(&vm_pressure_thread);
4746 }
4747
4748 if (old_level != memorystatus_vm_pressure_level) {
4749 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4750 }
4751 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4752 }
4753 }
4754 }
4755 #endif /* VM_PRESSURE_EVENTS */
4756
4757 /*
4758 * Function called by a kernel thread to either get the current pressure level or
4759 * wait until memory pressure changes from a given level.
4760 */
4761 kern_return_t
mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure,__unused unsigned int * pressure_level)4762 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4763 {
4764 #if !VM_PRESSURE_EVENTS
4765
4766 return KERN_FAILURE;
4767
4768 #else /* VM_PRESSURE_EVENTS */
4769
4770 wait_result_t wr = 0;
4771 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4772
4773 if (pressure_level == NULL) {
4774 return KERN_INVALID_ARGUMENT;
4775 }
4776
4777 if (*pressure_level == kVMPressureJetsam) {
4778 if (!wait_for_pressure) {
4779 return KERN_INVALID_ARGUMENT;
4780 }
4781
4782 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4783 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4784 THREAD_INTERRUPTIBLE);
4785 if (wr == THREAD_WAITING) {
4786 ++memorystatus_jetsam_fg_band_waiters;
4787 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4788 wr = thread_block(THREAD_CONTINUE_NULL);
4789 } else {
4790 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4791 }
4792 if (wr != THREAD_AWAKENED) {
4793 return KERN_ABORTED;
4794 }
4795 *pressure_level = kVMPressureJetsam;
4796 return KERN_SUCCESS;
4797 }
4798
4799 if (wait_for_pressure == TRUE) {
4800 while (old_level == *pressure_level) {
4801 wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4802 THREAD_INTERRUPTIBLE);
4803 if (wr == THREAD_WAITING) {
4804 wr = thread_block(THREAD_CONTINUE_NULL);
4805 }
4806 if (wr == THREAD_INTERRUPTED) {
4807 return KERN_ABORTED;
4808 }
4809
4810 if (wr == THREAD_AWAKENED) {
4811 old_level = memorystatus_vm_pressure_level;
4812 }
4813 }
4814 }
4815
4816 *pressure_level = old_level;
4817 return KERN_SUCCESS;
4818 #endif /* VM_PRESSURE_EVENTS */
4819 }
4820
4821 #if VM_PRESSURE_EVENTS
4822 void
vm_pressure_thread(void)4823 vm_pressure_thread(void)
4824 {
4825 static boolean_t thread_initialized = FALSE;
4826
4827 if (thread_initialized == TRUE) {
4828 vm_pageout_state.vm_pressure_thread_running = TRUE;
4829 consider_vm_pressure_events();
4830 vm_pageout_state.vm_pressure_thread_running = FALSE;
4831 }
4832
4833 #if CONFIG_THREAD_GROUPS
4834 thread_group_vm_add();
4835 #endif /* CONFIG_THREAD_GROUPS */
4836
4837 thread_set_thread_name(current_thread(), "VM_pressure");
4838 thread_initialized = TRUE;
4839 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4840 thread_block((thread_continue_t)vm_pressure_thread);
4841 }
4842 #endif /* VM_PRESSURE_EVENTS */
4843
4844
4845 /*
4846 * called once per-second via "compute_averages"
4847 */
4848 void
compute_pageout_gc_throttle(__unused void * arg)4849 compute_pageout_gc_throttle(__unused void *arg)
4850 {
4851 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4852 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4853
4854 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4855 }
4856 }
4857
4858 /*
4859 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4860 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4861 * jetsams. We need to check if the zone map size is above its jetsam limit to
4862 * decide if this was indeed the case.
4863 *
4864 * We need to do this on a different thread because of the following reasons:
4865 *
4866 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4867 * itself causing the system to hang. We perform synchronous jetsams if we're
4868 * leaking in the VM map entries zone, so the leaking process could be doing a
4869 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4870 * jetsam itself. We also need the vm_map lock on the process termination path,
4871 * which would now lead the dying process to deadlock against itself.
4872 *
4873 * 2. The jetsam path might need to allocate zone memory itself. We could try
4874 * using the non-blocking variant of zalloc for this path, but we can still
4875 * end up trying to do a kmem_alloc when the zone maps are almost full.
4876 */
4877 __dead2
4878 void
vm_pageout_garbage_collect(void * step,wait_result_t wr __unused)4879 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4880 {
4881 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4882
4883 if (step == VM_PAGEOUT_GC_INIT) {
4884 /* first time being called is not about GC */
4885 #if CONFIG_THREAD_GROUPS
4886 thread_group_vm_add();
4887 #endif /* CONFIG_THREAD_GROUPS */
4888 } else if (zone_map_nearing_exhaustion()) {
4889 /*
4890 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4891 *
4892 * Bail out after calling zone_gc (which triggers the
4893 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4894 * operations that clear out a bunch of caches might allocate zone
4895 * memory themselves (for eg. vm_map operations would need VM map
4896 * entries). Since the zone map is almost full at this point, we
4897 * could end up with a panic. We just need to quickly jetsam a
4898 * process and exit here.
4899 *
4900 * It could so happen that we were woken up to relieve memory
4901 * pressure and the zone map also happened to be near its limit at
4902 * the time, in which case we'll skip out early. But that should be
4903 * ok; if memory pressure persists, the thread will simply be woken
4904 * up again.
4905 */
4906 zone_gc(ZONE_GC_JETSAM);
4907 } else {
4908 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4909 boolean_t buf_large_zfree = FALSE;
4910 boolean_t first_try = TRUE;
4911
4912 stack_collect();
4913
4914 consider_machine_collect();
4915 mbuf_drain(FALSE);
4916
4917 do {
4918 if (consider_buffer_cache_collect != NULL) {
4919 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4920 }
4921 if (first_try == TRUE || buf_large_zfree == TRUE) {
4922 /*
4923 * zone_gc should be last, because the other operations
4924 * might return memory to zones.
4925 */
4926 zone_gc(ZONE_GC_TRIM);
4927 }
4928 first_try = FALSE;
4929 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4930
4931 consider_machine_adjust();
4932 }
4933
4934 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4935
4936 thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4937 __builtin_unreachable();
4938 }
4939
4940
4941 #if VM_PAGE_BUCKETS_CHECK
4942 #if VM_PAGE_FAKE_BUCKETS
4943 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4944 #endif /* VM_PAGE_FAKE_BUCKETS */
4945 #endif /* VM_PAGE_BUCKETS_CHECK */
4946
4947
4948
4949 void
vm_set_restrictions(unsigned int num_cpus)4950 vm_set_restrictions(unsigned int num_cpus)
4951 {
4952 int vm_restricted_to_single_processor = 0;
4953
4954 if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4955 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4956 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4957 } else {
4958 assert(num_cpus > 0);
4959
4960 if (num_cpus <= 3) {
4961 /*
4962 * on systems with a limited number of CPUS, bind the
4963 * 4 major threads that can free memory and that tend to use
4964 * a fair bit of CPU under pressured conditions to a single processor.
4965 * This insures that these threads don't hog all of the available CPUs
4966 * (important for camera launch), while allowing them to run independently
4967 * w/r to locks... the 4 threads are
4968 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4969 * vm_compressor_swap_trigger_thread (minor and major compactions),
4970 * memorystatus_thread (jetsams).
4971 *
4972 * the first time the thread is run, it is responsible for checking the
4973 * state of vm_restricted_to_single_processor, and if TRUE it calls
4974 * thread_bind_master... someday this should be replaced with a group
4975 * scheduling mechanism and KPI.
4976 */
4977 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4978 } else {
4979 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4980 }
4981 }
4982 }
4983
4984 /*
4985 * Set up vm_config based on the vm_compressor_mode.
4986 * Must run BEFORE the pageout thread starts up.
4987 */
4988 __startup_func
4989 void
vm_config_init(void)4990 vm_config_init(void)
4991 {
4992 bzero(&vm_config, sizeof(vm_config));
4993
4994 switch (vm_compressor_mode) {
4995 case VM_PAGER_DEFAULT:
4996 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4997 OS_FALLTHROUGH;
4998
4999 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5000 vm_config.compressor_is_present = TRUE;
5001 vm_config.swap_is_present = TRUE;
5002 vm_config.compressor_is_active = TRUE;
5003 vm_config.swap_is_active = TRUE;
5004 break;
5005
5006 case VM_PAGER_COMPRESSOR_NO_SWAP:
5007 vm_config.compressor_is_present = TRUE;
5008 vm_config.swap_is_present = TRUE;
5009 vm_config.compressor_is_active = TRUE;
5010 break;
5011
5012 case VM_PAGER_FREEZER_DEFAULT:
5013 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5014 OS_FALLTHROUGH;
5015
5016 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5017 vm_config.compressor_is_present = TRUE;
5018 vm_config.swap_is_present = TRUE;
5019 break;
5020
5021 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5022 vm_config.compressor_is_present = TRUE;
5023 vm_config.swap_is_present = TRUE;
5024 vm_config.compressor_is_active = TRUE;
5025 vm_config.freezer_swap_is_active = TRUE;
5026 break;
5027
5028 case VM_PAGER_NOT_CONFIGURED:
5029 break;
5030
5031 default:
5032 printf("unknown compressor mode - %x\n", vm_compressor_mode);
5033 break;
5034 }
5035 }
5036
5037 __startup_func
5038 static void
vm_pageout_create_gc_thread(void)5039 vm_pageout_create_gc_thread(void)
5040 {
5041 thread_t thread;
5042
5043 if (kernel_thread_create(vm_pageout_garbage_collect,
5044 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5045 panic("vm_pageout_garbage_collect: create failed");
5046 }
5047 thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5048 if (thread->reserved_stack == 0) {
5049 assert(thread->kernel_stack);
5050 thread->reserved_stack = thread->kernel_stack;
5051 }
5052
5053 /* thread is started in vm_pageout() */
5054 vm_pageout_gc_thread = thread;
5055 }
5056 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5057
5058 void
vm_pageout(void)5059 vm_pageout(void)
5060 {
5061 thread_t self = current_thread();
5062 thread_t thread;
5063 kern_return_t result;
5064 spl_t s;
5065
5066 /*
5067 * Set thread privileges.
5068 */
5069 s = splsched();
5070
5071 #if CONFIG_VPS_DYNAMIC_PRIO
5072
5073 int vps_dynprio_bootarg = 0;
5074
5075 if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
5076 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
5077 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
5078 } else {
5079 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
5080 vps_dynamic_priority_enabled = TRUE;
5081 } else {
5082 vps_dynamic_priority_enabled = FALSE;
5083 }
5084 }
5085
5086 if (vps_dynamic_priority_enabled) {
5087 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5088 thread_set_eager_preempt(self);
5089 } else {
5090 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5091 }
5092
5093 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5094
5095 vps_dynamic_priority_enabled = FALSE;
5096 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5097
5098 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5099
5100 thread_lock(self);
5101 self->options |= TH_OPT_VMPRIV;
5102 thread_unlock(self);
5103
5104 if (!self->reserved_stack) {
5105 self->reserved_stack = self->kernel_stack;
5106 }
5107
5108 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5109 vps_dynamic_priority_enabled == FALSE) {
5110 thread_vm_bind_group_add();
5111 }
5112
5113
5114 #if CONFIG_THREAD_GROUPS
5115 thread_group_vm_add();
5116 #endif /* CONFIG_THREAD_GROUPS */
5117
5118 #if __AMP__
5119 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5120 if (vm_pgo_pbound) {
5121 /*
5122 * Use the soft bound option for vm pageout to allow it to run on
5123 * E-cores if P-cluster is unavailable.
5124 */
5125 thread_bind_cluster_type(self, 'P', true);
5126 }
5127 #endif /* __AMP__ */
5128
5129 PE_parse_boot_argn("vmpgo_protect_realtime",
5130 &vm_pageout_protect_realtime,
5131 sizeof(vm_pageout_protect_realtime));
5132 splx(s);
5133
5134 thread_set_thread_name(current_thread(), "VM_pageout_scan");
5135
5136 /*
5137 * Initialize some paging parameters.
5138 */
5139
5140 vm_pageout_state.vm_pressure_thread_running = FALSE;
5141 vm_pageout_state.vm_pressure_changed = FALSE;
5142 vm_pageout_state.memorystatus_purge_on_warning = 2;
5143 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5144 vm_pageout_state.memorystatus_purge_on_critical = 8;
5145 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5146 vm_pageout_state.vm_page_speculative_percentage = 5;
5147 vm_pageout_state.vm_page_speculative_target = 0;
5148
5149 vm_pageout_state.vm_pageout_swap_wait = 0;
5150 vm_pageout_state.vm_pageout_idle_wait = 0;
5151 vm_pageout_state.vm_pageout_empty_wait = 0;
5152 vm_pageout_state.vm_pageout_burst_wait = 0;
5153 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5154 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5155 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5156
5157 vm_pageout_state.vm_pageout_inactive = 0;
5158 vm_pageout_state.vm_pageout_inactive_used = 0;
5159 vm_pageout_state.vm_pageout_inactive_clean = 0;
5160
5161 vm_pageout_state.vm_memory_pressure = 0;
5162 vm_pageout_state.vm_page_filecache_min = 0;
5163 #if CONFIG_JETSAM
5164 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5165 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5166 #else
5167 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5168 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5169 #endif
5170 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5171
5172 vm_pageout_state.vm_pageout_considered_page_last = 0;
5173
5174 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5175 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5176 }
5177
5178 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5179 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5180 }
5181
5182 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5183 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5184 }
5185
5186 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5187 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5188 }
5189
5190 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5191 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5192 }
5193
5194 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5195 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5196 }
5197
5198 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5199 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5200 }
5201 /*
5202 * even if we've already called vm_page_free_reserve
5203 * call it again here to insure that the targets are
5204 * accurately calculated (it uses vm_page_free_count_init)
5205 * calling it with an arg of 0 will not change the reserve
5206 * but will re-calculate free_min and free_target
5207 */
5208 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5209 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5210 } else {
5211 vm_page_free_reserve(0);
5212 }
5213
5214 bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5215 bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5216
5217 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5218 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5219
5220 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5221
5222 #if DEVELOPMENT || DEBUG
5223 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5224 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5225 #endif /* DEVELOPMENT || DEBUG */
5226
5227
5228 /* internal pageout thread started when default pager registered first time */
5229 /* external pageout and garbage collection threads started here */
5230 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5231 ethr->id = 0;
5232 ethr->q = &vm_pageout_queue_external;
5233 ethr->current_early_swapout_chead = NULL;
5234 ethr->current_regular_swapout_chead = NULL;
5235 ethr->current_late_swapout_chead = NULL;
5236 ethr->scratch_buf = NULL;
5237 #if DEVELOPMENT || DEBUG
5238 ethr->benchmark_q = NULL;
5239 #endif /* DEVELOPMENT || DEBUG */
5240 sched_cond_init(&(ethr->pgo_wakeup));
5241
5242 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5243 (void *)ethr, BASEPRI_VM,
5244 &(ethr->pgo_iothread));
5245 if (result != KERN_SUCCESS) {
5246 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5247 }
5248 thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5249
5250 thread_mtx_lock(vm_pageout_gc_thread );
5251 thread_start(vm_pageout_gc_thread );
5252 thread_mtx_unlock(vm_pageout_gc_thread);
5253
5254 #if VM_PRESSURE_EVENTS
5255 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5256 BASEPRI_DEFAULT,
5257 &thread);
5258
5259 if (result != KERN_SUCCESS) {
5260 panic("vm_pressure_thread: create failed");
5261 }
5262
5263 thread_deallocate(thread);
5264 #endif
5265
5266 vm_object_reaper_init();
5267
5268
5269 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5270 vm_compressor_init();
5271 }
5272
5273 #if VM_PRESSURE_EVENTS
5274 vm_pressure_events_enabled = TRUE;
5275 #endif /* VM_PRESSURE_EVENTS */
5276
5277 #if CONFIG_PHANTOM_CACHE
5278 vm_phantom_cache_init();
5279 #endif
5280 #if VM_PAGE_BUCKETS_CHECK
5281 #if VM_PAGE_FAKE_BUCKETS
5282 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5283 (uint64_t) vm_page_fake_buckets_start,
5284 (uint64_t) vm_page_fake_buckets_end);
5285 pmap_protect(kernel_pmap,
5286 vm_page_fake_buckets_start,
5287 vm_page_fake_buckets_end,
5288 VM_PROT_READ);
5289 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5290 #endif /* VM_PAGE_FAKE_BUCKETS */
5291 #endif /* VM_PAGE_BUCKETS_CHECK */
5292
5293 #if VM_OBJECT_TRACKING
5294 vm_object_tracking_init();
5295 #endif /* VM_OBJECT_TRACKING */
5296
5297 #if __arm64__
5298 // vm_tests();
5299 #endif /* __arm64__ */
5300
5301 vm_pageout_continue();
5302
5303 /*
5304 * Unreached code!
5305 *
5306 * The vm_pageout_continue() call above never returns, so the code below is never
5307 * executed. We take advantage of this to declare several DTrace VM related probe
5308 * points that our kernel doesn't have an analog for. These are probe points that
5309 * exist in Solaris and are in the DTrace documentation, so people may have written
5310 * scripts that use them. Declaring the probe points here means their scripts will
5311 * compile and execute which we want for portability of the scripts, but since this
5312 * section of code is never reached, the probe points will simply never fire. Yes,
5313 * this is basically a hack. The problem is the DTrace probe points were chosen with
5314 * Solaris specific VM events in mind, not portability to different VM implementations.
5315 */
5316
5317 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5318 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5319 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5320 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5321 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5322 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5323 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5324 /*NOTREACHED*/
5325 }
5326
5327
5328
5329 kern_return_t
vm_pageout_internal_start(void)5330 vm_pageout_internal_start(void)
5331 {
5332 kern_return_t result = KERN_SUCCESS;
5333 host_basic_info_data_t hinfo;
5334 vm_offset_t buf, bufsize;
5335
5336 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5337
5338 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5339 #define BSD_HOST 1
5340 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5341
5342 assert(hinfo.max_cpus > 0);
5343
5344 #if !XNU_TARGET_OS_OSX
5345 vm_pageout_state.vm_compressor_thread_count = 1;
5346 #else /* !XNU_TARGET_OS_OSX */
5347 if (hinfo.max_cpus > 4) {
5348 vm_pageout_state.vm_compressor_thread_count = 2;
5349 } else {
5350 vm_pageout_state.vm_compressor_thread_count = 1;
5351 }
5352 #endif /* !XNU_TARGET_OS_OSX */
5353 #if __AMP__
5354 if (vm_compressor_ebound) {
5355 vm_pageout_state.vm_compressor_thread_count = 2;
5356 }
5357 #endif
5358 PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5359 sizeof(vm_pageout_state.vm_compressor_thread_count));
5360
5361 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5362 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5363 }
5364 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5365 vm_pageout_state.vm_compressor_thread_count = 1;
5366 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5367 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5368 }
5369
5370 vm_pageout_queue_internal.pgo_maxlaundry =
5371 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5372
5373 PE_parse_boot_argn("vmpgoi_maxlaundry",
5374 &vm_pageout_queue_internal.pgo_maxlaundry,
5375 sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5376
5377 #if DEVELOPMENT || DEBUG
5378 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5379 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5380 #endif /* DEVELOPMENT || DEBUG */
5381
5382 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5383
5384 kmem_alloc(kernel_map, &buf,
5385 bufsize * vm_pageout_state.vm_compressor_thread_count,
5386 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5387 VM_KERN_MEMORY_COMPRESSOR);
5388
5389 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5390 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5391 iq->id = i;
5392 iq->q = &vm_pageout_queue_internal;
5393 iq->current_early_swapout_chead = NULL;
5394 iq->current_regular_swapout_chead = NULL;
5395 iq->current_late_swapout_chead = NULL;
5396 iq->scratch_buf = (char *)(buf + i * bufsize);
5397 #if DEVELOPMENT || DEBUG
5398 iq->benchmark_q = &vm_pageout_queue_benchmark;
5399 #endif /* DEVELOPMENT || DEBUG */
5400 sched_cond_init(&(iq->pgo_wakeup));
5401 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5402 (void *)iq, BASEPRI_VM,
5403 &(iq->pgo_iothread));
5404
5405 if (result != KERN_SUCCESS) {
5406 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5407 }
5408 }
5409 return result;
5410 }
5411
5412 #if CONFIG_IOSCHED
5413 /*
5414 * To support I/O Expedite for compressed files we mark the upls with special flags.
5415 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5416 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5417 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5418 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5419 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5420 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5421 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5422 * unless the real I/O upl is being destroyed).
5423 */
5424
5425
5426 static void
upl_set_decmp_info(upl_t upl,upl_t src_upl)5427 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5428 {
5429 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5430
5431 upl_lock(src_upl);
5432 if (src_upl->decmp_io_upl) {
5433 /*
5434 * If there is already an alive real I/O UPL, ignore this new UPL.
5435 * This case should rarely happen and even if it does, it just means
5436 * that we might issue a spurious expedite which the driver is expected
5437 * to handle.
5438 */
5439 upl_unlock(src_upl);
5440 return;
5441 }
5442 src_upl->decmp_io_upl = (void *)upl;
5443 src_upl->ref_count++;
5444
5445 upl->flags |= UPL_DECMP_REAL_IO;
5446 upl->decmp_io_upl = (void *)src_upl;
5447 upl_unlock(src_upl);
5448 }
5449 #endif /* CONFIG_IOSCHED */
5450
5451 #if UPL_DEBUG
5452 int upl_debug_enabled = 1;
5453 #else
5454 int upl_debug_enabled = 0;
5455 #endif
5456
5457 static upl_t
upl_create(int type,int flags,upl_size_t size)5458 upl_create(int type, int flags, upl_size_t size)
5459 {
5460 uint32_t pages = (uint32_t)atop(round_page_32(size));
5461 upl_t upl;
5462
5463 assert(page_aligned(size));
5464
5465 /*
5466 * FIXME: this code assumes the allocation always succeeds,
5467 * however `pages` can be up to MAX_UPL_SIZE.
5468 *
5469 * The allocation size is above 32k (resp. 128k)
5470 * on 16k pages (resp. 4k), which kalloc might fail
5471 * to allocate.
5472 */
5473 upl = kalloc_type(struct upl, struct upl_page_info,
5474 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5475 if (type & UPL_CREATE_INTERNAL) {
5476 flags |= UPL_INTERNAL;
5477 }
5478
5479 if (type & UPL_CREATE_LITE) {
5480 flags |= UPL_LITE;
5481 if (pages) {
5482 upl->lite_list = bitmap_alloc(pages);
5483 }
5484 }
5485
5486 upl->flags = flags;
5487 upl->ref_count = 1;
5488 upl_lock_init(upl);
5489 #if CONFIG_IOSCHED
5490 if (type & UPL_CREATE_IO_TRACKING) {
5491 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5492 }
5493
5494 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5495 /* Only support expedite on internal UPLs */
5496 thread_t curthread = current_thread();
5497 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5498 Z_WAITOK | Z_ZERO);
5499 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5500 if (curthread->decmp_upl != NULL) {
5501 upl_set_decmp_info(upl, curthread->decmp_upl);
5502 }
5503 }
5504 #endif
5505 #if CONFIG_IOSCHED || UPL_DEBUG
5506 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5507 upl->upl_creator = current_thread();
5508 upl->flags |= UPL_TRACKED_BY_OBJECT;
5509 }
5510 #endif
5511
5512 #if UPL_DEBUG
5513 upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5514 #endif /* UPL_DEBUG */
5515
5516 return upl;
5517 }
5518
5519 static void
upl_destroy(upl_t upl)5520 upl_destroy(upl_t upl)
5521 {
5522 uint32_t pages;
5523
5524 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5525
5526 if (upl->ext_ref_count) {
5527 panic("upl(%p) ext_ref_count", upl);
5528 }
5529
5530 #if CONFIG_IOSCHED
5531 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5532 upl_t src_upl;
5533 src_upl = upl->decmp_io_upl;
5534 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5535 upl_lock(src_upl);
5536 src_upl->decmp_io_upl = NULL;
5537 upl_unlock(src_upl);
5538 upl_deallocate(src_upl);
5539 }
5540 #endif /* CONFIG_IOSCHED */
5541
5542 #if CONFIG_IOSCHED || UPL_DEBUG
5543 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5544 !(upl->flags & UPL_VECTOR)) {
5545 vm_object_t object;
5546
5547 if (upl->flags & UPL_SHADOWED) {
5548 object = upl->map_object->shadow;
5549 } else {
5550 object = upl->map_object;
5551 }
5552
5553 vm_object_lock(object);
5554 queue_remove(&object->uplq, upl, upl_t, uplq);
5555 vm_object_activity_end(object);
5556 vm_object_collapse(object, 0, TRUE);
5557 vm_object_unlock(object);
5558 }
5559 #endif
5560 /*
5561 * drop a reference on the map_object whether or
5562 * not a pageout object is inserted
5563 */
5564 if (upl->flags & UPL_SHADOWED) {
5565 vm_object_deallocate(upl->map_object);
5566 }
5567
5568 if (upl->flags & UPL_DEVICE_MEMORY) {
5569 pages = 1;
5570 } else {
5571 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5572 }
5573
5574 upl_lock_destroy(upl);
5575
5576 #if CONFIG_IOSCHED
5577 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5578 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5579 }
5580 #endif
5581
5582 #if UPL_DEBUG
5583 for (int i = 0; i < upl->upl_commit_index; i++) {
5584 btref_put(upl->upl_commit_records[i].c_btref);
5585 }
5586 btref_put(upl->uple_create_btref);
5587 #endif /* UPL_DEBUG */
5588
5589 if ((upl->flags & UPL_LITE) && pages) {
5590 bitmap_free(upl->lite_list, pages);
5591 }
5592 kfree_type(struct upl, struct upl_page_info,
5593 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5594 }
5595
5596 void
upl_deallocate(upl_t upl)5597 upl_deallocate(upl_t upl)
5598 {
5599 upl_lock(upl);
5600
5601 if (--upl->ref_count == 0) {
5602 if (vector_upl_is_valid(upl)) {
5603 vector_upl_deallocate(upl);
5604 }
5605 upl_unlock(upl);
5606
5607 if (upl->upl_iodone) {
5608 upl_callout_iodone(upl);
5609 }
5610
5611 upl_destroy(upl);
5612 } else {
5613 upl_unlock(upl);
5614 }
5615 }
5616
5617 #if CONFIG_IOSCHED
5618 void
upl_mark_decmp(upl_t upl)5619 upl_mark_decmp(upl_t upl)
5620 {
5621 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5622 upl->flags |= UPL_DECMP_REQ;
5623 upl->upl_creator->decmp_upl = (void *)upl;
5624 }
5625 }
5626
5627 void
upl_unmark_decmp(upl_t upl)5628 upl_unmark_decmp(upl_t upl)
5629 {
5630 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5631 upl->upl_creator->decmp_upl = NULL;
5632 }
5633 }
5634
5635 #endif /* CONFIG_IOSCHED */
5636
5637 #define VM_PAGE_Q_BACKING_UP(q) \
5638 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5639
5640 boolean_t must_throttle_writes(void);
5641
5642 boolean_t
must_throttle_writes()5643 must_throttle_writes()
5644 {
5645 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5646 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5647 return TRUE;
5648 }
5649
5650 return FALSE;
5651 }
5652
5653 int vm_page_delayed_work_ctx_needed = 0;
5654 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5655
5656 __startup_func
5657 static void
vm_page_delayed_work_init_ctx(void)5658 vm_page_delayed_work_init_ctx(void)
5659 {
5660 uint16_t min_delayed_work_ctx_allocated = 16;
5661
5662 /*
5663 * try really hard to always keep NCPU elements around in the zone
5664 * in order for the UPL code to almost always get an element.
5665 */
5666 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5667 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5668 }
5669
5670 zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5671 }
5672 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5673
5674 struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)5675 vm_page_delayed_work_get_ctx(void)
5676 {
5677 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5678
5679 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5680
5681 if (__probable(dw_ctx)) {
5682 dw_ctx->delayed_owner = current_thread();
5683 } else {
5684 vm_page_delayed_work_ctx_needed++;
5685 }
5686 return dw_ctx ? dw_ctx->dwp : NULL;
5687 }
5688
5689 void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work * dwp)5690 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5691 {
5692 struct vm_page_delayed_work_ctx *ldw_ctx;
5693
5694 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5695 ldw_ctx->delayed_owner = NULL;
5696
5697 zfree(dw_ctx_zone, ldw_ctx);
5698 }
5699
5700 /*
5701 * Routine: vm_object_upl_request
5702 * Purpose:
5703 * Cause the population of a portion of a vm_object.
5704 * Depending on the nature of the request, the pages
5705 * returned may be contain valid data or be uninitialized.
5706 * A page list structure, listing the physical pages
5707 * will be returned upon request.
5708 * This function is called by the file system or any other
5709 * supplier of backing store to a pager.
5710 * IMPORTANT NOTE: The caller must still respect the relationship
5711 * between the vm_object and its backing memory object. The
5712 * caller MUST NOT substitute changes in the backing file
5713 * without first doing a memory_object_lock_request on the
5714 * target range unless it is know that the pages are not
5715 * shared with another entity at the pager level.
5716 * Copy_in_to:
5717 * if a page list structure is present
5718 * return the mapped physical pages, where a
5719 * page is not present, return a non-initialized
5720 * one. If the no_sync bit is turned on, don't
5721 * call the pager unlock to synchronize with other
5722 * possible copies of the page. Leave pages busy
5723 * in the original object, if a page list structure
5724 * was specified. When a commit of the page list
5725 * pages is done, the dirty bit will be set for each one.
5726 * Copy_out_from:
5727 * If a page list structure is present, return
5728 * all mapped pages. Where a page does not exist
5729 * map a zero filled one. Leave pages busy in
5730 * the original object. If a page list structure
5731 * is not specified, this call is a no-op.
5732 *
5733 * Note: access of default pager objects has a rather interesting
5734 * twist. The caller of this routine, presumably the file system
5735 * page cache handling code, will never actually make a request
5736 * against a default pager backed object. Only the default
5737 * pager will make requests on backing store related vm_objects
5738 * In this way the default pager can maintain the relationship
5739 * between backing store files (abstract memory objects) and
5740 * the vm_objects (cache objects), they support.
5741 *
5742 */
5743
5744 __private_extern__ kern_return_t
vm_object_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)5745 vm_object_upl_request(
5746 vm_object_t object,
5747 vm_object_offset_t offset,
5748 upl_size_t size,
5749 upl_t *upl_ptr,
5750 upl_page_info_array_t user_page_list,
5751 unsigned int *page_list_count,
5752 upl_control_flags_t cntrl_flags,
5753 vm_tag_t tag)
5754 {
5755 vm_page_t dst_page = VM_PAGE_NULL;
5756 vm_object_offset_t dst_offset;
5757 upl_size_t xfer_size;
5758 unsigned int size_in_pages;
5759 boolean_t dirty;
5760 boolean_t hw_dirty;
5761 upl_t upl = NULL;
5762 unsigned int entry;
5763 vm_page_t alias_page = NULL;
5764 int refmod_state = 0;
5765 vm_object_t last_copy_object;
5766 struct vm_page_delayed_work dw_array;
5767 struct vm_page_delayed_work *dwp, *dwp_start;
5768 bool dwp_finish_ctx = TRUE;
5769 int dw_count;
5770 int dw_limit;
5771 int io_tracking_flag = 0;
5772 int grab_options;
5773 int page_grab_count = 0;
5774 ppnum_t phys_page;
5775 pmap_flush_context pmap_flush_context_storage;
5776 boolean_t pmap_flushes_delayed = FALSE;
5777 #if DEVELOPMENT || DEBUG
5778 task_t task = current_task();
5779 #endif /* DEVELOPMENT || DEBUG */
5780
5781 dwp_start = dwp = NULL;
5782
5783 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5784 /*
5785 * For forward compatibility's sake,
5786 * reject any unknown flag.
5787 */
5788 return KERN_INVALID_VALUE;
5789 }
5790 if ((!object->internal) && (object->paging_offset != 0)) {
5791 panic("vm_object_upl_request: external object with non-zero paging offset");
5792 }
5793 if (object->phys_contiguous) {
5794 panic("vm_object_upl_request: contiguous object specified");
5795 }
5796
5797 assertf(page_aligned(offset) && page_aligned(size),
5798 "offset 0x%llx size 0x%x",
5799 offset, size);
5800
5801 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5802
5803 dw_count = 0;
5804 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5805 dwp_start = vm_page_delayed_work_get_ctx();
5806 if (dwp_start == NULL) {
5807 dwp_start = &dw_array;
5808 dw_limit = 1;
5809 dwp_finish_ctx = FALSE;
5810 }
5811
5812 dwp = dwp_start;
5813
5814 if (size > MAX_UPL_SIZE_BYTES) {
5815 size = MAX_UPL_SIZE_BYTES;
5816 }
5817
5818 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5819 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5820 }
5821
5822 #if CONFIG_IOSCHED || UPL_DEBUG
5823 if (object->io_tracking || upl_debug_enabled) {
5824 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5825 }
5826 #endif
5827 #if CONFIG_IOSCHED
5828 if (object->io_tracking) {
5829 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5830 }
5831 #endif
5832
5833 if (cntrl_flags & UPL_SET_INTERNAL) {
5834 if (cntrl_flags & UPL_SET_LITE) {
5835 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5836 } else {
5837 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5838 }
5839 user_page_list = size ? upl->page_list : NULL;
5840 } else {
5841 if (cntrl_flags & UPL_SET_LITE) {
5842 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5843 } else {
5844 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5845 }
5846 }
5847 *upl_ptr = upl;
5848
5849 if (user_page_list) {
5850 user_page_list[0].device = FALSE;
5851 }
5852
5853 if (cntrl_flags & UPL_SET_LITE) {
5854 upl->map_object = object;
5855 } else {
5856 upl->map_object = vm_object_allocate(size);
5857 /*
5858 * No neeed to lock the new object: nobody else knows
5859 * about it yet, so it's all ours so far.
5860 */
5861 upl->map_object->shadow = object;
5862 upl->map_object->pageout = TRUE;
5863 upl->map_object->can_persist = FALSE;
5864 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5865 upl->map_object->vo_shadow_offset = offset;
5866 upl->map_object->wimg_bits = object->wimg_bits;
5867 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5868 "object %p shadow_offset 0x%llx",
5869 upl->map_object, upl->map_object->vo_shadow_offset);
5870
5871 alias_page = vm_page_grab_fictitious(TRUE);
5872
5873 upl->flags |= UPL_SHADOWED;
5874 }
5875 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5876 upl->flags |= UPL_PAGEOUT;
5877 }
5878
5879 vm_object_lock(object);
5880 vm_object_activity_begin(object);
5881
5882 grab_options = 0;
5883 #if CONFIG_SECLUDED_MEMORY
5884 if (object->can_grab_secluded) {
5885 grab_options |= VM_PAGE_GRAB_SECLUDED;
5886 }
5887 #endif /* CONFIG_SECLUDED_MEMORY */
5888
5889 /*
5890 * we can lock in the paging_offset once paging_in_progress is set
5891 */
5892 upl->u_size = size;
5893 upl->u_offset = offset + object->paging_offset;
5894
5895 #if CONFIG_IOSCHED || UPL_DEBUG
5896 if (object->io_tracking || upl_debug_enabled) {
5897 vm_object_activity_begin(object);
5898 queue_enter(&object->uplq, upl, upl_t, uplq);
5899 }
5900 #endif
5901 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5902 /*
5903 * Honor copy-on-write obligations
5904 *
5905 * The caller is gathering these pages and
5906 * might modify their contents. We need to
5907 * make sure that the copy object has its own
5908 * private copies of these pages before we let
5909 * the caller modify them.
5910 */
5911 vm_object_update(object,
5912 offset,
5913 size,
5914 NULL,
5915 NULL,
5916 FALSE, /* should_return */
5917 MEMORY_OBJECT_COPY_SYNC,
5918 VM_PROT_NO_CHANGE);
5919
5920 VM_PAGEOUT_DEBUG(upl_cow, 1);
5921 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5922 }
5923 /*
5924 * remember which copy object we synchronized with
5925 */
5926 last_copy_object = object->copy;
5927 entry = 0;
5928
5929 xfer_size = size;
5930 dst_offset = offset;
5931 size_in_pages = size / PAGE_SIZE;
5932
5933 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5934 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5935 object->scan_collisions = 0;
5936 }
5937
5938 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5939 boolean_t isSSD = FALSE;
5940
5941 #if !XNU_TARGET_OS_OSX
5942 isSSD = TRUE;
5943 #else /* !XNU_TARGET_OS_OSX */
5944 vnode_pager_get_isSSD(object->pager, &isSSD);
5945 #endif /* !XNU_TARGET_OS_OSX */
5946 vm_object_unlock(object);
5947
5948 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5949
5950 if (isSSD == TRUE) {
5951 delay(1000 * size_in_pages);
5952 } else {
5953 delay(5000 * size_in_pages);
5954 }
5955 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5956
5957 vm_object_lock(object);
5958 }
5959
5960 while (xfer_size) {
5961 dwp->dw_mask = 0;
5962
5963 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5964 vm_object_unlock(object);
5965 alias_page = vm_page_grab_fictitious(TRUE);
5966 vm_object_lock(object);
5967 }
5968 if (cntrl_flags & UPL_COPYOUT_FROM) {
5969 upl->flags |= UPL_PAGE_SYNC_DONE;
5970
5971 if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5972 dst_page->vmp_fictitious ||
5973 dst_page->vmp_absent ||
5974 VMP_ERROR_GET(dst_page) ||
5975 dst_page->vmp_cleaning ||
5976 (VM_PAGE_WIRED(dst_page))) {
5977 if (user_page_list) {
5978 user_page_list[entry].phys_addr = 0;
5979 }
5980
5981 goto try_next_page;
5982 }
5983 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5984
5985 /*
5986 * grab this up front...
5987 * a high percentange of the time we're going to
5988 * need the hardware modification state a bit later
5989 * anyway... so we can eliminate an extra call into
5990 * the pmap layer by grabbing it here and recording it
5991 */
5992 if (dst_page->vmp_pmapped) {
5993 refmod_state = pmap_get_refmod(phys_page);
5994 } else {
5995 refmod_state = 0;
5996 }
5997
5998 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5999 /*
6000 * page is on inactive list and referenced...
6001 * reactivate it now... this gets it out of the
6002 * way of vm_pageout_scan which would have to
6003 * reactivate it upon tripping over it
6004 */
6005 dwp->dw_mask |= DW_vm_page_activate;
6006 }
6007 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6008 /*
6009 * we're only asking for DIRTY pages to be returned
6010 */
6011 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6012 /*
6013 * if we were the page stolen by vm_pageout_scan to be
6014 * cleaned (as opposed to a buddy being clustered in
6015 * or this request is not being driven by a PAGEOUT cluster
6016 * then we only need to check for the page being dirty or
6017 * precious to decide whether to return it
6018 */
6019 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6020 goto check_busy;
6021 }
6022 goto dont_return;
6023 }
6024 /*
6025 * this is a request for a PAGEOUT cluster and this page
6026 * is merely along for the ride as a 'buddy'... not only
6027 * does it have to be dirty to be returned, but it also
6028 * can't have been referenced recently...
6029 */
6030 if ((hibernate_cleaning_in_progress == TRUE ||
6031 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6032 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6033 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6034 goto check_busy;
6035 }
6036 dont_return:
6037 /*
6038 * if we reach here, we're not to return
6039 * the page... go on to the next one
6040 */
6041 if (dst_page->vmp_laundry == TRUE) {
6042 /*
6043 * if we get here, the page is not 'cleaning' (filtered out above).
6044 * since it has been referenced, remove it from the laundry
6045 * so we don't pay the cost of an I/O to clean a page
6046 * we're just going to take back
6047 */
6048 vm_page_lockspin_queues();
6049
6050 vm_pageout_steal_laundry(dst_page, TRUE);
6051 vm_page_activate(dst_page);
6052
6053 vm_page_unlock_queues();
6054 }
6055 if (user_page_list) {
6056 user_page_list[entry].phys_addr = 0;
6057 }
6058
6059 goto try_next_page;
6060 }
6061 check_busy:
6062 if (dst_page->vmp_busy) {
6063 if (cntrl_flags & UPL_NOBLOCK) {
6064 if (user_page_list) {
6065 user_page_list[entry].phys_addr = 0;
6066 }
6067 dwp->dw_mask = 0;
6068
6069 goto try_next_page;
6070 }
6071 /*
6072 * someone else is playing with the
6073 * page. We will have to wait.
6074 */
6075 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6076
6077 continue;
6078 }
6079 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6080 vm_page_lockspin_queues();
6081
6082 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6083 /*
6084 * we've buddied up a page for a clustered pageout
6085 * that has already been moved to the pageout
6086 * queue by pageout_scan... we need to remove
6087 * it from the queue and drop the laundry count
6088 * on that queue
6089 */
6090 vm_pageout_throttle_up(dst_page);
6091 }
6092 vm_page_unlock_queues();
6093 }
6094 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6095 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6096
6097 if (phys_page > upl->highest_page) {
6098 upl->highest_page = phys_page;
6099 }
6100
6101 assert(!pmap_is_noencrypt(phys_page));
6102
6103 if (cntrl_flags & UPL_SET_LITE) {
6104 unsigned int pg_num;
6105
6106 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6107 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6108 bitmap_set(upl->lite_list, pg_num);
6109
6110 if (hw_dirty) {
6111 if (pmap_flushes_delayed == FALSE) {
6112 pmap_flush_context_init(&pmap_flush_context_storage);
6113 pmap_flushes_delayed = TRUE;
6114 }
6115 pmap_clear_refmod_options(phys_page,
6116 VM_MEM_MODIFIED,
6117 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6118 &pmap_flush_context_storage);
6119 }
6120
6121 /*
6122 * Mark original page as cleaning
6123 * in place.
6124 */
6125 dst_page->vmp_cleaning = TRUE;
6126 dst_page->vmp_precious = FALSE;
6127 } else {
6128 /*
6129 * use pageclean setup, it is more
6130 * convenient even for the pageout
6131 * cases here
6132 */
6133 vm_object_lock(upl->map_object);
6134 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6135 vm_object_unlock(upl->map_object);
6136
6137 alias_page->vmp_absent = FALSE;
6138 alias_page = NULL;
6139 }
6140 if (dirty) {
6141 SET_PAGE_DIRTY(dst_page, FALSE);
6142 } else {
6143 dst_page->vmp_dirty = FALSE;
6144 }
6145
6146 if (!dirty) {
6147 dst_page->vmp_precious = TRUE;
6148 }
6149
6150 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6151 if (!VM_PAGE_WIRED(dst_page)) {
6152 dst_page->vmp_free_when_done = TRUE;
6153 }
6154 }
6155 } else {
6156 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
6157 /*
6158 * Honor copy-on-write obligations
6159 *
6160 * The copy object has changed since we
6161 * last synchronized for copy-on-write.
6162 * Another copy object might have been
6163 * inserted while we released the object's
6164 * lock. Since someone could have seen the
6165 * original contents of the remaining pages
6166 * through that new object, we have to
6167 * synchronize with it again for the remaining
6168 * pages only. The previous pages are "busy"
6169 * so they can not be seen through the new
6170 * mapping. The new mapping will see our
6171 * upcoming changes for those previous pages,
6172 * but that's OK since they couldn't see what
6173 * was there before. It's just a race anyway
6174 * and there's no guarantee of consistency or
6175 * atomicity. We just don't want new mappings
6176 * to see both the *before* and *after* pages.
6177 */
6178 if (object->copy != VM_OBJECT_NULL) {
6179 vm_object_update(
6180 object,
6181 dst_offset,/* current offset */
6182 xfer_size, /* remaining size */
6183 NULL,
6184 NULL,
6185 FALSE, /* should_return */
6186 MEMORY_OBJECT_COPY_SYNC,
6187 VM_PROT_NO_CHANGE);
6188
6189 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6190 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6191 }
6192 /*
6193 * remember the copy object we synced with
6194 */
6195 last_copy_object = object->copy;
6196 }
6197 dst_page = vm_page_lookup(object, dst_offset);
6198
6199 if (dst_page != VM_PAGE_NULL) {
6200 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6201 /*
6202 * skip over pages already present in the cache
6203 */
6204 if (user_page_list) {
6205 user_page_list[entry].phys_addr = 0;
6206 }
6207
6208 goto try_next_page;
6209 }
6210 if (dst_page->vmp_fictitious) {
6211 panic("need corner case for fictitious page");
6212 }
6213
6214 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6215 /*
6216 * someone else is playing with the
6217 * page. We will have to wait.
6218 */
6219 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6220
6221 continue;
6222 }
6223 if (dst_page->vmp_laundry) {
6224 vm_pageout_steal_laundry(dst_page, FALSE);
6225 }
6226 } else {
6227 if (object->private) {
6228 /*
6229 * This is a nasty wrinkle for users
6230 * of upl who encounter device or
6231 * private memory however, it is
6232 * unavoidable, only a fault can
6233 * resolve the actual backing
6234 * physical page by asking the
6235 * backing device.
6236 */
6237 if (user_page_list) {
6238 user_page_list[entry].phys_addr = 0;
6239 }
6240
6241 goto try_next_page;
6242 }
6243 if (object->scan_collisions) {
6244 /*
6245 * the pageout_scan thread is trying to steal
6246 * pages from this object, but has run into our
6247 * lock... grab 2 pages from the head of the object...
6248 * the first is freed on behalf of pageout_scan, the
6249 * 2nd is for our own use... we use vm_object_page_grab
6250 * in both cases to avoid taking pages from the free
6251 * list since we are under memory pressure and our
6252 * lock on this object is getting in the way of
6253 * relieving it
6254 */
6255 dst_page = vm_object_page_grab(object);
6256
6257 if (dst_page != VM_PAGE_NULL) {
6258 vm_page_release(dst_page,
6259 FALSE);
6260 }
6261
6262 dst_page = vm_object_page_grab(object);
6263 }
6264 if (dst_page == VM_PAGE_NULL) {
6265 /*
6266 * need to allocate a page
6267 */
6268 dst_page = vm_page_grab_options(grab_options);
6269 if (dst_page != VM_PAGE_NULL) {
6270 page_grab_count++;
6271 }
6272 }
6273 if (dst_page == VM_PAGE_NULL) {
6274 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6275 /*
6276 * we don't want to stall waiting for pages to come onto the free list
6277 * while we're already holding absent pages in this UPL
6278 * the caller will deal with the empty slots
6279 */
6280 if (user_page_list) {
6281 user_page_list[entry].phys_addr = 0;
6282 }
6283
6284 goto try_next_page;
6285 }
6286 /*
6287 * no pages available... wait
6288 * then try again for the same
6289 * offset...
6290 */
6291 vm_object_unlock(object);
6292
6293 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6294
6295 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6296
6297 VM_PAGE_WAIT();
6298 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6299
6300 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6301
6302 vm_object_lock(object);
6303
6304 continue;
6305 }
6306 vm_page_insert(dst_page, object, dst_offset);
6307
6308 dst_page->vmp_absent = TRUE;
6309 dst_page->vmp_busy = FALSE;
6310
6311 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6312 /*
6313 * if UPL_RET_ONLY_ABSENT was specified,
6314 * than we're definitely setting up a
6315 * upl for a clustered read/pagein
6316 * operation... mark the pages as clustered
6317 * so upl_commit_range can put them on the
6318 * speculative list
6319 */
6320 dst_page->vmp_clustered = TRUE;
6321
6322 if (!(cntrl_flags & UPL_FILE_IO)) {
6323 counter_inc(&vm_statistics_pageins);
6324 }
6325 }
6326 }
6327 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6328
6329 dst_page->vmp_overwriting = TRUE;
6330
6331 if (dst_page->vmp_pmapped) {
6332 if (!(cntrl_flags & UPL_FILE_IO)) {
6333 /*
6334 * eliminate all mappings from the
6335 * original object and its prodigy
6336 */
6337 refmod_state = pmap_disconnect(phys_page);
6338 } else {
6339 refmod_state = pmap_get_refmod(phys_page);
6340 }
6341 } else {
6342 refmod_state = 0;
6343 }
6344
6345 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6346 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6347
6348 if (cntrl_flags & UPL_SET_LITE) {
6349 unsigned int pg_num;
6350
6351 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6352 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6353 bitmap_set(upl->lite_list, pg_num);
6354
6355 if (hw_dirty) {
6356 pmap_clear_modify(phys_page);
6357 }
6358
6359 /*
6360 * Mark original page as cleaning
6361 * in place.
6362 */
6363 dst_page->vmp_cleaning = TRUE;
6364 dst_page->vmp_precious = FALSE;
6365 } else {
6366 /*
6367 * use pageclean setup, it is more
6368 * convenient even for the pageout
6369 * cases here
6370 */
6371 vm_object_lock(upl->map_object);
6372 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6373 vm_object_unlock(upl->map_object);
6374
6375 alias_page->vmp_absent = FALSE;
6376 alias_page = NULL;
6377 }
6378
6379 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6380 upl->flags &= ~UPL_CLEAR_DIRTY;
6381 upl->flags |= UPL_SET_DIRTY;
6382 dirty = TRUE;
6383 /*
6384 * Page belonging to a code-signed object is about to
6385 * be written. Mark it tainted and disconnect it from
6386 * all pmaps so processes have to fault it back in and
6387 * deal with the tainted bit.
6388 */
6389 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6390 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6391 vm_page_upl_tainted++;
6392 if (dst_page->vmp_pmapped) {
6393 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6394 if (refmod_state & VM_MEM_REFERENCED) {
6395 dst_page->vmp_reference = TRUE;
6396 }
6397 }
6398 }
6399 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6400 /*
6401 * clean in place for read implies
6402 * that a write will be done on all
6403 * the pages that are dirty before
6404 * a upl commit is done. The caller
6405 * is obligated to preserve the
6406 * contents of all pages marked dirty
6407 */
6408 upl->flags |= UPL_CLEAR_DIRTY;
6409 }
6410 dst_page->vmp_dirty = dirty;
6411
6412 if (!dirty) {
6413 dst_page->vmp_precious = TRUE;
6414 }
6415
6416 if (!VM_PAGE_WIRED(dst_page)) {
6417 /*
6418 * deny access to the target page while
6419 * it is being worked on
6420 */
6421 dst_page->vmp_busy = TRUE;
6422 } else {
6423 dwp->dw_mask |= DW_vm_page_wire;
6424 }
6425
6426 /*
6427 * We might be about to satisfy a fault which has been
6428 * requested. So no need for the "restart" bit.
6429 */
6430 dst_page->vmp_restart = FALSE;
6431 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6432 /*
6433 * expect the page to be used
6434 */
6435 dwp->dw_mask |= DW_set_reference;
6436 }
6437 if (cntrl_flags & UPL_PRECIOUS) {
6438 if (object->internal) {
6439 SET_PAGE_DIRTY(dst_page, FALSE);
6440 dst_page->vmp_precious = FALSE;
6441 } else {
6442 dst_page->vmp_precious = TRUE;
6443 }
6444 } else {
6445 dst_page->vmp_precious = FALSE;
6446 }
6447 }
6448 if (dst_page->vmp_busy) {
6449 upl->flags |= UPL_HAS_BUSY;
6450 }
6451
6452 if (phys_page > upl->highest_page) {
6453 upl->highest_page = phys_page;
6454 }
6455 assert(!pmap_is_noencrypt(phys_page));
6456 if (user_page_list) {
6457 user_page_list[entry].phys_addr = phys_page;
6458 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6459 user_page_list[entry].absent = dst_page->vmp_absent;
6460 user_page_list[entry].dirty = dst_page->vmp_dirty;
6461 user_page_list[entry].precious = dst_page->vmp_precious;
6462 user_page_list[entry].device = FALSE;
6463 user_page_list[entry].needed = FALSE;
6464 if (dst_page->vmp_clustered == TRUE) {
6465 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6466 } else {
6467 user_page_list[entry].speculative = FALSE;
6468 }
6469 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6470 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6471 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6472 user_page_list[entry].mark = FALSE;
6473 }
6474 /*
6475 * if UPL_RET_ONLY_ABSENT is set, then
6476 * we are working with a fresh page and we've
6477 * just set the clustered flag on it to
6478 * indicate that it was drug in as part of a
6479 * speculative cluster... so leave it alone
6480 */
6481 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6482 /*
6483 * someone is explicitly grabbing this page...
6484 * update clustered and speculative state
6485 *
6486 */
6487 if (dst_page->vmp_clustered) {
6488 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6489 }
6490 }
6491 try_next_page:
6492 if (dwp->dw_mask) {
6493 if (dwp->dw_mask & DW_vm_page_activate) {
6494 counter_inc(&vm_statistics_reactivations);
6495 }
6496
6497 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6498
6499 if (dw_count >= dw_limit) {
6500 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6501
6502 dwp = dwp_start;
6503 dw_count = 0;
6504 }
6505 }
6506 entry++;
6507 dst_offset += PAGE_SIZE_64;
6508 xfer_size -= PAGE_SIZE;
6509 }
6510 if (dw_count) {
6511 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6512 dwp = dwp_start;
6513 dw_count = 0;
6514 }
6515
6516 if (alias_page != NULL) {
6517 VM_PAGE_FREE(alias_page);
6518 }
6519 if (pmap_flushes_delayed == TRUE) {
6520 pmap_flush(&pmap_flush_context_storage);
6521 }
6522
6523 if (page_list_count != NULL) {
6524 if (upl->flags & UPL_INTERNAL) {
6525 *page_list_count = 0;
6526 } else if (*page_list_count > entry) {
6527 *page_list_count = entry;
6528 }
6529 }
6530 #if UPL_DEBUG
6531 upl->upl_state = 1;
6532 #endif
6533 vm_object_unlock(object);
6534
6535 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6536 #if DEVELOPMENT || DEBUG
6537 if (task != NULL) {
6538 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6539 }
6540 #endif /* DEVELOPMENT || DEBUG */
6541
6542 if (dwp_start && dwp_finish_ctx) {
6543 vm_page_delayed_work_finish_ctx(dwp_start);
6544 dwp_start = dwp = NULL;
6545 }
6546
6547 return KERN_SUCCESS;
6548 }
6549
6550 /*
6551 * Routine: vm_object_super_upl_request
6552 * Purpose:
6553 * Cause the population of a portion of a vm_object
6554 * in much the same way as memory_object_upl_request.
6555 * Depending on the nature of the request, the pages
6556 * returned may be contain valid data or be uninitialized.
6557 * However, the region may be expanded up to the super
6558 * cluster size provided.
6559 */
6560
6561 __private_extern__ kern_return_t
vm_object_super_upl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_size_t super_cluster,upl_t * upl,upl_page_info_t * user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)6562 vm_object_super_upl_request(
6563 vm_object_t object,
6564 vm_object_offset_t offset,
6565 upl_size_t size,
6566 upl_size_t super_cluster,
6567 upl_t *upl,
6568 upl_page_info_t *user_page_list,
6569 unsigned int *page_list_count,
6570 upl_control_flags_t cntrl_flags,
6571 vm_tag_t tag)
6572 {
6573 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6574 return KERN_FAILURE;
6575 }
6576
6577 assert(object->paging_in_progress);
6578 offset = offset - object->paging_offset;
6579
6580 if (super_cluster > size) {
6581 vm_object_offset_t base_offset;
6582 upl_size_t super_size;
6583 vm_object_size_t super_size_64;
6584
6585 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6586 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6587 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6588 super_size = (upl_size_t) super_size_64;
6589 assert(super_size == super_size_64);
6590
6591 if (offset > (base_offset + super_size)) {
6592 panic("vm_object_super_upl_request: Missed target pageout"
6593 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6594 offset, base_offset, super_size, super_cluster,
6595 size, object->paging_offset);
6596 }
6597 /*
6598 * apparently there is a case where the vm requests a
6599 * page to be written out who's offset is beyond the
6600 * object size
6601 */
6602 if ((offset + size) > (base_offset + super_size)) {
6603 super_size_64 = (offset + size) - base_offset;
6604 super_size = (upl_size_t) super_size_64;
6605 assert(super_size == super_size_64);
6606 }
6607
6608 offset = base_offset;
6609 size = super_size;
6610 }
6611 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6612 }
6613
6614 int cs_executable_create_upl = 0;
6615 extern int proc_selfpid(void);
6616 extern char *proc_name_address(void *p);
6617
6618 kern_return_t
vm_map_create_upl(vm_map_t map,vm_map_address_t offset,upl_size_t * upl_size,upl_t * upl,upl_page_info_array_t page_list,unsigned int * count,upl_control_flags_t * flags,vm_tag_t tag)6619 vm_map_create_upl(
6620 vm_map_t map,
6621 vm_map_address_t offset,
6622 upl_size_t *upl_size,
6623 upl_t *upl,
6624 upl_page_info_array_t page_list,
6625 unsigned int *count,
6626 upl_control_flags_t *flags,
6627 vm_tag_t tag)
6628 {
6629 vm_map_entry_t entry;
6630 upl_control_flags_t caller_flags;
6631 int force_data_sync;
6632 int sync_cow_data;
6633 vm_object_t local_object;
6634 vm_map_offset_t local_offset;
6635 vm_map_offset_t local_start;
6636 kern_return_t ret;
6637 vm_map_address_t original_offset;
6638 vm_map_size_t original_size, adjusted_size;
6639 vm_map_offset_t local_entry_start;
6640 vm_object_offset_t local_entry_offset;
6641 vm_object_offset_t offset_in_mapped_page;
6642 boolean_t release_map = FALSE;
6643
6644 start_with_map:
6645
6646 original_offset = offset;
6647 original_size = *upl_size;
6648 adjusted_size = original_size;
6649
6650 caller_flags = *flags;
6651
6652 if (caller_flags & ~UPL_VALID_FLAGS) {
6653 /*
6654 * For forward compatibility's sake,
6655 * reject any unknown flag.
6656 */
6657 ret = KERN_INVALID_VALUE;
6658 goto done;
6659 }
6660 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6661 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6662
6663 if (upl == NULL) {
6664 ret = KERN_INVALID_ARGUMENT;
6665 goto done;
6666 }
6667
6668 REDISCOVER_ENTRY:
6669 vm_map_lock_read(map);
6670
6671 if (!vm_map_lookup_entry(map, offset, &entry)) {
6672 vm_map_unlock_read(map);
6673 ret = KERN_FAILURE;
6674 goto done;
6675 }
6676
6677 local_entry_start = entry->vme_start;
6678 local_entry_offset = VME_OFFSET(entry);
6679
6680 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6681 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6682 }
6683
6684 if (entry->vme_end - original_offset < adjusted_size) {
6685 adjusted_size = entry->vme_end - original_offset;
6686 assert(adjusted_size > 0);
6687 *upl_size = (upl_size_t) adjusted_size;
6688 assert(*upl_size == adjusted_size);
6689 }
6690
6691 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6692 *flags = 0;
6693
6694 if (!entry->is_sub_map &&
6695 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6696 if (VME_OBJECT(entry)->private) {
6697 *flags = UPL_DEV_MEMORY;
6698 }
6699
6700 if (VME_OBJECT(entry)->phys_contiguous) {
6701 *flags |= UPL_PHYS_CONTIG;
6702 }
6703 }
6704 vm_map_unlock_read(map);
6705 ret = KERN_SUCCESS;
6706 goto done;
6707 }
6708
6709 offset_in_mapped_page = 0;
6710 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6711 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6712 *upl_size = (upl_size_t)
6713 (vm_map_round_page(original_offset + adjusted_size,
6714 VM_MAP_PAGE_MASK(map))
6715 - offset);
6716
6717 offset_in_mapped_page = original_offset - offset;
6718 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6719
6720 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6721 }
6722
6723 if (!entry->is_sub_map) {
6724 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6725 !VME_OBJECT(entry)->phys_contiguous) {
6726 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6727 *upl_size = MAX_UPL_SIZE_BYTES;
6728 }
6729 }
6730
6731 /*
6732 * Create an object if necessary.
6733 */
6734 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6735 if (vm_map_lock_read_to_write(map)) {
6736 goto REDISCOVER_ENTRY;
6737 }
6738
6739 VME_OBJECT_SET(entry,
6740 vm_object_allocate((vm_size_t)
6741 vm_object_round_page((entry->vme_end - entry->vme_start))),
6742 false, 0);
6743 VME_OFFSET_SET(entry, 0);
6744 assert(entry->use_pmap);
6745
6746 vm_map_lock_write_to_read(map);
6747 }
6748
6749 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6750 !(entry->protection & VM_PROT_WRITE)) {
6751 vm_map_unlock_read(map);
6752 ret = KERN_PROTECTION_FAILURE;
6753 goto done;
6754 }
6755 }
6756
6757 #if !XNU_TARGET_OS_OSX
6758 if (map->pmap != kernel_pmap &&
6759 (caller_flags & UPL_COPYOUT_FROM) &&
6760 (entry->protection & VM_PROT_EXECUTE) &&
6761 !(entry->protection & VM_PROT_WRITE)) {
6762 vm_offset_t kaddr;
6763 vm_size_t ksize;
6764
6765 /*
6766 * We're about to create a read-only UPL backed by
6767 * memory from an executable mapping.
6768 * Wiring the pages would result in the pages being copied
6769 * (due to the "MAP_PRIVATE" mapping) and no longer
6770 * code-signed, so no longer eligible for execution.
6771 * Instead, let's copy the data into a kernel buffer and
6772 * create the UPL from this kernel buffer.
6773 * The kernel buffer is then freed, leaving the UPL holding
6774 * the last reference on the VM object, so the memory will
6775 * be released when the UPL is committed.
6776 */
6777
6778 vm_map_unlock_read(map);
6779 entry = VM_MAP_ENTRY_NULL;
6780 /* allocate kernel buffer */
6781 ksize = round_page(*upl_size);
6782 kaddr = 0;
6783 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6784 KMA_PAGEABLE | KMA_DATA, tag);
6785 if (ret == KERN_SUCCESS) {
6786 /* copyin the user data */
6787 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6788 }
6789 if (ret == KERN_SUCCESS) {
6790 if (ksize > *upl_size) {
6791 /* zero out the extra space in kernel buffer */
6792 memset((void *)(kaddr + *upl_size),
6793 0,
6794 ksize - *upl_size);
6795 }
6796 /* create the UPL from the kernel buffer */
6797 vm_object_offset_t offset_in_object;
6798 vm_object_offset_t offset_in_object_page;
6799
6800 offset_in_object = offset - local_entry_start + local_entry_offset;
6801 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6802 assert(offset_in_object_page < PAGE_SIZE);
6803 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6804 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6805 ret = vm_map_create_upl(kernel_map,
6806 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6807 upl_size, upl, page_list, count, flags, tag);
6808 }
6809 if (kaddr != 0) {
6810 /* free the kernel buffer */
6811 kmem_free(kernel_map, kaddr, ksize);
6812 kaddr = 0;
6813 ksize = 0;
6814 }
6815 #if DEVELOPMENT || DEBUG
6816 DTRACE_VM4(create_upl_from_executable,
6817 vm_map_t, map,
6818 vm_map_address_t, offset,
6819 upl_size_t, *upl_size,
6820 kern_return_t, ret);
6821 #endif /* DEVELOPMENT || DEBUG */
6822 goto done;
6823 }
6824 #endif /* !XNU_TARGET_OS_OSX */
6825
6826 if (!entry->is_sub_map) {
6827 local_object = VME_OBJECT(entry);
6828 assert(local_object != VM_OBJECT_NULL);
6829 }
6830
6831 if (!entry->is_sub_map &&
6832 !entry->needs_copy &&
6833 *upl_size != 0 &&
6834 local_object->vo_size > *upl_size && /* partial UPL */
6835 entry->wired_count == 0 && /* No COW for entries that are wired */
6836 (map->pmap != kernel_pmap) && /* alias checks */
6837 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6838 ||
6839 ( /* case 2 */
6840 local_object->internal &&
6841 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6842 local_object->ref_count > 1))) {
6843 vm_prot_t prot;
6844
6845 /*
6846 * Case 1:
6847 * Set up the targeted range for copy-on-write to avoid
6848 * applying true_share/copy_delay to the entire object.
6849 *
6850 * Case 2:
6851 * This map entry covers only part of an internal
6852 * object. There could be other map entries covering
6853 * other areas of this object and some of these map
6854 * entries could be marked as "needs_copy", which
6855 * assumes that the object is COPY_SYMMETRIC.
6856 * To avoid marking this object as COPY_DELAY and
6857 * "true_share", let's shadow it and mark the new
6858 * (smaller) object as "true_share" and COPY_DELAY.
6859 */
6860
6861 if (vm_map_lock_read_to_write(map)) {
6862 goto REDISCOVER_ENTRY;
6863 }
6864 vm_map_lock_assert_exclusive(map);
6865 assert(VME_OBJECT(entry) == local_object);
6866
6867 vm_map_clip_start(map,
6868 entry,
6869 vm_map_trunc_page(offset,
6870 VM_MAP_PAGE_MASK(map)));
6871 vm_map_clip_end(map,
6872 entry,
6873 vm_map_round_page(offset + *upl_size,
6874 VM_MAP_PAGE_MASK(map)));
6875 if ((entry->vme_end - offset) < *upl_size) {
6876 *upl_size = (upl_size_t) (entry->vme_end - offset);
6877 assert(*upl_size == entry->vme_end - offset);
6878 }
6879
6880 prot = entry->protection & ~VM_PROT_WRITE;
6881 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6882 prot |= VM_PROT_EXECUTE;
6883 }
6884 vm_object_pmap_protect(local_object,
6885 VME_OFFSET(entry),
6886 entry->vme_end - entry->vme_start,
6887 ((entry->is_shared ||
6888 map->mapped_in_other_pmaps)
6889 ? PMAP_NULL
6890 : map->pmap),
6891 VM_MAP_PAGE_SIZE(map),
6892 entry->vme_start,
6893 prot);
6894
6895 assert(entry->wired_count == 0);
6896
6897 /*
6898 * Lock the VM object and re-check its status: if it's mapped
6899 * in another address space, we could still be racing with
6900 * another thread holding that other VM map exclusively.
6901 */
6902 vm_object_lock(local_object);
6903 if (local_object->true_share) {
6904 /* object is already in proper state: no COW needed */
6905 assert(local_object->copy_strategy !=
6906 MEMORY_OBJECT_COPY_SYMMETRIC);
6907 } else {
6908 /* not true_share: ask for copy-on-write below */
6909 assert(local_object->copy_strategy ==
6910 MEMORY_OBJECT_COPY_SYMMETRIC);
6911 entry->needs_copy = TRUE;
6912 }
6913 vm_object_unlock(local_object);
6914
6915 vm_map_lock_write_to_read(map);
6916 }
6917
6918 if (entry->needs_copy) {
6919 /*
6920 * Honor copy-on-write for COPY_SYMMETRIC
6921 * strategy.
6922 */
6923 vm_map_t local_map;
6924 vm_object_t object;
6925 vm_object_offset_t new_offset;
6926 vm_prot_t prot;
6927 boolean_t wired;
6928 vm_map_version_t version;
6929 vm_map_t real_map;
6930 vm_prot_t fault_type;
6931
6932 local_map = map;
6933
6934 if (caller_flags & UPL_COPYOUT_FROM) {
6935 fault_type = VM_PROT_READ | VM_PROT_COPY;
6936 vm_counters.create_upl_extra_cow++;
6937 vm_counters.create_upl_extra_cow_pages +=
6938 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6939 } else {
6940 fault_type = VM_PROT_WRITE;
6941 }
6942 if (vm_map_lookup_and_lock_object(&local_map,
6943 offset, fault_type,
6944 OBJECT_LOCK_EXCLUSIVE,
6945 &version, &object,
6946 &new_offset, &prot, &wired,
6947 NULL,
6948 &real_map, NULL) != KERN_SUCCESS) {
6949 if (fault_type == VM_PROT_WRITE) {
6950 vm_counters.create_upl_lookup_failure_write++;
6951 } else {
6952 vm_counters.create_upl_lookup_failure_copy++;
6953 }
6954 vm_map_unlock_read(local_map);
6955 ret = KERN_FAILURE;
6956 goto done;
6957 }
6958 if (real_map != local_map) {
6959 vm_map_unlock(real_map);
6960 }
6961 vm_map_unlock_read(local_map);
6962
6963 vm_object_unlock(object);
6964
6965 goto REDISCOVER_ENTRY;
6966 }
6967
6968 if (entry->is_sub_map) {
6969 vm_map_t submap;
6970
6971 submap = VME_SUBMAP(entry);
6972 local_start = entry->vme_start;
6973 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6974
6975 vm_map_reference(submap);
6976 vm_map_unlock_read(map);
6977
6978 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6979 offset += offset_in_mapped_page;
6980 *upl_size -= offset_in_mapped_page;
6981
6982 if (release_map) {
6983 vm_map_deallocate(map);
6984 }
6985 map = submap;
6986 release_map = TRUE;
6987 offset = local_offset + (offset - local_start);
6988 goto start_with_map;
6989 }
6990
6991 if (sync_cow_data &&
6992 (VME_OBJECT(entry)->shadow ||
6993 VME_OBJECT(entry)->copy)) {
6994 local_object = VME_OBJECT(entry);
6995 local_start = entry->vme_start;
6996 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6997
6998 vm_object_reference(local_object);
6999 vm_map_unlock_read(map);
7000
7001 if (local_object->shadow && local_object->copy) {
7002 vm_object_lock_request(local_object->shadow,
7003 ((vm_object_offset_t)
7004 ((offset - local_start) +
7005 local_offset) +
7006 local_object->vo_shadow_offset),
7007 *upl_size, FALSE,
7008 MEMORY_OBJECT_DATA_SYNC,
7009 VM_PROT_NO_CHANGE);
7010 }
7011 sync_cow_data = FALSE;
7012 vm_object_deallocate(local_object);
7013
7014 goto REDISCOVER_ENTRY;
7015 }
7016 if (force_data_sync) {
7017 local_object = VME_OBJECT(entry);
7018 local_start = entry->vme_start;
7019 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7020
7021 vm_object_reference(local_object);
7022 vm_map_unlock_read(map);
7023
7024 vm_object_lock_request(local_object,
7025 ((vm_object_offset_t)
7026 ((offset - local_start) +
7027 local_offset)),
7028 (vm_object_size_t)*upl_size,
7029 FALSE,
7030 MEMORY_OBJECT_DATA_SYNC,
7031 VM_PROT_NO_CHANGE);
7032
7033 force_data_sync = FALSE;
7034 vm_object_deallocate(local_object);
7035
7036 goto REDISCOVER_ENTRY;
7037 }
7038 if (VME_OBJECT(entry)->private) {
7039 *flags = UPL_DEV_MEMORY;
7040 } else {
7041 *flags = 0;
7042 }
7043
7044 if (VME_OBJECT(entry)->phys_contiguous) {
7045 *flags |= UPL_PHYS_CONTIG;
7046 }
7047
7048 local_object = VME_OBJECT(entry);
7049 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7050 local_start = entry->vme_start;
7051
7052 /*
7053 * Wiring will copy the pages to the shadow object.
7054 * The shadow object will not be code-signed so
7055 * attempting to execute code from these copied pages
7056 * would trigger a code-signing violation.
7057 */
7058 if (entry->protection & VM_PROT_EXECUTE) {
7059 #if MACH_ASSERT
7060 printf("pid %d[%s] create_upl out of executable range from "
7061 "0x%llx to 0x%llx: side effects may include "
7062 "code-signing violations later on\n",
7063 proc_selfpid(),
7064 (get_bsdtask_info(current_task())
7065 ? proc_name_address(get_bsdtask_info(current_task()))
7066 : "?"),
7067 (uint64_t) entry->vme_start,
7068 (uint64_t) entry->vme_end);
7069 #endif /* MACH_ASSERT */
7070 DTRACE_VM2(cs_executable_create_upl,
7071 uint64_t, (uint64_t)entry->vme_start,
7072 uint64_t, (uint64_t)entry->vme_end);
7073 cs_executable_create_upl++;
7074 }
7075
7076 vm_object_lock(local_object);
7077
7078 /*
7079 * Ensure that this object is "true_share" and "copy_delay" now,
7080 * while we're still holding the VM map lock. After we unlock the map,
7081 * anything could happen to that mapping, including some copy-on-write
7082 * activity. We need to make sure that the IOPL will point at the
7083 * same memory as the mapping.
7084 */
7085 if (local_object->true_share) {
7086 assert(local_object->copy_strategy !=
7087 MEMORY_OBJECT_COPY_SYMMETRIC);
7088 } else if (local_object != kernel_object &&
7089 local_object != compressor_object &&
7090 !local_object->phys_contiguous) {
7091 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7092 if (!local_object->true_share &&
7093 vm_object_tracking_btlog) {
7094 btlog_record(vm_object_tracking_btlog, local_object,
7095 VM_OBJECT_TRACKING_OP_TRUESHARE,
7096 btref_get(__builtin_frame_address(0), 0));
7097 }
7098 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7099 local_object->true_share = TRUE;
7100 if (local_object->copy_strategy ==
7101 MEMORY_OBJECT_COPY_SYMMETRIC) {
7102 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7103 }
7104 }
7105
7106 vm_object_reference_locked(local_object);
7107 vm_object_unlock(local_object);
7108
7109 vm_map_unlock_read(map);
7110
7111 offset += offset_in_mapped_page;
7112 assert(*upl_size > offset_in_mapped_page);
7113 *upl_size -= offset_in_mapped_page;
7114
7115 ret = vm_object_iopl_request(local_object,
7116 ((vm_object_offset_t)
7117 ((offset - local_start) + local_offset)),
7118 *upl_size,
7119 upl,
7120 page_list,
7121 count,
7122 caller_flags,
7123 tag);
7124 vm_object_deallocate(local_object);
7125
7126 done:
7127 if (release_map) {
7128 vm_map_deallocate(map);
7129 }
7130
7131 return ret;
7132 }
7133
7134 /*
7135 * Internal routine to enter a UPL into a VM map.
7136 *
7137 * JMM - This should just be doable through the standard
7138 * vm_map_enter() API.
7139 */
7140 kern_return_t
vm_map_enter_upl_range(vm_map_t map,upl_t upl,vm_object_offset_t offset_to_map,upl_size_t size_to_map,vm_prot_t prot_to_map,vm_map_offset_t * dst_addr)7141 vm_map_enter_upl_range(
7142 vm_map_t map,
7143 upl_t upl,
7144 vm_object_offset_t offset_to_map,
7145 upl_size_t size_to_map,
7146 vm_prot_t prot_to_map,
7147 vm_map_offset_t *dst_addr)
7148 {
7149 vm_map_size_t size;
7150 vm_object_offset_t offset;
7151 vm_map_offset_t addr;
7152 vm_page_t m;
7153 kern_return_t kr;
7154 int isVectorUPL = 0, curr_upl = 0;
7155 upl_t vector_upl = NULL;
7156 mach_vm_offset_t vector_upl_dst_addr = 0;
7157 vm_map_t vector_upl_submap = NULL;
7158 upl_offset_t subupl_offset = 0;
7159 upl_size_t subupl_size = 0;
7160
7161 if (upl == UPL_NULL) {
7162 return KERN_INVALID_ARGUMENT;
7163 }
7164
7165 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7166 assert(map == kernel_map);
7167
7168 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7169 int mapped = 0, valid_upls = 0;
7170 vector_upl = upl;
7171
7172 upl_lock(vector_upl);
7173 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7174 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7175 if (upl == NULL) {
7176 continue;
7177 }
7178 valid_upls++;
7179 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7180 mapped++;
7181 }
7182 }
7183
7184 if (mapped) {
7185 if (mapped != valid_upls) {
7186 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7187 } else {
7188 upl_unlock(vector_upl);
7189 return KERN_FAILURE;
7190 }
7191 }
7192
7193 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7194 panic("TODO4K: vector UPL not implemented");
7195 }
7196
7197 vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7198 vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7199 VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7200 VM_KERN_MEMORY_NONE).kmr_submap;
7201 map = vector_upl_submap;
7202 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7203 curr_upl = 0;
7204 } else {
7205 upl_lock(upl);
7206 }
7207
7208 process_upl_to_enter:
7209 if (isVectorUPL) {
7210 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7211 *dst_addr = vector_upl_dst_addr;
7212 upl_unlock(vector_upl);
7213 return KERN_SUCCESS;
7214 }
7215 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7216 if (upl == NULL) {
7217 goto process_upl_to_enter;
7218 }
7219
7220 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7221 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7222 } else {
7223 /*
7224 * check to see if already mapped
7225 */
7226 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7227 upl_unlock(upl);
7228 return KERN_FAILURE;
7229 }
7230 }
7231
7232 if ((!(upl->flags & UPL_SHADOWED)) &&
7233 ((upl->flags & UPL_HAS_BUSY) ||
7234 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7235 vm_object_t object;
7236 vm_page_t alias_page;
7237 vm_object_offset_t new_offset;
7238 unsigned int pg_num;
7239
7240 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7241 object = upl->map_object;
7242 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7243
7244 vm_object_lock(upl->map_object);
7245
7246 upl->map_object->shadow = object;
7247 upl->map_object->pageout = TRUE;
7248 upl->map_object->can_persist = FALSE;
7249 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7250 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7251 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7252 "object %p shadow_offset 0x%llx",
7253 upl->map_object,
7254 (uint64_t)upl->map_object->vo_shadow_offset);
7255 upl->map_object->wimg_bits = object->wimg_bits;
7256 offset = upl->map_object->vo_shadow_offset;
7257 new_offset = 0;
7258
7259 upl->flags |= UPL_SHADOWED;
7260
7261 while (size) {
7262 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7263 assert(pg_num == new_offset / PAGE_SIZE);
7264
7265 if (bitmap_test(upl->lite_list, pg_num)) {
7266 alias_page = vm_page_grab_fictitious(TRUE);
7267
7268 vm_object_lock(object);
7269
7270 m = vm_page_lookup(object, offset);
7271 if (m == VM_PAGE_NULL) {
7272 panic("vm_upl_map: page missing");
7273 }
7274
7275 /*
7276 * Convert the fictitious page to a private
7277 * shadow of the real page.
7278 */
7279 assert(alias_page->vmp_fictitious);
7280 alias_page->vmp_fictitious = FALSE;
7281 alias_page->vmp_private = TRUE;
7282 alias_page->vmp_free_when_done = TRUE;
7283 /*
7284 * since m is a page in the upl it must
7285 * already be wired or BUSY, so it's
7286 * safe to assign the underlying physical
7287 * page to the alias
7288 */
7289 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7290
7291 vm_object_unlock(object);
7292
7293 vm_page_lockspin_queues();
7294 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7295 vm_page_unlock_queues();
7296
7297 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7298
7299 assert(!alias_page->vmp_wanted);
7300 alias_page->vmp_busy = FALSE;
7301 alias_page->vmp_absent = FALSE;
7302 }
7303 size -= PAGE_SIZE;
7304 offset += PAGE_SIZE_64;
7305 new_offset += PAGE_SIZE_64;
7306 }
7307 vm_object_unlock(upl->map_object);
7308 }
7309 if (upl->flags & UPL_SHADOWED) {
7310 if (isVectorUPL) {
7311 offset = 0;
7312 } else {
7313 offset = offset_to_map;
7314 }
7315 } else {
7316 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7317 if (!isVectorUPL) {
7318 offset += offset_to_map;
7319 }
7320 }
7321
7322 if (isVectorUPL) {
7323 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7324 } else {
7325 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7326 }
7327
7328 vm_object_reference(upl->map_object);
7329
7330 if (!isVectorUPL) {
7331 *dst_addr = 0;
7332 /*
7333 * NEED A UPL_MAP ALIAS
7334 */
7335 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7336 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7337 upl->map_object, offset, FALSE,
7338 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7339
7340 if (kr != KERN_SUCCESS) {
7341 vm_object_deallocate(upl->map_object);
7342 upl_unlock(upl);
7343 return kr;
7344 }
7345 } else {
7346 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7347 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7348 upl->map_object, offset, FALSE,
7349 prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7350 if (kr) {
7351 panic("vm_map_enter failed for a Vector UPL");
7352 }
7353 }
7354 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7355 /* this will have to be an increment rather than */
7356 /* an assignment. */
7357 vm_object_lock(upl->map_object);
7358
7359 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7360 m = vm_page_lookup(upl->map_object, offset);
7361
7362 if (m) {
7363 m->vmp_pmapped = TRUE;
7364
7365 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7366 * but only in kernel space. If this was on a user map,
7367 * we'd have to set the wpmapped bit. */
7368 /* m->vmp_wpmapped = TRUE; */
7369 assert(map->pmap == kernel_pmap);
7370
7371 PMAP_ENTER(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE, kr);
7372
7373 assert(kr == KERN_SUCCESS);
7374 #if KASAN
7375 kasan_notify_address(addr, PAGE_SIZE_64);
7376 #endif
7377 }
7378 offset += PAGE_SIZE_64;
7379 }
7380 vm_object_unlock(upl->map_object);
7381
7382 /*
7383 * hold a reference for the mapping
7384 */
7385 upl->ref_count++;
7386 upl->flags |= UPL_PAGE_LIST_MAPPED;
7387 upl->kaddr = (vm_offset_t) *dst_addr;
7388 assert(upl->kaddr == *dst_addr);
7389
7390 if (isVectorUPL) {
7391 goto process_upl_to_enter;
7392 }
7393
7394 if (!isVectorUPL) {
7395 vm_map_offset_t addr_adjustment;
7396
7397 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7398 if (addr_adjustment) {
7399 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7400 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7401 *dst_addr += addr_adjustment;
7402 }
7403 }
7404
7405 upl_unlock(upl);
7406
7407 return KERN_SUCCESS;
7408 }
7409
7410 kern_return_t
vm_map_enter_upl(vm_map_t map,upl_t upl,vm_map_offset_t * dst_addr)7411 vm_map_enter_upl(
7412 vm_map_t map,
7413 upl_t upl,
7414 vm_map_offset_t *dst_addr)
7415 {
7416 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7417 return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7418 }
7419
7420 /*
7421 * Internal routine to remove a UPL mapping from a VM map.
7422 *
7423 * XXX - This should just be doable through a standard
7424 * vm_map_remove() operation. Otherwise, implicit clean-up
7425 * of the target map won't be able to correctly remove
7426 * these (and release the reference on the UPL). Having
7427 * to do this means we can't map these into user-space
7428 * maps yet.
7429 */
7430 kern_return_t
vm_map_remove_upl_range(vm_map_t map,upl_t upl,__unused vm_object_offset_t offset_to_unmap,__unused upl_size_t size_to_unmap)7431 vm_map_remove_upl_range(
7432 vm_map_t map,
7433 upl_t upl,
7434 __unused vm_object_offset_t offset_to_unmap,
7435 __unused upl_size_t size_to_unmap)
7436 {
7437 vm_address_t addr;
7438 upl_size_t size;
7439 int isVectorUPL = 0, curr_upl = 0;
7440 upl_t vector_upl = NULL;
7441
7442 if (upl == UPL_NULL) {
7443 return KERN_INVALID_ARGUMENT;
7444 }
7445
7446 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7447 int unmapped = 0, valid_upls = 0;
7448 vector_upl = upl;
7449 upl_lock(vector_upl);
7450 for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7451 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7452 if (upl == NULL) {
7453 continue;
7454 }
7455 valid_upls++;
7456 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7457 unmapped++;
7458 }
7459 }
7460
7461 if (unmapped) {
7462 if (unmapped != valid_upls) {
7463 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7464 } else {
7465 upl_unlock(vector_upl);
7466 return KERN_FAILURE;
7467 }
7468 }
7469 curr_upl = 0;
7470 } else {
7471 upl_lock(upl);
7472 }
7473
7474 process_upl_to_remove:
7475 if (isVectorUPL) {
7476 if (curr_upl == vector_upl_max_upls(vector_upl)) {
7477 vm_map_t v_upl_submap;
7478 vm_offset_t v_upl_submap_dst_addr;
7479 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7480
7481 kmem_free_guard(map, v_upl_submap_dst_addr,
7482 vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7483 vm_map_deallocate(v_upl_submap);
7484 upl_unlock(vector_upl);
7485 return KERN_SUCCESS;
7486 }
7487
7488 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7489 if (upl == NULL) {
7490 goto process_upl_to_remove;
7491 }
7492 }
7493
7494 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7495 addr = upl->kaddr;
7496 size = upl->u_mapped_size;
7497
7498 assert(upl->ref_count > 1);
7499 upl->ref_count--; /* removing mapping ref */
7500
7501 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7502 upl->kaddr = (vm_offset_t) 0;
7503 upl->u_mapped_size = 0;
7504
7505 if (isVectorUPL) {
7506 /*
7507 * If it's a Vectored UPL, we'll be removing the entire
7508 * submap anyways, so no need to remove individual UPL
7509 * element mappings from within the submap
7510 */
7511 goto process_upl_to_remove;
7512 }
7513
7514 upl_unlock(upl);
7515
7516 vm_map_remove(map,
7517 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7518 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7519 return KERN_SUCCESS;
7520 }
7521 upl_unlock(upl);
7522
7523 return KERN_FAILURE;
7524 }
7525
7526 kern_return_t
vm_map_remove_upl(vm_map_t map,upl_t upl)7527 vm_map_remove_upl(
7528 vm_map_t map,
7529 upl_t upl)
7530 {
7531 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7532 return vm_map_remove_upl_range(map, upl, 0, upl_size);
7533 }
7534
7535 kern_return_t
upl_commit_range(upl_t upl,upl_offset_t offset,upl_size_t size,int flags,upl_page_info_t * page_list,mach_msg_type_number_t count,boolean_t * empty)7536 upl_commit_range(
7537 upl_t upl,
7538 upl_offset_t offset,
7539 upl_size_t size,
7540 int flags,
7541 upl_page_info_t *page_list,
7542 mach_msg_type_number_t count,
7543 boolean_t *empty)
7544 {
7545 upl_size_t xfer_size, subupl_size;
7546 vm_object_t shadow_object;
7547 vm_object_t object;
7548 vm_object_t m_object;
7549 vm_object_offset_t target_offset;
7550 upl_offset_t subupl_offset = offset;
7551 int entry;
7552 int occupied;
7553 int clear_refmod = 0;
7554 int pgpgout_count = 0;
7555 struct vm_page_delayed_work dw_array;
7556 struct vm_page_delayed_work *dwp, *dwp_start;
7557 bool dwp_finish_ctx = TRUE;
7558 int dw_count;
7559 int dw_limit;
7560 int isVectorUPL = 0;
7561 upl_t vector_upl = NULL;
7562 boolean_t should_be_throttled = FALSE;
7563
7564 vm_page_t nxt_page = VM_PAGE_NULL;
7565 int fast_path_possible = 0;
7566 int fast_path_full_commit = 0;
7567 int throttle_page = 0;
7568 int unwired_count = 0;
7569 int local_queue_count = 0;
7570 vm_page_t first_local, last_local;
7571 vm_object_offset_t obj_start, obj_end, obj_offset;
7572 kern_return_t kr = KERN_SUCCESS;
7573
7574 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7575
7576 dwp_start = dwp = NULL;
7577
7578 subupl_size = size;
7579 *empty = FALSE;
7580
7581 if (upl == UPL_NULL) {
7582 return KERN_INVALID_ARGUMENT;
7583 }
7584
7585 dw_count = 0;
7586 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7587 dwp_start = vm_page_delayed_work_get_ctx();
7588 if (dwp_start == NULL) {
7589 dwp_start = &dw_array;
7590 dw_limit = 1;
7591 dwp_finish_ctx = FALSE;
7592 }
7593
7594 dwp = dwp_start;
7595
7596 if (count == 0) {
7597 page_list = NULL;
7598 }
7599
7600 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7601 vector_upl = upl;
7602 upl_lock(vector_upl);
7603 } else {
7604 upl_lock(upl);
7605 }
7606
7607 process_upl_to_commit:
7608
7609 if (isVectorUPL) {
7610 size = subupl_size;
7611 offset = subupl_offset;
7612 if (size == 0) {
7613 upl_unlock(vector_upl);
7614 kr = KERN_SUCCESS;
7615 goto done;
7616 }
7617 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7618 if (upl == NULL) {
7619 upl_unlock(vector_upl);
7620 kr = KERN_FAILURE;
7621 goto done;
7622 }
7623 page_list = upl->page_list;
7624 subupl_size -= size;
7625 subupl_offset += size;
7626 }
7627
7628 #if UPL_DEBUG
7629 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7630 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7631 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7632 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7633
7634 upl->upl_commit_index++;
7635 }
7636 #endif
7637 if (upl->flags & UPL_DEVICE_MEMORY) {
7638 xfer_size = 0;
7639 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7640 xfer_size = size;
7641 } else {
7642 if (!isVectorUPL) {
7643 upl_unlock(upl);
7644 } else {
7645 upl_unlock(vector_upl);
7646 }
7647 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7648 kr = KERN_FAILURE;
7649 goto done;
7650 }
7651 if (upl->flags & UPL_SET_DIRTY) {
7652 flags |= UPL_COMMIT_SET_DIRTY;
7653 }
7654 if (upl->flags & UPL_CLEAR_DIRTY) {
7655 flags |= UPL_COMMIT_CLEAR_DIRTY;
7656 }
7657
7658 object = upl->map_object;
7659
7660 if (upl->flags & UPL_SHADOWED) {
7661 vm_object_lock(object);
7662 shadow_object = object->shadow;
7663 } else {
7664 shadow_object = object;
7665 }
7666 entry = offset / PAGE_SIZE;
7667 target_offset = (vm_object_offset_t)offset;
7668
7669 if (upl->flags & UPL_KERNEL_OBJECT) {
7670 vm_object_lock_shared(shadow_object);
7671 } else {
7672 vm_object_lock(shadow_object);
7673 }
7674
7675 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7676
7677 if (upl->flags & UPL_ACCESS_BLOCKED) {
7678 assert(shadow_object->blocked_access);
7679 shadow_object->blocked_access = FALSE;
7680 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7681 }
7682
7683 if (shadow_object->code_signed) {
7684 /*
7685 * CODE SIGNING:
7686 * If the object is code-signed, do not let this UPL tell
7687 * us if the pages are valid or not. Let the pages be
7688 * validated by VM the normal way (when they get mapped or
7689 * copied).
7690 */
7691 flags &= ~UPL_COMMIT_CS_VALIDATED;
7692 }
7693 if (!page_list) {
7694 /*
7695 * No page list to get the code-signing info from !?
7696 */
7697 flags &= ~UPL_COMMIT_CS_VALIDATED;
7698 }
7699 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7700 should_be_throttled = TRUE;
7701 }
7702
7703 if ((upl->flags & UPL_IO_WIRE) &&
7704 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7705 !isVectorUPL &&
7706 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7707 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7708 if (!vm_page_queue_empty(&shadow_object->memq)) {
7709 if (size == shadow_object->vo_size) {
7710 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7711 fast_path_full_commit = 1;
7712 }
7713 fast_path_possible = 1;
7714
7715 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7716 (shadow_object->purgable == VM_PURGABLE_DENY ||
7717 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7718 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7719 throttle_page = 1;
7720 }
7721 }
7722 }
7723 first_local = VM_PAGE_NULL;
7724 last_local = VM_PAGE_NULL;
7725
7726 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7727 obj_end = obj_start + xfer_size;
7728 obj_start = vm_object_trunc_page(obj_start);
7729 obj_end = vm_object_round_page(obj_end);
7730 for (obj_offset = obj_start;
7731 obj_offset < obj_end;
7732 obj_offset += PAGE_SIZE) {
7733 vm_page_t t, m;
7734
7735 dwp->dw_mask = 0;
7736 clear_refmod = 0;
7737
7738 m = VM_PAGE_NULL;
7739
7740 if (upl->flags & UPL_LITE) {
7741 unsigned int pg_num;
7742
7743 if (nxt_page != VM_PAGE_NULL) {
7744 m = nxt_page;
7745 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7746 target_offset = m->vmp_offset;
7747 }
7748 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7749 assert(pg_num == target_offset / PAGE_SIZE);
7750
7751 if (bitmap_test(upl->lite_list, pg_num)) {
7752 bitmap_clear(upl->lite_list, pg_num);
7753
7754 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7755 m = vm_page_lookup(shadow_object, obj_offset);
7756 }
7757 } else {
7758 m = NULL;
7759 }
7760 }
7761 if (upl->flags & UPL_SHADOWED) {
7762 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7763 t->vmp_free_when_done = FALSE;
7764
7765 VM_PAGE_FREE(t);
7766
7767 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7768 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7769 }
7770 }
7771 }
7772 if (m == VM_PAGE_NULL) {
7773 goto commit_next_page;
7774 }
7775
7776 m_object = VM_PAGE_OBJECT(m);
7777
7778 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7779 assert(m->vmp_busy);
7780
7781 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7782 goto commit_next_page;
7783 }
7784
7785 if (flags & UPL_COMMIT_CS_VALIDATED) {
7786 /*
7787 * CODE SIGNING:
7788 * Set the code signing bits according to
7789 * what the UPL says they should be.
7790 */
7791 m->vmp_cs_validated |= page_list[entry].cs_validated;
7792 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7793 m->vmp_cs_nx |= page_list[entry].cs_nx;
7794 }
7795 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7796 m->vmp_written_by_kernel = TRUE;
7797 }
7798
7799 if (upl->flags & UPL_IO_WIRE) {
7800 if (page_list) {
7801 page_list[entry].phys_addr = 0;
7802 }
7803
7804 if (flags & UPL_COMMIT_SET_DIRTY) {
7805 SET_PAGE_DIRTY(m, FALSE);
7806 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7807 m->vmp_dirty = FALSE;
7808
7809 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7810 m->vmp_cs_validated &&
7811 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7812 /*
7813 * CODE SIGNING:
7814 * This page is no longer dirty
7815 * but could have been modified,
7816 * so it will need to be
7817 * re-validated.
7818 */
7819 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7820
7821 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7822
7823 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7824 }
7825 clear_refmod |= VM_MEM_MODIFIED;
7826 }
7827 if (upl->flags & UPL_ACCESS_BLOCKED) {
7828 /*
7829 * We blocked access to the pages in this UPL.
7830 * Clear the "busy" bit and wake up any waiter
7831 * for this page.
7832 */
7833 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7834 }
7835 if (fast_path_possible) {
7836 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7837 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7838 if (m->vmp_absent) {
7839 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7840 assert(m->vmp_wire_count == 0);
7841 assert(m->vmp_busy);
7842
7843 m->vmp_absent = FALSE;
7844 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7845 } else {
7846 if (m->vmp_wire_count == 0) {
7847 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7848 }
7849 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7850
7851 /*
7852 * XXX FBDP need to update some other
7853 * counters here (purgeable_wired_count)
7854 * (ledgers), ...
7855 */
7856 assert(m->vmp_wire_count > 0);
7857 m->vmp_wire_count--;
7858
7859 if (m->vmp_wire_count == 0) {
7860 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7861 unwired_count++;
7862 }
7863 }
7864 if (m->vmp_wire_count == 0) {
7865 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7866
7867 if (last_local == VM_PAGE_NULL) {
7868 assert(first_local == VM_PAGE_NULL);
7869
7870 last_local = m;
7871 first_local = m;
7872 } else {
7873 assert(first_local != VM_PAGE_NULL);
7874
7875 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7876 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7877 first_local = m;
7878 }
7879 local_queue_count++;
7880
7881 if (throttle_page) {
7882 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7883 } else {
7884 if (flags & UPL_COMMIT_INACTIVATE) {
7885 if (shadow_object->internal) {
7886 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7887 } else {
7888 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7889 }
7890 } else {
7891 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7892 }
7893 }
7894 }
7895 } else {
7896 if (flags & UPL_COMMIT_INACTIVATE) {
7897 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7898 clear_refmod |= VM_MEM_REFERENCED;
7899 }
7900 if (m->vmp_absent) {
7901 if (flags & UPL_COMMIT_FREE_ABSENT) {
7902 dwp->dw_mask |= DW_vm_page_free;
7903 } else {
7904 m->vmp_absent = FALSE;
7905 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7906
7907 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7908 dwp->dw_mask |= DW_vm_page_activate;
7909 }
7910 }
7911 } else {
7912 dwp->dw_mask |= DW_vm_page_unwire;
7913 }
7914 }
7915 goto commit_next_page;
7916 }
7917 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7918
7919 if (page_list) {
7920 page_list[entry].phys_addr = 0;
7921 }
7922
7923 /*
7924 * make sure to clear the hardware
7925 * modify or reference bits before
7926 * releasing the BUSY bit on this page
7927 * otherwise we risk losing a legitimate
7928 * change of state
7929 */
7930 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7931 m->vmp_dirty = FALSE;
7932
7933 clear_refmod |= VM_MEM_MODIFIED;
7934 }
7935 if (m->vmp_laundry) {
7936 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7937 }
7938
7939 if (VM_PAGE_WIRED(m)) {
7940 m->vmp_free_when_done = FALSE;
7941 }
7942
7943 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7944 m->vmp_cs_validated &&
7945 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7946 /*
7947 * CODE SIGNING:
7948 * This page is no longer dirty
7949 * but could have been modified,
7950 * so it will need to be
7951 * re-validated.
7952 */
7953 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7954
7955 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7956
7957 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7958 }
7959 if (m->vmp_overwriting) {
7960 /*
7961 * the (COPY_OUT_FROM == FALSE) request_page_list case
7962 */
7963 if (m->vmp_busy) {
7964 #if CONFIG_PHANTOM_CACHE
7965 if (m->vmp_absent && !m_object->internal) {
7966 dwp->dw_mask |= DW_vm_phantom_cache_update;
7967 }
7968 #endif
7969 m->vmp_absent = FALSE;
7970
7971 dwp->dw_mask |= DW_clear_busy;
7972 } else {
7973 /*
7974 * alternate (COPY_OUT_FROM == FALSE) page_list case
7975 * Occurs when the original page was wired
7976 * at the time of the list request
7977 */
7978 assert(VM_PAGE_WIRED(m));
7979
7980 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7981 }
7982 m->vmp_overwriting = FALSE;
7983 }
7984 m->vmp_cleaning = FALSE;
7985
7986 if (m->vmp_free_when_done) {
7987 /*
7988 * With the clean queue enabled, UPL_PAGEOUT should
7989 * no longer set the pageout bit. Its pages now go
7990 * to the clean queue.
7991 *
7992 * We don't use the cleaned Q anymore and so this
7993 * assert isn't correct. The code for the clean Q
7994 * still exists and might be used in the future. If we
7995 * go back to the cleaned Q, we will re-enable this
7996 * assert.
7997 *
7998 * assert(!(upl->flags & UPL_PAGEOUT));
7999 */
8000 assert(!m_object->internal);
8001
8002 m->vmp_free_when_done = FALSE;
8003
8004 if ((flags & UPL_COMMIT_SET_DIRTY) ||
8005 (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8006 /*
8007 * page was re-dirtied after we started
8008 * the pageout... reactivate it since
8009 * we don't know whether the on-disk
8010 * copy matches what is now in memory
8011 */
8012 SET_PAGE_DIRTY(m, FALSE);
8013
8014 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8015
8016 if (upl->flags & UPL_PAGEOUT) {
8017 counter_inc(&vm_statistics_reactivations);
8018 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8019 }
8020 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8021 /*
8022 * Someone else might still be handling this
8023 * page (vm_fault() for example), so let's not
8024 * free it or "un-busy" it!
8025 * Put that page in the "speculative" queue
8026 * for now (since we would otherwise have freed
8027 * it) and let whoever is keeping the page
8028 * "busy" move it if needed when they're done
8029 * with it.
8030 */
8031 dwp->dw_mask |= DW_vm_page_speculate;
8032 } else {
8033 /*
8034 * page has been successfully cleaned
8035 * go ahead and free it for other use
8036 */
8037 if (m_object->internal) {
8038 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8039 } else {
8040 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8041 }
8042 m->vmp_dirty = FALSE;
8043 if (!(upl->flags & UPL_HAS_BUSY)) {
8044 assert(!m->vmp_busy);
8045 }
8046 m->vmp_busy = TRUE;
8047
8048 dwp->dw_mask |= DW_vm_page_free;
8049 }
8050 goto commit_next_page;
8051 }
8052 /*
8053 * It is a part of the semantic of COPYOUT_FROM
8054 * UPLs that a commit implies cache sync
8055 * between the vm page and the backing store
8056 * this can be used to strip the precious bit
8057 * as well as clean
8058 */
8059 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8060 m->vmp_precious = FALSE;
8061 }
8062
8063 if (flags & UPL_COMMIT_SET_DIRTY) {
8064 SET_PAGE_DIRTY(m, FALSE);
8065 } else {
8066 m->vmp_dirty = FALSE;
8067 }
8068
8069 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8070 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8071 pgpgout_count++;
8072
8073 counter_inc(&vm_statistics_pageouts);
8074 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8075
8076 dwp->dw_mask |= DW_enqueue_cleaned;
8077 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8078 /*
8079 * page coming back in from being 'frozen'...
8080 * it was dirty before it was frozen, so keep it so
8081 * the vm_page_activate will notice that it really belongs
8082 * on the throttle queue and put it there
8083 */
8084 SET_PAGE_DIRTY(m, FALSE);
8085 dwp->dw_mask |= DW_vm_page_activate;
8086 } else {
8087 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8088 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8089 clear_refmod |= VM_MEM_REFERENCED;
8090 } else if (!VM_PAGE_PAGEABLE(m)) {
8091 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8092 dwp->dw_mask |= DW_vm_page_speculate;
8093 } else if (m->vmp_reference) {
8094 dwp->dw_mask |= DW_vm_page_activate;
8095 } else {
8096 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8097 clear_refmod |= VM_MEM_REFERENCED;
8098 }
8099 }
8100 }
8101 if (upl->flags & UPL_ACCESS_BLOCKED) {
8102 /*
8103 * We blocked access to the pages in this URL.
8104 * Clear the "busy" bit on this page before we
8105 * wake up any waiter.
8106 */
8107 dwp->dw_mask |= DW_clear_busy;
8108 }
8109 /*
8110 * Wakeup any thread waiting for the page to be un-cleaning.
8111 */
8112 dwp->dw_mask |= DW_PAGE_WAKEUP;
8113
8114 commit_next_page:
8115 if (clear_refmod) {
8116 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
8117 }
8118
8119 target_offset += PAGE_SIZE_64;
8120 xfer_size -= PAGE_SIZE;
8121 entry++;
8122
8123 if (dwp->dw_mask) {
8124 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8125 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8126
8127 if (dw_count >= dw_limit) {
8128 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8129
8130 dwp = dwp_start;
8131 dw_count = 0;
8132 }
8133 } else {
8134 if (dwp->dw_mask & DW_clear_busy) {
8135 m->vmp_busy = FALSE;
8136 }
8137
8138 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8139 PAGE_WAKEUP(m);
8140 }
8141 }
8142 }
8143 }
8144 if (dw_count) {
8145 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8146 dwp = dwp_start;
8147 dw_count = 0;
8148 }
8149
8150 if (fast_path_possible) {
8151 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8152 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8153
8154 if (local_queue_count || unwired_count) {
8155 if (local_queue_count) {
8156 vm_page_t first_target;
8157 vm_page_queue_head_t *target_queue;
8158
8159 if (throttle_page) {
8160 target_queue = &vm_page_queue_throttled;
8161 } else {
8162 if (flags & UPL_COMMIT_INACTIVATE) {
8163 if (shadow_object->internal) {
8164 target_queue = &vm_page_queue_anonymous;
8165 } else {
8166 target_queue = &vm_page_queue_inactive;
8167 }
8168 } else {
8169 target_queue = &vm_page_queue_active;
8170 }
8171 }
8172 /*
8173 * Transfer the entire local queue to a regular LRU page queues.
8174 */
8175 vm_page_lockspin_queues();
8176
8177 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8178
8179 if (vm_page_queue_empty(target_queue)) {
8180 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8181 } else {
8182 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8183 }
8184
8185 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8186 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8187 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8188
8189 /*
8190 * Adjust the global page counts.
8191 */
8192 if (throttle_page) {
8193 vm_page_throttled_count += local_queue_count;
8194 } else {
8195 if (flags & UPL_COMMIT_INACTIVATE) {
8196 if (shadow_object->internal) {
8197 vm_page_anonymous_count += local_queue_count;
8198 }
8199 vm_page_inactive_count += local_queue_count;
8200
8201 token_new_pagecount += local_queue_count;
8202 } else {
8203 vm_page_active_count += local_queue_count;
8204 }
8205
8206 if (shadow_object->internal) {
8207 vm_page_pageable_internal_count += local_queue_count;
8208 } else {
8209 vm_page_pageable_external_count += local_queue_count;
8210 }
8211 }
8212 } else {
8213 vm_page_lockspin_queues();
8214 }
8215 if (unwired_count) {
8216 vm_page_wire_count -= unwired_count;
8217 VM_CHECK_MEMORYSTATUS;
8218 }
8219 vm_page_unlock_queues();
8220
8221 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8222 }
8223 }
8224
8225 if (upl->flags & UPL_DEVICE_MEMORY) {
8226 occupied = 0;
8227 } else if (upl->flags & UPL_LITE) {
8228 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8229
8230 occupied = !fast_path_full_commit &&
8231 !bitmap_is_empty(upl->lite_list, pages);
8232 } else {
8233 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8234 }
8235 if (occupied == 0) {
8236 /*
8237 * If this UPL element belongs to a Vector UPL and is
8238 * empty, then this is the right function to deallocate
8239 * it. So go ahead set the *empty variable. The flag
8240 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8241 * should be considered relevant for the Vector UPL and not
8242 * the internal UPLs.
8243 */
8244 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8245 *empty = TRUE;
8246 }
8247
8248 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8249 /*
8250 * this is not a paging object
8251 * so we need to drop the paging reference
8252 * that was taken when we created the UPL
8253 * against this object
8254 */
8255 vm_object_activity_end(shadow_object);
8256 vm_object_collapse(shadow_object, 0, TRUE);
8257 } else {
8258 /*
8259 * we dontated the paging reference to
8260 * the map object... vm_pageout_object_terminate
8261 * will drop this reference
8262 */
8263 }
8264 }
8265 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8266 vm_object_unlock(shadow_object);
8267 if (object != shadow_object) {
8268 vm_object_unlock(object);
8269 }
8270
8271 if (!isVectorUPL) {
8272 upl_unlock(upl);
8273 } else {
8274 /*
8275 * If we completed our operations on an UPL that is
8276 * part of a Vectored UPL and if empty is TRUE, then
8277 * we should go ahead and deallocate this UPL element.
8278 * Then we check if this was the last of the UPL elements
8279 * within that Vectored UPL. If so, set empty to TRUE
8280 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8281 * can go ahead and deallocate the Vector UPL too.
8282 */
8283 if (*empty == TRUE) {
8284 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8285 upl_deallocate(upl);
8286 }
8287 goto process_upl_to_commit;
8288 }
8289 if (pgpgout_count) {
8290 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8291 }
8292
8293 kr = KERN_SUCCESS;
8294 done:
8295 if (dwp_start && dwp_finish_ctx) {
8296 vm_page_delayed_work_finish_ctx(dwp_start);
8297 dwp_start = dwp = NULL;
8298 }
8299
8300 return kr;
8301 }
8302
8303 kern_return_t
upl_abort_range(upl_t upl,upl_offset_t offset,upl_size_t size,int error,boolean_t * empty)8304 upl_abort_range(
8305 upl_t upl,
8306 upl_offset_t offset,
8307 upl_size_t size,
8308 int error,
8309 boolean_t *empty)
8310 {
8311 upl_size_t xfer_size, subupl_size;
8312 vm_object_t shadow_object;
8313 vm_object_t object;
8314 vm_object_offset_t target_offset;
8315 upl_offset_t subupl_offset = offset;
8316 int occupied;
8317 struct vm_page_delayed_work dw_array;
8318 struct vm_page_delayed_work *dwp, *dwp_start;
8319 bool dwp_finish_ctx = TRUE;
8320 int dw_count;
8321 int dw_limit;
8322 int isVectorUPL = 0;
8323 upl_t vector_upl = NULL;
8324 vm_object_offset_t obj_start, obj_end, obj_offset;
8325 kern_return_t kr = KERN_SUCCESS;
8326
8327 // DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8328
8329 dwp_start = dwp = NULL;
8330
8331 subupl_size = size;
8332 *empty = FALSE;
8333
8334 if (upl == UPL_NULL) {
8335 return KERN_INVALID_ARGUMENT;
8336 }
8337
8338 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8339 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8340 }
8341
8342 dw_count = 0;
8343 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8344 dwp_start = vm_page_delayed_work_get_ctx();
8345 if (dwp_start == NULL) {
8346 dwp_start = &dw_array;
8347 dw_limit = 1;
8348 dwp_finish_ctx = FALSE;
8349 }
8350
8351 dwp = dwp_start;
8352
8353 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8354 vector_upl = upl;
8355 upl_lock(vector_upl);
8356 } else {
8357 upl_lock(upl);
8358 }
8359
8360 process_upl_to_abort:
8361 if (isVectorUPL) {
8362 size = subupl_size;
8363 offset = subupl_offset;
8364 if (size == 0) {
8365 upl_unlock(vector_upl);
8366 kr = KERN_SUCCESS;
8367 goto done;
8368 }
8369 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8370 if (upl == NULL) {
8371 upl_unlock(vector_upl);
8372 kr = KERN_FAILURE;
8373 goto done;
8374 }
8375 subupl_size -= size;
8376 subupl_offset += size;
8377 }
8378
8379 *empty = FALSE;
8380
8381 #if UPL_DEBUG
8382 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8383 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8384 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8385 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8386 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8387
8388 upl->upl_commit_index++;
8389 }
8390 #endif
8391 if (upl->flags & UPL_DEVICE_MEMORY) {
8392 xfer_size = 0;
8393 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8394 xfer_size = size;
8395 } else {
8396 if (!isVectorUPL) {
8397 upl_unlock(upl);
8398 } else {
8399 upl_unlock(vector_upl);
8400 }
8401 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8402 kr = KERN_FAILURE;
8403 goto done;
8404 }
8405 object = upl->map_object;
8406
8407 if (upl->flags & UPL_SHADOWED) {
8408 vm_object_lock(object);
8409 shadow_object = object->shadow;
8410 } else {
8411 shadow_object = object;
8412 }
8413
8414 target_offset = (vm_object_offset_t)offset;
8415
8416 if (upl->flags & UPL_KERNEL_OBJECT) {
8417 vm_object_lock_shared(shadow_object);
8418 } else {
8419 vm_object_lock(shadow_object);
8420 }
8421
8422 if (upl->flags & UPL_ACCESS_BLOCKED) {
8423 assert(shadow_object->blocked_access);
8424 shadow_object->blocked_access = FALSE;
8425 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8426 }
8427
8428 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8429 panic("upl_abort_range: kernel_object being DUMPED");
8430 }
8431
8432 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8433 obj_end = obj_start + xfer_size;
8434 obj_start = vm_object_trunc_page(obj_start);
8435 obj_end = vm_object_round_page(obj_end);
8436 for (obj_offset = obj_start;
8437 obj_offset < obj_end;
8438 obj_offset += PAGE_SIZE) {
8439 vm_page_t t, m;
8440 unsigned int pg_num;
8441 boolean_t needed;
8442
8443 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8444 assert(pg_num == target_offset / PAGE_SIZE);
8445
8446 needed = FALSE;
8447
8448 if (upl->flags & UPL_INTERNAL) {
8449 needed = upl->page_list[pg_num].needed;
8450 }
8451
8452 dwp->dw_mask = 0;
8453 m = VM_PAGE_NULL;
8454
8455 if (upl->flags & UPL_LITE) {
8456 if (bitmap_test(upl->lite_list, pg_num)) {
8457 bitmap_clear(upl->lite_list, pg_num);
8458
8459 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8460 m = vm_page_lookup(shadow_object, obj_offset);
8461 }
8462 }
8463 }
8464 if (upl->flags & UPL_SHADOWED) {
8465 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8466 t->vmp_free_when_done = FALSE;
8467
8468 VM_PAGE_FREE(t);
8469
8470 if (m == VM_PAGE_NULL) {
8471 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8472 }
8473 }
8474 }
8475 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8476 goto abort_next_page;
8477 }
8478
8479 if (m != VM_PAGE_NULL) {
8480 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8481
8482 if (m->vmp_absent) {
8483 boolean_t must_free = TRUE;
8484
8485 /*
8486 * COPYOUT = FALSE case
8487 * check for error conditions which must
8488 * be passed back to the pages customer
8489 */
8490 if (error & UPL_ABORT_RESTART) {
8491 m->vmp_restart = TRUE;
8492 m->vmp_absent = FALSE;
8493 m->vmp_unusual = TRUE;
8494 must_free = FALSE;
8495 } else if (error & UPL_ABORT_UNAVAILABLE) {
8496 m->vmp_restart = FALSE;
8497 m->vmp_unusual = TRUE;
8498 must_free = FALSE;
8499 } else if (error & UPL_ABORT_ERROR) {
8500 m->vmp_restart = FALSE;
8501 m->vmp_absent = FALSE;
8502 m->vmp_error = TRUE;
8503 m->vmp_unusual = TRUE;
8504 must_free = FALSE;
8505 }
8506 if (m->vmp_clustered && needed == FALSE) {
8507 /*
8508 * This page was a part of a speculative
8509 * read-ahead initiated by the kernel
8510 * itself. No one is expecting this
8511 * page and no one will clean up its
8512 * error state if it ever becomes valid
8513 * in the future.
8514 * We have to free it here.
8515 */
8516 must_free = TRUE;
8517 }
8518 m->vmp_cleaning = FALSE;
8519
8520 if (m->vmp_overwriting && !m->vmp_busy) {
8521 /*
8522 * this shouldn't happen since
8523 * this is an 'absent' page, but
8524 * it doesn't hurt to check for
8525 * the 'alternate' method of
8526 * stabilizing the page...
8527 * we will mark 'busy' to be cleared
8528 * in the following code which will
8529 * take care of the primary stabilzation
8530 * method (i.e. setting 'busy' to TRUE)
8531 */
8532 dwp->dw_mask |= DW_vm_page_unwire;
8533 }
8534 m->vmp_overwriting = FALSE;
8535
8536 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8537
8538 if (must_free == TRUE) {
8539 dwp->dw_mask |= DW_vm_page_free;
8540 } else {
8541 dwp->dw_mask |= DW_vm_page_activate;
8542 }
8543 } else {
8544 /*
8545 * Handle the trusted pager throttle.
8546 */
8547 if (m->vmp_laundry) {
8548 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8549 }
8550
8551 if (upl->flags & UPL_ACCESS_BLOCKED) {
8552 /*
8553 * We blocked access to the pages in this UPL.
8554 * Clear the "busy" bit and wake up any waiter
8555 * for this page.
8556 */
8557 dwp->dw_mask |= DW_clear_busy;
8558 }
8559 if (m->vmp_overwriting) {
8560 if (m->vmp_busy) {
8561 dwp->dw_mask |= DW_clear_busy;
8562 } else {
8563 /*
8564 * deal with the 'alternate' method
8565 * of stabilizing the page...
8566 * we will either free the page
8567 * or mark 'busy' to be cleared
8568 * in the following code which will
8569 * take care of the primary stabilzation
8570 * method (i.e. setting 'busy' to TRUE)
8571 */
8572 dwp->dw_mask |= DW_vm_page_unwire;
8573 }
8574 m->vmp_overwriting = FALSE;
8575 }
8576 m->vmp_free_when_done = FALSE;
8577 m->vmp_cleaning = FALSE;
8578
8579 if (error & UPL_ABORT_DUMP_PAGES) {
8580 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8581
8582 dwp->dw_mask |= DW_vm_page_free;
8583 } else {
8584 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8585 if (error & UPL_ABORT_REFERENCE) {
8586 /*
8587 * we've been told to explictly
8588 * reference this page... for
8589 * file I/O, this is done by
8590 * implementing an LRU on the inactive q
8591 */
8592 dwp->dw_mask |= DW_vm_page_lru;
8593 } else if (!VM_PAGE_PAGEABLE(m)) {
8594 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8595 }
8596 }
8597 dwp->dw_mask |= DW_PAGE_WAKEUP;
8598 }
8599 }
8600 }
8601 abort_next_page:
8602 target_offset += PAGE_SIZE_64;
8603 xfer_size -= PAGE_SIZE;
8604
8605 if (dwp->dw_mask) {
8606 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8607 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8608
8609 if (dw_count >= dw_limit) {
8610 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8611
8612 dwp = dwp_start;
8613 dw_count = 0;
8614 }
8615 } else {
8616 if (dwp->dw_mask & DW_clear_busy) {
8617 m->vmp_busy = FALSE;
8618 }
8619
8620 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8621 PAGE_WAKEUP(m);
8622 }
8623 }
8624 }
8625 }
8626 if (dw_count) {
8627 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8628 dwp = dwp_start;
8629 dw_count = 0;
8630 }
8631
8632 if (upl->flags & UPL_DEVICE_MEMORY) {
8633 occupied = 0;
8634 } else if (upl->flags & UPL_LITE) {
8635 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8636
8637 occupied = !bitmap_is_empty(upl->lite_list, pages);
8638 } else {
8639 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8640 }
8641 if (occupied == 0) {
8642 /*
8643 * If this UPL element belongs to a Vector UPL and is
8644 * empty, then this is the right function to deallocate
8645 * it. So go ahead set the *empty variable. The flag
8646 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8647 * should be considered relevant for the Vector UPL and
8648 * not the internal UPLs.
8649 */
8650 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8651 *empty = TRUE;
8652 }
8653
8654 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8655 /*
8656 * this is not a paging object
8657 * so we need to drop the paging reference
8658 * that was taken when we created the UPL
8659 * against this object
8660 */
8661 vm_object_activity_end(shadow_object);
8662 vm_object_collapse(shadow_object, 0, TRUE);
8663 } else {
8664 /*
8665 * we dontated the paging reference to
8666 * the map object... vm_pageout_object_terminate
8667 * will drop this reference
8668 */
8669 }
8670 }
8671 vm_object_unlock(shadow_object);
8672 if (object != shadow_object) {
8673 vm_object_unlock(object);
8674 }
8675
8676 if (!isVectorUPL) {
8677 upl_unlock(upl);
8678 } else {
8679 /*
8680 * If we completed our operations on an UPL that is
8681 * part of a Vectored UPL and if empty is TRUE, then
8682 * we should go ahead and deallocate this UPL element.
8683 * Then we check if this was the last of the UPL elements
8684 * within that Vectored UPL. If so, set empty to TRUE
8685 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8686 * can go ahead and deallocate the Vector UPL too.
8687 */
8688 if (*empty == TRUE) {
8689 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8690 upl_deallocate(upl);
8691 }
8692 goto process_upl_to_abort;
8693 }
8694
8695 kr = KERN_SUCCESS;
8696
8697 done:
8698 if (dwp_start && dwp_finish_ctx) {
8699 vm_page_delayed_work_finish_ctx(dwp_start);
8700 dwp_start = dwp = NULL;
8701 }
8702
8703 return kr;
8704 }
8705
8706
8707 kern_return_t
upl_abort(upl_t upl,int error)8708 upl_abort(
8709 upl_t upl,
8710 int error)
8711 {
8712 boolean_t empty;
8713
8714 if (upl == UPL_NULL) {
8715 return KERN_INVALID_ARGUMENT;
8716 }
8717
8718 return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8719 }
8720
8721
8722 /* an option on commit should be wire */
8723 kern_return_t
upl_commit(upl_t upl,upl_page_info_t * page_list,mach_msg_type_number_t count)8724 upl_commit(
8725 upl_t upl,
8726 upl_page_info_t *page_list,
8727 mach_msg_type_number_t count)
8728 {
8729 boolean_t empty;
8730
8731 if (upl == UPL_NULL) {
8732 return KERN_INVALID_ARGUMENT;
8733 }
8734
8735 return upl_commit_range(upl, 0, upl->u_size, 0,
8736 page_list, count, &empty);
8737 }
8738
8739
8740 void
iopl_valid_data(upl_t upl,vm_tag_t tag)8741 iopl_valid_data(
8742 upl_t upl,
8743 vm_tag_t tag)
8744 {
8745 vm_object_t object;
8746 vm_offset_t offset;
8747 vm_page_t m, nxt_page = VM_PAGE_NULL;
8748 upl_size_t size;
8749 int wired_count = 0;
8750
8751 if (upl == NULL) {
8752 panic("iopl_valid_data: NULL upl");
8753 }
8754 if (vector_upl_is_valid(upl)) {
8755 panic("iopl_valid_data: vector upl");
8756 }
8757 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8758 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8759 }
8760
8761 object = upl->map_object;
8762
8763 if (object == kernel_object || object == compressor_object) {
8764 panic("iopl_valid_data: object == kernel or compressor");
8765 }
8766
8767 if (object->purgable == VM_PURGABLE_VOLATILE ||
8768 object->purgable == VM_PURGABLE_EMPTY) {
8769 panic("iopl_valid_data: object %p purgable %d",
8770 object, object->purgable);
8771 }
8772
8773 size = upl_adjusted_size(upl, PAGE_MASK);
8774
8775 vm_object_lock(object);
8776 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8777
8778 bool whole_object;
8779
8780 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8781 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8782 whole_object = true;
8783 } else {
8784 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8785 whole_object = false;
8786 }
8787
8788 while (size) {
8789 if (whole_object) {
8790 if (nxt_page != VM_PAGE_NULL) {
8791 m = nxt_page;
8792 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8793 }
8794 } else {
8795 m = vm_page_lookup(object, offset);
8796 offset += PAGE_SIZE;
8797
8798 if (m == VM_PAGE_NULL) {
8799 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8800 }
8801 }
8802 if (m->vmp_busy) {
8803 if (!m->vmp_absent) {
8804 panic("iopl_valid_data: busy page w/o absent");
8805 }
8806
8807 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8808 panic("iopl_valid_data: busy+absent page on page queue");
8809 }
8810 if (m->vmp_reusable) {
8811 panic("iopl_valid_data: %p is reusable", m);
8812 }
8813
8814 m->vmp_absent = FALSE;
8815 m->vmp_dirty = TRUE;
8816 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8817 assert(m->vmp_wire_count == 0);
8818 m->vmp_wire_count++;
8819 assert(m->vmp_wire_count);
8820 if (m->vmp_wire_count == 1) {
8821 m->vmp_q_state = VM_PAGE_IS_WIRED;
8822 wired_count++;
8823 } else {
8824 panic("iopl_valid_data: %p already wired", m);
8825 }
8826
8827 PAGE_WAKEUP_DONE(m);
8828 }
8829 size -= PAGE_SIZE;
8830 }
8831 if (wired_count) {
8832 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8833 assert(object->resident_page_count >= object->wired_page_count);
8834
8835 /* no need to adjust purgeable accounting for this object: */
8836 assert(object->purgable != VM_PURGABLE_VOLATILE);
8837 assert(object->purgable != VM_PURGABLE_EMPTY);
8838
8839 vm_page_lockspin_queues();
8840 vm_page_wire_count += wired_count;
8841 vm_page_unlock_queues();
8842 }
8843 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8844 vm_object_unlock(object);
8845 }
8846
8847
8848 void
vm_object_set_pmap_cache_attr(vm_object_t object,upl_page_info_array_t user_page_list,unsigned int num_pages,boolean_t batch_pmap_op)8849 vm_object_set_pmap_cache_attr(
8850 vm_object_t object,
8851 upl_page_info_array_t user_page_list,
8852 unsigned int num_pages,
8853 boolean_t batch_pmap_op)
8854 {
8855 unsigned int cache_attr = 0;
8856
8857 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8858 assert(user_page_list);
8859 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8860 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8861 }
8862 }
8863
8864
8865 static bool
vm_object_iopl_wire_full(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag)8866 vm_object_iopl_wire_full(
8867 vm_object_t object,
8868 upl_t upl,
8869 upl_page_info_array_t user_page_list,
8870 upl_control_flags_t cntrl_flags,
8871 vm_tag_t tag)
8872 {
8873 vm_page_t dst_page;
8874 unsigned int entry;
8875 int page_count;
8876 int delayed_unlock = 0;
8877 boolean_t retval = TRUE;
8878 ppnum_t phys_page;
8879
8880 vm_object_lock_assert_exclusive(object);
8881 assert(object->purgable != VM_PURGABLE_VOLATILE);
8882 assert(object->purgable != VM_PURGABLE_EMPTY);
8883 assert(object->pager == NULL);
8884 assert(object->copy == NULL);
8885 assert(object->shadow == NULL);
8886
8887 page_count = object->resident_page_count;
8888 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8889
8890 vm_page_lock_queues();
8891
8892 while (page_count--) {
8893 if (dst_page->vmp_busy ||
8894 dst_page->vmp_fictitious ||
8895 dst_page->vmp_absent ||
8896 VMP_ERROR_GET(dst_page) ||
8897 dst_page->vmp_cleaning ||
8898 dst_page->vmp_restart ||
8899 dst_page->vmp_laundry) {
8900 retval = FALSE;
8901 goto done;
8902 }
8903 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8904 retval = FALSE;
8905 goto done;
8906 }
8907 dst_page->vmp_reference = TRUE;
8908
8909 vm_page_wire(dst_page, tag, FALSE);
8910
8911 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8912 SET_PAGE_DIRTY(dst_page, FALSE);
8913 }
8914 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8915 assert(entry >= 0 && entry < object->resident_page_count);
8916 bitmap_set(upl->lite_list, entry);
8917
8918 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8919
8920 if (phys_page > upl->highest_page) {
8921 upl->highest_page = phys_page;
8922 }
8923
8924 if (user_page_list) {
8925 user_page_list[entry].phys_addr = phys_page;
8926 user_page_list[entry].absent = dst_page->vmp_absent;
8927 user_page_list[entry].dirty = dst_page->vmp_dirty;
8928 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8929 user_page_list[entry].precious = dst_page->vmp_precious;
8930 user_page_list[entry].device = FALSE;
8931 user_page_list[entry].speculative = FALSE;
8932 user_page_list[entry].cs_validated = FALSE;
8933 user_page_list[entry].cs_tainted = FALSE;
8934 user_page_list[entry].cs_nx = FALSE;
8935 user_page_list[entry].needed = FALSE;
8936 user_page_list[entry].mark = FALSE;
8937 }
8938 if (delayed_unlock++ > 256) {
8939 delayed_unlock = 0;
8940 lck_mtx_yield(&vm_page_queue_lock);
8941
8942 VM_CHECK_MEMORYSTATUS;
8943 }
8944 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8945 }
8946 done:
8947 vm_page_unlock_queues();
8948
8949 VM_CHECK_MEMORYSTATUS;
8950
8951 return retval;
8952 }
8953
8954
8955 static kern_return_t
vm_object_iopl_wire_empty(vm_object_t object,upl_t upl,upl_page_info_array_t user_page_list,upl_control_flags_t cntrl_flags,vm_tag_t tag,vm_object_offset_t * dst_offset,int page_count,int * page_grab_count)8956 vm_object_iopl_wire_empty(
8957 vm_object_t object,
8958 upl_t upl,
8959 upl_page_info_array_t user_page_list,
8960 upl_control_flags_t cntrl_flags,
8961 vm_tag_t tag,
8962 vm_object_offset_t *dst_offset,
8963 int page_count,
8964 int *page_grab_count)
8965 {
8966 vm_page_t dst_page;
8967 boolean_t no_zero_fill = FALSE;
8968 int interruptible;
8969 int pages_wired = 0;
8970 int pages_inserted = 0;
8971 int entry = 0;
8972 uint64_t delayed_ledger_update = 0;
8973 kern_return_t ret = KERN_SUCCESS;
8974 int grab_options;
8975 ppnum_t phys_page;
8976
8977 vm_object_lock_assert_exclusive(object);
8978 assert(object->purgable != VM_PURGABLE_VOLATILE);
8979 assert(object->purgable != VM_PURGABLE_EMPTY);
8980 assert(object->pager == NULL);
8981 assert(object->copy == NULL);
8982 assert(object->shadow == NULL);
8983
8984 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8985 interruptible = THREAD_ABORTSAFE;
8986 } else {
8987 interruptible = THREAD_UNINT;
8988 }
8989
8990 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8991 no_zero_fill = TRUE;
8992 }
8993
8994 grab_options = 0;
8995 #if CONFIG_SECLUDED_MEMORY
8996 if (object->can_grab_secluded) {
8997 grab_options |= VM_PAGE_GRAB_SECLUDED;
8998 }
8999 #endif /* CONFIG_SECLUDED_MEMORY */
9000
9001 while (page_count--) {
9002 while ((dst_page = vm_page_grab_options(grab_options))
9003 == VM_PAGE_NULL) {
9004 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9005
9006 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9007
9008 if (vm_page_wait(interruptible) == FALSE) {
9009 /*
9010 * interrupted case
9011 */
9012 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9013
9014 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9015
9016 ret = MACH_SEND_INTERRUPTED;
9017 goto done;
9018 }
9019 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9020
9021 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9022 }
9023 if (no_zero_fill == FALSE) {
9024 vm_page_zero_fill(dst_page);
9025 } else {
9026 dst_page->vmp_absent = TRUE;
9027 }
9028
9029 dst_page->vmp_reference = TRUE;
9030
9031 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9032 SET_PAGE_DIRTY(dst_page, FALSE);
9033 }
9034 if (dst_page->vmp_absent == FALSE) {
9035 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9036 assert(dst_page->vmp_wire_count == 0);
9037 dst_page->vmp_wire_count++;
9038 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9039 assert(dst_page->vmp_wire_count);
9040 pages_wired++;
9041 PAGE_WAKEUP_DONE(dst_page);
9042 }
9043 pages_inserted++;
9044
9045 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
9046
9047 bitmap_set(upl->lite_list, entry);
9048
9049 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9050
9051 if (phys_page > upl->highest_page) {
9052 upl->highest_page = phys_page;
9053 }
9054
9055 if (user_page_list) {
9056 user_page_list[entry].phys_addr = phys_page;
9057 user_page_list[entry].absent = dst_page->vmp_absent;
9058 user_page_list[entry].dirty = dst_page->vmp_dirty;
9059 user_page_list[entry].free_when_done = FALSE;
9060 user_page_list[entry].precious = FALSE;
9061 user_page_list[entry].device = FALSE;
9062 user_page_list[entry].speculative = FALSE;
9063 user_page_list[entry].cs_validated = FALSE;
9064 user_page_list[entry].cs_tainted = FALSE;
9065 user_page_list[entry].cs_nx = FALSE;
9066 user_page_list[entry].needed = FALSE;
9067 user_page_list[entry].mark = FALSE;
9068 }
9069 entry++;
9070 *dst_offset += PAGE_SIZE_64;
9071 }
9072 done:
9073 if (pages_wired) {
9074 vm_page_lockspin_queues();
9075 vm_page_wire_count += pages_wired;
9076 vm_page_unlock_queues();
9077 }
9078 if (pages_inserted) {
9079 if (object->internal) {
9080 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9081 } else {
9082 OSAddAtomic(pages_inserted, &vm_page_external_count);
9083 }
9084 }
9085 if (delayed_ledger_update) {
9086 task_t owner;
9087 int ledger_idx_volatile;
9088 int ledger_idx_nonvolatile;
9089 int ledger_idx_volatile_compressed;
9090 int ledger_idx_nonvolatile_compressed;
9091 boolean_t do_footprint;
9092
9093 owner = VM_OBJECT_OWNER(object);
9094 assert(owner);
9095
9096 vm_object_ledger_tag_ledgers(object,
9097 &ledger_idx_volatile,
9098 &ledger_idx_nonvolatile,
9099 &ledger_idx_volatile_compressed,
9100 &ledger_idx_nonvolatile_compressed,
9101 &do_footprint);
9102
9103 /* more non-volatile bytes */
9104 ledger_credit(owner->ledger,
9105 ledger_idx_nonvolatile,
9106 delayed_ledger_update);
9107 if (do_footprint) {
9108 /* more footprint */
9109 ledger_credit(owner->ledger,
9110 task_ledgers.phys_footprint,
9111 delayed_ledger_update);
9112 }
9113 }
9114
9115 assert(page_grab_count);
9116 *page_grab_count = pages_inserted;
9117
9118 return ret;
9119 }
9120
9121
9122
9123 kern_return_t
vm_object_iopl_request(vm_object_t object,vm_object_offset_t offset,upl_size_t size,upl_t * upl_ptr,upl_page_info_array_t user_page_list,unsigned int * page_list_count,upl_control_flags_t cntrl_flags,vm_tag_t tag)9124 vm_object_iopl_request(
9125 vm_object_t object,
9126 vm_object_offset_t offset,
9127 upl_size_t size,
9128 upl_t *upl_ptr,
9129 upl_page_info_array_t user_page_list,
9130 unsigned int *page_list_count,
9131 upl_control_flags_t cntrl_flags,
9132 vm_tag_t tag)
9133 {
9134 vm_page_t dst_page;
9135 vm_object_offset_t dst_offset;
9136 upl_size_t xfer_size;
9137 upl_t upl = NULL;
9138 unsigned int entry;
9139 int no_zero_fill = FALSE;
9140 unsigned int size_in_pages;
9141 int page_grab_count = 0;
9142 u_int32_t psize;
9143 kern_return_t ret;
9144 vm_prot_t prot;
9145 struct vm_object_fault_info fault_info = {};
9146 struct vm_page_delayed_work dw_array;
9147 struct vm_page_delayed_work *dwp, *dwp_start;
9148 bool dwp_finish_ctx = TRUE;
9149 int dw_count;
9150 int dw_limit;
9151 int dw_index;
9152 boolean_t caller_lookup;
9153 int io_tracking_flag = 0;
9154 int interruptible;
9155 ppnum_t phys_page;
9156
9157 boolean_t set_cache_attr_needed = FALSE;
9158 boolean_t free_wired_pages = FALSE;
9159 boolean_t fast_path_empty_req = FALSE;
9160 boolean_t fast_path_full_req = FALSE;
9161
9162 #if DEVELOPMENT || DEBUG
9163 task_t task = current_task();
9164 #endif /* DEVELOPMENT || DEBUG */
9165
9166 dwp_start = dwp = NULL;
9167
9168 vm_object_offset_t original_offset = offset;
9169 upl_size_t original_size = size;
9170
9171 // DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9172
9173 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9174 offset = vm_object_trunc_page(offset);
9175 if (size != original_size || offset != original_offset) {
9176 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9177 }
9178
9179 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9180 /*
9181 * For forward compatibility's sake,
9182 * reject any unknown flag.
9183 */
9184 return KERN_INVALID_VALUE;
9185 }
9186 if (vm_lopage_needed == FALSE) {
9187 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9188 }
9189
9190 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9191 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9192 return KERN_INVALID_VALUE;
9193 }
9194
9195 if (object->phys_contiguous) {
9196 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9197 return KERN_INVALID_ADDRESS;
9198 }
9199
9200 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9201 return KERN_INVALID_ADDRESS;
9202 }
9203 }
9204 }
9205 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9206 no_zero_fill = TRUE;
9207 }
9208
9209 if (cntrl_flags & UPL_COPYOUT_FROM) {
9210 prot = VM_PROT_READ;
9211 } else {
9212 prot = VM_PROT_READ | VM_PROT_WRITE;
9213 }
9214
9215 if ((!object->internal) && (object->paging_offset != 0)) {
9216 panic("vm_object_iopl_request: external object with non-zero paging offset");
9217 }
9218
9219
9220 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9221
9222 #if CONFIG_IOSCHED || UPL_DEBUG
9223 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
9224 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9225 }
9226 #endif
9227
9228 #if CONFIG_IOSCHED
9229 if (object->io_tracking) {
9230 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9231 if (object != kernel_object) {
9232 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9233 }
9234 }
9235 #endif
9236
9237 if (object->phys_contiguous) {
9238 psize = PAGE_SIZE;
9239 } else {
9240 psize = size;
9241
9242 dw_count = 0;
9243 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9244 dwp_start = vm_page_delayed_work_get_ctx();
9245 if (dwp_start == NULL) {
9246 dwp_start = &dw_array;
9247 dw_limit = 1;
9248 dwp_finish_ctx = FALSE;
9249 }
9250
9251 dwp = dwp_start;
9252 }
9253
9254 if (cntrl_flags & UPL_SET_INTERNAL) {
9255 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9256 user_page_list = size ? upl->page_list : NULL;
9257 } else {
9258 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
9259 }
9260 if (user_page_list) {
9261 user_page_list[0].device = FALSE;
9262 }
9263 *upl_ptr = upl;
9264
9265 if (cntrl_flags & UPL_NOZEROFILLIO) {
9266 DTRACE_VM4(upl_nozerofillio,
9267 vm_object_t, object,
9268 vm_object_offset_t, offset,
9269 upl_size_t, size,
9270 upl_t, upl);
9271 }
9272
9273 upl->map_object = object;
9274 upl->u_offset = original_offset;
9275 upl->u_size = original_size;
9276
9277 size_in_pages = size / PAGE_SIZE;
9278
9279 if (object == kernel_object &&
9280 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9281 upl->flags |= UPL_KERNEL_OBJECT;
9282 #if UPL_DEBUG
9283 vm_object_lock(object);
9284 #else
9285 vm_object_lock_shared(object);
9286 #endif
9287 } else {
9288 vm_object_lock(object);
9289 vm_object_activity_begin(object);
9290 }
9291 /*
9292 * paging in progress also protects the paging_offset
9293 */
9294 upl->u_offset = original_offset + object->paging_offset;
9295
9296 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9297 /*
9298 * The user requested that access to the pages in this UPL
9299 * be blocked until the UPL is commited or aborted.
9300 */
9301 upl->flags |= UPL_ACCESS_BLOCKED;
9302 }
9303
9304 #if CONFIG_IOSCHED || UPL_DEBUG
9305 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9306 vm_object_activity_begin(object);
9307 queue_enter(&object->uplq, upl, upl_t, uplq);
9308 }
9309 #endif
9310
9311 if (object->phys_contiguous) {
9312 if (upl->flags & UPL_ACCESS_BLOCKED) {
9313 assert(!object->blocked_access);
9314 object->blocked_access = TRUE;
9315 }
9316
9317 vm_object_unlock(object);
9318
9319 /*
9320 * don't need any shadow mappings for this one
9321 * since it is already I/O memory
9322 */
9323 upl->flags |= UPL_DEVICE_MEMORY;
9324
9325 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9326
9327 if (user_page_list) {
9328 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9329 user_page_list[0].device = TRUE;
9330 }
9331 if (page_list_count != NULL) {
9332 if (upl->flags & UPL_INTERNAL) {
9333 *page_list_count = 0;
9334 } else {
9335 *page_list_count = 1;
9336 }
9337 }
9338
9339 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9340 #if DEVELOPMENT || DEBUG
9341 if (task != NULL) {
9342 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9343 }
9344 #endif /* DEVELOPMENT || DEBUG */
9345 return KERN_SUCCESS;
9346 }
9347 if (object != kernel_object && object != compressor_object) {
9348 /*
9349 * Protect user space from future COW operations
9350 */
9351 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9352 if (!object->true_share &&
9353 vm_object_tracking_btlog) {
9354 btlog_record(vm_object_tracking_btlog, object,
9355 VM_OBJECT_TRACKING_OP_TRUESHARE,
9356 btref_get(__builtin_frame_address(0), 0));
9357 }
9358 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9359
9360 vm_object_lock_assert_exclusive(object);
9361 object->true_share = TRUE;
9362
9363 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9364 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9365 }
9366 }
9367
9368 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9369 object->copy != VM_OBJECT_NULL) {
9370 /*
9371 * Honor copy-on-write obligations
9372 *
9373 * The caller is gathering these pages and
9374 * might modify their contents. We need to
9375 * make sure that the copy object has its own
9376 * private copies of these pages before we let
9377 * the caller modify them.
9378 *
9379 * NOTE: someone else could map the original object
9380 * after we've done this copy-on-write here, and they
9381 * could then see an inconsistent picture of the memory
9382 * while it's being modified via the UPL. To prevent this,
9383 * we would have to block access to these pages until the
9384 * UPL is released. We could use the UPL_BLOCK_ACCESS
9385 * code path for that...
9386 */
9387 vm_object_update(object,
9388 offset,
9389 size,
9390 NULL,
9391 NULL,
9392 FALSE, /* should_return */
9393 MEMORY_OBJECT_COPY_SYNC,
9394 VM_PROT_NO_CHANGE);
9395 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9396 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9397 }
9398 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9399 object->purgable != VM_PURGABLE_VOLATILE &&
9400 object->purgable != VM_PURGABLE_EMPTY &&
9401 object->copy == NULL &&
9402 size == object->vo_size &&
9403 offset == 0 &&
9404 object->shadow == NULL &&
9405 object->pager == NULL) {
9406 if (object->resident_page_count == size_in_pages) {
9407 assert(object != compressor_object);
9408 assert(object != kernel_object);
9409 fast_path_full_req = TRUE;
9410 } else if (object->resident_page_count == 0) {
9411 assert(object != compressor_object);
9412 assert(object != kernel_object);
9413 fast_path_empty_req = TRUE;
9414 set_cache_attr_needed = TRUE;
9415 }
9416 }
9417
9418 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9419 interruptible = THREAD_ABORTSAFE;
9420 } else {
9421 interruptible = THREAD_UNINT;
9422 }
9423
9424 entry = 0;
9425
9426 xfer_size = size;
9427 dst_offset = offset;
9428
9429 if (fast_path_full_req) {
9430 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9431 goto finish;
9432 }
9433 /*
9434 * we couldn't complete the processing of this request on the fast path
9435 * so fall through to the slow path and finish up
9436 */
9437 } else if (fast_path_empty_req) {
9438 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9439 ret = KERN_MEMORY_ERROR;
9440 goto return_err;
9441 }
9442 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9443 cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9444
9445 if (ret) {
9446 free_wired_pages = TRUE;
9447 goto return_err;
9448 }
9449 goto finish;
9450 }
9451
9452 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9453 fault_info.lo_offset = offset;
9454 fault_info.hi_offset = offset + xfer_size;
9455 fault_info.mark_zf_absent = TRUE;
9456 fault_info.interruptible = interruptible;
9457 fault_info.batch_pmap_op = TRUE;
9458
9459 while (xfer_size) {
9460 vm_fault_return_t result;
9461
9462 dwp->dw_mask = 0;
9463
9464 if (fast_path_full_req) {
9465 /*
9466 * if we get here, it means that we ran into a page
9467 * state we couldn't handle in the fast path and
9468 * bailed out to the slow path... since the order
9469 * we look at pages is different between the 2 paths,
9470 * the following check is needed to determine whether
9471 * this page was already processed in the fast path
9472 */
9473 if (bitmap_test(upl->lite_list, entry)) {
9474 goto skip_page;
9475 }
9476 }
9477 dst_page = vm_page_lookup(object, dst_offset);
9478
9479 if (dst_page == VM_PAGE_NULL ||
9480 dst_page->vmp_busy ||
9481 VMP_ERROR_GET(dst_page) ||
9482 dst_page->vmp_restart ||
9483 dst_page->vmp_absent ||
9484 dst_page->vmp_fictitious) {
9485 if (object == kernel_object) {
9486 panic("vm_object_iopl_request: missing/bad page in kernel object");
9487 }
9488 if (object == compressor_object) {
9489 panic("vm_object_iopl_request: missing/bad page in compressor object");
9490 }
9491
9492 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9493 ret = KERN_MEMORY_ERROR;
9494 goto return_err;
9495 }
9496 set_cache_attr_needed = TRUE;
9497
9498 /*
9499 * We just looked up the page and the result remains valid
9500 * until the object lock is release, so send it to
9501 * vm_fault_page() (as "dst_page"), to avoid having to
9502 * look it up again there.
9503 */
9504 caller_lookup = TRUE;
9505
9506 do {
9507 vm_page_t top_page;
9508 kern_return_t error_code;
9509
9510 fault_info.cluster_size = xfer_size;
9511
9512 vm_object_paging_begin(object);
9513
9514 result = vm_fault_page(object, dst_offset,
9515 prot | VM_PROT_WRITE, FALSE,
9516 caller_lookup,
9517 &prot, &dst_page, &top_page,
9518 (int *)0,
9519 &error_code, no_zero_fill,
9520 &fault_info);
9521
9522 /* our lookup is no longer valid at this point */
9523 caller_lookup = FALSE;
9524
9525 switch (result) {
9526 case VM_FAULT_SUCCESS:
9527 page_grab_count++;
9528
9529 if (!dst_page->vmp_absent) {
9530 PAGE_WAKEUP_DONE(dst_page);
9531 } else {
9532 /*
9533 * we only get back an absent page if we
9534 * requested that it not be zero-filled
9535 * because we are about to fill it via I/O
9536 *
9537 * absent pages should be left BUSY
9538 * to prevent them from being faulted
9539 * into an address space before we've
9540 * had a chance to complete the I/O on
9541 * them since they may contain info that
9542 * shouldn't be seen by the faulting task
9543 */
9544 }
9545 /*
9546 * Release paging references and
9547 * top-level placeholder page, if any.
9548 */
9549 if (top_page != VM_PAGE_NULL) {
9550 vm_object_t local_object;
9551
9552 local_object = VM_PAGE_OBJECT(top_page);
9553
9554 /*
9555 * comparing 2 packed pointers
9556 */
9557 if (top_page->vmp_object != dst_page->vmp_object) {
9558 vm_object_lock(local_object);
9559 VM_PAGE_FREE(top_page);
9560 vm_object_paging_end(local_object);
9561 vm_object_unlock(local_object);
9562 } else {
9563 VM_PAGE_FREE(top_page);
9564 vm_object_paging_end(local_object);
9565 }
9566 }
9567 vm_object_paging_end(object);
9568 break;
9569
9570 case VM_FAULT_RETRY:
9571 vm_object_lock(object);
9572 break;
9573
9574 case VM_FAULT_MEMORY_SHORTAGE:
9575 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9576
9577 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9578
9579 if (vm_page_wait(interruptible)) {
9580 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9581
9582 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9583 vm_object_lock(object);
9584
9585 break;
9586 }
9587 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9588
9589 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9590 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
9591 OS_FALLTHROUGH;
9592
9593 case VM_FAULT_INTERRUPTED:
9594 error_code = MACH_SEND_INTERRUPTED;
9595 OS_FALLTHROUGH;
9596 case VM_FAULT_MEMORY_ERROR:
9597 memory_error:
9598 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9599
9600 vm_object_lock(object);
9601 goto return_err;
9602
9603 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9604 /* success but no page: fail */
9605 vm_object_paging_end(object);
9606 vm_object_unlock(object);
9607 goto memory_error;
9608
9609 default:
9610 panic("vm_object_iopl_request: unexpected error"
9611 " 0x%x from vm_fault_page()\n", result);
9612 }
9613 } while (result != VM_FAULT_SUCCESS);
9614 }
9615 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9616
9617 if (upl->flags & UPL_KERNEL_OBJECT) {
9618 goto record_phys_addr;
9619 }
9620
9621 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9622 dst_page->vmp_busy = TRUE;
9623 goto record_phys_addr;
9624 }
9625
9626 if (dst_page->vmp_cleaning) {
9627 /*
9628 * Someone else is cleaning this page in place.
9629 * In theory, we should be able to proceed and use this
9630 * page but they'll probably end up clearing the "busy"
9631 * bit on it in upl_commit_range() but they didn't set
9632 * it, so they would clear our "busy" bit and open
9633 * us to race conditions.
9634 * We'd better wait for the cleaning to complete and
9635 * then try again.
9636 */
9637 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9638 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9639 continue;
9640 }
9641 if (dst_page->vmp_laundry) {
9642 vm_pageout_steal_laundry(dst_page, FALSE);
9643 }
9644
9645 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9646 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9647 vm_page_t low_page;
9648 int refmod;
9649
9650 /*
9651 * support devices that can't DMA above 32 bits
9652 * by substituting pages from a pool of low address
9653 * memory for any pages we find above the 4G mark
9654 * can't substitute if the page is already wired because
9655 * we don't know whether that physical address has been
9656 * handed out to some other 64 bit capable DMA device to use
9657 */
9658 if (VM_PAGE_WIRED(dst_page)) {
9659 ret = KERN_PROTECTION_FAILURE;
9660 goto return_err;
9661 }
9662 low_page = vm_page_grablo();
9663
9664 if (low_page == VM_PAGE_NULL) {
9665 ret = KERN_RESOURCE_SHORTAGE;
9666 goto return_err;
9667 }
9668 /*
9669 * from here until the vm_page_replace completes
9670 * we musn't drop the object lock... we don't
9671 * want anyone refaulting this page in and using
9672 * it after we disconnect it... we want the fault
9673 * to find the new page being substituted.
9674 */
9675 if (dst_page->vmp_pmapped) {
9676 refmod = pmap_disconnect(phys_page);
9677 } else {
9678 refmod = 0;
9679 }
9680
9681 if (!dst_page->vmp_absent) {
9682 vm_page_copy(dst_page, low_page);
9683 }
9684
9685 low_page->vmp_reference = dst_page->vmp_reference;
9686 low_page->vmp_dirty = dst_page->vmp_dirty;
9687 low_page->vmp_absent = dst_page->vmp_absent;
9688
9689 if (refmod & VM_MEM_REFERENCED) {
9690 low_page->vmp_reference = TRUE;
9691 }
9692 if (refmod & VM_MEM_MODIFIED) {
9693 SET_PAGE_DIRTY(low_page, FALSE);
9694 }
9695
9696 vm_page_replace(low_page, object, dst_offset);
9697
9698 dst_page = low_page;
9699 /*
9700 * vm_page_grablo returned the page marked
9701 * BUSY... we don't need a PAGE_WAKEUP_DONE
9702 * here, because we've never dropped the object lock
9703 */
9704 if (!dst_page->vmp_absent) {
9705 dst_page->vmp_busy = FALSE;
9706 }
9707
9708 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9709 }
9710 if (!dst_page->vmp_busy) {
9711 dwp->dw_mask |= DW_vm_page_wire;
9712 }
9713
9714 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9715 /*
9716 * Mark the page "busy" to block any future page fault
9717 * on this page in addition to wiring it.
9718 * We'll also remove the mapping
9719 * of all these pages before leaving this routine.
9720 */
9721 assert(!dst_page->vmp_fictitious);
9722 dst_page->vmp_busy = TRUE;
9723 }
9724 /*
9725 * expect the page to be used
9726 * page queues lock must be held to set 'reference'
9727 */
9728 dwp->dw_mask |= DW_set_reference;
9729
9730 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9731 SET_PAGE_DIRTY(dst_page, TRUE);
9732 /*
9733 * Page belonging to a code-signed object is about to
9734 * be written. Mark it tainted and disconnect it from
9735 * all pmaps so processes have to fault it back in and
9736 * deal with the tainted bit.
9737 */
9738 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9739 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9740 vm_page_iopl_tainted++;
9741 if (dst_page->vmp_pmapped) {
9742 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9743 if (refmod & VM_MEM_REFERENCED) {
9744 dst_page->vmp_reference = TRUE;
9745 }
9746 }
9747 }
9748 }
9749 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9750 pmap_sync_page_attributes_phys(phys_page);
9751 dst_page->vmp_written_by_kernel = FALSE;
9752 }
9753
9754 record_phys_addr:
9755 if (dst_page->vmp_busy) {
9756 upl->flags |= UPL_HAS_BUSY;
9757 }
9758
9759 bitmap_set(upl->lite_list, entry);
9760
9761 if (phys_page > upl->highest_page) {
9762 upl->highest_page = phys_page;
9763 }
9764
9765 if (user_page_list) {
9766 user_page_list[entry].phys_addr = phys_page;
9767 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9768 user_page_list[entry].absent = dst_page->vmp_absent;
9769 user_page_list[entry].dirty = dst_page->vmp_dirty;
9770 user_page_list[entry].precious = dst_page->vmp_precious;
9771 user_page_list[entry].device = FALSE;
9772 user_page_list[entry].needed = FALSE;
9773 if (dst_page->vmp_clustered == TRUE) {
9774 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9775 } else {
9776 user_page_list[entry].speculative = FALSE;
9777 }
9778 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9779 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9780 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9781 user_page_list[entry].mark = FALSE;
9782 }
9783 if (object != kernel_object && object != compressor_object) {
9784 /*
9785 * someone is explicitly grabbing this page...
9786 * update clustered and speculative state
9787 *
9788 */
9789 if (dst_page->vmp_clustered) {
9790 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9791 }
9792 }
9793 skip_page:
9794 entry++;
9795 dst_offset += PAGE_SIZE_64;
9796 xfer_size -= PAGE_SIZE;
9797
9798 if (dwp->dw_mask) {
9799 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9800
9801 if (dw_count >= dw_limit) {
9802 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9803
9804 dwp = dwp_start;
9805 dw_count = 0;
9806 }
9807 }
9808 }
9809 assert(entry == size_in_pages);
9810
9811 if (dw_count) {
9812 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9813 dwp = dwp_start;
9814 dw_count = 0;
9815 }
9816 finish:
9817 if (user_page_list && set_cache_attr_needed == TRUE) {
9818 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9819 }
9820
9821 if (page_list_count != NULL) {
9822 if (upl->flags & UPL_INTERNAL) {
9823 *page_list_count = 0;
9824 } else if (*page_list_count > size_in_pages) {
9825 *page_list_count = size_in_pages;
9826 }
9827 }
9828 vm_object_unlock(object);
9829
9830 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9831 /*
9832 * We've marked all the pages "busy" so that future
9833 * page faults will block.
9834 * Now remove the mapping for these pages, so that they
9835 * can't be accessed without causing a page fault.
9836 */
9837 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9838 PMAP_NULL,
9839 PAGE_SIZE,
9840 0, VM_PROT_NONE);
9841 assert(!object->blocked_access);
9842 object->blocked_access = TRUE;
9843 }
9844
9845 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9846 #if DEVELOPMENT || DEBUG
9847 if (task != NULL) {
9848 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9849 }
9850 #endif /* DEVELOPMENT || DEBUG */
9851
9852 if (dwp_start && dwp_finish_ctx) {
9853 vm_page_delayed_work_finish_ctx(dwp_start);
9854 dwp_start = dwp = NULL;
9855 }
9856
9857 return KERN_SUCCESS;
9858
9859 return_err:
9860 dw_index = 0;
9861
9862 for (; offset < dst_offset; offset += PAGE_SIZE) {
9863 boolean_t need_unwire;
9864
9865 dst_page = vm_page_lookup(object, offset);
9866
9867 if (dst_page == VM_PAGE_NULL) {
9868 panic("vm_object_iopl_request: Wired page missing.");
9869 }
9870
9871 /*
9872 * if we've already processed this page in an earlier
9873 * dw_do_work, we need to undo the wiring... we will
9874 * leave the dirty and reference bits on if they
9875 * were set, since we don't have a good way of knowing
9876 * what the previous state was and we won't get here
9877 * under any normal circumstances... we will always
9878 * clear BUSY and wakeup any waiters via vm_page_free
9879 * or PAGE_WAKEUP_DONE
9880 */
9881 need_unwire = TRUE;
9882
9883 if (dw_count) {
9884 if ((dwp_start)[dw_index].dw_m == dst_page) {
9885 /*
9886 * still in the deferred work list
9887 * which means we haven't yet called
9888 * vm_page_wire on this page
9889 */
9890 need_unwire = FALSE;
9891
9892 dw_index++;
9893 dw_count--;
9894 }
9895 }
9896 vm_page_lock_queues();
9897
9898 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9899 vm_page_free(dst_page);
9900
9901 need_unwire = FALSE;
9902 } else {
9903 if (need_unwire == TRUE) {
9904 vm_page_unwire(dst_page, TRUE);
9905 }
9906
9907 PAGE_WAKEUP_DONE(dst_page);
9908 }
9909 vm_page_unlock_queues();
9910
9911 if (need_unwire == TRUE) {
9912 counter_inc(&vm_statistics_reactivations);
9913 }
9914 }
9915 #if UPL_DEBUG
9916 upl->upl_state = 2;
9917 #endif
9918 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9919 vm_object_activity_end(object);
9920 vm_object_collapse(object, 0, TRUE);
9921 }
9922 vm_object_unlock(object);
9923 upl_destroy(upl);
9924
9925 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9926 #if DEVELOPMENT || DEBUG
9927 if (task != NULL) {
9928 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9929 }
9930 #endif /* DEVELOPMENT || DEBUG */
9931
9932 if (dwp_start && dwp_finish_ctx) {
9933 vm_page_delayed_work_finish_ctx(dwp_start);
9934 dwp_start = dwp = NULL;
9935 }
9936 return ret;
9937 }
9938
9939 kern_return_t
upl_transpose(upl_t upl1,upl_t upl2)9940 upl_transpose(
9941 upl_t upl1,
9942 upl_t upl2)
9943 {
9944 kern_return_t retval;
9945 boolean_t upls_locked;
9946 vm_object_t object1, object2;
9947
9948 /* LD: Should mapped UPLs be eligible for a transpose? */
9949 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9950 return KERN_INVALID_ARGUMENT;
9951 }
9952
9953 upls_locked = FALSE;
9954
9955 /*
9956 * Since we need to lock both UPLs at the same time,
9957 * avoid deadlocks by always taking locks in the same order.
9958 */
9959 if (upl1 < upl2) {
9960 upl_lock(upl1);
9961 upl_lock(upl2);
9962 } else {
9963 upl_lock(upl2);
9964 upl_lock(upl1);
9965 }
9966 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9967
9968 object1 = upl1->map_object;
9969 object2 = upl2->map_object;
9970
9971 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9972 upl1->u_size != upl2->u_size) {
9973 /*
9974 * We deal only with full objects, not subsets.
9975 * That's because we exchange the entire backing store info
9976 * for the objects: pager, resident pages, etc... We can't do
9977 * only part of it.
9978 */
9979 retval = KERN_INVALID_VALUE;
9980 goto done;
9981 }
9982
9983 /*
9984 * Tranpose the VM objects' backing store.
9985 */
9986 retval = vm_object_transpose(object1, object2,
9987 upl_adjusted_size(upl1, PAGE_MASK));
9988
9989 if (retval == KERN_SUCCESS) {
9990 /*
9991 * Make each UPL point to the correct VM object, i.e. the
9992 * object holding the pages that the UPL refers to...
9993 */
9994 #if CONFIG_IOSCHED || UPL_DEBUG
9995 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9996 vm_object_lock(object1);
9997 vm_object_lock(object2);
9998 }
9999 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10000 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10001 }
10002 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10003 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10004 }
10005 #endif
10006 upl1->map_object = object2;
10007 upl2->map_object = object1;
10008
10009 #if CONFIG_IOSCHED || UPL_DEBUG
10010 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10011 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10012 }
10013 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10014 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10015 }
10016 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10017 vm_object_unlock(object2);
10018 vm_object_unlock(object1);
10019 }
10020 #endif
10021 }
10022
10023 done:
10024 /*
10025 * Cleanup.
10026 */
10027 if (upls_locked) {
10028 upl_unlock(upl1);
10029 upl_unlock(upl2);
10030 upls_locked = FALSE;
10031 }
10032
10033 return retval;
10034 }
10035
10036 void
upl_range_needed(upl_t upl,int index,int count)10037 upl_range_needed(
10038 upl_t upl,
10039 int index,
10040 int count)
10041 {
10042 int size_in_pages;
10043
10044 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10045 return;
10046 }
10047
10048 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10049
10050 while (count-- && index < size_in_pages) {
10051 upl->page_list[index++].needed = TRUE;
10052 }
10053 }
10054
10055
10056 /*
10057 * Reserve of virtual addresses in the kernel address space.
10058 * We need to map the physical pages in the kernel, so that we
10059 * can call the code-signing or slide routines with a kernel
10060 * virtual address. We keep this pool of pre-allocated kernel
10061 * virtual addresses so that we don't have to scan the kernel's
10062 * virtaul address space each time we need to work with
10063 * a physical page.
10064 */
10065 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10066 #define VM_PAGING_NUM_PAGES 64
10067 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10068 bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10069 int vm_paging_max_index = 0;
10070 int vm_paging_page_waiter = 0;
10071 int vm_paging_page_waiter_total = 0;
10072
10073 unsigned long vm_paging_no_kernel_page = 0;
10074 unsigned long vm_paging_objects_mapped = 0;
10075 unsigned long vm_paging_pages_mapped = 0;
10076 unsigned long vm_paging_objects_mapped_slow = 0;
10077 unsigned long vm_paging_pages_mapped_slow = 0;
10078
10079 __startup_func
10080 static void
vm_paging_map_init(void)10081 vm_paging_map_init(void)
10082 {
10083 kmem_alloc(kernel_map, &vm_paging_base_address,
10084 ptoa(VM_PAGING_NUM_PAGES),
10085 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10086 VM_KERN_MEMORY_NONE);
10087 }
10088 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10089
10090 /*
10091 * vm_paging_map_object:
10092 * Maps part of a VM object's pages in the kernel
10093 * virtual address space, using the pre-allocated
10094 * kernel virtual addresses, if possible.
10095 * Context:
10096 * The VM object is locked. This lock will get
10097 * dropped and re-acquired though, so the caller
10098 * must make sure the VM object is kept alive
10099 * (by holding a VM map that has a reference
10100 * on it, for example, or taking an extra reference).
10101 * The page should also be kept busy to prevent
10102 * it from being reclaimed.
10103 */
10104 kern_return_t
vm_paging_map_object(vm_page_t page,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection,boolean_t can_unlock_object,vm_map_size_t * size,vm_map_offset_t * address,boolean_t * need_unmap)10105 vm_paging_map_object(
10106 vm_page_t page,
10107 vm_object_t object,
10108 vm_object_offset_t offset,
10109 vm_prot_t protection,
10110 boolean_t can_unlock_object,
10111 vm_map_size_t *size, /* IN/OUT */
10112 vm_map_offset_t *address, /* OUT */
10113 boolean_t *need_unmap) /* OUT */
10114 {
10115 kern_return_t kr;
10116 vm_map_offset_t page_map_offset;
10117 vm_map_size_t map_size;
10118 vm_object_offset_t object_offset;
10119 int i;
10120
10121 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10122 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10123 *address = (vm_map_offset_t)
10124 phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
10125 *need_unmap = FALSE;
10126 return KERN_SUCCESS;
10127
10128 assert(page->vmp_busy);
10129 /*
10130 * Use one of the pre-allocated kernel virtual addresses
10131 * and just enter the VM page in the kernel address space
10132 * at that virtual address.
10133 */
10134 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10135
10136 /*
10137 * Try and find an available kernel virtual address
10138 * from our pre-allocated pool.
10139 */
10140 page_map_offset = 0;
10141 for (;;) {
10142 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10143 if (vm_paging_page_inuse[i] == FALSE) {
10144 page_map_offset =
10145 vm_paging_base_address +
10146 (i * PAGE_SIZE);
10147 break;
10148 }
10149 }
10150 if (page_map_offset != 0) {
10151 /* found a space to map our page ! */
10152 break;
10153 }
10154
10155 if (can_unlock_object) {
10156 /*
10157 * If we can afford to unlock the VM object,
10158 * let's take the slow path now...
10159 */
10160 break;
10161 }
10162 /*
10163 * We can't afford to unlock the VM object, so
10164 * let's wait for a space to become available...
10165 */
10166 vm_paging_page_waiter_total++;
10167 vm_paging_page_waiter++;
10168 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
10169 if (kr == THREAD_WAITING) {
10170 simple_unlock(&vm_paging_lock);
10171 kr = thread_block(THREAD_CONTINUE_NULL);
10172 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10173 }
10174 vm_paging_page_waiter--;
10175 /* ... and try again */
10176 }
10177
10178 if (page_map_offset != 0) {
10179 /*
10180 * We found a kernel virtual address;
10181 * map the physical page to that virtual address.
10182 */
10183 if (i > vm_paging_max_index) {
10184 vm_paging_max_index = i;
10185 }
10186 vm_paging_page_inuse[i] = TRUE;
10187 simple_unlock(&vm_paging_lock);
10188
10189 page->vmp_pmapped = TRUE;
10190
10191 /*
10192 * Keep the VM object locked over the PMAP_ENTER
10193 * and the actual use of the page by the kernel,
10194 * or this pmap mapping might get undone by a
10195 * vm_object_pmap_protect() call...
10196 */
10197 PMAP_ENTER(kernel_pmap,
10198 page_map_offset,
10199 page,
10200 protection,
10201 VM_PROT_NONE,
10202 0,
10203 TRUE,
10204 kr);
10205 assert(kr == KERN_SUCCESS);
10206 vm_paging_objects_mapped++;
10207 vm_paging_pages_mapped++;
10208 *address = page_map_offset;
10209 *need_unmap = TRUE;
10210
10211 #if KASAN
10212 kasan_notify_address(page_map_offset, PAGE_SIZE);
10213 #endif
10214
10215 /* all done and mapped, ready to use ! */
10216 return KERN_SUCCESS;
10217 }
10218
10219 /*
10220 * We ran out of pre-allocated kernel virtual
10221 * addresses. Just map the page in the kernel
10222 * the slow and regular way.
10223 */
10224 vm_paging_no_kernel_page++;
10225 simple_unlock(&vm_paging_lock);
10226 }
10227
10228 if (!can_unlock_object) {
10229 *address = 0;
10230 *size = 0;
10231 *need_unmap = FALSE;
10232 return KERN_NOT_SUPPORTED;
10233 }
10234
10235 object_offset = vm_object_trunc_page(offset);
10236 map_size = vm_map_round_page(*size,
10237 VM_MAP_PAGE_MASK(kernel_map));
10238
10239 /*
10240 * Try and map the required range of the object
10241 * in the kernel_map. Given that allocation is
10242 * for pageable memory, it shouldn't contain
10243 * pointers and is mapped into the data range.
10244 */
10245
10246 vm_object_reference_locked(object); /* for the map entry */
10247 vm_object_unlock(object);
10248
10249 kr = vm_map_enter(kernel_map,
10250 address,
10251 map_size,
10252 0,
10253 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10254 object,
10255 object_offset,
10256 FALSE,
10257 protection,
10258 VM_PROT_ALL,
10259 VM_INHERIT_NONE);
10260 if (kr != KERN_SUCCESS) {
10261 *address = 0;
10262 *size = 0;
10263 *need_unmap = FALSE;
10264 vm_object_deallocate(object); /* for the map entry */
10265 vm_object_lock(object);
10266 return kr;
10267 }
10268
10269 *size = map_size;
10270
10271 /*
10272 * Enter the mapped pages in the page table now.
10273 */
10274 vm_object_lock(object);
10275 /*
10276 * VM object must be kept locked from before PMAP_ENTER()
10277 * until after the kernel is done accessing the page(s).
10278 * Otherwise, the pmap mappings in the kernel could be
10279 * undone by a call to vm_object_pmap_protect().
10280 */
10281
10282 for (page_map_offset = 0;
10283 map_size != 0;
10284 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10285 page = vm_page_lookup(object, offset + page_map_offset);
10286 if (page == VM_PAGE_NULL) {
10287 printf("vm_paging_map_object: no page !?");
10288 vm_object_unlock(object);
10289 vm_map_remove(kernel_map, *address, *size);
10290 *address = 0;
10291 *size = 0;
10292 *need_unmap = FALSE;
10293 vm_object_lock(object);
10294 return KERN_MEMORY_ERROR;
10295 }
10296 page->vmp_pmapped = TRUE;
10297
10298 PMAP_ENTER(kernel_pmap,
10299 *address + page_map_offset,
10300 page,
10301 protection,
10302 VM_PROT_NONE,
10303 0,
10304 TRUE,
10305 kr);
10306 assert(kr == KERN_SUCCESS);
10307 #if KASAN
10308 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10309 #endif
10310 }
10311
10312 vm_paging_objects_mapped_slow++;
10313 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10314
10315 *need_unmap = TRUE;
10316
10317 return KERN_SUCCESS;
10318 }
10319
10320 /*
10321 * vm_paging_unmap_object:
10322 * Unmaps part of a VM object's pages from the kernel
10323 * virtual address space.
10324 * Context:
10325 * The VM object is locked. This lock will get
10326 * dropped and re-acquired though.
10327 */
10328 void
vm_paging_unmap_object(vm_object_t object,vm_map_offset_t start,vm_map_offset_t end)10329 vm_paging_unmap_object(
10330 vm_object_t object,
10331 vm_map_offset_t start,
10332 vm_map_offset_t end)
10333 {
10334 int i;
10335
10336 if ((vm_paging_base_address == 0) ||
10337 (start < vm_paging_base_address) ||
10338 (end > (vm_paging_base_address
10339 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10340 /*
10341 * We didn't use our pre-allocated pool of
10342 * kernel virtual address. Deallocate the
10343 * virtual memory.
10344 */
10345 if (object != VM_OBJECT_NULL) {
10346 vm_object_unlock(object);
10347 }
10348 vm_map_remove(kernel_map, start, end);
10349 if (object != VM_OBJECT_NULL) {
10350 vm_object_lock(object);
10351 }
10352 } else {
10353 /*
10354 * We used a kernel virtual address from our
10355 * pre-allocated pool. Put it back in the pool
10356 * for next time.
10357 */
10358 assert(end - start == PAGE_SIZE);
10359 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10360 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10361
10362 /* undo the pmap mapping */
10363 pmap_remove(kernel_pmap, start, end);
10364
10365 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10366 vm_paging_page_inuse[i] = FALSE;
10367 if (vm_paging_page_waiter) {
10368 thread_wakeup(&vm_paging_page_waiter);
10369 }
10370 simple_unlock(&vm_paging_lock);
10371 }
10372 }
10373
10374
10375 /*
10376 * page->vmp_object must be locked
10377 */
10378 void
vm_pageout_steal_laundry(vm_page_t page,boolean_t queues_locked)10379 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10380 {
10381 if (!queues_locked) {
10382 vm_page_lockspin_queues();
10383 }
10384
10385 page->vmp_free_when_done = FALSE;
10386 /*
10387 * need to drop the laundry count...
10388 * we may also need to remove it
10389 * from the I/O paging queue...
10390 * vm_pageout_throttle_up handles both cases
10391 *
10392 * the laundry and pageout_queue flags are cleared...
10393 */
10394 vm_pageout_throttle_up(page);
10395
10396 if (!queues_locked) {
10397 vm_page_unlock_queues();
10398 }
10399 }
10400
10401 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10402
10403 upl_t
vector_upl_create(vm_offset_t upl_offset,uint32_t max_upls)10404 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10405 {
10406 int i = 0;
10407 upl_t upl;
10408
10409 assert(max_upls > 0);
10410 if (max_upls == 0) {
10411 return NULL;
10412 }
10413
10414 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10415 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10416 }
10417 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10418
10419 upl = upl_create(0, UPL_VECTOR, 0);
10420 upl->vector_upl = vector_upl;
10421 upl->u_offset = upl_offset;
10422 vector_upl->size = 0;
10423 vector_upl->offset = upl_offset;
10424 vector_upl->invalid_upls = 0;
10425 vector_upl->num_upls = 0;
10426 vector_upl->pagelist = NULL;
10427 vector_upl->max_upls = max_upls;
10428
10429 for (i = 0; i < max_upls; i++) {
10430 vector_upl->upls[i].iostate.size = 0;
10431 vector_upl->upls[i].iostate.offset = 0;
10432 }
10433 return upl;
10434 }
10435
10436 uint32_t
vector_upl_max_upls(const upl_t upl)10437 vector_upl_max_upls(const upl_t upl)
10438 {
10439 if (!vector_upl_is_valid(upl)) {
10440 return 0;
10441 }
10442 return ((vector_upl_t)(upl->vector_upl))->max_upls;
10443 }
10444
10445 void
vector_upl_deallocate(upl_t upl)10446 vector_upl_deallocate(upl_t upl)
10447 {
10448 vector_upl_t vector_upl = upl->vector_upl;
10449
10450 assert(vector_upl_is_valid(upl));
10451
10452 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10453 panic("Deallocating non-empty Vectored UPL");
10454 }
10455 uint32_t max_upls = vector_upl->max_upls;
10456 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10457 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10458 upl->vector_upl = NULL;
10459 }
10460
10461 boolean_t
vector_upl_is_valid(upl_t upl)10462 vector_upl_is_valid(upl_t upl)
10463 {
10464 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10465 }
10466
10467 boolean_t
vector_upl_set_subupl(upl_t upl,upl_t subupl,uint32_t io_size)10468 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10469 {
10470 if (vector_upl_is_valid(upl)) {
10471 vector_upl_t vector_upl = upl->vector_upl;
10472
10473 if (vector_upl) {
10474 if (subupl) {
10475 if (io_size) {
10476 if (io_size < PAGE_SIZE) {
10477 io_size = PAGE_SIZE;
10478 }
10479 subupl->vector_upl = (void*)vector_upl;
10480 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10481 vector_upl->size += io_size;
10482 upl->u_size += io_size;
10483 } else {
10484 uint32_t i = 0, invalid_upls = 0;
10485 for (i = 0; i < vector_upl->num_upls; i++) {
10486 if (vector_upl->upls[i].elem == subupl) {
10487 break;
10488 }
10489 }
10490 if (i == vector_upl->num_upls) {
10491 panic("Trying to remove sub-upl when none exists");
10492 }
10493
10494 vector_upl->upls[i].elem = NULL;
10495 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10496 relaxed);
10497 if (invalid_upls == vector_upl->num_upls) {
10498 return TRUE;
10499 } else {
10500 return FALSE;
10501 }
10502 }
10503 } else {
10504 panic("vector_upl_set_subupl was passed a NULL upl element");
10505 }
10506 } else {
10507 panic("vector_upl_set_subupl was passed a non-vectored upl");
10508 }
10509 } else {
10510 panic("vector_upl_set_subupl was passed a NULL upl");
10511 }
10512
10513 return FALSE;
10514 }
10515
10516 void
vector_upl_set_pagelist(upl_t upl)10517 vector_upl_set_pagelist(upl_t upl)
10518 {
10519 if (vector_upl_is_valid(upl)) {
10520 uint32_t i = 0;
10521 vector_upl_t vector_upl = upl->vector_upl;
10522
10523 if (vector_upl) {
10524 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10525
10526 vector_upl->pagelist = kalloc_type(struct upl_page_info,
10527 atop(vector_upl->size), Z_WAITOK);
10528
10529 for (i = 0; i < vector_upl->num_upls; i++) {
10530 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10531 bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10532 pagelist_size += cur_upl_pagelist_size;
10533 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10534 upl->highest_page = vector_upl->upls[i].elem->highest_page;
10535 }
10536 }
10537 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10538 } else {
10539 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10540 }
10541 } else {
10542 panic("vector_upl_set_pagelist was passed a NULL upl");
10543 }
10544 }
10545
10546 upl_t
vector_upl_subupl_byindex(upl_t upl,uint32_t index)10547 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10548 {
10549 if (vector_upl_is_valid(upl)) {
10550 vector_upl_t vector_upl = upl->vector_upl;
10551 if (vector_upl) {
10552 if (index < vector_upl->num_upls) {
10553 return vector_upl->upls[index].elem;
10554 }
10555 } else {
10556 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10557 }
10558 }
10559 return NULL;
10560 }
10561
10562 upl_t
vector_upl_subupl_byoffset(upl_t upl,upl_offset_t * upl_offset,upl_size_t * upl_size)10563 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10564 {
10565 if (vector_upl_is_valid(upl)) {
10566 uint32_t i = 0;
10567 vector_upl_t vector_upl = upl->vector_upl;
10568
10569 if (vector_upl) {
10570 upl_t subupl = NULL;
10571 vector_upl_iostates_t subupl_state;
10572
10573 for (i = 0; i < vector_upl->num_upls; i++) {
10574 subupl = vector_upl->upls[i].elem;
10575 subupl_state = vector_upl->upls[i].iostate;
10576 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10577 /* We could have been passed an offset/size pair that belongs
10578 * to an UPL element that has already been committed/aborted.
10579 * If so, return NULL.
10580 */
10581 if (subupl == NULL) {
10582 return NULL;
10583 }
10584 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10585 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10586 if (*upl_size > subupl_state.size) {
10587 *upl_size = subupl_state.size;
10588 }
10589 }
10590 if (*upl_offset >= subupl_state.offset) {
10591 *upl_offset -= subupl_state.offset;
10592 } else if (i) {
10593 panic("Vector UPL offset miscalculation");
10594 }
10595 return subupl;
10596 }
10597 }
10598 } else {
10599 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10600 }
10601 }
10602 return NULL;
10603 }
10604
10605 void
vector_upl_get_submap(upl_t upl,vm_map_t * v_upl_submap,vm_offset_t * submap_dst_addr)10606 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10607 {
10608 *v_upl_submap = NULL;
10609
10610 if (vector_upl_is_valid(upl)) {
10611 vector_upl_t vector_upl = upl->vector_upl;
10612 if (vector_upl) {
10613 *v_upl_submap = vector_upl->submap;
10614 *submap_dst_addr = vector_upl->submap_dst_addr;
10615 } else {
10616 panic("vector_upl_get_submap was passed a non-vectored UPL");
10617 }
10618 } else {
10619 panic("vector_upl_get_submap was passed a null UPL");
10620 }
10621 }
10622
10623 void
vector_upl_set_submap(upl_t upl,vm_map_t submap,vm_offset_t submap_dst_addr)10624 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10625 {
10626 if (vector_upl_is_valid(upl)) {
10627 vector_upl_t vector_upl = upl->vector_upl;
10628 if (vector_upl) {
10629 vector_upl->submap = submap;
10630 vector_upl->submap_dst_addr = submap_dst_addr;
10631 } else {
10632 panic("vector_upl_get_submap was passed a non-vectored UPL");
10633 }
10634 } else {
10635 panic("vector_upl_get_submap was passed a NULL UPL");
10636 }
10637 }
10638
10639 void
vector_upl_set_iostate(upl_t upl,upl_t subupl,upl_offset_t offset,upl_size_t size)10640 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10641 {
10642 if (vector_upl_is_valid(upl)) {
10643 uint32_t i = 0;
10644 vector_upl_t vector_upl = upl->vector_upl;
10645
10646 if (vector_upl) {
10647 for (i = 0; i < vector_upl->num_upls; i++) {
10648 if (vector_upl->upls[i].elem == subupl) {
10649 break;
10650 }
10651 }
10652
10653 if (i == vector_upl->num_upls) {
10654 panic("setting sub-upl iostate when none exists");
10655 }
10656
10657 vector_upl->upls[i].iostate.offset = offset;
10658 if (size < PAGE_SIZE) {
10659 size = PAGE_SIZE;
10660 }
10661 vector_upl->upls[i].iostate.size = size;
10662 } else {
10663 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10664 }
10665 } else {
10666 panic("vector_upl_set_iostate was passed a NULL UPL");
10667 }
10668 }
10669
10670 void
vector_upl_get_iostate(upl_t upl,upl_t subupl,upl_offset_t * offset,upl_size_t * size)10671 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10672 {
10673 if (vector_upl_is_valid(upl)) {
10674 uint32_t i = 0;
10675 vector_upl_t vector_upl = upl->vector_upl;
10676
10677 if (vector_upl) {
10678 for (i = 0; i < vector_upl->num_upls; i++) {
10679 if (vector_upl->upls[i].elem == subupl) {
10680 break;
10681 }
10682 }
10683
10684 if (i == vector_upl->num_upls) {
10685 panic("getting sub-upl iostate when none exists");
10686 }
10687
10688 *offset = vector_upl->upls[i].iostate.offset;
10689 *size = vector_upl->upls[i].iostate.size;
10690 } else {
10691 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10692 }
10693 } else {
10694 panic("vector_upl_get_iostate was passed a NULL UPL");
10695 }
10696 }
10697
10698 void
vector_upl_get_iostate_byindex(upl_t upl,uint32_t index,upl_offset_t * offset,upl_size_t * size)10699 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10700 {
10701 if (vector_upl_is_valid(upl)) {
10702 vector_upl_t vector_upl = upl->vector_upl;
10703 if (vector_upl) {
10704 if (index < vector_upl->num_upls) {
10705 *offset = vector_upl->upls[index].iostate.offset;
10706 *size = vector_upl->upls[index].iostate.size;
10707 } else {
10708 *offset = *size = 0;
10709 }
10710 } else {
10711 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10712 }
10713 } else {
10714 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10715 }
10716 }
10717
10718 void *
upl_get_internal_vectorupl(upl_t upl)10719 upl_get_internal_vectorupl(upl_t upl)
10720 {
10721 return upl->vector_upl;
10722 }
10723
10724 upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)10725 upl_get_internal_vectorupl_pagelist(upl_t upl)
10726 {
10727 return upl->vector_upl->pagelist;
10728 }
10729
10730 upl_page_info_t *
upl_get_internal_page_list(upl_t upl)10731 upl_get_internal_page_list(upl_t upl)
10732 {
10733 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10734 }
10735
10736 void
upl_clear_dirty(upl_t upl,boolean_t value)10737 upl_clear_dirty(
10738 upl_t upl,
10739 boolean_t value)
10740 {
10741 if (value) {
10742 upl->flags |= UPL_CLEAR_DIRTY;
10743 } else {
10744 upl->flags &= ~UPL_CLEAR_DIRTY;
10745 }
10746 }
10747
10748 void
upl_set_referenced(upl_t upl,boolean_t value)10749 upl_set_referenced(
10750 upl_t upl,
10751 boolean_t value)
10752 {
10753 upl_lock(upl);
10754 if (value) {
10755 upl->ext_ref_count++;
10756 } else {
10757 if (!upl->ext_ref_count) {
10758 panic("upl_set_referenced not %p", upl);
10759 }
10760 upl->ext_ref_count--;
10761 }
10762 upl_unlock(upl);
10763 }
10764
10765 #if CONFIG_IOSCHED
10766 void
upl_set_blkno(upl_t upl,vm_offset_t upl_offset,int io_size,int64_t blkno)10767 upl_set_blkno(
10768 upl_t upl,
10769 vm_offset_t upl_offset,
10770 int io_size,
10771 int64_t blkno)
10772 {
10773 int i, j;
10774 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10775 return;
10776 }
10777
10778 assert(upl->upl_reprio_info != 0);
10779 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10780 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10781 }
10782 }
10783 #endif
10784
10785 void inline
memoryshot(unsigned int event,unsigned int control)10786 memoryshot(unsigned int event, unsigned int control)
10787 {
10788 if (vm_debug_events) {
10789 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10790 vm_page_active_count, vm_page_inactive_count,
10791 vm_page_free_count, vm_page_speculative_count,
10792 vm_page_throttled_count);
10793 } else {
10794 (void) event;
10795 (void) control;
10796 }
10797 }
10798
10799 #ifdef MACH_BSD
10800
10801 boolean_t
upl_device_page(upl_page_info_t * upl)10802 upl_device_page(upl_page_info_t *upl)
10803 {
10804 return UPL_DEVICE_PAGE(upl);
10805 }
10806 boolean_t
upl_page_present(upl_page_info_t * upl,int index)10807 upl_page_present(upl_page_info_t *upl, int index)
10808 {
10809 return UPL_PAGE_PRESENT(upl, index);
10810 }
10811 boolean_t
upl_speculative_page(upl_page_info_t * upl,int index)10812 upl_speculative_page(upl_page_info_t *upl, int index)
10813 {
10814 return UPL_SPECULATIVE_PAGE(upl, index);
10815 }
10816 boolean_t
upl_dirty_page(upl_page_info_t * upl,int index)10817 upl_dirty_page(upl_page_info_t *upl, int index)
10818 {
10819 return UPL_DIRTY_PAGE(upl, index);
10820 }
10821 boolean_t
upl_valid_page(upl_page_info_t * upl,int index)10822 upl_valid_page(upl_page_info_t *upl, int index)
10823 {
10824 return UPL_VALID_PAGE(upl, index);
10825 }
10826 ppnum_t
upl_phys_page(upl_page_info_t * upl,int index)10827 upl_phys_page(upl_page_info_t *upl, int index)
10828 {
10829 return UPL_PHYS_PAGE(upl, index);
10830 }
10831
10832 void
upl_page_set_mark(upl_page_info_t * upl,int index,boolean_t v)10833 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10834 {
10835 upl[index].mark = v;
10836 }
10837
10838 boolean_t
upl_page_get_mark(upl_page_info_t * upl,int index)10839 upl_page_get_mark(upl_page_info_t *upl, int index)
10840 {
10841 return upl[index].mark;
10842 }
10843
10844 void
vm_countdirtypages(void)10845 vm_countdirtypages(void)
10846 {
10847 vm_page_t m;
10848 int dpages;
10849 int pgopages;
10850 int precpages;
10851
10852
10853 dpages = 0;
10854 pgopages = 0;
10855 precpages = 0;
10856
10857 vm_page_lock_queues();
10858 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10859 do {
10860 if (m == (vm_page_t)0) {
10861 break;
10862 }
10863
10864 if (m->vmp_dirty) {
10865 dpages++;
10866 }
10867 if (m->vmp_free_when_done) {
10868 pgopages++;
10869 }
10870 if (m->vmp_precious) {
10871 precpages++;
10872 }
10873
10874 assert(VM_PAGE_OBJECT(m) != kernel_object);
10875 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10876 if (m == (vm_page_t)0) {
10877 break;
10878 }
10879 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10880 vm_page_unlock_queues();
10881
10882 vm_page_lock_queues();
10883 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10884 do {
10885 if (m == (vm_page_t)0) {
10886 break;
10887 }
10888
10889 dpages++;
10890 assert(m->vmp_dirty);
10891 assert(!m->vmp_free_when_done);
10892 assert(VM_PAGE_OBJECT(m) != kernel_object);
10893 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10894 if (m == (vm_page_t)0) {
10895 break;
10896 }
10897 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10898 vm_page_unlock_queues();
10899
10900 vm_page_lock_queues();
10901 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10902 do {
10903 if (m == (vm_page_t)0) {
10904 break;
10905 }
10906
10907 if (m->vmp_dirty) {
10908 dpages++;
10909 }
10910 if (m->vmp_free_when_done) {
10911 pgopages++;
10912 }
10913 if (m->vmp_precious) {
10914 precpages++;
10915 }
10916
10917 assert(VM_PAGE_OBJECT(m) != kernel_object);
10918 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10919 if (m == (vm_page_t)0) {
10920 break;
10921 }
10922 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10923 vm_page_unlock_queues();
10924
10925 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10926
10927 dpages = 0;
10928 pgopages = 0;
10929 precpages = 0;
10930
10931 vm_page_lock_queues();
10932 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10933
10934 do {
10935 if (m == (vm_page_t)0) {
10936 break;
10937 }
10938 if (m->vmp_dirty) {
10939 dpages++;
10940 }
10941 if (m->vmp_free_when_done) {
10942 pgopages++;
10943 }
10944 if (m->vmp_precious) {
10945 precpages++;
10946 }
10947
10948 assert(VM_PAGE_OBJECT(m) != kernel_object);
10949 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10950 if (m == (vm_page_t)0) {
10951 break;
10952 }
10953 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10954 vm_page_unlock_queues();
10955
10956 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10957 }
10958 #endif /* MACH_BSD */
10959
10960
10961 #if CONFIG_IOSCHED
10962 int
upl_get_cached_tier(upl_t upl)10963 upl_get_cached_tier(upl_t upl)
10964 {
10965 assert(upl);
10966 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10967 return upl->upl_priority;
10968 }
10969 return -1;
10970 }
10971 #endif /* CONFIG_IOSCHED */
10972
10973
10974 void
upl_callout_iodone(upl_t upl)10975 upl_callout_iodone(upl_t upl)
10976 {
10977 struct upl_io_completion *upl_ctx = upl->upl_iodone;
10978
10979 if (upl_ctx) {
10980 void (*iodone_func)(void *, int) = upl_ctx->io_done;
10981
10982 assert(upl_ctx->io_done);
10983
10984 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
10985 }
10986 }
10987
10988 void
upl_set_iodone(upl_t upl,void * upl_iodone)10989 upl_set_iodone(upl_t upl, void *upl_iodone)
10990 {
10991 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
10992 }
10993
10994 void
upl_set_iodone_error(upl_t upl,int error)10995 upl_set_iodone_error(upl_t upl, int error)
10996 {
10997 struct upl_io_completion *upl_ctx = upl->upl_iodone;
10998
10999 if (upl_ctx) {
11000 upl_ctx->io_error = error;
11001 }
11002 }
11003
11004
11005 ppnum_t
upl_get_highest_page(upl_t upl)11006 upl_get_highest_page(
11007 upl_t upl)
11008 {
11009 return upl->highest_page;
11010 }
11011
11012 upl_size_t
upl_get_size(upl_t upl)11013 upl_get_size(
11014 upl_t upl)
11015 {
11016 return upl_adjusted_size(upl, PAGE_MASK);
11017 }
11018
11019 upl_size_t
upl_adjusted_size(upl_t upl,vm_map_offset_t pgmask)11020 upl_adjusted_size(
11021 upl_t upl,
11022 vm_map_offset_t pgmask)
11023 {
11024 vm_object_offset_t start_offset, end_offset;
11025
11026 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11027 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11028
11029 return (upl_size_t)(end_offset - start_offset);
11030 }
11031
11032 vm_object_offset_t
upl_adjusted_offset(upl_t upl,vm_map_offset_t pgmask)11033 upl_adjusted_offset(
11034 upl_t upl,
11035 vm_map_offset_t pgmask)
11036 {
11037 return trunc_page_mask_64(upl->u_offset, pgmask);
11038 }
11039
11040 vm_object_offset_t
upl_get_data_offset(upl_t upl)11041 upl_get_data_offset(
11042 upl_t upl)
11043 {
11044 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11045 }
11046
11047 upl_t
upl_associated_upl(upl_t upl)11048 upl_associated_upl(upl_t upl)
11049 {
11050 return upl->associated_upl;
11051 }
11052
11053 void
upl_set_associated_upl(upl_t upl,upl_t associated_upl)11054 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11055 {
11056 upl->associated_upl = associated_upl;
11057 }
11058
11059 struct vnode *
upl_lookup_vnode(upl_t upl)11060 upl_lookup_vnode(upl_t upl)
11061 {
11062 if (!upl->map_object->internal) {
11063 return vnode_pager_lookup_vnode(upl->map_object->pager);
11064 } else {
11065 return NULL;
11066 }
11067 }
11068
11069 #if UPL_DEBUG
11070 kern_return_t
upl_ubc_alias_set(upl_t upl,uintptr_t alias1,uintptr_t alias2)11071 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11072 {
11073 upl->ubc_alias1 = alias1;
11074 upl->ubc_alias2 = alias2;
11075 return KERN_SUCCESS;
11076 }
11077 int
upl_ubc_alias_get(upl_t upl,uintptr_t * al,uintptr_t * al2)11078 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11079 {
11080 if (al) {
11081 *al = upl->ubc_alias1;
11082 }
11083 if (al2) {
11084 *al2 = upl->ubc_alias2;
11085 }
11086 return KERN_SUCCESS;
11087 }
11088 #endif /* UPL_DEBUG */
11089
11090 #if VM_PRESSURE_EVENTS
11091 /*
11092 * Upward trajectory.
11093 */
11094 extern boolean_t vm_compressor_low_on_space(void);
11095
11096 boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)11097 VM_PRESSURE_NORMAL_TO_WARNING(void)
11098 {
11099 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11100 /* Available pages below our threshold */
11101 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11102 /* No frozen processes to kill */
11103 if (memorystatus_frozen_count == 0) {
11104 /* Not enough suspended processes available. */
11105 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11106 return TRUE;
11107 }
11108 }
11109 }
11110 return FALSE;
11111 } else {
11112 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11113 }
11114 }
11115
11116 boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)11117 VM_PRESSURE_WARNING_TO_CRITICAL(void)
11118 {
11119 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11120 /* Available pages below our threshold */
11121 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11122 return TRUE;
11123 }
11124 return FALSE;
11125 } else {
11126 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11127 }
11128 }
11129
11130 /*
11131 * Downward trajectory.
11132 */
11133 boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)11134 VM_PRESSURE_WARNING_TO_NORMAL(void)
11135 {
11136 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11137 /* Available pages above our threshold */
11138 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11139 if (memorystatus_available_pages > target_threshold) {
11140 return TRUE;
11141 }
11142 return FALSE;
11143 } else {
11144 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11145 }
11146 }
11147
11148 boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)11149 VM_PRESSURE_CRITICAL_TO_WARNING(void)
11150 {
11151 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11152 /* Available pages above our threshold */
11153 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11154 if (memorystatus_available_pages > target_threshold) {
11155 return TRUE;
11156 }
11157 return FALSE;
11158 } else {
11159 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11160 }
11161 }
11162 #endif /* VM_PRESSURE_EVENTS */
11163
11164 #if DEVELOPMENT || DEBUG
11165 bool compressor_running_perf_test;
11166 uint64_t compressor_perf_test_pages_processed;
11167
11168 kern_return_t
11169 run_compressor_perf_test(
11170 user_addr_t buf,
11171 size_t buffer_size,
11172 uint64_t *time,
11173 uint64_t *bytes_compressed,
11174 uint64_t *compressor_growth);
11175
11176 static kern_return_t
move_pages_to_queue(vm_map_t map,user_addr_t start_addr,size_t buffer_size,vm_page_queue_head_t * queue,size_t * pages_moved)11177 move_pages_to_queue(
11178 vm_map_t map,
11179 user_addr_t start_addr,
11180 size_t buffer_size,
11181 vm_page_queue_head_t *queue,
11182 size_t *pages_moved)
11183 {
11184 kern_return_t err = KERN_SUCCESS;
11185 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11186 boolean_t addr_in_map = FALSE;
11187 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11188 vm_object_t curr_object = VM_OBJECT_NULL;
11189 *pages_moved = 0;
11190
11191
11192 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11193 /*
11194 * We don't currently support benchmarking maps with a different page size
11195 * than the kernel.
11196 */
11197 return KERN_INVALID_ARGUMENT;
11198 }
11199
11200 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11201 return KERN_INVALID_ARGUMENT;
11202 }
11203
11204 vm_map_lock_read(map);
11205 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11206 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11207
11208
11209 while (curr_addr < end_addr) {
11210 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11211 if (!addr_in_map) {
11212 err = KERN_INVALID_ARGUMENT;
11213 break;
11214 }
11215 curr_object = VME_OBJECT(curr_entry);
11216 if (curr_object) {
11217 vm_object_lock(curr_object);
11218 /* We really only want anonymous memory that's in the top level map and object here. */
11219 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11220 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11221 err = KERN_INVALID_ARGUMENT;
11222 vm_object_unlock(curr_object);
11223 break;
11224 }
11225 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11226 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11227 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11228 vm_map_offset_t curr_offset = start_offset;
11229 vm_page_t curr_page;
11230 while (curr_offset < end_offset) {
11231 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11232 if (curr_page != VM_PAGE_NULL) {
11233 vm_page_lock_queues();
11234 if (curr_page->vmp_laundry) {
11235 vm_pageout_steal_laundry(curr_page, TRUE);
11236 }
11237 /*
11238 * we've already factored out pages in the laundry which
11239 * means this page can't be on the pageout queue so it's
11240 * safe to do the vm_page_queues_remove
11241 */
11242 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11243 vm_page_queues_remove(curr_page, TRUE);
11244 if (donate) {
11245 /*
11246 * The compressor needs to see this bit to know
11247 * where this page needs to land. Also if stolen,
11248 * this bit helps put the page back in the right
11249 * special queue where it belongs.
11250 */
11251 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11252 }
11253 // Clear the referenced bit so we ensure this gets paged out
11254 curr_page->vmp_reference = false;
11255 if (curr_page->vmp_pmapped) {
11256 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11257 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11258 }
11259 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11260 vm_page_unlock_queues();
11261 *pages_moved += 1;
11262 }
11263 curr_offset += PAGE_SIZE_64;
11264 curr_addr += PAGE_SIZE_64;
11265 }
11266 }
11267 vm_object_unlock(curr_object);
11268 }
11269 vm_map_unlock_read(map);
11270 return err;
11271 }
11272
11273 /*
11274 * Local queue for processing benchmark pages.
11275 * Can't be allocated on the stack because the pointer has to
11276 * be packable.
11277 */
11278 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11279 kern_return_t
run_compressor_perf_test(user_addr_t buf,size_t buffer_size,uint64_t * time,uint64_t * bytes_compressed,uint64_t * compressor_growth)11280 run_compressor_perf_test(
11281 user_addr_t buf,
11282 size_t buffer_size,
11283 uint64_t *time,
11284 uint64_t *bytes_compressed,
11285 uint64_t *compressor_growth)
11286 {
11287 kern_return_t err = KERN_SUCCESS;
11288 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11289 return KERN_NOT_SUPPORTED;
11290 }
11291 if (current_task() == kernel_task) {
11292 return KERN_INVALID_ARGUMENT;
11293 }
11294 vm_page_lock_queues();
11295 if (compressor_running_perf_test) {
11296 /* Only run one instance of the benchmark at a time. */
11297 vm_page_unlock_queues();
11298 return KERN_RESOURCE_SHORTAGE;
11299 }
11300 vm_page_unlock_queues();
11301 size_t page_count = 0;
11302 vm_map_t map;
11303 vm_page_t p, next;
11304 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11305 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11306 *bytes_compressed = *compressor_growth = 0;
11307
11308 vm_page_queue_init(&compressor_perf_test_queue);
11309 map = current_task()->map;
11310 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11311 if (err != KERN_SUCCESS) {
11312 goto out;
11313 }
11314
11315 vm_page_lock_queues();
11316 compressor_running_perf_test = true;
11317 compressor_perf_test_pages_processed = 0;
11318 /*
11319 * At this point the compressor threads should only process the benchmark queue
11320 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11321 * to determine how many compressed bytes we ended up using.
11322 */
11323 compressed_bytes_start = c_segment_compressed_bytes;
11324 vm_page_unlock_queues();
11325
11326 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11327
11328 vm_page_lock_queues();
11329 compressor_perf_test_start = mach_absolute_time();
11330
11331 // Wake up the compressor thread(s)
11332 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11333 pgo_iothread_internal_state[0].pgo_iothread);
11334
11335 /*
11336 * Depending on when this test is run we could overshoot or be right on the mark
11337 * with our page_count. So the comparison is of the _less than_ variety.
11338 */
11339 while (compressor_perf_test_pages_processed < page_count) {
11340 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11341 vm_page_unlock_queues();
11342 thread_block(THREAD_CONTINUE_NULL);
11343 vm_page_lock_queues();
11344 }
11345 compressor_perf_test_end = mach_absolute_time();
11346 compressed_bytes_end = c_segment_compressed_bytes;
11347 vm_page_unlock_queues();
11348
11349
11350 out:
11351 /*
11352 * If we errored out above, then we could still have some pages
11353 * on the local queue. Make sure to put them back on the active queue before
11354 * returning so they're not orphaned.
11355 */
11356 vm_page_lock_queues();
11357 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11358 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11359 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11360 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11361
11362 vm_page_enqueue_active(p, FALSE);
11363 p = next;
11364 }
11365
11366 compressor_running_perf_test = false;
11367 vm_page_unlock_queues();
11368 if (err == KERN_SUCCESS) {
11369 *bytes_compressed = page_count * PAGE_SIZE_64;
11370 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11371 }
11372
11373 /*
11374 * pageout_scan will consider waking the compactor swapper
11375 * before it blocks. Do the same thing here before we return
11376 * to ensure that back to back benchmark runs can't overly fragment the
11377 * compressor pool.
11378 */
11379 vm_consider_waking_compactor_swapper();
11380 return err;
11381 }
11382 #endif /* DEVELOPMENT || DEBUG */
11383