1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_object.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Virtual memory object module.
63 */
64
65 #include <debug.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/memory_object.h>
69 #include <mach/vm_param.h>
70
71 #include <mach/sdt.h>
72
73 #include <ipc/ipc_types.h>
74 #include <ipc/ipc_port.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/assert.h>
78 #include <kern/queue.h>
79 #include <kern/kalloc.h>
80 #include <kern/zalloc.h>
81 #include <kern/host.h>
82 #include <kern/host_statistics.h>
83 #include <kern/processor.h>
84 #include <kern/misc_protos.h>
85 #include <kern/policy_internal.h>
86 #include <kern/coalition.h>
87
88 #include <sys/kdebug.h>
89 #include <sys/kdebug_triage.h>
90
91 #include <vm/memory_object_internal.h>
92 #include <vm/vm_compressor_pager_internal.h>
93 #include <vm/vm_fault_internal.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object_internal.h>
96 #include <vm/vm_page_internal.h>
97 #include <vm/vm_pageout_internal.h>
98 #include <vm/vm_protos_internal.h>
99 #include <vm/vm_purgeable_internal.h>
100 #include <vm/vm_ubc.h>
101 #include <vm/vm_compressor_xnu.h>
102 #include <os/hash.h>
103
104 #if CONFIG_PHANTOM_CACHE
105 #include <vm/vm_phantom_cache_internal.h>
106 #endif
107
108 #if VM_OBJECT_ACCESS_TRACKING
109 uint64_t vm_object_access_tracking_reads = 0;
110 uint64_t vm_object_access_tracking_writes = 0;
111 #endif /* VM_OBJECT_ACCESS_TRACKING */
112
113 boolean_t vm_object_collapse_compressor_allowed = TRUE;
114
115 struct vm_counters vm_counters;
116
117 os_refgrp_decl(, vm_object_refgrp, "vm_object", NULL);
118
119 #if DEVELOPMENT || DEBUG
120 extern struct memory_object_pager_ops shared_region_pager_ops;
121 extern unsigned int shared_region_pagers_resident_count;
122 extern unsigned int shared_region_pagers_resident_peak;
123 #endif /* DEVELOPMENT || DEBUG */
124
125 #if VM_OBJECT_TRACKING
126 btlog_t vm_object_tracking_btlog;
127
128 void
vm_object_tracking_init(void)129 vm_object_tracking_init(void)
130 {
131 int vm_object_tracking;
132
133 vm_object_tracking = 1;
134 PE_parse_boot_argn("vm_object_tracking", &vm_object_tracking,
135 sizeof(vm_object_tracking));
136
137 if (vm_object_tracking) {
138 vm_object_tracking_btlog = btlog_create(BTLOG_HASH,
139 VM_OBJECT_TRACKING_NUM_RECORDS);
140 assert(vm_object_tracking_btlog);
141 }
142 }
143 #endif /* VM_OBJECT_TRACKING */
144
145 /*
146 * Virtual memory objects maintain the actual data
147 * associated with allocated virtual memory. A given
148 * page of memory exists within exactly one object.
149 *
150 * An object is only deallocated when all "references"
151 * are given up.
152 *
153 * Associated with each object is a list of all resident
154 * memory pages belonging to that object; this list is
155 * maintained by the "vm_page" module, but locked by the object's
156 * lock.
157 *
158 * Each object also records the memory object reference
159 * that is used by the kernel to request and write
160 * back data (the memory object, field "pager"), etc...
161 *
162 * Virtual memory objects are allocated to provide
163 * zero-filled memory (vm_allocate) or map a user-defined
164 * memory object into a virtual address space (vm_map).
165 *
166 * Virtual memory objects that refer to a user-defined
167 * memory object are called "permanent", because all changes
168 * made in virtual memory are reflected back to the
169 * memory manager, which may then store it permanently.
170 * Other virtual memory objects are called "temporary",
171 * meaning that changes need be written back only when
172 * necessary to reclaim pages, and that storage associated
173 * with the object can be discarded once it is no longer
174 * mapped.
175 *
176 * A permanent memory object may be mapped into more
177 * than one virtual address space. Moreover, two threads
178 * may attempt to make the first mapping of a memory
179 * object concurrently. Only one thread is allowed to
180 * complete this mapping; all others wait for the
181 * "pager_initialized" field is asserted, indicating
182 * that the first thread has initialized all of the
183 * necessary fields in the virtual memory object structure.
184 *
185 * The kernel relies on a *default memory manager* to
186 * provide backing storage for the zero-filled virtual
187 * memory objects. The pager memory objects associated
188 * with these temporary virtual memory objects are only
189 * requested from the default memory manager when it
190 * becomes necessary. Virtual memory objects
191 * that depend on the default memory manager are called
192 * "internal". The "pager_created" field is provided to
193 * indicate whether these ports have ever been allocated.
194 *
195 * The kernel may also create virtual memory objects to
196 * hold changed pages after a copy-on-write operation.
197 * In this case, the virtual memory object (and its
198 * backing storage -- its memory object) only contain
199 * those pages that have been changed. The "shadow"
200 * field refers to the virtual memory object that contains
201 * the remainder of the contents. The "shadow_offset"
202 * field indicates where in the "shadow" these contents begin.
203 * The "copy" field refers to a virtual memory object
204 * to which changed pages must be copied before changing
205 * this object, in order to implement another form
206 * of copy-on-write optimization.
207 *
208 * The virtual memory object structure also records
209 * the attributes associated with its memory object.
210 * The "pager_ready", "can_persist" and "copy_strategy"
211 * fields represent those attributes. The "cached_list"
212 * field is used in the implementation of the persistence
213 * attribute.
214 *
215 * ZZZ Continue this comment.
216 */
217
218 /* Forward declarations for internal functions. */
219 static kern_return_t vm_object_terminate(
220 vm_object_t object);
221
222 static void vm_object_do_collapse(
223 vm_object_t object,
224 vm_object_t backing_object);
225
226 static void vm_object_do_bypass(
227 vm_object_t object,
228 vm_object_t backing_object);
229
230 static void vm_object_release_pager(
231 memory_object_t pager);
232
233 SECURITY_READ_ONLY_LATE(zone_t) vm_object_zone; /* vm backing store zone */
234
235 /*
236 * Wired-down kernel memory belongs to this memory object (kernel_object)
237 * by default to avoid wasting data structures.
238 */
239 static struct vm_object kernel_object_store VM_PAGE_PACKED_ALIGNED;
240 const vm_object_t kernel_object_default = &kernel_object_store;
241
242 static struct vm_object compressor_object_store VM_PAGE_PACKED_ALIGNED;
243 const vm_object_t compressor_object = &compressor_object_store;
244
245 /*
246 * This object holds all pages that have been retired due to errors like ECC.
247 * The system should never use the page or look at its contents. The offset
248 * in this object is the same as the page's physical address.
249 */
250 static struct vm_object retired_pages_object_store VM_PAGE_PACKED_ALIGNED;
251 const vm_object_t retired_pages_object = &retired_pages_object_store;
252
253 #if HAS_MTE
254 /*
255 * This object holds all pages that are currently being used to hold MTE tags.
256 * The pages are wired and may have no pmap mappings of any kind.
257 * The object offset will be the same as physical address.
258 */
259 static struct vm_object mte_tags_object_store VM_PAGE_PACKED_ALIGNED;
260 const vm_object_t mte_tags_object = &mte_tags_object_store;
261
262 /*
263 * This object is for pages that would have been on kernel_object_default, except
264 * that they are using MTE tags.
265 */
266 static struct vm_object kernel_object_tagged_store VM_PAGE_PACKED_ALIGNED;
267 const vm_object_t kernel_object_tagged = &kernel_object_tagged_store;
268 #endif /* HAS_MTE */
269
270 static struct vm_object exclaves_object_store VM_PAGE_PACKED_ALIGNED;
271 const vm_object_t exclaves_object = &exclaves_object_store;
272 #if HAS_MTE
273 static struct vm_object exclaves_object_tagged_store VM_PAGE_PACKED_ALIGNED;
274 const vm_object_t exclaves_object_tagged = &exclaves_object_tagged_store;
275 #endif /* HAS_MTE */
276
277
278 /*
279 * Virtual memory objects are initialized from
280 * a template (see vm_object_allocate).
281 *
282 * When adding a new field to the virtual memory
283 * object structure, be sure to add initialization
284 * (see _vm_object_allocate()).
285 */
286 static const struct vm_object vm_object_template = {
287 .memq.prev = 0,
288 .memq.next = 0,
289 /*
290 * The lock will be initialized for each allocated object in
291 * _vm_object_allocate(), so we don't need to initialize it in
292 * the vm_object_template.
293 */
294 .vo_size = 0,
295 .memq_hint = VM_PAGE_NULL,
296 /*
297 * The ref count will be initialized for each allocated object in
298 * _vm_object_allocate(), so we don't need to initialize it in the
299 * vm_object_template.
300 */
301 .resident_page_count = 0,
302 .wired_page_count = 0,
303 .reusable_page_count = 0,
304 .vo_copy = VM_OBJECT_NULL,
305 .vo_copy_version = 0,
306 .vo_inherit_copy_none = false,
307 .shadow = VM_OBJECT_NULL,
308 .vo_shadow_offset = (vm_object_offset_t) 0,
309 .pager = MEMORY_OBJECT_NULL,
310 .paging_offset = 0,
311 .pager_control = MEMORY_OBJECT_CONTROL_NULL,
312 .copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC,
313 .paging_in_progress = 0,
314 .vo_size_delta = 0,
315 .activity_in_progress = 0,
316
317 /* Begin bitfields */
318 .all_wanted = 0, /* all bits FALSE */
319 .pager_created = FALSE,
320 .pager_initialized = FALSE,
321 .pager_ready = FALSE,
322 .pager_trusted = FALSE,
323 .can_persist = FALSE,
324 .internal = TRUE,
325 .private = FALSE,
326 .pageout = FALSE,
327 .alive = TRUE,
328 .purgable = VM_PURGABLE_DENY,
329 .purgeable_when_ripe = FALSE,
330 .purgeable_only_by_kernel = FALSE,
331 .shadowed = FALSE,
332 .true_share = FALSE,
333 .terminating = FALSE,
334 .named = FALSE,
335 .shadow_severed = FALSE,
336 .phys_contiguous = FALSE,
337 .nophyscache = FALSE,
338 /* End bitfields */
339
340 .cached_list.prev = NULL,
341 .cached_list.next = NULL,
342
343 .last_alloc = (vm_object_offset_t) 0,
344 .sequential = (vm_object_offset_t) 0,
345 .pages_created = 0,
346 .pages_used = 0,
347 .scan_collisions = 0,
348 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
349 .vo_chead_hint = 0,
350 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
351 #if CONFIG_PHANTOM_CACHE
352 .phantom_object_id = 0,
353 #endif
354 .cow_hint = ~(vm_offset_t)0,
355
356 /* cache bitfields */
357 .wimg_bits = VM_WIMG_USE_DEFAULT,
358 .set_cache_attr = FALSE,
359 .object_is_shared_cache = FALSE,
360 .code_signed = FALSE,
361 .transposed = FALSE,
362 .mapping_in_progress = FALSE,
363 .phantom_isssd = FALSE,
364 .volatile_empty = FALSE,
365 .volatile_fault = FALSE,
366 .all_reusable = FALSE,
367 .blocked_access = FALSE,
368 .vo_ledger_tag = VM_LEDGER_TAG_NONE,
369 .vo_no_footprint = FALSE,
370 #if CONFIG_IOSCHED || UPL_DEBUG
371 .uplq.prev = NULL,
372 .uplq.next = NULL,
373 #endif /* UPL_DEBUG */
374 #ifdef VM_PIP_DEBUG
375 .pip_holders = {0},
376 #endif /* VM_PIP_DEBUG */
377
378 .objq.next = NULL,
379 .objq.prev = NULL,
380 .task_objq.next = NULL,
381 .task_objq.prev = NULL,
382
383 .purgeable_queue_type = PURGEABLE_Q_TYPE_MAX,
384 .purgeable_queue_group = 0,
385
386 .wire_tag = VM_KERN_MEMORY_NONE,
387 #if !VM_TAG_ACTIVE_UPDATE
388 .wired_objq.next = NULL,
389 .wired_objq.prev = NULL,
390 #endif /* ! VM_TAG_ACTIVE_UPDATE */
391
392 .io_tracking = FALSE,
393
394 #if CONFIG_SECLUDED_MEMORY
395 .eligible_for_secluded = FALSE,
396 .can_grab_secluded = FALSE,
397 #else /* CONFIG_SECLUDED_MEMORY */
398 .__object3_unused_bits = 0,
399 #endif /* CONFIG_SECLUDED_MEMORY */
400
401 .for_realtime = false,
402 .no_pager_reason = VM_OBJECT_DESTROY_UNKNOWN_REASON,
403
404 #if VM_OBJECT_ACCESS_TRACKING
405 .access_tracking = FALSE,
406 .access_tracking_reads = 0,
407 .access_tracking_writes = 0,
408 #endif /* VM_OBJECT_ACCESS_TRACKING */
409
410 #if DEBUG
411 .purgeable_owner_bt = {0},
412 .vo_purgeable_volatilizer = NULL,
413 .purgeable_volatilizer_bt = {0},
414 #endif /* DEBUG */
415 .vmo_provenance = VM_MAP_SERIAL_NONE,
416 .vmo_pl_req_in_progress = 0,
417 };
418
419 LCK_GRP_DECLARE(vm_object_lck_grp, "vm_object");
420 LCK_GRP_DECLARE(vm_object_cache_lck_grp, "vm_object_cache");
421 LCK_ATTR_DECLARE(vm_object_lck_attr, 0, 0);
422 LCK_ATTR_DECLARE(kernel_object_lck_attr, 0, LCK_ATTR_DEBUG);
423 LCK_ATTR_DECLARE(compressor_object_lck_attr, 0, LCK_ATTR_DEBUG);
424
425 unsigned int vm_page_purged_wired = 0;
426 unsigned int vm_page_purged_busy = 0;
427 unsigned int vm_page_purged_others = 0;
428
429 static queue_head_t vm_object_cached_list;
430 static uint32_t vm_object_cache_pages_freed = 0;
431 static uint32_t vm_object_cache_pages_moved = 0;
432 static uint32_t vm_object_cache_pages_skipped = 0;
433 static uint32_t vm_object_cache_adds = 0;
434 static uint32_t vm_object_cached_count = 0;
435 static LCK_MTX_DECLARE_ATTR(vm_object_cached_lock_data,
436 &vm_object_cache_lck_grp, &vm_object_lck_attr);
437
438 static uint32_t vm_object_page_grab_failed = 0;
439 static uint32_t vm_object_page_grab_skipped = 0;
440 static uint32_t vm_object_page_grab_returned = 0;
441 static uint32_t vm_object_page_grab_pmapped = 0;
442 static uint32_t vm_object_page_grab_reactivations = 0;
443
444 #define vm_object_cache_lock_spin() \
445 lck_mtx_lock_spin(&vm_object_cached_lock_data)
446 #define vm_object_cache_unlock() \
447 lck_mtx_unlock(&vm_object_cached_lock_data)
448
449 static void vm_object_cache_remove_locked(vm_object_t);
450
451
452 static void vm_object_reap(vm_object_t object);
453 static void vm_object_reap_async(vm_object_t object);
454 static void vm_object_reaper_thread(void);
455
456 static LCK_MTX_DECLARE_ATTR(vm_object_reaper_lock_data,
457 &vm_object_lck_grp, &vm_object_lck_attr);
458
459 static queue_head_t vm_object_reaper_queue; /* protected by vm_object_reaper_lock() */
460 unsigned int vm_object_reap_count = 0;
461 unsigned int vm_object_reap_count_async = 0;
462
463 #if HAS_MTE
464 unsigned int vm_object_no_compressor_pager_for_mte_count = 0;
465 TUNABLE(bool, vm_object_allow_compressor_pager_for_mte, "compress_mte", true);
466 #endif
467
468 #define vm_object_reaper_lock() \
469 lck_mtx_lock(&vm_object_reaper_lock_data)
470 #define vm_object_reaper_lock_spin() \
471 lck_mtx_lock_spin(&vm_object_reaper_lock_data)
472 #define vm_object_reaper_unlock() \
473 lck_mtx_unlock(&vm_object_reaper_lock_data)
474
475 #if CONFIG_IOSCHED
476 /* I/O Re-prioritization request list */
477 struct mpsc_daemon_queue io_reprioritize_q;
478
479 ZONE_DEFINE_TYPE(io_reprioritize_req_zone, "io_reprioritize_req",
480 struct io_reprioritize_req, ZC_NONE);
481
482 /* I/O re-prioritization MPSC callback */
483 static void io_reprioritize(mpsc_queue_chain_t elm, mpsc_daemon_queue_t dq);
484
485 void vm_page_request_reprioritize(vm_object_t, uint64_t, uint32_t, int);
486 void vm_page_handle_prio_inversion(vm_object_t, vm_page_t);
487 void vm_decmp_upl_reprioritize(upl_t, int);
488 #endif
489
490 void
vm_object_set_size(vm_object_t object,vm_object_size_t outer_size,vm_object_size_t inner_size)491 vm_object_set_size(
492 vm_object_t object,
493 vm_object_size_t outer_size,
494 vm_object_size_t inner_size)
495 {
496 object->vo_size = vm_object_round_page(outer_size);
497 #if KASAN
498 assert(object->vo_size - inner_size <= USHRT_MAX);
499 object->vo_size_delta = (unsigned short)(object->vo_size - inner_size);
500 #else
501 (void)inner_size;
502 #endif
503 }
504
505
506 /*
507 * vm_object_allocate:
508 *
509 * Returns a new object with the given size.
510 */
511
512 __private_extern__ void
_vm_object_allocate(vm_object_size_t size,vm_object_t object,vm_map_serial_t provenance)513 _vm_object_allocate(
514 vm_object_size_t size,
515 vm_object_t object,
516 vm_map_serial_t provenance)
517 {
518 *object = vm_object_template;
519 object->vmo_provenance = provenance;
520
521 vm_page_queue_init(&object->memq);
522 #if UPL_DEBUG || CONFIG_IOSCHED
523 queue_init(&object->uplq);
524 #endif
525 vm_object_lock_init(object);
526 vm_object_set_size(object, size, size);
527
528 os_ref_init_raw(&object->ref_count, &vm_object_refgrp);
529
530 #if VM_OBJECT_TRACKING_OP_CREATED
531 if (vm_object_tracking_btlog) {
532 btlog_record(vm_object_tracking_btlog, object,
533 VM_OBJECT_TRACKING_OP_CREATED,
534 btref_get(__builtin_frame_address(0), 0));
535 }
536 #endif /* VM_OBJECT_TRACKING_OP_CREATED */
537 }
538
539 __private_extern__ vm_object_t
vm_object_allocate(vm_object_size_t size,vm_map_serial_t provenance)540 vm_object_allocate(
541 vm_object_size_t size, vm_map_serial_t provenance)
542 {
543 vm_object_t object;
544
545 object = zalloc_flags(vm_object_zone, Z_WAITOK | Z_NOFAIL);
546 _vm_object_allocate(size, object, provenance);
547
548 return object;
549 }
550
551 TUNABLE(bool, workaround_41447923, "workaround_41447923", false);
552
553 /*
554 * vm_object_bootstrap:
555 *
556 * Initialize the VM objects module.
557 */
558 __startup_func
559 void
vm_object_bootstrap(void)560 vm_object_bootstrap(void)
561 {
562 vm_size_t vm_object_size;
563
564 assert(sizeof(mo_ipc_object_bits_t) == sizeof(ipc_object_bits_t));
565
566 vm_object_size = (sizeof(struct vm_object) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
567 ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
568
569 vm_object_zone = zone_create("vm objects", vm_object_size,
570 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED | ZC_VM);
571
572 queue_init(&vm_object_cached_list);
573
574 queue_init(&vm_object_reaper_queue);
575
576 /*
577 * Initialize the "kernel object"
578 */
579
580 /*
581 * Note that in the following size specifications, we need to add 1 because
582 * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
583 */
584 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object_default, VM_MAP_SERIAL_SPECIAL);
585 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, compressor_object, VM_MAP_SERIAL_SPECIAL);
586 kernel_object_default->copy_strategy = MEMORY_OBJECT_COPY_NONE;
587 compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
588 kernel_object_default->no_tag_update = TRUE;
589
590 /*
591 * The object to hold retired VM pages.
592 */
593 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, retired_pages_object, VM_MAP_SERIAL_SPECIAL);
594 retired_pages_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
595
596 #if HAS_MTE
597 /*
598 * The object to hold MTE tag pages.
599 */
600 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, mte_tags_object, VM_MAP_SERIAL_SPECIAL);
601 mte_tags_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
602
603 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object_tagged, VM_MAP_SERIAL_SPECIAL);
604 kernel_object_tagged->copy_strategy = MEMORY_OBJECT_COPY_NONE;
605 kernel_object_tagged->no_tag_update = TRUE;
606 kernel_object_tagged->wimg_bits = VM_WIMG_MTE;
607 #endif /* HAS_MTE */
608
609 /**
610 * The object to hold pages owned by exclaves.
611 */
612 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, exclaves_object, VM_MAP_SERIAL_SPECIAL);
613 exclaves_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
614 #if HAS_MTE
615 /**
616 * The object to hold MTE tag pages owned by exclaves.
617 */
618 _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, exclaves_object_tagged, VM_MAP_SERIAL_SPECIAL);
619 exclaves_object_tagged->copy_strategy = MEMORY_OBJECT_COPY_NONE;
620 exclaves_object_tagged->wimg_bits = VM_WIMG_MTE;
621 #endif /* HAS_MTE */
622 }
623
624 #if CONFIG_IOSCHED
625 void
vm_io_reprioritize_init(void)626 vm_io_reprioritize_init(void)
627 {
628 kern_return_t result;
629
630 result = mpsc_daemon_queue_init_with_thread(&io_reprioritize_q, io_reprioritize, BASEPRI_KERNEL,
631 "VM_io_reprioritize_thread", MPSC_DAEMON_INIT_NONE);
632 if (result != KERN_SUCCESS) {
633 panic("Unable to start I/O reprioritization thread (%d)", result);
634 }
635 }
636 #endif
637
638 void
vm_object_reaper_init(void)639 vm_object_reaper_init(void)
640 {
641 kern_return_t kr;
642 thread_t thread;
643
644 kr = kernel_thread_start_priority(
645 (thread_continue_t) vm_object_reaper_thread,
646 NULL,
647 BASEPRI_VM,
648 &thread);
649 if (kr != KERN_SUCCESS) {
650 panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
651 }
652 thread_set_thread_name(thread, "VM_object_reaper_thread");
653 thread_deallocate(thread);
654 }
655
656
657 /*
658 * vm_object_deallocate:
659 *
660 * Release a reference to the specified object,
661 * gained either through a vm_object_allocate
662 * or a vm_object_reference call. When all references
663 * are gone, storage associated with this object
664 * may be relinquished.
665 *
666 * No object may be locked.
667 */
668 unsigned long vm_object_deallocate_shared_successes = 0;
669 unsigned long vm_object_deallocate_shared_failures = 0;
670 unsigned long vm_object_deallocate_shared_swap_failures = 0;
671
672 __private_extern__ void
vm_object_deallocate(vm_object_t object)673 vm_object_deallocate(
674 vm_object_t object)
675 {
676 vm_object_t shadow = VM_OBJECT_NULL;
677
678 // if(object)dbgLog(object, object->ref_count, object->can_persist, 3); /* (TEST/DEBUG) */
679 // else dbgLog(object, 0, 0, 3); /* (TEST/DEBUG) */
680
681 if (object == VM_OBJECT_NULL) {
682 return;
683 }
684
685 if (is_kernel_object(object) || object == compressor_object || object == retired_pages_object) {
686 vm_object_lock_shared(object);
687
688 if (os_ref_get_count_raw(&object->ref_count) == 1) {
689 if (is_kernel_object(object)) {
690 panic("vm_object_deallocate: losing a kernel_object");
691 } else if (object == retired_pages_object) {
692 panic("vm_object_deallocate: losing retired_pages_object");
693 } else {
694 panic("vm_object_deallocate: losing compressor_object");
695 }
696 }
697
698 os_ref_release_live_raw(&object->ref_count, &vm_object_refgrp);
699
700 vm_object_unlock(object);
701 return;
702 }
703
704 if (os_ref_get_count_raw(&object->ref_count) == 2 &&
705 object->named) {
706 /*
707 * This "named" object's reference count is about to
708 * drop from 2 to 1:
709 * we'll need to call memory_object_last_unmap().
710 */
711 } else if (os_ref_get_count_raw(&object->ref_count) == 2 &&
712 object->internal &&
713 object->shadow != VM_OBJECT_NULL) {
714 /*
715 * This internal object's reference count is about to
716 * drop from 2 to 1 and it has a shadow object:
717 * we'll want to try and collapse this object with its
718 * shadow.
719 */
720 } else if (os_ref_get_count_raw(&object->ref_count) >= 2) {
721 UInt32 original_ref_count;
722 volatile UInt32 *ref_count_p;
723 Boolean atomic_swap;
724
725 /*
726 * The object currently looks like it is not being
727 * kept alive solely by the reference we're about to release.
728 * Let's try and release our reference without taking
729 * all the locks we would need if we had to terminate the
730 * object (cache lock + exclusive object lock).
731 * Lock the object "shared" to make sure we don't race with
732 * anyone holding it "exclusive".
733 */
734 vm_object_lock_shared(object);
735 ref_count_p = (volatile UInt32 *) &object->ref_count;
736 original_ref_count = os_ref_get_count_raw(&object->ref_count);
737 /*
738 * Test again as "ref_count" could have changed.
739 * "named" shouldn't change.
740 */
741 if (original_ref_count == 2 &&
742 object->named) {
743 /* need to take slow path for m_o_last_unmap() */
744 atomic_swap = FALSE;
745 } else if (original_ref_count == 2 &&
746 object->internal &&
747 object->shadow != VM_OBJECT_NULL) {
748 /* need to take slow path for vm_object_collapse() */
749 atomic_swap = FALSE;
750 } else if (original_ref_count < 2) {
751 /* need to take slow path for vm_object_terminate() */
752 atomic_swap = FALSE;
753 } else {
754 /* try an atomic update with the shared lock */
755 atomic_swap = OSCompareAndSwap(
756 original_ref_count,
757 original_ref_count - 1,
758 (UInt32 *) &object->ref_count);
759 if (atomic_swap == FALSE) {
760 vm_object_deallocate_shared_swap_failures++;
761 /* fall back to the slow path... */
762 }
763 }
764
765 vm_object_unlock(object);
766
767 if (atomic_swap) {
768 /*
769 * ref_count was updated atomically !
770 */
771 vm_object_deallocate_shared_successes++;
772 return;
773 }
774
775 /*
776 * Someone else updated the ref_count at the same
777 * time and we lost the race. Fall back to the usual
778 * slow but safe path...
779 */
780 vm_object_deallocate_shared_failures++;
781 }
782
783 while (object != VM_OBJECT_NULL) {
784 vm_object_lock(object);
785
786 assert(os_ref_get_count_raw(&object->ref_count) > 0);
787
788 /*
789 * If the object has a named reference, and only
790 * that reference would remain, inform the pager
791 * about the last "mapping" reference going away.
792 */
793 if ((os_ref_get_count_raw(&object->ref_count) == 2) && (object->named)) {
794 memory_object_t pager = object->pager;
795
796 /* Notify the Pager that there are no */
797 /* more mappers for this object */
798
799 if (pager != MEMORY_OBJECT_NULL) {
800 vm_object_mapping_wait(object, THREAD_UNINT);
801 /* object might have lost its pager while waiting */
802 pager = object->pager;
803 if (object->ref_count == 2 &&
804 object->named &&
805 pager != MEMORY_OBJECT_NULL) {
806 vm_object_mapping_begin(object);
807 assert(pager->mo_last_unmap_ctid == 0);
808 /*
809 * Signal that we're the thread that triggered
810 * the memory_object_last_unmap(), so that we
811 * don't deadlock in vm_object_destroy() if this
812 * was the last reference and we're releasing
813 * the pager there.
814 */
815 pager->mo_last_unmap_ctid = thread_get_ctid(current_thread());
816 vm_object_unlock(object);
817
818 memory_object_last_unmap(pager);
819 /* pager might no longer be valid now */
820 pager = MEMORY_OBJECT_NULL;
821
822 vm_object_lock(object);
823
824 vm_object_mapping_end(object);
825 pager = object->pager;
826 if (pager != MEMORY_OBJECT_NULL) {
827 /*
828 * The pager is still there, so reset its
829 * "mo_last_unmap_ctid" now that we're done.
830 */
831 assert3u(pager->mo_last_unmap_ctid, ==, thread_get_ctid(current_thread()));
832 pager->mo_last_unmap_ctid = 0;
833 }
834 }
835 }
836 assert(os_ref_get_count_raw(&object->ref_count) > 0);
837 }
838
839 /*
840 * Lose the reference. If other references
841 * remain, then we are done, unless we need
842 * to retry a cache trim.
843 * If it is the last reference, then keep it
844 * until any pending initialization is completed.
845 */
846
847 /* if the object is terminating, it cannot go into */
848 /* the cache and we obviously should not call */
849 /* terminate again. */
850
851 if ((os_ref_get_count_raw(&object->ref_count) > 1) ||
852 object->terminating) {
853 vm_object_lock_assert_exclusive(object);
854 os_ref_release_live_locked_raw(&object->ref_count,
855 &vm_object_refgrp);
856
857 if (os_ref_get_count_raw(&object->ref_count) == 1 &&
858 object->shadow != VM_OBJECT_NULL) {
859 /*
860 * There's only one reference left on this
861 * VM object. We can't tell if it's a valid
862 * one (from a mapping for example) or if this
863 * object is just part of a possibly stale and
864 * useless shadow chain.
865 * We would like to try and collapse it into
866 * its parent, but we don't have any pointers
867 * back to this parent object.
868 * But we can try and collapse this object with
869 * its own shadows, in case these are useless
870 * too...
871 * We can't bypass this object though, since we
872 * don't know if this last reference on it is
873 * meaningful or not.
874 */
875 vm_object_collapse(object, 0, FALSE);
876 }
877 vm_object_unlock(object);
878 return;
879 }
880
881 /*
882 * We have to wait for initialization
883 * before destroying or caching the object.
884 */
885
886 if (object->pager_created && !object->pager_ready) {
887 assert(!object->can_persist);
888 vm_object_sleep(object,
889 VM_OBJECT_EVENT_PAGER_READY,
890 THREAD_UNINT,
891 LCK_SLEEP_UNLOCK);
892 continue;
893 }
894
895 /*
896 * Terminate this object. If it had a shadow,
897 * then deallocate it; otherwise, if we need
898 * to retry a cache trim, do so now; otherwise,
899 * we are done. "pageout" objects have a shadow,
900 * but maintain a "paging reference" rather than
901 * a normal reference.
902 */
903 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
904
905 if (vm_object_terminate(object) != KERN_SUCCESS) {
906 return;
907 }
908 if (shadow != VM_OBJECT_NULL) {
909 object = shadow;
910 continue;
911 }
912 return;
913 }
914 }
915
916
917
918 vm_page_t
vm_object_page_grab(vm_object_t object)919 vm_object_page_grab(
920 vm_object_t object)
921 {
922 vm_page_t p, next_p;
923 int p_limit = 0;
924 int p_skipped = 0;
925
926 vm_object_lock_assert_exclusive(object);
927
928 next_p = (vm_page_t)vm_page_queue_first(&object->memq);
929 p_limit = MIN(50, object->resident_page_count);
930
931 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && --p_limit > 0) {
932 p = next_p;
933 next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq);
934
935 if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning ||
936 p->vmp_laundry || vm_page_is_fictitious(p)) {
937 goto move_page_in_obj;
938 }
939
940 if (p->vmp_pmapped || p->vmp_dirty || p->vmp_precious) {
941 vm_page_lockspin_queues();
942
943 if (p->vmp_pmapped) {
944 int refmod_state;
945
946 vm_object_page_grab_pmapped++;
947
948 if (p->vmp_reference == FALSE || p->vmp_dirty == FALSE) {
949 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(p));
950
951 if (refmod_state & VM_MEM_REFERENCED) {
952 p->vmp_reference = TRUE;
953 }
954 if (refmod_state & VM_MEM_MODIFIED) {
955 SET_PAGE_DIRTY(p, FALSE);
956 }
957 }
958 if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) {
959 vm_page_lockconvert_queues();
960 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
961
962 if (refmod_state & VM_MEM_REFERENCED) {
963 p->vmp_reference = TRUE;
964 }
965 if (refmod_state & VM_MEM_MODIFIED) {
966 SET_PAGE_DIRTY(p, FALSE);
967 }
968
969 if (p->vmp_dirty == FALSE) {
970 goto take_page;
971 }
972 }
973 }
974 if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) {
975 vm_page_activate(p);
976
977 counter_inc(&vm_statistics_reactivations);
978 vm_object_page_grab_reactivations++;
979 }
980 vm_page_unlock_queues();
981 move_page_in_obj:
982 vm_page_queue_remove(&object->memq, p, vmp_listq);
983 vm_page_queue_enter(&object->memq, p, vmp_listq);
984
985 p_skipped++;
986 continue;
987 }
988 vm_page_lockspin_queues();
989 take_page:
990 vm_page_free_prepare_queues(p);
991 vm_object_page_grab_returned++;
992 vm_object_page_grab_skipped += p_skipped;
993
994 vm_page_unlock_queues();
995
996 vm_page_free_prepare_object(p, TRUE);
997
998 return p;
999 }
1000 vm_object_page_grab_skipped += p_skipped;
1001 vm_object_page_grab_failed++;
1002
1003 return NULL;
1004 }
1005
1006 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
1007
1008 /* This is the actual number of filling cheads that's going to be used.
1009 * must be 1 <= vm_cheads <= COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT */
1010 TUNABLE_WRITEABLE(uint32_t, vm_cheads, "vm_cheads", 8);
1011 /* This determines what criteria is used for selecting the chead,
1012 * either the PID of the grabber task or it's coalition */
1013 TUNABLE_WRITEABLE(vm_chead_select_t, vm_chead_select, "vm_chead_select", CSEL_BY_PID);
1014 /* This determines if the grabber-id is set on every page-fault insert or just the first insert */
1015 TUNABLE_WRITEABLE(boolean_t, vm_chead_rehint, "vm_chead_rehint", false);
1016
1017 /*
1018 * This function is called from vm_page_insert_internal(). When it's called from the context
1019 * of a vm_fault where a task has just requested a new page/paged-in a existing page,
1020 * this function records some bits of information about the task. These bits are then
1021 * going to be used when the page is sent to the compressor to select the compressor-head
1022 * that will be used.
1023 * The goal of this is to make pages that come from the same task/coalition be compressed to the
1024 * same compressor segment, This helps the locality of swap-in and decompression.
1025 * This optimization relies on a heuristic assumptions that the vm_object is only ever mapped
1026 * in a single task/coalition. vm_objects that violate this would not benefit from this optimization.
1027 * See also vm_pageout_select_filling_chead()
1028 */
1029 void
vm_object_set_chead_hint(vm_object_t object)1030 vm_object_set_chead_hint(
1031 vm_object_t object)
1032 {
1033 if (!object->internal) {
1034 /* not relevant for pages that are not going to get to the compressor */
1035 return;
1036 }
1037
1038 if (object->vo_chead_hint != 0 && !vm_chead_rehint) {
1039 /* there's already a value there and we don't want to set it again */
1040 return;
1041 }
1042 task_t cur_task = current_task_early();
1043 if (cur_task == TASK_NULL || cur_task == kernel_task || vm_cheads <= 1) {
1044 /* avoid doing extra work for the kernel map case */
1045 object->vo_chead_hint = 0;
1046 return;
1047 }
1048 int value = 0;
1049 if (vm_chead_select == CSEL_BY_PID) {
1050 value = task_pid(cur_task);
1051 } else if (vm_chead_select == CSEL_BY_COALITION) {
1052 /* The choice of coalition type is not very significant here since both
1053 * types seem to have a similar task division. */
1054 coalition_t coalition = task_get_coalition(cur_task, COALITION_TYPE_JETSAM);
1055 if (coalition != COALITION_NULL) {
1056 value = coalition_id(coalition);
1057 }
1058 }
1059 uint32_t mod_by = MIN(vm_cheads, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT);
1060 object->vo_chead_hint = (uint8_t)value % mod_by;
1061 }
1062
1063 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
1064
1065 #define EVICT_PREPARE_LIMIT 64
1066 #define EVICT_AGE 10
1067
1068 static clock_sec_t vm_object_cache_aging_ts = 0;
1069
1070 static void
vm_object_cache_remove_locked(vm_object_t object)1071 vm_object_cache_remove_locked(
1072 vm_object_t object)
1073 {
1074 assert(object->purgable == VM_PURGABLE_DENY);
1075
1076 queue_remove(&vm_object_cached_list, object, vm_object_t, cached_list);
1077 object->cached_list.next = NULL;
1078 object->cached_list.prev = NULL;
1079
1080 vm_object_cached_count--;
1081 }
1082
1083 void
vm_object_cache_remove(vm_object_t object)1084 vm_object_cache_remove(
1085 vm_object_t object)
1086 {
1087 vm_object_cache_lock_spin();
1088
1089 if (object->cached_list.next &&
1090 object->cached_list.prev) {
1091 vm_object_cache_remove_locked(object);
1092 }
1093
1094 vm_object_cache_unlock();
1095 }
1096
1097 void
vm_object_cache_add(vm_object_t object)1098 vm_object_cache_add(
1099 vm_object_t object)
1100 {
1101 clock_sec_t sec;
1102 clock_nsec_t nsec;
1103
1104 assert(object->purgable == VM_PURGABLE_DENY);
1105
1106 if (object->resident_page_count == 0) {
1107 return;
1108 }
1109 if (object->vo_ledger_tag) {
1110 /*
1111 * We can't add an "owned" object to the cache because
1112 * the "vo_owner" and "vo_cache_ts" fields are part of the
1113 * same "union" and can't be used at the same time.
1114 */
1115 return;
1116 }
1117 clock_get_system_nanotime(&sec, &nsec);
1118
1119 vm_object_cache_lock_spin();
1120
1121 if (object->cached_list.next == NULL &&
1122 object->cached_list.prev == NULL) {
1123 queue_enter(&vm_object_cached_list, object, vm_object_t, cached_list);
1124 object->vo_cache_ts = sec + EVICT_AGE;
1125 object->vo_cache_pages_to_scan = object->resident_page_count;
1126
1127 vm_object_cached_count++;
1128 vm_object_cache_adds++;
1129 }
1130 vm_object_cache_unlock();
1131 }
1132
1133 int
vm_object_cache_evict(int num_to_evict,int max_objects_to_examine)1134 vm_object_cache_evict(
1135 int num_to_evict,
1136 int max_objects_to_examine)
1137 {
1138 vm_object_t object = VM_OBJECT_NULL;
1139 vm_object_t next_obj = VM_OBJECT_NULL;
1140 vm_page_t local_free_q = VM_PAGE_NULL;
1141 vm_page_t p;
1142 vm_page_t next_p;
1143 int object_cnt = 0;
1144 vm_page_t ep_array[EVICT_PREPARE_LIMIT];
1145 int ep_count;
1146 int ep_limit;
1147 int ep_index;
1148 int ep_freed = 0;
1149 int ep_moved = 0;
1150 uint32_t ep_skipped = 0;
1151 clock_sec_t sec;
1152 clock_nsec_t nsec;
1153
1154 KDBG_DEBUG(0x13001ec | DBG_FUNC_START);
1155 /*
1156 * do a couple of quick checks to see if it's
1157 * worthwhile grabbing the lock
1158 */
1159 if (queue_empty(&vm_object_cached_list)) {
1160 KDBG_DEBUG(0x13001ec | DBG_FUNC_END);
1161 return 0;
1162 }
1163 clock_get_system_nanotime(&sec, &nsec);
1164 if (max_objects_to_examine == INT_MAX) {
1165 /* evict all pages from all cached objects now */
1166 sec = (clock_sec_t)-1;
1167 }
1168
1169 /*
1170 * the object on the head of the queue has not
1171 * yet sufficiently aged
1172 */
1173 if (sec < vm_object_cache_aging_ts) {
1174 KDBG_DEBUG(0x13001ec | DBG_FUNC_END);
1175 return 0;
1176 }
1177 /*
1178 * don't need the queue lock to find
1179 * and lock an object on the cached list
1180 */
1181 vm_page_unlock_queues();
1182
1183 vm_object_cache_lock_spin();
1184
1185 for (;;) { /* loop for as long as we have objects to process */
1186 next_obj = (vm_object_t)queue_first(&vm_object_cached_list);
1187
1188 /* loop to find the next target in the cache_list */
1189 while (!queue_end(&vm_object_cached_list, (queue_entry_t)next_obj) && object_cnt++ < max_objects_to_examine) {
1190 object = next_obj;
1191 next_obj = (vm_object_t)queue_next(&next_obj->cached_list);
1192
1193 assert(object->purgable == VM_PURGABLE_DENY);
1194
1195 if (sec < object->vo_cache_ts) { // reached the point in the queue beyond the time we started
1196 KDBG_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec);
1197
1198 vm_object_cache_aging_ts = object->vo_cache_ts;
1199 object = VM_OBJECT_NULL; /* this will cause to break away from the outer loop */
1200 break;
1201 }
1202 if (!vm_object_lock_try_scan(object)) {
1203 /*
1204 * just skip over this guy for now... if we find
1205 * an object to steal pages from, we'll revist in a bit...
1206 * hopefully, the lock will have cleared
1207 */
1208 KDBG_DEBUG(0x13001f8, object, object->resident_page_count);
1209
1210 object = VM_OBJECT_NULL;
1211 continue;
1212 }
1213 if (vm_page_queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) {
1214 /*
1215 * this case really shouldn't happen, but it's not fatal
1216 * so deal with it... if we don't remove the object from
1217 * the list, we'll never move past it.
1218 */
1219 KDBG_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved);
1220
1221 vm_object_cache_remove_locked(object);
1222 vm_object_unlock(object);
1223 object = VM_OBJECT_NULL;
1224 continue;
1225 }
1226 /*
1227 * we have a locked object with pages...
1228 * time to start harvesting
1229 */
1230 break;
1231 }
1232 vm_object_cache_unlock();
1233
1234 if (object == VM_OBJECT_NULL) {
1235 break;
1236 }
1237
1238 /*
1239 * object is locked at this point and
1240 * has resident pages
1241 */
1242 next_p = (vm_page_t)vm_page_queue_first(&object->memq);
1243
1244 /*
1245 * break the page scan into 2 pieces to minimize the time spent
1246 * behind the page queue lock...
1247 * the list of pages on these unused objects is likely to be cold
1248 * w/r to the cpu cache which increases the time to scan the list
1249 * tenfold... and we may have a 'run' of pages we can't utilize that
1250 * needs to be skipped over...
1251 */
1252 if ((ep_limit = num_to_evict - (ep_freed + ep_moved)) > EVICT_PREPARE_LIMIT) {
1253 ep_limit = EVICT_PREPARE_LIMIT;
1254 }
1255 ep_count = 0;
1256
1257 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) {
1258 p = next_p;
1259 next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq);
1260
1261 object->vo_cache_pages_to_scan--;
1262
1263 if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry) {
1264 vm_page_queue_remove(&object->memq, p, vmp_listq);
1265 vm_page_queue_enter(&object->memq, p, vmp_listq);
1266
1267 ep_skipped++;
1268 continue;
1269 }
1270 if (!object->internal &&
1271 object->pager_created &&
1272 object->pager == NULL) {
1273 /*
1274 * This object has lost its pager, most likely
1275 * due to a force-unmount or ungraft. The pager
1276 * will never come back, so there's no point in
1277 * keeping these pages, even if modified.
1278 * The object could still be mapped, so we need
1279 * to clear any PTE that might still be pointing
1280 * at this physical page before we can reclaim
1281 * it.
1282 */
1283 if (p->vmp_pmapped) {
1284 int refmod;
1285 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
1286 if (refmod & VM_MEM_MODIFIED) {
1287 assert(p->vmp_wpmapped);
1288 p->vmp_dirty = TRUE;
1289 }
1290 }
1291 // printf("FBDP %s:%d object %p reason %d page %p offset 0x%llx pmapped %d wpmapped %d xpmapped %d dirty %d precious %d\n", __FUNCTION__, __LINE__, object, object->no_pager_reason, p, p->vmp_offset, p->vmp_pmapped, p->vmp_wpmapped, p->vmp_xpmapped, p->vmp_dirty, p->vmp_precious);
1292 /* clear any reason to skip this page below */
1293 p->vmp_dirty = FALSE;
1294 p->vmp_precious = FALSE;
1295 p->vmp_wpmapped = FALSE;
1296 }
1297 if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) {
1298 vm_page_queue_remove(&object->memq, p, vmp_listq);
1299 vm_page_queue_enter(&object->memq, p, vmp_listq);
1300
1301 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(p));
1302 }
1303 ep_array[ep_count++] = p;
1304 }
1305 KDBG_DEBUG(0x13001f4 | DBG_FUNC_START, object, object->resident_page_count, ep_freed, ep_moved);
1306
1307 vm_page_lockspin_queues();
1308
1309 for (ep_index = 0; ep_index < ep_count; ep_index++) {
1310 p = ep_array[ep_index];
1311
1312 if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) {
1313 p->vmp_reference = FALSE;
1314 p->vmp_no_cache = FALSE;
1315
1316 /*
1317 * we've already filtered out pages that are in the laundry
1318 * so if we get here, this page can't be on the pageout queue
1319 */
1320 vm_page_queues_remove(p, FALSE);
1321 vm_page_enqueue_inactive(p, TRUE);
1322
1323 ep_moved++;
1324 } else {
1325 #if CONFIG_PHANTOM_CACHE
1326 vm_phantom_cache_add_ghost(p);
1327 #endif
1328 vm_page_free_prepare_queues(p);
1329
1330 assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0);
1331 /*
1332 * Add this page to our list of reclaimed pages,
1333 * to be freed later.
1334 */
1335 p->vmp_snext = local_free_q;
1336 local_free_q = p;
1337
1338 ep_freed++;
1339 }
1340 }
1341 vm_page_unlock_queues();
1342
1343 KDBG_DEBUG(0x13001f4 | DBG_FUNC_END, object, object->resident_page_count, ep_freed, ep_moved);
1344
1345 if (local_free_q) {
1346 vm_page_free_list(local_free_q, TRUE);
1347 local_free_q = VM_PAGE_NULL;
1348 }
1349 if (object->vo_cache_pages_to_scan == 0) {
1350 KDBG_DEBUG(0x1300208, object, object->resident_page_count, ep_freed, ep_moved);
1351
1352 vm_object_cache_remove(object);
1353
1354 KDBG_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved);
1355 }
1356 /*
1357 * done with this object
1358 */
1359 vm_object_unlock(object);
1360 object = VM_OBJECT_NULL;
1361
1362 /*
1363 * at this point, we are not holding any locks
1364 */
1365 if ((ep_freed + ep_moved) >= num_to_evict) {
1366 /*
1367 * we've reached our target for the
1368 * number of pages to evict
1369 */
1370 break;
1371 }
1372 vm_object_cache_lock_spin();
1373 }
1374 /*
1375 * put the page queues lock back to the caller's
1376 * idea of it
1377 */
1378 vm_page_lock_queues();
1379
1380 vm_object_cache_pages_freed += ep_freed;
1381 vm_object_cache_pages_moved += ep_moved;
1382 vm_object_cache_pages_skipped += ep_skipped;
1383
1384 KDBG_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed);
1385 // printf("FBDP %s(0x%x,0x%x) freed %d moved %d skipped %u\n", __func__, num_to_evict, max_objects_to_examine, ep_freed, ep_moved, ep_skipped);
1386 return ep_freed;
1387 }
1388
1389 int vm_object_cache_evict_all(void);
1390 int
vm_object_cache_evict_all(void)1391 vm_object_cache_evict_all(void)
1392 {
1393 int freed;
1394
1395 vm_page_lock_queues();
1396 freed = vm_object_cache_evict(INT_MAX, INT_MAX);
1397 vm_page_unlock_queues();
1398 printf("%s: freed %d\n", __func__, freed);
1399 return freed;
1400 }
1401
1402 /*
1403 * Routine: vm_object_terminate
1404 * Purpose:
1405 * Free all resources associated with a vm_object.
1406 * In/out conditions:
1407 * Upon entry, the object must be locked,
1408 * and the object must have exactly one reference.
1409 *
1410 * The shadow object reference is left alone.
1411 *
1412 * The object must be unlocked if its found that pages
1413 * must be flushed to a backing object. If someone
1414 * manages to map the object while it is being flushed
1415 * the object is returned unlocked and unchanged. Otherwise,
1416 * upon exit, the cache will be unlocked, and the
1417 * object will cease to exist.
1418 */
1419 static kern_return_t
vm_object_terminate(vm_object_t object)1420 vm_object_terminate(
1421 vm_object_t object)
1422 {
1423 vm_object_t shadow_object;
1424
1425 vm_object_lock_assert_exclusive(object);
1426
1427 if (!object->pageout && (!object->internal && object->can_persist) &&
1428 (object->pager != NULL || object->shadow_severed)) {
1429 /*
1430 * Clear pager_trusted bit so that the pages get yanked
1431 * out of the object instead of cleaned in place. This
1432 * prevents a deadlock in XMM and makes more sense anyway.
1433 */
1434 VM_OBJECT_SET_PAGER_TRUSTED(object, FALSE);
1435
1436 vm_object_reap_pages(object, REAP_TERMINATE);
1437 }
1438 /*
1439 * Make sure the object isn't already being terminated
1440 */
1441 if (object->terminating) {
1442 vm_object_lock_assert_exclusive(object);
1443 os_ref_release_live_locked_raw(&object->ref_count, &vm_object_refgrp);
1444 vm_object_unlock(object);
1445 return KERN_FAILURE;
1446 }
1447
1448 /*
1449 * Did somebody get a reference to the object while we were
1450 * cleaning it?
1451 */
1452 if (os_ref_get_count_raw(&object->ref_count) != 1) {
1453 vm_object_lock_assert_exclusive(object);
1454 os_ref_release_live_locked_raw(&object->ref_count, &vm_object_refgrp);
1455 vm_object_unlock(object);
1456 return KERN_FAILURE;
1457 }
1458
1459 /*
1460 * Make sure no one can look us up now.
1461 */
1462
1463 VM_OBJECT_SET_TERMINATING(object, TRUE);
1464 VM_OBJECT_SET_ALIVE(object, FALSE);
1465
1466 if (!object->internal &&
1467 object->cached_list.next &&
1468 object->cached_list.prev) {
1469 vm_object_cache_remove(object);
1470 }
1471
1472 /*
1473 * Detach the object from its shadow if we are the shadow's
1474 * copy. The reference we hold on the shadow must be dropped
1475 * by our caller.
1476 */
1477 if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1478 !(object->pageout)) {
1479 vm_object_lock(shadow_object);
1480 if (shadow_object->vo_copy == object) {
1481 VM_OBJECT_COPY_SET(shadow_object, VM_OBJECT_NULL);
1482 }
1483 vm_object_unlock(shadow_object);
1484 }
1485
1486 if (object->paging_in_progress != 0 ||
1487 object->activity_in_progress != 0) {
1488 /*
1489 * There are still some paging_in_progress references
1490 * on this object, meaning that there are some paging
1491 * or other I/O operations in progress for this VM object.
1492 * Such operations take some paging_in_progress references
1493 * up front to ensure that the object doesn't go away, but
1494 * they may also need to acquire a reference on the VM object,
1495 * to map it in kernel space, for example. That means that
1496 * they may end up releasing the last reference on the VM
1497 * object, triggering its termination, while still holding
1498 * paging_in_progress references. Waiting for these
1499 * pending paging_in_progress references to go away here would
1500 * deadlock.
1501 *
1502 * To avoid deadlocking, we'll let the vm_object_reaper_thread
1503 * complete the VM object termination if it still holds
1504 * paging_in_progress references at this point.
1505 *
1506 * No new paging_in_progress should appear now that the
1507 * VM object is "terminating" and not "alive".
1508 */
1509 vm_object_reap_async(object);
1510 vm_object_unlock(object);
1511 /*
1512 * Return KERN_FAILURE to let the caller know that we
1513 * haven't completed the termination and it can't drop this
1514 * object's reference on its shadow object yet.
1515 * The reaper thread will take care of that once it has
1516 * completed this object's termination.
1517 */
1518 return KERN_FAILURE;
1519 }
1520 /*
1521 * complete the VM object termination
1522 */
1523 vm_object_reap(object);
1524 object = VM_OBJECT_NULL;
1525
1526 /*
1527 * the object lock was released by vm_object_reap()
1528 *
1529 * KERN_SUCCESS means that this object has been terminated
1530 * and no longer needs its shadow object but still holds a
1531 * reference on it.
1532 * The caller is responsible for dropping that reference.
1533 * We can't call vm_object_deallocate() here because that
1534 * would create a recursion.
1535 */
1536 return KERN_SUCCESS;
1537 }
1538
1539
1540 /*
1541 * vm_object_reap():
1542 *
1543 * Complete the termination of a VM object after it's been marked
1544 * as "terminating" and "!alive" by vm_object_terminate().
1545 *
1546 * The VM object must be locked by caller.
1547 * The lock will be released on return and the VM object is no longer valid.
1548 */
1549
1550 void
vm_object_reap(vm_object_t object)1551 vm_object_reap(
1552 vm_object_t object)
1553 {
1554 memory_object_t pager;
1555 os_ref_count_t ref_count;
1556
1557 vm_object_lock_assert_exclusive(object);
1558 assert(object->paging_in_progress == 0);
1559 assert(object->activity_in_progress == 0);
1560
1561 vm_object_reap_count++;
1562
1563 /*
1564 * Disown this purgeable object to cleanup its owner's purgeable
1565 * ledgers. We need to do this before disconnecting the object
1566 * from its pager, to properly account for compressed pages.
1567 */
1568 if (/* object->internal && */
1569 (object->purgable != VM_PURGABLE_DENY ||
1570 object->vo_ledger_tag)) {
1571 int ledger_flags;
1572 kern_return_t kr;
1573
1574 ledger_flags = 0;
1575 assert(!object->alive);
1576 assert(object->terminating);
1577 kr = vm_object_ownership_change(object,
1578 VM_LEDGER_TAG_NONE,
1579 NULL, /* no owner */
1580 ledger_flags,
1581 FALSE); /* task_objq not locked */
1582 assert(kr == KERN_SUCCESS);
1583 assert(object->vo_owner == NULL);
1584 }
1585
1586 #if DEVELOPMENT || DEBUG
1587 if (object->object_is_shared_cache &&
1588 object->pager != NULL &&
1589 object->pager->mo_pager_ops == &shared_region_pager_ops) {
1590 OSAddAtomic(-object->resident_page_count, &shared_region_pagers_resident_count);
1591 }
1592 #endif /* DEVELOPMENT || DEBUG */
1593
1594 pager = object->pager;
1595 object->pager = MEMORY_OBJECT_NULL;
1596
1597 if (pager != MEMORY_OBJECT_NULL) {
1598 memory_object_control_disable(&object->pager_control);
1599 }
1600
1601 ref_count = os_ref_release_locked_raw(&object->ref_count,
1602 &vm_object_refgrp);
1603 if (__improbable(ref_count != 0)) {
1604 panic("Attempting to deallocate vm_object with outstanding refs: %u",
1605 ref_count);
1606 }
1607
1608 /*
1609 * remove from purgeable queue if it's on
1610 */
1611 if (object->internal) {
1612 assert(VM_OBJECT_OWNER(object) == TASK_NULL);
1613
1614 VM_OBJECT_UNWIRED(object);
1615
1616 if (object->purgable == VM_PURGABLE_DENY) {
1617 /* not purgeable: nothing to do */
1618 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
1619 purgeable_q_t queue;
1620
1621 queue = vm_purgeable_object_remove(object);
1622 assert(queue);
1623
1624 if (object->purgeable_when_ripe) {
1625 /*
1626 * Must take page lock for this -
1627 * using it to protect token queue
1628 */
1629 vm_page_lock_queues();
1630 vm_purgeable_token_delete_first(queue);
1631
1632 assert(queue->debug_count_objects >= 0);
1633 vm_page_unlock_queues();
1634 }
1635
1636 /*
1637 * Update "vm_page_purgeable_count" in bulk and mark
1638 * object as VM_PURGABLE_EMPTY to avoid updating
1639 * "vm_page_purgeable_count" again in vm_page_remove()
1640 * when reaping the pages.
1641 */
1642 unsigned int delta;
1643 assert(object->resident_page_count >=
1644 object->wired_page_count);
1645 delta = (object->resident_page_count -
1646 object->wired_page_count);
1647 if (delta != 0) {
1648 assert(vm_page_purgeable_count >= delta);
1649 OSAddAtomic(-delta,
1650 (SInt32 *)&vm_page_purgeable_count);
1651 }
1652 if (object->wired_page_count != 0) {
1653 assert(vm_page_purgeable_wired_count >=
1654 object->wired_page_count);
1655 OSAddAtomic(-object->wired_page_count,
1656 (SInt32 *)&vm_page_purgeable_wired_count);
1657 }
1658 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_EMPTY);
1659 } else if (object->purgable == VM_PURGABLE_NONVOLATILE ||
1660 object->purgable == VM_PURGABLE_EMPTY) {
1661 /* remove from nonvolatile queue */
1662 vm_purgeable_nonvolatile_dequeue(object);
1663 } else {
1664 panic("object %p in unexpected purgeable state 0x%x",
1665 object, object->purgable);
1666 }
1667 if (object->transposed &&
1668 object->cached_list.next != NULL &&
1669 object->cached_list.prev == NULL) {
1670 /*
1671 * object->cached_list.next "points" to the
1672 * object that was transposed with this object.
1673 */
1674 } else {
1675 assert(object->cached_list.next == NULL);
1676 }
1677 assert(object->cached_list.prev == NULL);
1678 }
1679
1680 if (object->pageout) {
1681 /*
1682 * free all remaining pages tabled on
1683 * this object
1684 * clean up it's shadow
1685 */
1686 assert(object->shadow != VM_OBJECT_NULL);
1687
1688 vm_pageout_object_terminate(object);
1689 } else if (object->resident_page_count) {
1690 /*
1691 * free all remaining pages tabled on
1692 * this object
1693 */
1694 vm_object_reap_pages(object, REAP_REAP);
1695 }
1696 assert(vm_page_queue_empty(&object->memq));
1697 assert(object->paging_in_progress == 0);
1698 assert(object->activity_in_progress == 0);
1699 assert(os_ref_get_count_raw(&object->ref_count) == 0);
1700
1701 /*
1702 * If the pager has not already been released by
1703 * vm_object_destroy, we need to terminate it and
1704 * release our reference to it here.
1705 */
1706 if (pager != MEMORY_OBJECT_NULL) {
1707 vm_object_unlock(object);
1708 vm_object_release_pager(pager);
1709 vm_object_lock(object);
1710 }
1711
1712 /* kick off anyone waiting on terminating */
1713 VM_OBJECT_SET_TERMINATING(object, FALSE);
1714 vm_object_paging_begin(object);
1715 vm_object_paging_end(object);
1716 vm_object_unlock(object);
1717
1718 object->shadow = VM_OBJECT_NULL;
1719
1720 #if VM_OBJECT_TRACKING
1721 if (vm_object_tracking_btlog) {
1722 btlog_erase(vm_object_tracking_btlog, object);
1723 }
1724 #endif /* VM_OBJECT_TRACKING */
1725
1726 vm_object_lock_destroy(object);
1727 /*
1728 * Free the space for the object.
1729 */
1730 zfree(vm_object_zone, object);
1731 object = VM_OBJECT_NULL;
1732 }
1733
1734
1735 unsigned int vm_max_batch = 256;
1736
1737 #define V_O_R_MAX_BATCH 128
1738
1739 #define BATCH_LIMIT(max) (vm_max_batch >= max ? max : vm_max_batch)
1740
1741 static inline vm_page_t
vm_object_reap_freelist(vm_page_t local_free_q,bool do_disconnect,bool set_cache_attr)1742 vm_object_reap_freelist(vm_page_t local_free_q, bool do_disconnect, bool set_cache_attr)
1743 {
1744 vm_page_t page;
1745 if (local_free_q) {
1746 if (do_disconnect) {
1747 _vm_page_list_foreach(page, local_free_q) {
1748 if (page->vmp_pmapped) {
1749 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
1750 }
1751 }
1752 }
1753
1754 if (set_cache_attr) {
1755 #if HAS_MTE
1756 assert(!local_free_q->vmp_using_mte);
1757 #endif /* HAS_MTE */
1758 const unified_page_list_t pmap_batch_list = {
1759 .page_slist = local_free_q,
1760 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
1761 };
1762 pmap_batch_set_cache_attributes(&pmap_batch_list, 0);
1763 }
1764 vm_page_free_list(local_free_q, TRUE);
1765 }
1766 return VM_PAGE_NULL;
1767 }
1768
1769 void
vm_object_reap_pages(vm_object_t object,int reap_type)1770 vm_object_reap_pages(
1771 vm_object_t object,
1772 int reap_type)
1773 {
1774 vm_page_t p;
1775 vm_page_t next;
1776 vm_page_t local_free_q = VM_PAGE_NULL;
1777 int loop_count;
1778 bool disconnect_on_release;
1779 bool set_cache_attr_needed;
1780 pmap_flush_context pmap_flush_context_storage;
1781
1782 if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_DATA_FLUSH_CLEAN) {
1783 /*
1784 * We need to disconnect pages from all pmaps before
1785 * releasing them to the free list
1786 */
1787 disconnect_on_release = true;
1788 } else {
1789 /*
1790 * Either the caller has already disconnected the pages
1791 * from all pmaps, or we disconnect them here as we add
1792 * them to out local list of pages to be released.
1793 * No need to re-disconnect them when we release the pages
1794 * to the free list.
1795 */
1796 disconnect_on_release = false;
1797 }
1798
1799 restart_after_sleep:
1800 set_cache_attr_needed = false;
1801 if (object->set_cache_attr) {
1802 /**
1803 * If the cache attributes need to be reset for the pages to
1804 * be freed, we clear object->set_cache_attr here so that
1805 * our call to vm_page_free_list (which will ultimately call
1806 * vm_page_remove() on each page) won't try to reset the
1807 * cache attributes on each page individually. Depending on
1808 * the architecture, it may be much faster for us to call
1809 * pmap_batch_set_cache_attributes() instead. Note that
1810 * this function must restore object->set_cache_attr in any
1811 * case where it is required to drop the object lock, e.g.
1812 * to wait for a busy page.
1813 */
1814 object->set_cache_attr = FALSE;
1815 set_cache_attr_needed = true;
1816 }
1817
1818 if (vm_page_queue_empty(&object->memq)) {
1819 return;
1820 }
1821 loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
1822
1823 if (reap_type == REAP_PURGEABLE) {
1824 pmap_flush_context_init(&pmap_flush_context_storage);
1825 }
1826
1827 vm_page_lock_queues();
1828
1829 next = (vm_page_t)vm_page_queue_first(&object->memq);
1830
1831 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) {
1832 p = next;
1833 next = (vm_page_t)vm_page_queue_next(&next->vmp_listq);
1834
1835 if (--loop_count == 0) {
1836 vm_page_unlock_queues();
1837
1838 if (local_free_q) {
1839 if (reap_type == REAP_PURGEABLE) {
1840 pmap_flush(&pmap_flush_context_storage);
1841 pmap_flush_context_init(&pmap_flush_context_storage);
1842 }
1843 /*
1844 * Free the pages we reclaimed so far
1845 * and take a little break to avoid
1846 * hogging the page queue lock too long
1847 */
1848 local_free_q = vm_object_reap_freelist(local_free_q,
1849 disconnect_on_release, set_cache_attr_needed);
1850 } else {
1851 mutex_pause(0);
1852 }
1853
1854 loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
1855
1856 vm_page_lock_queues();
1857 }
1858 if (reap_type == REAP_DATA_FLUSH ||
1859 reap_type == REAP_DATA_FLUSH_CLEAN ||
1860 reap_type == REAP_TERMINATE) {
1861 if (p->vmp_busy || p->vmp_cleaning) {
1862 vm_page_unlock_queues();
1863 /*
1864 * free the pages reclaimed so far
1865 */
1866 local_free_q = vm_object_reap_freelist(local_free_q,
1867 disconnect_on_release, set_cache_attr_needed);
1868
1869 if (set_cache_attr_needed) {
1870 object->set_cache_attr = TRUE;
1871 }
1872 vm_page_sleep(object, p, THREAD_UNINT, LCK_SLEEP_DEFAULT);
1873
1874 goto restart_after_sleep;
1875 }
1876 if (p->vmp_laundry && reap_type != REAP_DATA_FLUSH_CLEAN) {
1877 vm_pageout_steal_laundry(p, TRUE);
1878 }
1879 }
1880 switch (reap_type) {
1881 case REAP_DATA_FLUSH_CLEAN:
1882 if (!p->vmp_dirty &&
1883 p->vmp_wpmapped &&
1884 pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p))) {
1885 SET_PAGE_DIRTY(p, FALSE);
1886 }
1887 if (p->vmp_dirty) {
1888 /* only flush clean pages */
1889 continue;
1890 }
1891 OS_FALLTHROUGH;
1892
1893 case REAP_DATA_FLUSH:
1894 if (VM_PAGE_WIRED(p)) {
1895 /*
1896 * this is an odd case... perhaps we should
1897 * zero-fill this page since we're conceptually
1898 * tossing its data at this point, but leaving
1899 * it on the object to honor the 'wire' contract
1900 */
1901 continue;
1902 }
1903 break;
1904
1905 case REAP_PURGEABLE:
1906 if (VM_PAGE_WIRED(p)) {
1907 /*
1908 * can't purge a wired page
1909 */
1910 vm_page_purged_wired++;
1911 continue;
1912 }
1913 if (p->vmp_laundry && !p->vmp_busy && !p->vmp_cleaning) {
1914 vm_pageout_steal_laundry(p, TRUE);
1915 }
1916
1917 if (p->vmp_cleaning || p->vmp_laundry || p->vmp_absent) {
1918 /*
1919 * page is being acted upon,
1920 * so don't mess with it
1921 */
1922 vm_page_purged_others++;
1923 continue;
1924 }
1925 if (p->vmp_busy) {
1926 /*
1927 * We can't reclaim a busy page but we can
1928 * make it more likely to be paged (it's not wired) to make
1929 * sure that it gets considered by
1930 * vm_pageout_scan() later.
1931 */
1932 if (VM_PAGE_PAGEABLE(p)) {
1933 vm_page_deactivate(p);
1934 }
1935 vm_page_purged_busy++;
1936 continue;
1937 }
1938
1939 assert(!is_kernel_object(VM_PAGE_OBJECT(p)));
1940
1941 /*
1942 * we can discard this page...
1943 */
1944 if (p->vmp_pmapped == TRUE) {
1945 /*
1946 * unmap the page
1947 */
1948 pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage);
1949 }
1950 vm_page_purged_count++;
1951
1952 break;
1953
1954 case REAP_TERMINATE:
1955 if (p->vmp_absent || vm_page_is_private(p)) {
1956 /*
1957 * For private pages, VM_PAGE_FREE just
1958 * leaves the page structure around for
1959 * its owner to clean up. For absent
1960 * pages, the structure is returned to
1961 * the appropriate pool.
1962 */
1963 break;
1964 }
1965 if (vm_page_is_fictitious(p)) {
1966 assert(vm_page_is_guard(p));
1967 break;
1968 }
1969 if (!p->vmp_dirty && p->vmp_wpmapped) {
1970 p->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p));
1971 }
1972
1973 if ((p->vmp_dirty || p->vmp_precious) && !VMP_ERROR_GET(p) && object->alive) {
1974 assert(!object->internal);
1975
1976 p->vmp_free_when_done = TRUE;
1977
1978 if (!p->vmp_laundry) {
1979 vm_page_queues_remove(p, TRUE);
1980 /*
1981 * flush page... page will be freed
1982 * upon completion of I/O
1983 */
1984 vm_pageout_cluster(p);
1985 }
1986 vm_page_unlock_queues();
1987 /*
1988 * free the pages reclaimed so far
1989 */
1990 local_free_q = vm_object_reap_freelist(local_free_q,
1991 disconnect_on_release, set_cache_attr_needed);
1992
1993 if (set_cache_attr_needed) {
1994 object->set_cache_attr = TRUE;
1995 }
1996 vm_object_paging_wait(object, THREAD_UNINT);
1997
1998 goto restart_after_sleep;
1999 }
2000 break;
2001
2002 case REAP_REAP:
2003 break;
2004 }
2005 vm_page_free_prepare_queues(p);
2006 assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0);
2007 /*
2008 * Add this page to our list of reclaimed pages,
2009 * to be freed later.
2010 */
2011 p->vmp_snext = local_free_q;
2012 local_free_q = p;
2013 }
2014 vm_page_unlock_queues();
2015
2016 /*
2017 * Free the remaining reclaimed pages
2018 */
2019 if (reap_type == REAP_PURGEABLE) {
2020 pmap_flush(&pmap_flush_context_storage);
2021 }
2022
2023 vm_object_reap_freelist(local_free_q,
2024 disconnect_on_release, set_cache_attr_needed);
2025 if (set_cache_attr_needed) {
2026 object->set_cache_attr = TRUE;
2027 }
2028 }
2029
2030
2031 void
vm_object_reap_async(vm_object_t object)2032 vm_object_reap_async(
2033 vm_object_t object)
2034 {
2035 vm_object_lock_assert_exclusive(object);
2036
2037 vm_object_reaper_lock_spin();
2038
2039 vm_object_reap_count_async++;
2040
2041 /* enqueue the VM object... */
2042 queue_enter(&vm_object_reaper_queue, object,
2043 vm_object_t, cached_list);
2044
2045 vm_object_reaper_unlock();
2046
2047 /* ... and wake up the reaper thread */
2048 thread_wakeup((event_t) &vm_object_reaper_queue);
2049 }
2050
2051
2052 void
vm_object_reaper_thread(void)2053 vm_object_reaper_thread(void)
2054 {
2055 vm_object_t object, shadow_object;
2056
2057 vm_object_reaper_lock_spin();
2058
2059 while (!queue_empty(&vm_object_reaper_queue)) {
2060 queue_remove_first(&vm_object_reaper_queue,
2061 object,
2062 vm_object_t,
2063 cached_list);
2064
2065 vm_object_reaper_unlock();
2066 vm_object_lock(object);
2067
2068 assert(object->terminating);
2069 assert(!object->alive);
2070
2071 /*
2072 * The pageout daemon might be playing with our pages.
2073 * Now that the object is dead, it won't touch any more
2074 * pages, but some pages might already be on their way out.
2075 * Hence, we wait until the active paging activities have
2076 * ceased before we break the association with the pager
2077 * itself.
2078 */
2079 vm_object_paging_wait(object, THREAD_UNINT);
2080
2081 shadow_object =
2082 object->pageout ? VM_OBJECT_NULL : object->shadow;
2083
2084 vm_object_reap(object);
2085 /* cache is unlocked and object is no longer valid */
2086 object = VM_OBJECT_NULL;
2087
2088 if (shadow_object != VM_OBJECT_NULL) {
2089 /*
2090 * Drop the reference "object" was holding on
2091 * its shadow object.
2092 */
2093 vm_object_deallocate(shadow_object);
2094 shadow_object = VM_OBJECT_NULL;
2095 }
2096 vm_object_reaper_lock_spin();
2097 }
2098
2099 /* wait for more work... */
2100 assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT);
2101
2102 vm_object_reaper_unlock();
2103
2104 thread_block((thread_continue_t) vm_object_reaper_thread);
2105 /*NOTREACHED*/
2106 }
2107
2108 /*
2109 * Routine: vm_object_release_pager
2110 * Purpose: Terminate the pager and, upon completion,
2111 * release our last reference to it.
2112 */
2113 static void
vm_object_release_pager(memory_object_t pager)2114 vm_object_release_pager(
2115 memory_object_t pager)
2116 {
2117 /*
2118 * Terminate the pager.
2119 */
2120
2121 (void) memory_object_terminate(pager);
2122
2123 /*
2124 * Release reference to pager.
2125 */
2126 memory_object_deallocate(pager);
2127 }
2128
2129 /*
2130 * Routine: vm_object_destroy
2131 * Purpose:
2132 * Shut down a VM object, despite the
2133 * presence of address map (or other) references
2134 * to the vm_object.
2135 */
2136 #if FBDP_DEBUG_OBJECT_NO_PAGER
2137 extern uint32_t system_inshutdown;
2138 int fbdp_no_panic = 1;
2139 #endif /* FBDP_DEBUG_OBJECT_NO_PAGER */
2140 kern_return_t
vm_object_destroy(vm_object_t object,vm_object_destroy_reason_t reason)2141 vm_object_destroy(
2142 vm_object_t object,
2143 vm_object_destroy_reason_t reason)
2144 {
2145 memory_object_t old_pager;
2146
2147 if (object == VM_OBJECT_NULL) {
2148 return KERN_SUCCESS;
2149 }
2150
2151 /*
2152 * Remove the pager association immediately.
2153 *
2154 * This will prevent the memory manager from further
2155 * meddling. [If it wanted to flush data or make
2156 * other changes, it should have done so before performing
2157 * the destroy call.]
2158 */
2159
2160 vm_object_lock(object);
2161
2162 #if FBDP_DEBUG_OBJECT_NO_PAGER
2163 static bool fbdp_no_panic_retrieved = false;
2164 if (!fbdp_no_panic_retrieved) {
2165 PE_parse_boot_argn("fbdp_no_panic4", &fbdp_no_panic, sizeof(fbdp_no_panic));
2166 fbdp_no_panic_retrieved = true;
2167 }
2168
2169 bool forced_unmount = false;
2170 if (object->named &&
2171 os_ref_get_count_raw(&object->ref_count) > 2 &&
2172 object->pager != NULL &&
2173 vnode_pager_get_forced_unmount(object->pager, &forced_unmount) == KERN_SUCCESS &&
2174 forced_unmount == false) {
2175 if (!fbdp_no_panic) {
2176 panic("FBDP rdar://99829401 object %p refs %d pager %p (no forced unmount)\n", object, os_ref_get_count_raw(&object->ref_count), object->pager);
2177 }
2178 DTRACE_VM3(vm_object_destroy_no_forced_unmount,
2179 vm_object_t, object,
2180 int, os_ref_get_count_raw(&object->ref_count),
2181 memory_object_t, object->pager);
2182 }
2183
2184 if (object->fbdp_tracked) {
2185 if (os_ref_get_count_raw(&object->ref_count) > 2 && !system_inshutdown) {
2186 if (!fbdp_no_panic) {
2187 panic("FBDP/4 rdar://99829401 object %p refs %d pager %p (tracked)\n", object, os_ref_get_count_raw(&object->ref_count), object->pager);
2188 }
2189 }
2190 VM_OBJECT_SET_FBDP_TRACKED(object, false);
2191 }
2192 #endif /* FBDP_DEBUG_OBJECT_NO_PAGER */
2193
2194 VM_OBJECT_SET_NO_PAGER_REASON(object, reason);
2195
2196 VM_OBJECT_SET_CAN_PERSIST(object, FALSE);
2197 VM_OBJECT_SET_NAMED(object, FALSE);
2198 #if 00
2199 VM_OBJECT_SET_ALIVE(object, FALSE);
2200 #endif /* 00 */
2201
2202 #if DEVELOPMENT || DEBUG
2203 if (object->object_is_shared_cache &&
2204 object->pager != NULL &&
2205 object->pager->mo_pager_ops == &shared_region_pager_ops) {
2206 OSAddAtomic(-object->resident_page_count, &shared_region_pagers_resident_count);
2207 }
2208 #endif /* DEVELOPMENT || DEBUG */
2209
2210 old_pager = object->pager;
2211 object->pager = MEMORY_OBJECT_NULL;
2212 if (old_pager != MEMORY_OBJECT_NULL) {
2213 memory_object_control_disable(&object->pager_control);
2214 }
2215
2216 /*
2217 * Wait for the existing paging activity (that got
2218 * through before we nulled out the pager) to subside.
2219 */
2220 vm_object_paging_wait(object, THREAD_UNINT);
2221 vm_object_pl_req_wait(object, THREAD_UNINT);
2222
2223 /*
2224 * Memory objects usually stay alive while their
2225 * VM object is still mapped but vnodes can get
2226 * reclaimed by forced unmounts while still mapped,
2227 * for example, so we could be racing with a
2228 * memory_object_map() or memory_object_last_unmap()
2229 * here.
2230 * We should wait for any memory_object_map/last_unmap()
2231 * to complete, except if we're the thread calling
2232 * memory_object_last_unmap() on this memory object.
2233 */
2234 if (old_pager != MEMORY_OBJECT_NULL &&
2235 old_pager->mo_last_unmap_ctid == thread_get_ctid(current_thread())) {
2236 old_pager->mo_last_unmap_ctid = 0;
2237 } else {
2238 vm_object_mapping_wait(object, THREAD_UNINT);
2239 }
2240
2241 vm_object_unlock(object);
2242
2243 /*
2244 * Terminate the object now.
2245 */
2246 if (old_pager != MEMORY_OBJECT_NULL) {
2247 vm_object_release_pager(old_pager);
2248
2249 /*
2250 * JMM - Release the caller's reference. This assumes the
2251 * caller had a reference to release, which is a big (but
2252 * currently valid) assumption if this is driven from the
2253 * vnode pager (it is holding a named reference when making
2254 * this call)..
2255 */
2256 vm_object_deallocate(object);
2257 }
2258 return KERN_SUCCESS;
2259 }
2260
2261 /*
2262 * The "chunk" macros are used by routines below when looking for pages to deactivate. These
2263 * exist because of the need to handle shadow chains. When deactivating pages, we only
2264 * want to deactive the ones at the top most level in the object chain. In order to do
2265 * this efficiently, the specified address range is divided up into "chunks" and we use
2266 * a bit map to keep track of which pages have already been processed as we descend down
2267 * the shadow chain. These chunk macros hide the details of the bit map implementation
2268 * as much as we can.
2269 *
2270 * For convenience, we use a 64-bit data type as the bit map, and therefore a chunk is
2271 * set to 64 pages. The bit map is indexed from the low-order end, so that the lowest
2272 * order bit represents page 0 in the current range and highest order bit represents
2273 * page 63.
2274 *
2275 * For further convenience, we also use negative logic for the page state in the bit map.
2276 * The bit is set to 1 to indicate it has not yet been seen, and to 0 to indicate it has
2277 * been processed. This way we can simply test the 64-bit long word to see if it's zero
2278 * to easily tell if the whole range has been processed. Therefore, the bit map starts
2279 * out with all the bits set. The macros below hide all these details from the caller.
2280 */
2281
2282 #define PAGES_IN_A_CHUNK 64 /* The number of pages in the chunk must */
2283 /* be the same as the number of bits in */
2284 /* the chunk_state_t type. We use 64 */
2285 /* just for convenience. */
2286
2287 #define CHUNK_SIZE (PAGES_IN_A_CHUNK * PAGE_SIZE_64) /* Size of a chunk in bytes */
2288
2289 typedef uint64_t chunk_state_t;
2290
2291 /*
2292 * The bit map uses negative logic, so we start out with all 64 bits set to indicate
2293 * that no pages have been processed yet. Also, if len is less than the full CHUNK_SIZE,
2294 * then we mark pages beyond the len as having been "processed" so that we don't waste time
2295 * looking at pages in that range. This can save us from unnecessarily chasing down the
2296 * shadow chain.
2297 */
2298
2299 #define CHUNK_INIT(c, len) \
2300 MACRO_BEGIN \
2301 uint64_t p; \
2302 \
2303 (c) = 0xffffffffffffffffLL; \
2304 \
2305 for (p = (len) / PAGE_SIZE_64; p < PAGES_IN_A_CHUNK; p++) \
2306 MARK_PAGE_HANDLED(c, p); \
2307 MACRO_END
2308
2309
2310 /*
2311 * Return true if all pages in the chunk have not yet been processed.
2312 */
2313
2314 #define CHUNK_NOT_COMPLETE(c) ((c) != 0)
2315
2316 /*
2317 * Return true if the page at offset 'p' in the bit map has already been handled
2318 * while processing a higher level object in the shadow chain.
2319 */
2320
2321 #define PAGE_ALREADY_HANDLED(c, p) (((c) & (1ULL << (p))) == 0)
2322
2323 /*
2324 * Mark the page at offset 'p' in the bit map as having been processed.
2325 */
2326
2327 #define MARK_PAGE_HANDLED(c, p) \
2328 MACRO_BEGIN \
2329 (c) = (c) & ~(1ULL << (p)); \
2330 MACRO_END
2331
2332
2333 /*
2334 * Return true if the page at the given offset has been paged out. Object is
2335 * locked upon entry and returned locked.
2336 *
2337 * NB: It is the callers responsibility to ensure that the offset in question
2338 * is not in the process of being paged in/out (i.e. not busy or no backing
2339 * page)
2340 */
2341 static bool
page_is_paged_out(vm_object_t object,vm_object_offset_t offset)2342 page_is_paged_out(
2343 vm_object_t object,
2344 vm_object_offset_t offset)
2345 {
2346 if (object->internal &&
2347 object->alive &&
2348 !object->terminating &&
2349 object->pager_ready) {
2350 if (vm_object_compressor_pager_state_get(object, offset)
2351 == VM_EXTERNAL_STATE_EXISTS) {
2352 return true;
2353 }
2354 }
2355 return false;
2356 }
2357
2358
2359
2360 /*
2361 * madvise_free_debug
2362 *
2363 * To help debug madvise(MADV_FREE*) mis-usage, this triggers a
2364 * zero-fill as soon as a page is affected by a madvise(MADV_FREE*), to
2365 * simulate the loss of the page's contents as if the page had been
2366 * reclaimed and then re-faulted.
2367 */
2368 #if DEVELOPMENT || DEBUG
2369 int madvise_free_debug = 0;
2370 int madvise_free_debug_sometimes = 1;
2371 #else /* DEBUG */
2372 int madvise_free_debug = 0;
2373 int madvise_free_debug_sometimes = 0;
2374 #endif /* DEBUG */
2375 int madvise_free_counter = 0;
2376
2377 __options_decl(deactivate_flags_t, uint32_t, {
2378 DEACTIVATE_KILL = 0x1,
2379 DEACTIVATE_REUSABLE = 0x2,
2380 DEACTIVATE_ALL_REUSABLE = 0x4,
2381 DEACTIVATE_CLEAR_REFMOD = 0x8,
2382 DEACTIVATE_KILL_NO_WRITE = 0x10
2383 });
2384
2385 /*
2386 * Deactivate the pages in the specified object and range. If kill_page is set, also discard any
2387 * page modified state from the pmap. Update the chunk_state as we go along. The caller must specify
2388 * a size that is less than or equal to the CHUNK_SIZE.
2389 */
2390
2391 static void
deactivate_pages_in_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,deactivate_flags_t flags,chunk_state_t * chunk_state,pmap_flush_context * pfc,struct pmap * pmap,vm_map_offset_t pmap_offset)2392 deactivate_pages_in_object(
2393 vm_object_t object,
2394 vm_object_offset_t offset,
2395 vm_object_size_t size,
2396 deactivate_flags_t flags,
2397 chunk_state_t *chunk_state,
2398 pmap_flush_context *pfc,
2399 struct pmap *pmap,
2400 vm_map_offset_t pmap_offset)
2401 {
2402 vm_page_t m;
2403 int p;
2404 struct vm_page_delayed_work dw_array;
2405 struct vm_page_delayed_work *dwp, *dwp_start;
2406 bool dwp_finish_ctx = TRUE;
2407 int dw_count;
2408 int dw_limit;
2409 unsigned int reusable = 0;
2410
2411 /*
2412 * Examine each page in the chunk. The variable 'p' is the page number relative to the start of the
2413 * chunk. Since this routine is called once for each level in the shadow chain, the chunk_state may
2414 * have pages marked as having been processed already. We stop the loop early if we find we've handled
2415 * all the pages in the chunk.
2416 */
2417
2418 dwp_start = dwp = NULL;
2419 dw_count = 0;
2420 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
2421 dwp_start = vm_page_delayed_work_get_ctx();
2422 if (dwp_start == NULL) {
2423 dwp_start = &dw_array;
2424 dw_limit = 1;
2425 dwp_finish_ctx = FALSE;
2426 }
2427
2428 dwp = dwp_start;
2429
2430 for (p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64, pmap_offset += PAGE_SIZE_64) {
2431 /*
2432 * If this offset has already been found and handled in a higher level object, then don't
2433 * do anything with it in the current shadow object.
2434 */
2435
2436 if (PAGE_ALREADY_HANDLED(*chunk_state, p)) {
2437 continue;
2438 }
2439
2440 /*
2441 * See if the page at this offset is around. First check to see if the page is resident,
2442 * then if not, check the existence map or with the pager.
2443 */
2444
2445 if ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
2446 /*
2447 * We found a page we were looking for. Mark it as "handled" now in the chunk_state
2448 * so that we won't bother looking for a page at this offset again if there are more
2449 * shadow objects. Then deactivate the page.
2450 */
2451
2452 MARK_PAGE_HANDLED(*chunk_state, p);
2453
2454 if ((!VM_PAGE_WIRED(m)) && (!vm_page_is_private(m)) && (!m->vmp_gobbled) && (!m->vmp_busy) &&
2455 (!m->vmp_laundry) && (!m->vmp_cleaning) && !(m->vmp_free_when_done)) {
2456 int clear_refmod_mask;
2457 int pmap_options;
2458 dwp->dw_mask = 0;
2459
2460 pmap_options = 0;
2461 clear_refmod_mask = VM_MEM_REFERENCED;
2462 dwp->dw_mask |= DW_clear_reference;
2463
2464 if ((flags & DEACTIVATE_KILL) && (object->internal)) {
2465 if (!(flags & DEACTIVATE_KILL_NO_WRITE) &&
2466 (madvise_free_debug ||
2467 (madvise_free_debug_sometimes &&
2468 madvise_free_counter++ & 0x1))) {
2469 /*
2470 * zero-fill the page (or every
2471 * other page) now to simulate
2472 * it being reclaimed and
2473 * re-faulted.
2474 */
2475 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
2476 if (!m->vmp_unmodified_ro) {
2477 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
2478 if (true) {
2479 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
2480 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
2481 }
2482 }
2483 m->vmp_precious = FALSE;
2484 m->vmp_dirty = FALSE;
2485
2486 clear_refmod_mask |= VM_MEM_MODIFIED;
2487 if (m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
2488 /*
2489 * This page is now clean and
2490 * reclaimable. Move it out
2491 * of the throttled queue, so
2492 * that vm_pageout_scan() can
2493 * find it.
2494 */
2495 dwp->dw_mask |= DW_move_page;
2496 }
2497
2498 #if 0
2499 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
2500 /*
2501 * COMMENT BLOCK ON WHY THIS SHOULDN'T BE DONE.
2502 *
2503 * Since we are about to do a vm_object_compressor_pager_state_clr
2504 * below for this page, which drops any existing compressor
2505 * storage of this page (eg side-effect of a CoW operation or
2506 * a collapse operation), it is tempting to think that we should
2507 * treat this page as if it was just decompressed (during which
2508 * we also drop existing compressor storage) and so start its life
2509 * out with vmp_unmodified_ro set to FALSE.
2510 *
2511 * However, we can't do that here because we could swing around
2512 * and re-access this page in a read-only fault.
2513 * Clearing this bit means we'll try to zero it up above
2514 * and fail.
2515 *
2516 * Note that clearing the bit is unnecessary regardless because
2517 * dirty state has been cleared. During the next soft fault, the
2518 * right state will be restored and things will progress just fine.
2519 */
2520 if (m->vmp_unmodified_ro == true) {
2521 /* Need object and pageq locks for bit manipulation*/
2522 m->vmp_unmodified_ro = false;
2523 os_atomic_dec(&compressor_ro_uncompressed);
2524 }
2525 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
2526 #endif /* 0 */
2527 vm_object_compressor_pager_state_clr(object, offset);
2528
2529 if ((flags & DEACTIVATE_REUSABLE) && !m->vmp_reusable) {
2530 assert(!(flags & DEACTIVATE_ALL_REUSABLE));
2531 assert(!object->all_reusable);
2532 m->vmp_reusable = TRUE;
2533 object->reusable_page_count++;
2534 assert(object->resident_page_count >= object->reusable_page_count);
2535 reusable++;
2536 /*
2537 * Tell pmap this page is now
2538 * "reusable" (to update pmap
2539 * stats for all mappings).
2540 */
2541 pmap_options |= PMAP_OPTIONS_SET_REUSABLE;
2542 }
2543 }
2544 if (flags & DEACTIVATE_CLEAR_REFMOD) {
2545 /*
2546 * The caller didn't clear the refmod bits in advance.
2547 * Clear them for this page now.
2548 */
2549 pmap_options |= PMAP_OPTIONS_NOFLUSH;
2550 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m),
2551 clear_refmod_mask,
2552 pmap_options,
2553 (void *)pfc);
2554 }
2555
2556 if ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) &&
2557 !(flags & (DEACTIVATE_REUSABLE | DEACTIVATE_ALL_REUSABLE))) {
2558 dwp->dw_mask |= DW_move_page;
2559 }
2560
2561 if (dwp->dw_mask) {
2562 VM_PAGE_ADD_DELAYED_WORK(dwp, m,
2563 dw_count);
2564 }
2565
2566 if (dw_count >= dw_limit) {
2567 if (reusable) {
2568 OSAddAtomic(reusable,
2569 &vm_page_stats_reusable.reusable_count);
2570 vm_page_stats_reusable.reusable += reusable;
2571 reusable = 0;
2572 }
2573 vm_page_do_delayed_work(object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
2574
2575 dwp = dwp_start;
2576 dw_count = 0;
2577 }
2578 }
2579 } else {
2580 /*
2581 * The page at this offset isn't memory resident, check to see if it's
2582 * been paged out. If so, mark it as handled so we don't bother looking
2583 * for it in the shadow chain.
2584 */
2585
2586 if (page_is_paged_out(object, offset)) {
2587 MARK_PAGE_HANDLED(*chunk_state, p);
2588
2589 /*
2590 * If we're killing a non-resident page, then clear the page in the existence
2591 * map so we don't bother paging it back in if it's touched again in the future.
2592 */
2593
2594 if ((flags & DEACTIVATE_KILL) && (object->internal)) {
2595 vm_object_compressor_pager_state_clr(object, offset);
2596
2597 if (pmap != PMAP_NULL) {
2598 /*
2599 * Tell pmap that this page
2600 * is no longer mapped, to
2601 * adjust the footprint ledger
2602 * because this page is no
2603 * longer compressed.
2604 */
2605 pmap_remove_options(
2606 pmap,
2607 pmap_offset,
2608 (pmap_offset +
2609 PAGE_SIZE),
2610 PMAP_OPTIONS_REMOVE);
2611 }
2612 }
2613 }
2614 }
2615 }
2616
2617 if (reusable) {
2618 OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count);
2619 vm_page_stats_reusable.reusable += reusable;
2620 reusable = 0;
2621 }
2622
2623 if (dw_count) {
2624 vm_page_do_delayed_work(object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
2625 dwp = dwp_start;
2626 dw_count = 0;
2627 }
2628
2629 if (dwp_start && dwp_finish_ctx) {
2630 vm_page_delayed_work_finish_ctx(dwp_start);
2631 dwp_start = dwp = NULL;
2632 }
2633 }
2634
2635
2636 /*
2637 * Deactive a "chunk" of the given range of the object starting at offset. A "chunk"
2638 * will always be less than or equal to the given size. The total range is divided up
2639 * into chunks for efficiency and performance related to the locks and handling the shadow
2640 * chain. This routine returns how much of the given "size" it actually processed. It's
2641 * up to the caler to loop and keep calling this routine until the entire range they want
2642 * to process has been done.
2643 * Iff clear_refmod is true, pmap_clear_refmod_options is called for each physical page in this range.
2644 */
2645
2646 static vm_object_size_t
2647 deactivate_a_chunk(
2648 vm_object_t orig_object,
2649 vm_object_offset_t offset,
2650 vm_object_size_t size,
2651 deactivate_flags_t flags,
2652 pmap_flush_context *pfc,
2653 struct pmap *pmap,
2654 vm_map_offset_t pmap_offset)
2655 {
2656 vm_object_t object;
2657 vm_object_t tmp_object;
2658 vm_object_size_t length;
2659 chunk_state_t chunk_state;
2660
2661
2662 /*
2663 * Get set to do a chunk. We'll do up to CHUNK_SIZE, but no more than the
2664 * remaining size the caller asked for.
2665 */
2666
2667 length = MIN(size, CHUNK_SIZE);
2668
2669 /*
2670 * The chunk_state keeps track of which pages we've already processed if there's
2671 * a shadow chain on this object. At this point, we haven't done anything with this
2672 * range of pages yet, so initialize the state to indicate no pages processed yet.
2673 */
2674
2675 CHUNK_INIT(chunk_state, length);
2676 object = orig_object;
2677
2678 /*
2679 * Start at the top level object and iterate around the loop once for each object
2680 * in the shadow chain. We stop processing early if we've already found all the pages
2681 * in the range. Otherwise we stop when we run out of shadow objects.
2682 */
2683
2684 while (object && CHUNK_NOT_COMPLETE(chunk_state)) {
2685 vm_object_paging_begin(object);
2686
2687 deactivate_pages_in_object(object, offset, length, flags, &chunk_state, pfc, pmap, pmap_offset);
2688
2689 vm_object_paging_end(object);
2690
2691 /*
2692 * We've finished with this object, see if there's a shadow object. If
2693 * there is, update the offset and lock the new object. We also turn off
2694 * kill_page at this point since we only kill pages in the top most object.
2695 */
2696
2697 tmp_object = object->shadow;
2698
2699 if (tmp_object) {
2700 assert(!(flags & DEACTIVATE_KILL) || (flags & DEACTIVATE_CLEAR_REFMOD));
2701 flags &= ~(DEACTIVATE_KILL | DEACTIVATE_REUSABLE | DEACTIVATE_ALL_REUSABLE);
2702 offset += object->vo_shadow_offset;
2703 vm_object_lock(tmp_object);
2704 }
2705
2706 if (object != orig_object) {
2707 vm_object_unlock(object);
2708 }
2709
2710 object = tmp_object;
2711 }
2712
2713 if (object && object != orig_object) {
2714 vm_object_unlock(object);
2715 }
2716
2717 return length;
2718 }
2719
2720
2721
2722 /*
2723 * Move any resident pages in the specified range to the inactive queue. If kill_page is set,
2724 * we also clear the modified status of the page and "forget" any changes that have been made
2725 * to the page.
2726 */
2727
2728 __private_extern__ void
2729 vm_object_deactivate_pages(
2730 vm_object_t object,
2731 vm_object_offset_t offset,
2732 vm_object_size_t size,
2733 boolean_t kill_page,
2734 boolean_t reusable_page,
2735 boolean_t kill_no_write,
2736 struct pmap *pmap,
2737 vm_map_offset_t pmap_offset)
2738 {
2739 vm_object_size_t length;
2740 boolean_t all_reusable;
2741 pmap_flush_context pmap_flush_context_storage;
2742 unsigned int pmap_clear_refmod_mask = VM_MEM_REFERENCED;
2743 unsigned int pmap_clear_refmod_options = 0;
2744 deactivate_flags_t flags = DEACTIVATE_CLEAR_REFMOD;
2745 bool refmod_cleared = false;
2746 if (kill_page) {
2747 flags |= DEACTIVATE_KILL;
2748 }
2749 if (reusable_page) {
2750 flags |= DEACTIVATE_REUSABLE;
2751 }
2752 if (kill_no_write) {
2753 flags |= DEACTIVATE_KILL_NO_WRITE;
2754 }
2755
2756 /*
2757 * We break the range up into chunks and do one chunk at a time. This is for
2758 * efficiency and performance while handling the shadow chains and the locks.
2759 * The deactivate_a_chunk() function returns how much of the range it processed.
2760 * We keep calling this routine until the given size is exhausted.
2761 */
2762
2763
2764 all_reusable = FALSE;
2765 #if 11
2766 /*
2767 * For the sake of accurate "reusable" pmap stats, we need
2768 * to tell pmap about each page that is no longer "reusable",
2769 * so we can't do the "all_reusable" optimization.
2770 *
2771 * If we do go with the all_reusable optimization, we can't
2772 * return if size is 0 since we could have "all_reusable == TRUE"
2773 * In this case, we save the overhead of doing the pmap_flush_context
2774 * work.
2775 */
2776 if (size == 0) {
2777 return;
2778 }
2779 #else
2780 if (reusable_page &&
2781 object->internal &&
2782 object->vo_size != 0 &&
2783 object->vo_size == size &&
2784 object->reusable_page_count == 0) {
2785 all_reusable = TRUE;
2786 reusable_page = FALSE;
2787 flags |= DEACTIVATE_ALL_REUSABLE;
2788 }
2789 #endif
2790
2791 if ((reusable_page || all_reusable) && object->all_reusable) {
2792 /* This means MADV_FREE_REUSABLE has been called twice, which
2793 * is probably illegal. */
2794 return;
2795 }
2796
2797
2798 pmap_flush_context_init(&pmap_flush_context_storage);
2799
2800 /*
2801 * If we're deactivating multiple pages, try to perform one bulk pmap operation.
2802 * We can't do this if we're killing pages and there's a shadow chain as
2803 * we don't yet know which pages are in the top object (pages in shadow copies aren't
2804 * safe to kill).
2805 * And we can only do this on hardware that supports it.
2806 */
2807 if (size > PAGE_SIZE && (!kill_page || !object->shadow)) {
2808 if (kill_page && object->internal) {
2809 pmap_clear_refmod_mask |= VM_MEM_MODIFIED;
2810 }
2811 if (reusable_page) {
2812 pmap_clear_refmod_options |= PMAP_OPTIONS_SET_REUSABLE;
2813 }
2814
2815 refmod_cleared = pmap_clear_refmod_range_options(pmap, pmap_offset, pmap_offset + size, pmap_clear_refmod_mask, pmap_clear_refmod_options);
2816 if (refmod_cleared) {
2817 // We were able to clear all the refmod bits. So deactivate_a_chunk doesn't need to do it.
2818 flags &= ~DEACTIVATE_CLEAR_REFMOD;
2819 }
2820 }
2821
2822 while (size) {
2823 length = deactivate_a_chunk(object, offset, size, flags,
2824 &pmap_flush_context_storage, pmap, pmap_offset);
2825
2826 size -= length;
2827 offset += length;
2828 pmap_offset += length;
2829 }
2830 pmap_flush(&pmap_flush_context_storage);
2831
2832 if (all_reusable) {
2833 if (!object->all_reusable) {
2834 unsigned int reusable;
2835
2836 object->all_reusable = TRUE;
2837 assert(object->reusable_page_count == 0);
2838 /* update global stats */
2839 reusable = object->resident_page_count;
2840 OSAddAtomic(reusable,
2841 &vm_page_stats_reusable.reusable_count);
2842 vm_page_stats_reusable.reusable += reusable;
2843 vm_page_stats_reusable.all_reusable_calls++;
2844 }
2845 } else if (reusable_page) {
2846 vm_page_stats_reusable.partial_reusable_calls++;
2847 }
2848 }
2849
2850 void
2851 vm_object_reuse_pages(
2852 vm_object_t object,
2853 vm_object_offset_t start_offset,
2854 vm_object_offset_t end_offset,
2855 boolean_t allow_partial_reuse)
2856 {
2857 vm_object_offset_t cur_offset;
2858 vm_page_t m;
2859 unsigned int reused, reusable;
2860
2861 #define VM_OBJECT_REUSE_PAGE(object, m, reused) \
2862 MACRO_BEGIN \
2863 if ((m) != VM_PAGE_NULL && \
2864 (m)->vmp_reusable) { \
2865 assert((object)->reusable_page_count <= \
2866 (object)->resident_page_count); \
2867 assert((object)->reusable_page_count > 0); \
2868 (object)->reusable_page_count--; \
2869 (m)->vmp_reusable = FALSE; \
2870 (reused)++; \
2871 /* \
2872 * Tell pmap that this page is no longer \
2873 * "reusable", to update the "reusable" stats \
2874 * for all the pmaps that have mapped this \
2875 * page. \
2876 */ \
2877 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE((m)), \
2878 0, /* refmod */ \
2879 (PMAP_OPTIONS_CLEAR_REUSABLE \
2880 | PMAP_OPTIONS_NOFLUSH), \
2881 NULL); \
2882 } \
2883 MACRO_END
2884
2885 reused = 0;
2886 reusable = 0;
2887
2888 vm_object_lock_assert_exclusive(object);
2889
2890 if (object->all_reusable) {
2891 panic("object %p all_reusable: can't update pmap stats",
2892 object);
2893 assert(object->reusable_page_count == 0);
2894 object->all_reusable = FALSE;
2895 if (end_offset - start_offset == object->vo_size ||
2896 !allow_partial_reuse) {
2897 vm_page_stats_reusable.all_reuse_calls++;
2898 reused = object->resident_page_count;
2899 } else {
2900 vm_page_stats_reusable.partial_reuse_calls++;
2901 vm_page_queue_iterate(&object->memq, m, vmp_listq) {
2902 if (m->vmp_offset < start_offset ||
2903 m->vmp_offset >= end_offset) {
2904 m->vmp_reusable = TRUE;
2905 object->reusable_page_count++;
2906 assert(object->resident_page_count >= object->reusable_page_count);
2907 continue;
2908 } else {
2909 assert(!m->vmp_reusable);
2910 reused++;
2911 }
2912 }
2913 }
2914 } else if (object->resident_page_count >
2915 ((end_offset - start_offset) >> PAGE_SHIFT)) {
2916 vm_page_stats_reusable.partial_reuse_calls++;
2917 for (cur_offset = start_offset;
2918 cur_offset < end_offset;
2919 cur_offset += PAGE_SIZE_64) {
2920 if (object->reusable_page_count == 0) {
2921 break;
2922 }
2923 m = vm_page_lookup(object, cur_offset);
2924 VM_OBJECT_REUSE_PAGE(object, m, reused);
2925 }
2926 } else {
2927 vm_page_stats_reusable.partial_reuse_calls++;
2928 vm_page_queue_iterate(&object->memq, m, vmp_listq) {
2929 if (object->reusable_page_count == 0) {
2930 break;
2931 }
2932 if (m->vmp_offset < start_offset ||
2933 m->vmp_offset >= end_offset) {
2934 continue;
2935 }
2936 VM_OBJECT_REUSE_PAGE(object, m, reused);
2937 }
2938 }
2939
2940 /* update global stats */
2941 OSAddAtomic(reusable - reused, &vm_page_stats_reusable.reusable_count);
2942 vm_page_stats_reusable.reused += reused;
2943 vm_page_stats_reusable.reusable += reusable;
2944 }
2945
2946 /*
2947 * This function determines if the zero operation can be run on the
2948 * object. The checks on the entry have already been performed by
2949 * vm_map_zero_entry_preflight.
2950 */
2951 static kern_return_t
2952 vm_object_zero_preflight(
2953 vm_object_t object,
2954 vm_object_offset_t start,
2955 vm_object_offset_t end)
2956 {
2957 /*
2958 * Zeroing is further restricted to anonymous memory.
2959 */
2960 if (!object->internal) {
2961 return KERN_PROTECTION_FAILURE;
2962 }
2963
2964 /*
2965 * Zeroing for copy on write isn't yet supported
2966 */
2967 if (object->shadow != NULL ||
2968 object->vo_copy != NULL) {
2969 return KERN_NO_ACCESS;
2970 }
2971
2972 /*
2973 * Ensure the that bounds makes sense wrt the object
2974 */
2975 if (end - start > object->vo_size) {
2976 return KERN_INVALID_ADDRESS;
2977 }
2978
2979 if (object->terminating || !object->alive) {
2980 return KERN_ABORTED;
2981 }
2982
2983 return KERN_SUCCESS;
2984 }
2985
2986 static void
2987 vm_object_zero_page(vm_page_t m)
2988 {
2989 if (m != VM_PAGE_NULL) {
2990 ppnum_t phy_page_num = VM_PAGE_GET_PHYS_PAGE(m);
2991
2992 /*
2993 * Skip fictitious guard pages
2994 */
2995 if (vm_page_is_fictitious(m)) {
2996 assert(vm_page_is_guard(m));
2997 return;
2998 }
2999 pmap_zero_page(phy_page_num);
3000 }
3001 }
3002
3003 /*
3004 * This function iterates the range of pages specified in the object and
3005 * discards the ones that are compressed and zeroes the ones that are wired.
3006 * This function may drop the object lock while waiting for a page that is
3007 * busy and will restart the operation for the specific offset.
3008 */
3009 kern_return_t
3010 vm_object_zero(
3011 vm_object_t object,
3012 vm_object_offset_t *cur_offset_p,
3013 vm_object_offset_t end_offset)
3014 {
3015 kern_return_t ret;
3016
3017 vm_object_lock_assert_exclusive(object);
3018 ret = vm_object_zero_preflight(object, *cur_offset_p, end_offset);
3019 if (ret != KERN_SUCCESS) {
3020 return ret;
3021 }
3022
3023 while (*cur_offset_p < end_offset) {
3024 vm_page_t m = vm_page_lookup(object, *cur_offset_p);
3025
3026 if (m != VM_PAGE_NULL && m->vmp_busy) {
3027 vm_page_sleep(object, m, THREAD_UNINT, LCK_SLEEP_DEFAULT);
3028 /* Object lock was dropped -- reverify validity */
3029 ret = vm_object_zero_preflight(object, *cur_offset_p, end_offset);
3030 if (ret != KERN_SUCCESS) {
3031 return ret;
3032 }
3033 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
3034 /*
3035 * Our mapping could have been made "needs_copy" while
3036 * the map and object were unlocked.
3037 * We need to do the mapping preflight again...
3038 */
3039 return KERN_SUCCESS;
3040 }
3041 continue;
3042 }
3043
3044 /*
3045 * If the compressor has the page then just discard it instead
3046 * of faulting it in and zeroing it else zero the page if it exists. If
3047 * we dropped the object lock during the lookup retry the lookup for the
3048 * cur_offset.
3049 */
3050 if (page_is_paged_out(object, *cur_offset_p)) {
3051 vm_object_compressor_pager_state_clr(object, *cur_offset_p);
3052 } else {
3053 vm_object_zero_page(m);
3054 }
3055 *cur_offset_p += PAGE_SIZE_64;
3056 /*
3057 * TODO: May need a vm_object_lock_yield_shared in this loop if it takes
3058 * too long, as holding the object lock for too long can stall pageout
3059 * scan (or other users of the object)
3060 */
3061 }
3062
3063 return KERN_SUCCESS;
3064 }
3065
3066 /*
3067 * Routine: vm_object_pmap_protect
3068 *
3069 * Purpose:
3070 * Reduces the permission for all physical
3071 * pages in the specified object range.
3072 *
3073 * If removing write permission only, it is
3074 * sufficient to protect only the pages in
3075 * the top-level object; only those pages may
3076 * have write permission.
3077 *
3078 * If removing all access, we must follow the
3079 * shadow chain from the top-level object to
3080 * remove access to all pages in shadowed objects.
3081 *
3082 * The object must *not* be locked. The object must
3083 * be internal.
3084 *
3085 * If pmap is not NULL, this routine assumes that
3086 * the only mappings for the pages are in that
3087 * pmap.
3088 */
3089
3090 __private_extern__ void
3091 vm_object_pmap_protect(
3092 vm_object_t object,
3093 vm_object_offset_t offset,
3094 vm_object_size_t size,
3095 pmap_t pmap,
3096 vm_map_size_t pmap_page_size,
3097 vm_map_offset_t pmap_start,
3098 vm_prot_t prot)
3099 {
3100 vm_object_pmap_protect_options(object, offset, size, pmap,
3101 pmap_page_size,
3102 pmap_start, prot, 0);
3103 }
3104
3105 __private_extern__ void
3106 vm_object_pmap_protect_options(
3107 vm_object_t object,
3108 vm_object_offset_t offset,
3109 vm_object_size_t size,
3110 pmap_t pmap,
3111 vm_map_size_t pmap_page_size,
3112 vm_map_offset_t pmap_start,
3113 vm_prot_t prot,
3114 int options)
3115 {
3116 pmap_flush_context pmap_flush_context_storage;
3117 boolean_t delayed_pmap_flush = FALSE;
3118 vm_object_offset_t offset_in_object;
3119 vm_object_size_t size_in_object;
3120
3121 if (object == VM_OBJECT_NULL) {
3122 return;
3123 }
3124 if (pmap_page_size > PAGE_SIZE) {
3125 /* for 16K map on 4K device... */
3126 pmap_page_size = PAGE_SIZE;
3127 }
3128 /*
3129 * If we decide to work on the object itself, extend the range to
3130 * cover a full number of native pages.
3131 */
3132 size_in_object = vm_object_round_page(offset + size) - vm_object_trunc_page(offset);
3133 offset_in_object = vm_object_trunc_page(offset);
3134 /*
3135 * If we decide to work on the pmap, use the exact range specified,
3136 * so no rounding/truncating offset and size. They should already
3137 * be aligned to pmap_page_size.
3138 */
3139 assertf(!(offset & (pmap_page_size - 1)) && !(size & (pmap_page_size - 1)),
3140 "offset 0x%llx size 0x%llx pmap_page_size 0x%llx",
3141 offset, size, (uint64_t)pmap_page_size);
3142
3143 vm_object_lock(object);
3144
3145 if (object->phys_contiguous) {
3146 if (pmap != NULL) {
3147 vm_object_unlock(object);
3148 pmap_protect_options(pmap,
3149 pmap_start,
3150 pmap_start + size,
3151 prot,
3152 options & ~PMAP_OPTIONS_NOFLUSH,
3153 NULL);
3154 } else {
3155 vm_object_offset_t phys_start, phys_end, phys_addr;
3156
3157 phys_start = object->vo_shadow_offset + offset_in_object;
3158 phys_end = phys_start + size_in_object;
3159 assert(phys_start <= phys_end);
3160 assert(phys_end <= object->vo_shadow_offset + object->vo_size);
3161 vm_object_unlock(object);
3162
3163 pmap_flush_context_init(&pmap_flush_context_storage);
3164 delayed_pmap_flush = FALSE;
3165
3166 for (phys_addr = phys_start;
3167 phys_addr < phys_end;
3168 phys_addr += PAGE_SIZE_64) {
3169 pmap_page_protect_options(
3170 (ppnum_t) (phys_addr >> PAGE_SHIFT),
3171 prot,
3172 options | PMAP_OPTIONS_NOFLUSH,
3173 (void *)&pmap_flush_context_storage);
3174 delayed_pmap_flush = TRUE;
3175 }
3176 if (delayed_pmap_flush == TRUE) {
3177 pmap_flush(&pmap_flush_context_storage);
3178 }
3179 }
3180 return;
3181 }
3182
3183 assert(object->internal);
3184
3185 while (TRUE) {
3186 if (ptoa_64(object->resident_page_count) > size_in_object / 2 && pmap != PMAP_NULL) {
3187 vm_object_unlock(object);
3188 if (pmap_page_size < PAGE_SIZE) {
3189 DEBUG4K_PMAP("pmap %p start 0x%llx end 0x%llx prot 0x%x: pmap_protect()\n", pmap, (uint64_t)pmap_start, pmap_start + size, prot);
3190 }
3191 pmap_protect_options(pmap, pmap_start, pmap_start + size, prot,
3192 options & ~PMAP_OPTIONS_NOFLUSH, NULL);
3193 return;
3194 }
3195
3196 if (pmap_page_size < PAGE_SIZE) {
3197 DEBUG4K_PMAP("pmap %p start 0x%llx end 0x%llx prot 0x%x: offset 0x%llx size 0x%llx object %p offset 0x%llx size 0x%llx\n", pmap, (uint64_t)pmap_start, pmap_start + size, prot, offset, size, object, offset_in_object, size_in_object);
3198 }
3199
3200 pmap_flush_context_init(&pmap_flush_context_storage);
3201 delayed_pmap_flush = FALSE;
3202
3203 /*
3204 * if we are doing large ranges with respect to resident
3205 * page count then we should interate over pages otherwise
3206 * inverse page look-up will be faster
3207 */
3208 if (ptoa_64(object->resident_page_count / 4) < size_in_object) {
3209 vm_page_t p;
3210 vm_object_offset_t end;
3211
3212 end = offset_in_object + size_in_object;
3213
3214 vm_page_queue_iterate(&object->memq, p, vmp_listq) {
3215 if (!vm_page_is_fictitious(p) &&
3216 (offset_in_object <= p->vmp_offset) &&
3217 (p->vmp_offset < end)) {
3218 vm_map_offset_t start;
3219
3220 /*
3221 * XXX FBDP 4K: intentionally using "offset" here instead
3222 * of "offset_in_object", since "start" is a pmap address.
3223 */
3224 start = pmap_start + p->vmp_offset - offset;
3225
3226 if (pmap != PMAP_NULL) {
3227 vm_map_offset_t curr;
3228 for (curr = start;
3229 curr < start + PAGE_SIZE_64;
3230 curr += pmap_page_size) {
3231 if (curr < pmap_start) {
3232 continue;
3233 }
3234 if (curr >= pmap_start + size) {
3235 break;
3236 }
3237 pmap_protect_options(
3238 pmap,
3239 curr,
3240 curr + pmap_page_size,
3241 prot,
3242 options | PMAP_OPTIONS_NOFLUSH,
3243 &pmap_flush_context_storage);
3244 }
3245 } else {
3246 pmap_page_protect_options(
3247 VM_PAGE_GET_PHYS_PAGE(p),
3248 prot,
3249 options | PMAP_OPTIONS_NOFLUSH,
3250 &pmap_flush_context_storage);
3251 }
3252 delayed_pmap_flush = TRUE;
3253 }
3254 }
3255 } else {
3256 vm_page_t p;
3257 vm_object_offset_t end;
3258 vm_object_offset_t target_off;
3259
3260 end = offset_in_object + size_in_object;
3261
3262 for (target_off = offset_in_object;
3263 target_off < end; target_off += PAGE_SIZE) {
3264 p = vm_page_lookup(object, target_off);
3265
3266 if (p != VM_PAGE_NULL) {
3267 vm_object_offset_t start;
3268
3269 /*
3270 * XXX FBDP 4K: intentionally using "offset" here instead
3271 * of "offset_in_object", since "start" is a pmap address.
3272 */
3273 start = pmap_start + (p->vmp_offset - offset);
3274
3275 if (pmap != PMAP_NULL) {
3276 vm_map_offset_t curr;
3277 for (curr = start;
3278 curr < start + PAGE_SIZE;
3279 curr += pmap_page_size) {
3280 if (curr < pmap_start) {
3281 continue;
3282 }
3283 if (curr >= pmap_start + size) {
3284 break;
3285 }
3286 pmap_protect_options(
3287 pmap,
3288 curr,
3289 curr + pmap_page_size,
3290 prot,
3291 options | PMAP_OPTIONS_NOFLUSH,
3292 &pmap_flush_context_storage);
3293 }
3294 } else {
3295 pmap_page_protect_options(
3296 VM_PAGE_GET_PHYS_PAGE(p),
3297 prot,
3298 options | PMAP_OPTIONS_NOFLUSH,
3299 &pmap_flush_context_storage);
3300 }
3301 delayed_pmap_flush = TRUE;
3302 }
3303 }
3304 }
3305 if (delayed_pmap_flush == TRUE) {
3306 pmap_flush(&pmap_flush_context_storage);
3307 }
3308
3309 if (prot == VM_PROT_NONE) {
3310 /*
3311 * Must follow shadow chain to remove access
3312 * to pages in shadowed objects.
3313 */
3314 vm_object_t next_object;
3315
3316 next_object = object->shadow;
3317 if (next_object != VM_OBJECT_NULL) {
3318 offset_in_object += object->vo_shadow_offset;
3319 offset += object->vo_shadow_offset;
3320 vm_object_lock(next_object);
3321 vm_object_unlock(object);
3322 object = next_object;
3323 } else {
3324 /*
3325 * End of chain - we are done.
3326 */
3327 break;
3328 }
3329 } else {
3330 /*
3331 * Pages in shadowed objects may never have
3332 * write permission - we may stop here.
3333 */
3334 break;
3335 }
3336 }
3337
3338 vm_object_unlock(object);
3339 }
3340
3341 uint32_t vm_page_busy_absent_skipped = 0;
3342
3343 /*
3344 * Routine: vm_object_copy_slowly
3345 *
3346 * Description:
3347 * Copy the specified range of the source
3348 * virtual memory object without using
3349 * protection-based optimizations (such
3350 * as copy-on-write). The pages in the
3351 * region are actually copied.
3352 *
3353 * In/out conditions:
3354 * The caller must hold a reference and a lock
3355 * for the source virtual memory object. The source
3356 * object will be returned *unlocked*.
3357 *
3358 * Results:
3359 * If the copy is completed successfully, KERN_SUCCESS is
3360 * returned. If the caller asserted the interruptible
3361 * argument, and an interruption occurred while waiting
3362 * for a user-generated event, MACH_SEND_INTERRUPTED is
3363 * returned. Other values may be returned to indicate
3364 * hard errors during the copy operation.
3365 *
3366 * A new virtual memory object is returned in a
3367 * parameter (_result_object). The contents of this
3368 * new object, starting at a zero offset, are a copy
3369 * of the source memory region. In the event of
3370 * an error, this parameter will contain the value
3371 * VM_OBJECT_NULL.
3372 */
3373 __exported_hidden kern_return_t
3374 vm_object_copy_slowly(
3375 vm_object_t src_object,
3376 vm_object_offset_t src_offset,
3377 vm_object_size_t size,
3378 boolean_t interruptible,
3379 #if HAS_MTE
3380 bool create_mte_object,
3381 #endif /* HAS_MTE */
3382 vm_object_t *_result_object) /* OUT */
3383 {
3384 vm_object_t new_object;
3385 vm_object_offset_t new_offset;
3386
3387 struct vm_object_fault_info fault_info = {};
3388
3389 if (size == 0) {
3390 vm_object_unlock(src_object);
3391 *_result_object = VM_OBJECT_NULL;
3392 return KERN_INVALID_ARGUMENT;
3393 }
3394
3395 /*
3396 * Prevent destruction of the source object while we copy.
3397 */
3398
3399 vm_object_reference_locked(src_object);
3400 vm_object_unlock(src_object);
3401
3402 /*
3403 * Create a new object to hold the copied pages.
3404 * A few notes:
3405 * We fill the new object starting at offset 0,
3406 * regardless of the input offset.
3407 * We don't bother to lock the new object within
3408 * this routine, since we have the only reference.
3409 */
3410
3411 size = vm_object_round_page(src_offset + size) - vm_object_trunc_page(src_offset);
3412 src_offset = vm_object_trunc_page(src_offset);
3413
3414 #if HAS_MTE
3415 /*
3416 * Retain the original provenance despite the fact we're creating a byte-for-byte copy.
3417 * As far as I can think, this doesn't have a consequence either way:
3418 * The only path for which we copy slowly MTE-enabled objects is on the fork path,
3419 * during which the two maps will hold the same ID anyway.
3420 * For objects that'll never be MTE-mapped, the provenance has no consequence anyway.
3421 * I'm carrying over the ID here just because it seems more tidy than dropping it.
3422 */
3423 #endif /* HAS_MTE */
3424 new_object = vm_object_allocate(size, src_object->vmo_provenance);
3425 new_offset = 0;
3426 if (src_object->copy_strategy == MEMORY_OBJECT_COPY_NONE &&
3427 src_object->vo_inherit_copy_none) {
3428 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3429 new_object->vo_inherit_copy_none = true;
3430 }
3431
3432 #if HAS_MTE
3433 /*
3434 * The new object should hold MTE enabled pages. This is a byproduct
3435 * of our current forking strategy.
3436 */
3437 if (create_mte_object) {
3438 vm_object_mte_set(new_object);
3439
3440 assert(src_object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
3441 new_object->copy_strategy = src_object->copy_strategy;
3442 }
3443 #endif /* HAS_MTE */
3444
3445 assert(size == trunc_page_64(size)); /* Will the loop terminate? */
3446
3447 fault_info.interruptible = interruptible;
3448 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
3449 fault_info.lo_offset = src_offset;
3450 fault_info.hi_offset = src_offset + size;
3451 fault_info.stealth = TRUE;
3452
3453 for (;
3454 size != 0;
3455 src_offset += PAGE_SIZE_64,
3456 new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
3457 ) {
3458 vm_page_t new_page;
3459 vm_fault_return_t result;
3460 vm_grab_options_t options;
3461
3462 options = vm_page_grab_options_for_object(new_object);
3463
3464 while ((new_page = vm_page_grab_options(options)) == VM_PAGE_NULL) {
3465 if (!vm_page_wait(interruptible)) {
3466 vm_object_deallocate(new_object);
3467 vm_object_deallocate(src_object);
3468 *_result_object = VM_OBJECT_NULL;
3469 return MACH_SEND_INTERRUPTED;
3470 }
3471 }
3472
3473 vm_object_lock(new_object);
3474 vm_page_insert(new_page, new_object, new_offset);
3475 vm_object_unlock(new_object);
3476
3477 do {
3478 vm_prot_t prot = VM_PROT_READ;
3479 vm_page_t _result_page;
3480 vm_page_t top_page;
3481 vm_page_t result_page;
3482 kern_return_t error_code;
3483 vm_object_t result_page_object;
3484
3485
3486 vm_object_lock(src_object);
3487
3488 if (src_object->internal &&
3489 src_object->shadow == VM_OBJECT_NULL &&
3490 (src_object->pager == NULL ||
3491 (vm_object_compressor_pager_state_get(src_object,
3492 src_offset) ==
3493 VM_EXTERNAL_STATE_ABSENT))) {
3494 boolean_t can_skip_page;
3495
3496 _result_page = vm_page_lookup(src_object,
3497 src_offset);
3498 if (_result_page == VM_PAGE_NULL) {
3499 /*
3500 * This page is neither resident nor
3501 * compressed and there's no shadow
3502 * object below "src_object", so this
3503 * page is really missing.
3504 * There's no need to zero-fill it just
3505 * to copy it: let's leave it missing
3506 * in "new_object" and get zero-filled
3507 * on demand.
3508 */
3509 can_skip_page = TRUE;
3510 } else if (workaround_41447923 &&
3511 src_object->pager == NULL &&
3512 _result_page != VM_PAGE_NULL &&
3513 _result_page->vmp_busy &&
3514 _result_page->vmp_absent &&
3515 src_object->purgable == VM_PURGABLE_DENY &&
3516 !src_object->blocked_access) {
3517 /*
3518 * This page is "busy" and "absent"
3519 * but not because we're waiting for
3520 * it to be decompressed. It must
3521 * be because it's a "no zero fill"
3522 * page that is currently not
3523 * accessible until it gets overwritten
3524 * by a device driver.
3525 * Since its initial state would have
3526 * been "zero-filled", let's leave the
3527 * copy page missing and get zero-filled
3528 * on demand.
3529 */
3530 assert(src_object->internal);
3531 assert(src_object->shadow == NULL);
3532 assert(src_object->pager == NULL);
3533 can_skip_page = TRUE;
3534 vm_page_busy_absent_skipped++;
3535 } else {
3536 can_skip_page = FALSE;
3537 }
3538 if (can_skip_page) {
3539 vm_object_unlock(src_object);
3540 /* free the unused "new_page"... */
3541 vm_object_lock(new_object);
3542 VM_PAGE_FREE(new_page);
3543 new_page = VM_PAGE_NULL;
3544 vm_object_unlock(new_object);
3545 /* ...and go to next page in "src_object" */
3546 result = VM_FAULT_SUCCESS;
3547 break;
3548 }
3549 }
3550
3551 vm_object_paging_begin(src_object);
3552
3553 /* cap size at maximum UPL size */
3554 upl_size_t cluster_size;
3555 if (os_convert_overflow(size, &cluster_size)) {
3556 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
3557 }
3558 fault_info.cluster_size = cluster_size;
3559
3560 _result_page = VM_PAGE_NULL;
3561 result = vm_fault_page(src_object, src_offset,
3562 VM_PROT_READ, FALSE,
3563 FALSE, /* page not looked up */
3564 &prot, &_result_page, &top_page,
3565 (int *)0,
3566 &error_code, FALSE, &fault_info);
3567
3568 switch (result) {
3569 case VM_FAULT_SUCCESS:
3570 result_page = _result_page;
3571 result_page_object = VM_PAGE_OBJECT(result_page);
3572
3573 /*
3574 * Copy the page to the new object.
3575 *
3576 * POLICY DECISION:
3577 * If result_page is clean,
3578 * we could steal it instead
3579 * of copying.
3580 */
3581 vm_page_copy(result_page, new_page);
3582
3583 vm_object_unlock(result_page_object);
3584
3585 /*
3586 * Let go of both pages (make them
3587 * not busy, perform wakeup, activate).
3588 */
3589 vm_object_lock(new_object);
3590 SET_PAGE_DIRTY(new_page, FALSE);
3591 vm_page_wakeup_done(new_object, new_page);
3592 vm_object_unlock(new_object);
3593
3594 vm_object_lock(result_page_object);
3595 vm_page_wakeup_done(result_page_object, result_page);
3596
3597 vm_page_lockspin_queues();
3598 if ((result_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3599 (result_page->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
3600 vm_page_activate(result_page);
3601 }
3602 vm_page_activate(new_page);
3603 vm_page_unlock_queues();
3604
3605 /*
3606 * Release paging references and
3607 * top-level placeholder page, if any.
3608 */
3609
3610 vm_fault_cleanup(result_page_object,
3611 top_page);
3612
3613 break;
3614
3615 case VM_FAULT_RETRY:
3616 break;
3617
3618 case VM_FAULT_MEMORY_SHORTAGE:
3619 if (vm_page_wait(interruptible)) {
3620 break;
3621 }
3622 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJCOPYSLOWLY_MEMORY_SHORTAGE), 0 /* arg */);
3623 OS_FALLTHROUGH;
3624
3625 case VM_FAULT_INTERRUPTED:
3626 vm_object_lock(new_object);
3627 VM_PAGE_FREE(new_page);
3628 vm_object_unlock(new_object);
3629
3630 vm_object_deallocate(new_object);
3631 vm_object_deallocate(src_object);
3632 *_result_object = VM_OBJECT_NULL;
3633 return MACH_SEND_INTERRUPTED;
3634
3635 case VM_FAULT_SUCCESS_NO_VM_PAGE:
3636 /* success but no VM page: fail */
3637 vm_object_paging_end(src_object);
3638 vm_object_unlock(src_object);
3639 OS_FALLTHROUGH;
3640 case VM_FAULT_MEMORY_ERROR:
3641 /*
3642 * A policy choice:
3643 * (a) ignore pages that we can't
3644 * copy
3645 * (b) return the null object if
3646 * any page fails [chosen]
3647 */
3648
3649 vm_object_lock(new_object);
3650 VM_PAGE_FREE(new_page);
3651 vm_object_unlock(new_object);
3652
3653 vm_object_deallocate(new_object);
3654 vm_object_deallocate(src_object);
3655 *_result_object = VM_OBJECT_NULL;
3656 return error_code ? error_code:
3657 KERN_MEMORY_ERROR;
3658
3659 default:
3660 panic("vm_object_copy_slowly: unexpected error"
3661 " 0x%x from vm_fault_page()\n", result);
3662 }
3663 } while (result != VM_FAULT_SUCCESS);
3664 }
3665
3666 /*
3667 * Lose the extra reference, and return our object.
3668 */
3669 vm_object_deallocate(src_object);
3670 *_result_object = new_object;
3671 return KERN_SUCCESS;
3672 }
3673
3674 /*
3675 * Routine: vm_object_copy_quickly
3676 *
3677 * Purpose:
3678 * Copy the specified range of the source virtual
3679 * memory object, if it can be done without waiting
3680 * for user-generated events.
3681 *
3682 * Results:
3683 * If the copy is successful, the copy is returned in
3684 * the arguments; otherwise, the arguments are not
3685 * affected.
3686 *
3687 * In/out conditions:
3688 * The object should be unlocked on entry and exit.
3689 */
3690
3691 /*ARGSUSED*/
3692 __private_extern__ boolean_t
3693 vm_object_copy_quickly(
3694 vm_object_t object, /* IN */
3695 __unused vm_object_offset_t offset, /* IN */
3696 __unused vm_object_size_t size, /* IN */
3697 boolean_t *_src_needs_copy, /* OUT */
3698 boolean_t *_dst_needs_copy) /* OUT */
3699 {
3700 memory_object_copy_strategy_t copy_strategy;
3701
3702 if (object == VM_OBJECT_NULL) {
3703 *_src_needs_copy = FALSE;
3704 *_dst_needs_copy = FALSE;
3705 return TRUE;
3706 }
3707
3708 vm_object_lock(object);
3709
3710 copy_strategy = object->copy_strategy;
3711
3712 switch (copy_strategy) {
3713 case MEMORY_OBJECT_COPY_SYMMETRIC:
3714
3715 /*
3716 * Symmetric copy strategy.
3717 * Make another reference to the object.
3718 * Leave object/offset unchanged.
3719 */
3720
3721 vm_object_reference_locked(object);
3722 VM_OBJECT_SET_SHADOWED(object, TRUE);
3723 vm_object_unlock(object);
3724
3725 /*
3726 * Both source and destination must make
3727 * shadows, and the source must be made
3728 * read-only if not already.
3729 */
3730
3731 *_src_needs_copy = TRUE;
3732 *_dst_needs_copy = TRUE;
3733
3734 break;
3735
3736 case MEMORY_OBJECT_COPY_DELAY:
3737 vm_object_unlock(object);
3738 return FALSE;
3739
3740 default:
3741 vm_object_unlock(object);
3742 return FALSE;
3743 }
3744 return TRUE;
3745 }
3746
3747 static uint32_t copy_delayed_lock_collisions;
3748 static uint32_t copy_delayed_max_collisions;
3749 static uint32_t copy_delayed_lock_contention;
3750 static uint32_t copy_delayed_protect_iterate;
3751
3752 #if XNU_TARGET_OS_OSX
3753 unsigned int vm_object_copy_delayed_paging_wait_disable = 0;
3754 #else /* XNU_TARGET_OS_OSX */
3755 unsigned int vm_object_copy_delayed_paging_wait_disable = 1;
3756 #endif /* XNU_TARGET_OS_OSX */
3757
3758 /*
3759 * Routine: vm_object_copy_delayed [internal]
3760 *
3761 * Description:
3762 * Copy the specified virtual memory object, using
3763 * the asymmetric copy-on-write algorithm.
3764 *
3765 * In/out conditions:
3766 * The src_object must be locked on entry. It will be unlocked
3767 * on exit - so the caller must also hold a reference to it.
3768 *
3769 * This routine will not block waiting for user-generated
3770 * events. It is not interruptible.
3771 */
3772 __private_extern__ vm_object_t
3773 vm_object_copy_delayed(
3774 vm_object_t src_object,
3775 vm_object_offset_t src_offset,
3776 vm_object_size_t size,
3777 boolean_t src_object_shared)
3778 {
3779 vm_object_t new_copy = VM_OBJECT_NULL;
3780 vm_object_t old_copy;
3781 vm_page_t p;
3782 vm_object_size_t copy_size = src_offset + size;
3783 pmap_flush_context pmap_flush_context_storage;
3784 boolean_t delayed_pmap_flush = FALSE;
3785
3786
3787 uint32_t collisions = 0;
3788 /*
3789 * The user-level memory manager wants to see all of the changes
3790 * to this object, but it has promised not to make any changes on
3791 * its own.
3792 *
3793 * Perform an asymmetric copy-on-write, as follows:
3794 * Create a new object, called a "copy object" to hold
3795 * pages modified by the new mapping (i.e., the copy,
3796 * not the original mapping).
3797 * Record the original object as the backing object for
3798 * the copy object. If the original mapping does not
3799 * change a page, it may be used read-only by the copy.
3800 * Record the copy object in the original object.
3801 * When the original mapping causes a page to be modified,
3802 * it must be copied to a new page that is "pushed" to
3803 * the copy object.
3804 * Mark the new mapping (the copy object) copy-on-write.
3805 * This makes the copy object itself read-only, allowing
3806 * it to be reused if the original mapping makes no
3807 * changes, and simplifying the synchronization required
3808 * in the "push" operation described above.
3809 *
3810 * The copy-on-write is said to be assymetric because the original
3811 * object is *not* marked copy-on-write. A copied page is pushed
3812 * to the copy object, regardless which party attempted to modify
3813 * the page.
3814 *
3815 * Repeated asymmetric copy operations may be done. If the
3816 * original object has not been changed since the last copy, its
3817 * copy object can be reused. Otherwise, a new copy object can be
3818 * inserted between the original object and its previous copy
3819 * object. Since any copy object is read-only, this cannot affect
3820 * affect the contents of the previous copy object.
3821 *
3822 * Note that a copy object is higher in the object tree than the
3823 * original object; therefore, use of the copy object recorded in
3824 * the original object must be done carefully, to avoid deadlock.
3825 */
3826
3827 copy_size = vm_object_round_page(copy_size);
3828 Retry:
3829 if (!vm_object_copy_delayed_paging_wait_disable) {
3830 /*
3831 * Wait for paging in progress.
3832 */
3833 if (!src_object->true_share &&
3834 (src_object->paging_in_progress != 0 ||
3835 src_object->activity_in_progress != 0)) {
3836 if (src_object_shared == TRUE) {
3837 vm_object_unlock(src_object);
3838 vm_object_lock(src_object);
3839 src_object_shared = FALSE;
3840 goto Retry;
3841 }
3842 vm_object_paging_wait(src_object, THREAD_UNINT);
3843 }
3844 }
3845 if (src_object->vmo_pl_req_in_progress) {
3846 if (src_object_shared) {
3847 vm_object_unlock(src_object);
3848 vm_object_lock(src_object);
3849 src_object_shared = false;
3850 goto Retry;
3851 }
3852 vm_object_pl_req_wait(src_object, THREAD_UNINT);
3853 }
3854
3855 /*
3856 * See whether we can reuse the result of a previous
3857 * copy operation.
3858 */
3859
3860 old_copy = src_object->vo_copy;
3861 if (old_copy != VM_OBJECT_NULL) {
3862 int lock_granted;
3863
3864 /*
3865 * Try to get the locks (out of order)
3866 */
3867 if (src_object_shared == TRUE) {
3868 lock_granted = vm_object_lock_try_shared(old_copy);
3869 } else {
3870 lock_granted = vm_object_lock_try(old_copy);
3871 }
3872
3873 if (!lock_granted) {
3874 vm_object_unlock(src_object);
3875
3876 if (collisions++ == 0) {
3877 copy_delayed_lock_contention++;
3878 }
3879 mutex_pause(collisions);
3880
3881 /* Heisenberg Rules */
3882 copy_delayed_lock_collisions++;
3883
3884 if (collisions > copy_delayed_max_collisions) {
3885 copy_delayed_max_collisions = collisions;
3886 }
3887
3888 if (src_object_shared == TRUE) {
3889 vm_object_lock_shared(src_object);
3890 } else {
3891 vm_object_lock(src_object);
3892 }
3893
3894 goto Retry;
3895 }
3896
3897 /*
3898 * Determine whether the old copy object has
3899 * been modified.
3900 */
3901
3902 if (old_copy->resident_page_count == 0 &&
3903 !old_copy->pager_created) {
3904 /*
3905 * It has not been modified.
3906 *
3907 * Return another reference to
3908 * the existing copy-object if
3909 * we can safely grow it (if
3910 * needed).
3911 */
3912
3913 if (old_copy->vo_size < copy_size) {
3914 if (src_object_shared == TRUE) {
3915 vm_object_unlock(old_copy);
3916 vm_object_unlock(src_object);
3917
3918 vm_object_lock(src_object);
3919 src_object_shared = FALSE;
3920 goto Retry;
3921 }
3922 /*
3923 * We can't perform a delayed copy if any of the
3924 * pages in the extended range are wired (because
3925 * we can't safely take write permission away from
3926 * wired pages). If the pages aren't wired, then
3927 * go ahead and protect them.
3928 */
3929 copy_delayed_protect_iterate++;
3930
3931 pmap_flush_context_init(&pmap_flush_context_storage);
3932 delayed_pmap_flush = FALSE;
3933
3934 vm_page_queue_iterate(&src_object->memq, p, vmp_listq) {
3935 if (!vm_page_is_fictitious(p) &&
3936 p->vmp_offset >= old_copy->vo_size &&
3937 p->vmp_offset < copy_size) {
3938 if (p->vmp_busy && p->vmp_absent) {
3939 /*
3940 * A busy/absent page is still
3941 * waiting for its contents.
3942 * It should not be mapped in user
3943 * space (because it has no valid
3944 * contents) so no need to
3945 * write-protect it for copy-on-write.
3946 * It could have been mapped in the
3947 * kernel by the content provider
3948 * (a network filesystem, for example)
3949 * and we do not want to write-protect
3950 * that mapping, so we skip this page.
3951 */
3952 continue;
3953 }
3954 if (VM_PAGE_WIRED(p)) {
3955 vm_object_unlock(old_copy);
3956 vm_object_unlock(src_object);
3957
3958 if (new_copy != VM_OBJECT_NULL) {
3959 vm_object_unlock(new_copy);
3960 vm_object_deallocate(new_copy);
3961 }
3962 if (delayed_pmap_flush == TRUE) {
3963 pmap_flush(&pmap_flush_context_storage);
3964 }
3965
3966 return VM_OBJECT_NULL;
3967 } else {
3968 pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p),
3969 (p->vmp_xpmapped ? (VM_PROT_READ | VM_PROT_EXECUTE) : VM_PROT_READ),
3970 PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
3971 delayed_pmap_flush = TRUE;
3972 }
3973 }
3974 }
3975 if (delayed_pmap_flush == TRUE) {
3976 pmap_flush(&pmap_flush_context_storage);
3977 }
3978
3979 assertf(page_aligned(copy_size),
3980 "object %p size 0x%llx",
3981 old_copy, (uint64_t)copy_size);
3982 old_copy->vo_size = copy_size;
3983
3984 /*
3985 * src_object's "vo_copy" object now covers
3986 * a larger portion of src_object.
3987 * Increment src_object's "vo_copy_version"
3988 * to make any racing vm_fault() on
3989 * "src_object" re-check if it needs to honor
3990 * any new copy-on-write obligation.
3991 */
3992 src_object->vo_copy_version++;
3993 }
3994 if (src_object_shared == TRUE) {
3995 vm_object_reference_shared(old_copy);
3996 } else {
3997 vm_object_reference_locked(old_copy);
3998 }
3999 assert3u(old_copy->copy_strategy, ==, MEMORY_OBJECT_COPY_SYMMETRIC);
4000 vm_object_unlock(old_copy);
4001 vm_object_unlock(src_object);
4002
4003 if (new_copy != VM_OBJECT_NULL) {
4004 vm_object_unlock(new_copy);
4005 vm_object_deallocate(new_copy);
4006 }
4007 return old_copy;
4008 }
4009
4010
4011
4012 /*
4013 * Adjust the size argument so that the newly-created
4014 * copy object will be large enough to back either the
4015 * old copy object or the new mapping.
4016 */
4017 if (old_copy->vo_size > copy_size) {
4018 copy_size = old_copy->vo_size;
4019 }
4020
4021 if (new_copy == VM_OBJECT_NULL) {
4022 vm_object_unlock(old_copy);
4023 vm_object_unlock(src_object);
4024 /* Carry over the provenance from the object that's backing us */
4025 new_copy = vm_object_allocate(copy_size, src_object->vmo_provenance);
4026 vm_object_lock(src_object);
4027 vm_object_lock(new_copy);
4028
4029 src_object_shared = FALSE;
4030 goto Retry;
4031 }
4032 assertf(page_aligned(copy_size),
4033 "object %p size 0x%llx",
4034 new_copy, (uint64_t)copy_size);
4035 new_copy->vo_size = copy_size;
4036
4037 /*
4038 * The copy-object is always made large enough to
4039 * completely shadow the original object, since
4040 * it may have several users who want to shadow
4041 * the original object at different points.
4042 */
4043
4044 assert((old_copy->shadow == src_object) &&
4045 (old_copy->vo_shadow_offset == (vm_object_offset_t) 0));
4046 } else if (new_copy == VM_OBJECT_NULL) {
4047 vm_object_unlock(src_object);
4048 /* Carry over the provenance from the object that's backing us */
4049 new_copy = vm_object_allocate(copy_size, src_object->vmo_provenance);
4050 vm_object_lock(src_object);
4051 vm_object_lock(new_copy);
4052
4053 src_object_shared = FALSE;
4054 goto Retry;
4055 }
4056
4057 /*
4058 * We now have the src object locked, and the new copy object
4059 * allocated and locked (and potentially the old copy locked).
4060 * Before we go any further, make sure we can still perform
4061 * a delayed copy, as the situation may have changed.
4062 *
4063 * Specifically, we can't perform a delayed copy if any of the
4064 * pages in the range are wired (because we can't safely take
4065 * write permission away from wired pages). If the pages aren't
4066 * wired, then go ahead and protect them.
4067 */
4068 copy_delayed_protect_iterate++;
4069
4070 pmap_flush_context_init(&pmap_flush_context_storage);
4071 delayed_pmap_flush = FALSE;
4072
4073 vm_page_queue_iterate(&src_object->memq, p, vmp_listq) {
4074 if (!vm_page_is_fictitious(p) && p->vmp_offset < copy_size) {
4075 if (VM_PAGE_WIRED(p)) {
4076 if (old_copy) {
4077 vm_object_unlock(old_copy);
4078 }
4079 vm_object_unlock(src_object);
4080 vm_object_unlock(new_copy);
4081 vm_object_deallocate(new_copy);
4082
4083 if (delayed_pmap_flush == TRUE) {
4084 pmap_flush(&pmap_flush_context_storage);
4085 }
4086
4087 return VM_OBJECT_NULL;
4088 } else {
4089 pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p),
4090 (p->vmp_xpmapped ? (VM_PROT_READ | VM_PROT_EXECUTE) : VM_PROT_READ),
4091 PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
4092 delayed_pmap_flush = TRUE;
4093 }
4094 }
4095 }
4096 if (delayed_pmap_flush == TRUE) {
4097 pmap_flush(&pmap_flush_context_storage);
4098 }
4099
4100 if (old_copy != VM_OBJECT_NULL) {
4101 /*
4102 * Make the old copy-object shadow the new one.
4103 * It will receive no more pages from the original
4104 * object.
4105 */
4106
4107 /* remove ref. from old_copy */
4108 vm_object_lock_assert_exclusive(src_object);
4109 os_ref_release_live_locked_raw(&src_object->ref_count,
4110 &vm_object_refgrp);
4111 vm_object_lock_assert_exclusive(old_copy);
4112 old_copy->shadow = new_copy;
4113 vm_object_lock_assert_exclusive(new_copy);
4114 assert(os_ref_get_count_raw(&new_copy->ref_count) > 0);
4115 /* for old_copy->shadow ref. */
4116 os_ref_retain_locked_raw(&new_copy->ref_count, &vm_object_refgrp);
4117
4118 vm_object_unlock(old_copy); /* done with old_copy */
4119 }
4120
4121 /*
4122 * Point the new copy at the existing object.
4123 */
4124 vm_object_lock_assert_exclusive(new_copy);
4125 new_copy->shadow = src_object;
4126 new_copy->vo_shadow_offset = 0;
4127 VM_OBJECT_SET_SHADOWED(new_copy, TRUE); /* caller must set needs_copy */
4128
4129 vm_object_lock_assert_exclusive(src_object);
4130 vm_object_reference_locked(src_object);
4131 VM_OBJECT_COPY_SET(src_object, new_copy);
4132 vm_object_unlock(src_object);
4133 assert3u(new_copy->copy_strategy, ==, MEMORY_OBJECT_COPY_SYMMETRIC);
4134 vm_object_unlock(new_copy);
4135
4136 return new_copy;
4137 }
4138
4139 /*
4140 * Routine: vm_object_copy_strategically
4141 *
4142 * Purpose:
4143 * Perform a copy according to the source object's
4144 * declared strategy. This operation may block,
4145 * and may be interrupted.
4146 */
4147 __private_extern__ kern_return_t
4148 vm_object_copy_strategically(
4149 vm_object_t src_object,
4150 vm_object_offset_t src_offset,
4151 vm_object_size_t size,
4152 bool forking,
4153 vm_object_t *dst_object, /* OUT */
4154 vm_object_offset_t *dst_offset, /* OUT */
4155 boolean_t *dst_needs_copy) /* OUT */
4156 {
4157 boolean_t result;
4158 boolean_t interruptible = THREAD_ABORTSAFE; /* XXX */
4159 boolean_t object_lock_shared = FALSE;
4160 memory_object_copy_strategy_t copy_strategy;
4161
4162 assert(src_object != VM_OBJECT_NULL);
4163
4164 copy_strategy = src_object->copy_strategy;
4165
4166 if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4167 vm_object_lock_shared(src_object);
4168 object_lock_shared = TRUE;
4169 } else {
4170 vm_object_lock(src_object);
4171 }
4172
4173 /*
4174 * The copy strategy is only valid if the memory manager
4175 * is "ready". Internal objects are always ready.
4176 */
4177
4178 while (!src_object->internal && !src_object->pager_ready) {
4179 wait_result_t wait_result;
4180
4181 if (object_lock_shared == TRUE) {
4182 vm_object_unlock(src_object);
4183 vm_object_lock(src_object);
4184 object_lock_shared = FALSE;
4185 continue;
4186 }
4187 wait_result = vm_object_sleep( src_object,
4188 VM_OBJECT_EVENT_PAGER_READY,
4189 interruptible, LCK_SLEEP_EXCLUSIVE);
4190 if (wait_result != THREAD_AWAKENED) {
4191 vm_object_unlock(src_object);
4192 *dst_object = VM_OBJECT_NULL;
4193 *dst_offset = 0;
4194 *dst_needs_copy = FALSE;
4195 return MACH_SEND_INTERRUPTED;
4196 }
4197 }
4198
4199 /*
4200 * Use the appropriate copy strategy.
4201 */
4202
4203 if (copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK) {
4204 if (forking) {
4205 copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4206 } else {
4207 copy_strategy = MEMORY_OBJECT_COPY_NONE;
4208 if (object_lock_shared) {
4209 vm_object_unlock(src_object);
4210 vm_object_lock(src_object);
4211 object_lock_shared = FALSE;
4212 }
4213 }
4214 }
4215
4216 switch (copy_strategy) {
4217 case MEMORY_OBJECT_COPY_DELAY:
4218 *dst_object = vm_object_copy_delayed(src_object,
4219 src_offset, size, object_lock_shared);
4220 if (*dst_object != VM_OBJECT_NULL) {
4221 *dst_offset = src_offset;
4222 *dst_needs_copy = TRUE;
4223 result = KERN_SUCCESS;
4224 break;
4225 }
4226 vm_object_lock(src_object);
4227 OS_FALLTHROUGH; /* fall thru when delayed copy not allowed */
4228
4229 case MEMORY_OBJECT_COPY_NONE:
4230 result = vm_object_copy_slowly(src_object,
4231 src_offset, size,
4232 interruptible,
4233 #if HAS_MTE
4234 forking && vm_object_is_mte_mappable(src_object), /* create_mte_object */
4235 #endif /* HAS_MTE */
4236 dst_object);
4237 if (result == KERN_SUCCESS) {
4238 *dst_offset = src_offset - vm_object_trunc_page(src_offset);
4239 *dst_needs_copy = FALSE;
4240 }
4241 break;
4242
4243 case MEMORY_OBJECT_COPY_SYMMETRIC:
4244 vm_object_unlock(src_object);
4245 result = KERN_MEMORY_RESTART_COPY;
4246 break;
4247
4248 default:
4249 panic("copy_strategically: bad strategy %d for object %p",
4250 copy_strategy, src_object);
4251 result = KERN_INVALID_ARGUMENT;
4252 }
4253 return result;
4254 }
4255
4256 /*
4257 * vm_object_shadow:
4258 *
4259 * Create a new object which is backed by the
4260 * specified existing object range. The source
4261 * object reference is deallocated.
4262 *
4263 * The new object and offset into that object
4264 * are returned in the source parameters.
4265 */
4266 boolean_t vm_object_shadow_check = TRUE;
4267 uint64_t vm_object_shadow_forced = 0;
4268 uint64_t vm_object_shadow_skipped = 0;
4269
4270 __private_extern__ boolean_t
4271 vm_object_shadow(
4272 vm_object_t *object, /* IN/OUT */
4273 vm_object_offset_t *offset, /* IN/OUT */
4274 vm_object_size_t length,
4275 boolean_t always_shadow)
4276 {
4277 vm_object_t source;
4278 vm_object_t result;
4279
4280 source = *object;
4281 assert(source != VM_OBJECT_NULL);
4282 if (source == VM_OBJECT_NULL) {
4283 return FALSE;
4284 }
4285
4286 assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4287
4288 /*
4289 * Determine if we really need a shadow.
4290 *
4291 * If the source object is larger than what we are trying
4292 * to create, then force the shadow creation even if the
4293 * ref count is 1. This will allow us to [potentially]
4294 * collapse the underlying object away in the future
4295 * (freeing up the extra data it might contain and that
4296 * we don't need).
4297 */
4298
4299 assert(source->copy_strategy != MEMORY_OBJECT_COPY_NONE); /* Purgeable objects shouldn't have shadow objects. */
4300
4301 /*
4302 * The following optimization does not work in the context of submaps
4303 * (the shared region, in particular).
4304 * This object might have only 1 reference (in the submap) but that
4305 * submap can itself be mapped multiple times, so the object is
4306 * actually indirectly referenced more than once...
4307 * The caller can specify to "always_shadow" to bypass the optimization.
4308 */
4309 if (vm_object_shadow_check &&
4310 source->vo_size == length &&
4311 os_ref_get_count_raw(&source->ref_count) == 1) {
4312 if (always_shadow) {
4313 vm_object_shadow_forced++;
4314 } else {
4315 /*
4316 * Lock the object and check again.
4317 * We also check to see if there's
4318 * a shadow or copy object involved.
4319 * We can't do that earlier because
4320 * without the object locked, there
4321 * could be a collapse and the chain
4322 * gets modified leaving us with an
4323 * invalid pointer.
4324 */
4325 vm_object_lock(source);
4326 if (source->vo_size == length &&
4327 os_ref_get_count_raw(&source->ref_count) == 1 &&
4328 (source->shadow == VM_OBJECT_NULL ||
4329 source->shadow->vo_copy == VM_OBJECT_NULL)) {
4330 VM_OBJECT_SET_SHADOWED(source, FALSE);
4331 vm_object_unlock(source);
4332 vm_object_shadow_skipped++;
4333 return FALSE;
4334 }
4335 /* things changed while we were locking "source"... */
4336 vm_object_unlock(source);
4337 }
4338 }
4339
4340 /*
4341 * *offset is the map entry's offset into the VM object and
4342 * is aligned to the map's page size.
4343 * VM objects need to be aligned to the system's page size.
4344 * Record the necessary adjustment and re-align the offset so
4345 * that result->vo_shadow_offset is properly page-aligned.
4346 */
4347 vm_object_offset_t offset_adjustment;
4348 offset_adjustment = *offset - vm_object_trunc_page(*offset);
4349 length = vm_object_round_page(length + offset_adjustment);
4350 *offset = vm_object_trunc_page(*offset);
4351
4352 /*
4353 * Allocate a new object with the given length
4354 */
4355
4356 if ((result = vm_object_allocate(length, source->vmo_provenance)) == VM_OBJECT_NULL) {
4357 panic("vm_object_shadow: no object for shadowing");
4358 }
4359
4360 /*
4361 * The new object shadows the source object, adding
4362 * a reference to it. Our caller changes his reference
4363 * to point to the new object, removing a reference to
4364 * the source object. Net result: no change of reference
4365 * count.
4366 */
4367 result->shadow = source;
4368
4369 /*
4370 * Store the offset into the source object,
4371 * and fix up the offset into the new object.
4372 */
4373
4374 result->vo_shadow_offset = *offset;
4375 assertf(page_aligned(result->vo_shadow_offset),
4376 "result %p shadow offset 0x%llx",
4377 result, result->vo_shadow_offset);
4378
4379 /*
4380 * Return the new things
4381 */
4382
4383 *offset = 0;
4384 if (offset_adjustment) {
4385 /*
4386 * Make the map entry point to the equivalent offset
4387 * in the new object.
4388 */
4389 DEBUG4K_COPY("adjusting offset @ %p from 0x%llx to 0x%llx for object %p length: 0x%llx\n", offset, *offset, *offset + offset_adjustment, result, length);
4390 *offset += offset_adjustment;
4391 }
4392 *object = result;
4393 return TRUE;
4394 }
4395
4396 /*
4397 * The relationship between vm_object structures and
4398 * the memory_object requires careful synchronization.
4399 *
4400 * All associations are created by memory_object_create_named
4401 * for external pagers and vm_object_compressor_pager_create for internal
4402 * objects as follows:
4403 *
4404 * pager: the memory_object itself, supplied by
4405 * the user requesting a mapping (or the kernel,
4406 * when initializing internal objects); the
4407 * kernel simulates holding send rights by keeping
4408 * a port reference;
4409 *
4410 * pager_request:
4411 * the memory object control port,
4412 * created by the kernel; the kernel holds
4413 * receive (and ownership) rights to this
4414 * port, but no other references.
4415 *
4416 * When initialization is complete, the "initialized" field
4417 * is asserted. Other mappings using a particular memory object,
4418 * and any references to the vm_object gained through the
4419 * port association must wait for this initialization to occur.
4420 *
4421 * In order to allow the memory manager to set attributes before
4422 * requests (notably virtual copy operations, but also data or
4423 * unlock requests) are made, a "ready" attribute is made available.
4424 * Only the memory manager may affect the value of this attribute.
4425 * Its value does not affect critical kernel functions, such as
4426 * internal object initialization or destruction. [Furthermore,
4427 * memory objects created by the kernel are assumed to be ready
4428 * immediately; the default memory manager need not explicitly
4429 * set the "ready" attribute.]
4430 *
4431 * [Both the "initialized" and "ready" attribute wait conditions
4432 * use the "pager" field as the wait event.]
4433 *
4434 * The port associations can be broken down by any of the
4435 * following routines:
4436 * vm_object_terminate:
4437 * No references to the vm_object remain, and
4438 * the object cannot (or will not) be cached.
4439 * This is the normal case, and is done even
4440 * though one of the other cases has already been
4441 * done.
4442 * memory_object_destroy:
4443 * The memory manager has requested that the
4444 * kernel relinquish references to the memory
4445 * object. [The memory manager may not want to
4446 * destroy the memory object, but may wish to
4447 * refuse or tear down existing memory mappings.]
4448 *
4449 * Each routine that breaks an association must break all of
4450 * them at once. At some later time, that routine must clear
4451 * the pager field and release the memory object references.
4452 * [Furthermore, each routine must cope with the simultaneous
4453 * or previous operations of the others.]
4454 *
4455 * Because the pager field may be cleared spontaneously, it
4456 * cannot be used to determine whether a memory object has
4457 * ever been associated with a particular vm_object. [This
4458 * knowledge is important to the shadow object mechanism.]
4459 * For this reason, an additional "created" attribute is
4460 * provided.
4461 *
4462 * During various paging operations, the pager reference found in the
4463 * vm_object must be valid. To prevent this from being released,
4464 * (other than being removed, i.e., made null), routines may use
4465 * the vm_object_paging_begin/end routines [actually, macros].
4466 * The implementation uses the "paging_in_progress" and "wanted" fields.
4467 * [Operations that alter the validity of the pager values include the
4468 * termination routines and vm_object_collapse.]
4469 */
4470
4471
4472 /*
4473 * Routine: vm_object_memory_object_associate
4474 * Purpose:
4475 * Associate a VM object to the given pager.
4476 * If a VM object is not provided, create one.
4477 * Initialize the pager.
4478 */
4479 vm_object_t
4480 vm_object_memory_object_associate(
4481 memory_object_t pager,
4482 vm_object_t object,
4483 vm_object_size_t size,
4484 boolean_t named)
4485 {
4486 memory_object_control_t control;
4487
4488 assert(pager != MEMORY_OBJECT_NULL);
4489
4490 if (object != VM_OBJECT_NULL) {
4491 vm_object_lock(object);
4492 assert(object->internal);
4493 assert(object->pager_created);
4494 assert(!object->pager_initialized);
4495 assert(!object->pager_ready);
4496 assert(object->pager_trusted);
4497 } else {
4498 /* No provenance yet */
4499 object = vm_object_allocate(size, VM_MAP_SERIAL_NONE);
4500 assert(object != VM_OBJECT_NULL);
4501 vm_object_lock(object);
4502 VM_OBJECT_SET_INTERNAL(object, FALSE);
4503 VM_OBJECT_SET_PAGER_TRUSTED(object, FALSE);
4504 /* copy strategy invalid until set by memory manager */
4505 object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
4506 }
4507
4508 /*
4509 * Allocate request port.
4510 */
4511
4512 control = memory_object_control_allocate(object);
4513 assert(control != MEMORY_OBJECT_CONTROL_NULL);
4514
4515 assert(!object->pager_ready);
4516 assert(!object->pager_initialized);
4517 assert(object->pager == NULL);
4518 assert(object->pager_control == NULL);
4519
4520 /*
4521 * Copy the reference we were given.
4522 */
4523
4524 memory_object_reference(pager);
4525 VM_OBJECT_SET_PAGER_CREATED(object, TRUE);
4526 object->pager = pager;
4527 object->pager_control = control;
4528 VM_OBJECT_SET_PAGER_READY(object, FALSE);
4529
4530 vm_object_unlock(object);
4531
4532 /*
4533 * Let the pager know we're using it.
4534 */
4535
4536 (void) memory_object_init(pager,
4537 object->pager_control,
4538 PAGE_SIZE);
4539
4540 vm_object_lock(object);
4541 if (named) {
4542 VM_OBJECT_SET_NAMED(object, TRUE);
4543 }
4544 if (object->internal) {
4545 VM_OBJECT_SET_PAGER_READY(object, TRUE);
4546 vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
4547 }
4548
4549 VM_OBJECT_SET_PAGER_INITIALIZED(object, TRUE);
4550 // vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_INIT);
4551
4552 vm_object_unlock(object);
4553
4554 return object;
4555 }
4556
4557 /*
4558 * Routine: vm_object_compressor_pager_create
4559 * Purpose:
4560 * Create a memory object for an internal object.
4561 * In/out conditions:
4562 * The object is locked on entry and exit;
4563 * it may be unlocked within this call.
4564 * Limitations:
4565 * Only one thread may be performing a
4566 * vm_object_compressor_pager_create on an object at
4567 * a time. Presumably, only the pageout
4568 * daemon will be using this routine.
4569 */
4570
4571 void
4572 vm_object_compressor_pager_create(
4573 vm_object_t object)
4574 {
4575 memory_object_t pager;
4576 vm_object_t pager_object = VM_OBJECT_NULL;
4577
4578 assert(!is_kernel_object(object));
4579
4580 /*
4581 * Prevent collapse or termination by holding a paging reference
4582 */
4583
4584 vm_object_paging_begin(object);
4585 if (object->pager_created) {
4586 /*
4587 * Someone else got to it first...
4588 * wait for them to finish initializing the ports
4589 */
4590 while (!object->pager_ready) {
4591 vm_object_sleep(object,
4592 VM_OBJECT_EVENT_PAGER_READY,
4593 THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
4594 }
4595 vm_object_paging_end(object);
4596 return;
4597 }
4598
4599 if ((uint32_t) (object->vo_size / PAGE_SIZE) !=
4600 (object->vo_size / PAGE_SIZE)) {
4601 #if DEVELOPMENT || DEBUG
4602 printf("vm_object_compressor_pager_create(%p): "
4603 "object size 0x%llx >= 0x%llx\n",
4604 object,
4605 (uint64_t) object->vo_size,
4606 0x0FFFFFFFFULL * PAGE_SIZE);
4607 #endif /* DEVELOPMENT || DEBUG */
4608 vm_object_paging_end(object);
4609 return;
4610 }
4611
4612 #if HAS_MTE /* TODO: remove this when MTE support in the compressor is finalized */
4613 if (!vm_object_allow_compressor_pager_for_mte && vm_object_is_mte_mappable(object)) {
4614 vm_object_no_compressor_pager_for_mte_count++;
4615 vm_object_paging_end(object);
4616 return;
4617 }
4618 #endif
4619
4620 /*
4621 * Indicate that a memory object has been assigned
4622 * before dropping the lock, to prevent a race.
4623 */
4624
4625 VM_OBJECT_SET_PAGER_CREATED(object, TRUE);
4626 VM_OBJECT_SET_PAGER_TRUSTED(object, TRUE);
4627 object->paging_offset = 0;
4628
4629 vm_object_unlock(object);
4630
4631 /*
4632 * Create the [internal] pager, and associate it with this object.
4633 *
4634 * We make the association here so that vm_object_enter()
4635 * can look up the object to complete initializing it. No
4636 * user will ever map this object.
4637 */
4638 {
4639 /* create our new memory object */
4640 assert((uint32_t) (object->vo_size / PAGE_SIZE) ==
4641 (object->vo_size / PAGE_SIZE));
4642 (void) compressor_memory_object_create(
4643 (memory_object_size_t) object->vo_size,
4644 &pager);
4645 if (pager == NULL) {
4646 panic("vm_object_compressor_pager_create(): "
4647 "no pager for object %p size 0x%llx\n",
4648 object, (uint64_t) object->vo_size);
4649 }
4650 }
4651
4652 /*
4653 * A reference was returned by
4654 * memory_object_create(), and it is
4655 * copied by vm_object_memory_object_associate().
4656 */
4657
4658 pager_object = vm_object_memory_object_associate(pager,
4659 object,
4660 object->vo_size,
4661 FALSE);
4662 if (pager_object != object) {
4663 panic("vm_object_compressor_pager_create: mismatch (pager: %p, pager_object: %p, orig_object: %p, orig_object size: 0x%llx)", pager, pager_object, object, (uint64_t) object->vo_size);
4664 }
4665
4666 /*
4667 * Drop the reference we were passed.
4668 */
4669 memory_object_deallocate(pager);
4670
4671 vm_object_lock(object);
4672
4673 /*
4674 * Release the paging reference
4675 */
4676 vm_object_paging_end(object);
4677 }
4678
4679 vm_external_state_t
4680 vm_object_compressor_pager_state_get(
4681 vm_object_t object,
4682 vm_object_offset_t offset)
4683 {
4684 if (__probable(not_in_kdp)) {
4685 vm_object_lock_assert_held(object);
4686 }
4687 if (object->internal &&
4688 object->pager != NULL &&
4689 !object->terminating &&
4690 object->alive) {
4691 return vm_compressor_pager_state_get(object->pager,
4692 offset + object->paging_offset);
4693 } else {
4694 return VM_EXTERNAL_STATE_UNKNOWN;
4695 }
4696 }
4697
4698 void
4699 vm_object_compressor_pager_state_clr(
4700 vm_object_t object,
4701 vm_object_offset_t offset)
4702 {
4703 unsigned int num_pages_cleared;
4704 vm_object_lock_assert_exclusive(object);
4705 if (object->internal &&
4706 object->pager != NULL &&
4707 !object->terminating &&
4708 object->alive) {
4709 num_pages_cleared = vm_compressor_pager_state_clr(object->pager,
4710 offset + object->paging_offset);
4711 if (num_pages_cleared) {
4712 vm_compressor_pager_count(object->pager,
4713 -num_pages_cleared,
4714 FALSE, /* shared */
4715 object);
4716 }
4717 if (num_pages_cleared &&
4718 (object->purgable != VM_PURGABLE_DENY || object->vo_ledger_tag)) {
4719 /* less compressed purgeable/tagged pages */
4720 assert3u(num_pages_cleared, ==, 1);
4721 vm_object_owner_compressed_update(object, -num_pages_cleared);
4722 }
4723 }
4724 }
4725
4726 /*
4727 * Global variables for vm_object_collapse():
4728 *
4729 * Counts for normal collapses and bypasses.
4730 * Debugging variables, to watch or disable collapse.
4731 */
4732 static long object_collapses = 0;
4733 static long object_bypasses = 0;
4734
4735 static boolean_t vm_object_collapse_allowed = TRUE;
4736 static boolean_t vm_object_bypass_allowed = TRUE;
4737
4738 void vm_object_do_collapse_compressor(vm_object_t object,
4739 vm_object_t backing_object);
4740 void
4741 vm_object_do_collapse_compressor(
4742 vm_object_t object,
4743 vm_object_t backing_object)
4744 {
4745 vm_object_offset_t new_offset, backing_offset;
4746 vm_object_size_t size;
4747
4748 vm_counters.do_collapse_compressor++;
4749
4750 vm_object_lock_assert_exclusive(object);
4751 vm_object_lock_assert_exclusive(backing_object);
4752
4753 size = object->vo_size;
4754
4755 /*
4756 * Move all compressed pages from backing_object
4757 * to the parent.
4758 */
4759
4760 for (backing_offset = object->vo_shadow_offset;
4761 backing_offset < object->vo_shadow_offset + object->vo_size;
4762 backing_offset += PAGE_SIZE) {
4763 memory_object_offset_t backing_pager_offset;
4764
4765 /* find the next compressed page at or after this offset */
4766 backing_pager_offset = (backing_offset +
4767 backing_object->paging_offset);
4768 backing_pager_offset = vm_compressor_pager_next_compressed(
4769 backing_object->pager,
4770 backing_pager_offset);
4771 if (backing_pager_offset == (memory_object_offset_t) -1) {
4772 /* no more compressed pages */
4773 break;
4774 }
4775 backing_offset = (backing_pager_offset -
4776 backing_object->paging_offset);
4777
4778 new_offset = backing_offset - object->vo_shadow_offset;
4779
4780 if (new_offset >= object->vo_size) {
4781 /* we're out of the scope of "object": done */
4782 break;
4783 }
4784
4785 if ((vm_page_lookup(object, new_offset) != VM_PAGE_NULL) ||
4786 (vm_compressor_pager_state_get(object->pager,
4787 (new_offset +
4788 object->paging_offset)) ==
4789 VM_EXTERNAL_STATE_EXISTS)) {
4790 /*
4791 * This page already exists in object, resident or
4792 * compressed.
4793 * We don't need this compressed page in backing_object
4794 * and it will be reclaimed when we release
4795 * backing_object.
4796 */
4797 continue;
4798 }
4799
4800 /*
4801 * backing_object has this page in the VM compressor and
4802 * we need to transfer it to object.
4803 */
4804 vm_counters.do_collapse_compressor_pages++;
4805 vm_compressor_pager_transfer(
4806 /* destination: */
4807 object->pager,
4808 (new_offset + object->paging_offset),
4809 /* source: */
4810 backing_object->pager,
4811 (backing_offset + backing_object->paging_offset));
4812 }
4813 }
4814
4815 /*
4816 * Routine: vm_object_do_collapse
4817 * Purpose:
4818 * Collapse an object with the object backing it.
4819 * Pages in the backing object are moved into the
4820 * parent, and the backing object is deallocated.
4821 * Conditions:
4822 * Both objects and the cache are locked; the page
4823 * queues are unlocked.
4824 *
4825 */
4826 static void
4827 vm_object_do_collapse(
4828 vm_object_t object,
4829 vm_object_t backing_object)
4830 {
4831 vm_page_t p, pp;
4832 vm_object_offset_t new_offset, backing_offset;
4833 vm_object_size_t size;
4834
4835 vm_object_lock_assert_exclusive(object);
4836 vm_object_lock_assert_exclusive(backing_object);
4837
4838 assert(object->purgable == VM_PURGABLE_DENY);
4839 assert(backing_object->purgable == VM_PURGABLE_DENY);
4840
4841 backing_offset = object->vo_shadow_offset;
4842 size = object->vo_size;
4843
4844 /*
4845 * Move all in-memory pages from backing_object
4846 * to the parent. Pages that have been paged out
4847 * will be overwritten by any of the parent's
4848 * pages that shadow them.
4849 */
4850
4851 while (!vm_page_queue_empty(&backing_object->memq)) {
4852 p = (vm_page_t) vm_page_queue_first(&backing_object->memq);
4853
4854 new_offset = (p->vmp_offset - backing_offset);
4855
4856 assert(!p->vmp_busy || p->vmp_absent);
4857
4858 /*
4859 * If the parent has a page here, or if
4860 * this page falls outside the parent,
4861 * dispose of it.
4862 *
4863 * Otherwise, move it as planned.
4864 */
4865
4866 if (p->vmp_offset < backing_offset || new_offset >= size) {
4867 VM_PAGE_FREE(p);
4868 } else {
4869 pp = vm_page_lookup(object, new_offset);
4870 if (pp == VM_PAGE_NULL) {
4871 if (vm_object_compressor_pager_state_get(object,
4872 new_offset)
4873 == VM_EXTERNAL_STATE_EXISTS) {
4874 /*
4875 * Parent object has this page
4876 * in the VM compressor.
4877 * Throw away the backing
4878 * object's page.
4879 */
4880 VM_PAGE_FREE(p);
4881 } else {
4882 /*
4883 * Parent now has no page.
4884 * Move the backing object's page
4885 * up.
4886 */
4887 vm_page_rename(p, object, new_offset);
4888 }
4889 } else {
4890 assert(!pp->vmp_absent);
4891
4892 /*
4893 * Parent object has a real page.
4894 * Throw away the backing object's
4895 * page.
4896 */
4897 VM_PAGE_FREE(p);
4898 }
4899 }
4900 }
4901
4902 if (vm_object_collapse_compressor_allowed &&
4903 object->pager != MEMORY_OBJECT_NULL &&
4904 backing_object->pager != MEMORY_OBJECT_NULL) {
4905 /* move compressed pages from backing_object to object */
4906 vm_object_do_collapse_compressor(object, backing_object);
4907 } else if (backing_object->pager != MEMORY_OBJECT_NULL) {
4908 assert((!object->pager_created &&
4909 (object->pager == MEMORY_OBJECT_NULL)) ||
4910 (!backing_object->pager_created &&
4911 (backing_object->pager == MEMORY_OBJECT_NULL)));
4912 /*
4913 * Move the pager from backing_object to object.
4914 *
4915 * XXX We're only using part of the paging space
4916 * for keeps now... we ought to discard the
4917 * unused portion.
4918 */
4919
4920 assert(!object->paging_in_progress);
4921 assert(!object->activity_in_progress);
4922 assert(!object->pager_created);
4923 assert(object->pager == NULL);
4924 object->pager = backing_object->pager;
4925
4926 VM_OBJECT_SET_PAGER_CREATED(object, backing_object->pager_created);
4927 object->pager_control = backing_object->pager_control;
4928 VM_OBJECT_SET_PAGER_READY(object, backing_object->pager_ready);
4929 VM_OBJECT_SET_PAGER_INITIALIZED(object, backing_object->pager_initialized);
4930 object->paging_offset =
4931 backing_object->paging_offset + backing_offset;
4932 if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
4933 memory_object_control_collapse(&object->pager_control,
4934 object);
4935 }
4936 /* the backing_object has lost its pager: reset all fields */
4937 VM_OBJECT_SET_PAGER_CREATED(backing_object, FALSE);
4938 backing_object->pager_control = NULL;
4939 VM_OBJECT_SET_PAGER_READY(backing_object, FALSE);
4940 backing_object->paging_offset = 0;
4941 backing_object->pager = NULL;
4942 }
4943 /*
4944 * Object now shadows whatever backing_object did.
4945 * Note that the reference to backing_object->shadow
4946 * moves from within backing_object to within object.
4947 */
4948
4949 assert(!object->phys_contiguous);
4950 assert(!backing_object->phys_contiguous);
4951 object->shadow = backing_object->shadow;
4952 if (object->shadow) {
4953 assertf(page_aligned(object->vo_shadow_offset),
4954 "object %p shadow_offset 0x%llx",
4955 object, object->vo_shadow_offset);
4956 assertf(page_aligned(backing_object->vo_shadow_offset),
4957 "backing_object %p shadow_offset 0x%llx",
4958 backing_object, backing_object->vo_shadow_offset);
4959 object->vo_shadow_offset += backing_object->vo_shadow_offset;
4960 /* "backing_object" gave its shadow to "object" */
4961 backing_object->shadow = VM_OBJECT_NULL;
4962 backing_object->vo_shadow_offset = 0;
4963 } else {
4964 /* no shadow, therefore no shadow offset... */
4965 object->vo_shadow_offset = 0;
4966 }
4967 assert((object->shadow == VM_OBJECT_NULL) ||
4968 (object->shadow->vo_copy != backing_object));
4969
4970 /*
4971 * Discard backing_object.
4972 *
4973 * Since the backing object has no pages, no
4974 * pager left, and no object references within it,
4975 * all that is necessary is to dispose of it.
4976 */
4977 object_collapses++;
4978
4979 assert(os_ref_get_count_raw(&backing_object->ref_count) == 1);
4980 assert(backing_object->resident_page_count == 0);
4981 assert(backing_object->paging_in_progress == 0);
4982 assert(backing_object->activity_in_progress == 0);
4983 assert(backing_object->shadow == VM_OBJECT_NULL);
4984 assert(backing_object->vo_shadow_offset == 0);
4985
4986 if (backing_object->pager != MEMORY_OBJECT_NULL) {
4987 /* ... unless it has a pager; need to terminate pager too */
4988 vm_counters.do_collapse_terminate++;
4989 if (vm_object_terminate(backing_object) != KERN_SUCCESS) {
4990 vm_counters.do_collapse_terminate_failure++;
4991 }
4992 return;
4993 }
4994
4995 assert(backing_object->pager == NULL);
4996
4997 VM_OBJECT_SET_ALIVE(backing_object, FALSE);
4998 vm_object_unlock(backing_object);
4999
5000 #if VM_OBJECT_TRACKING
5001 if (vm_object_tracking_btlog) {
5002 btlog_erase(vm_object_tracking_btlog, backing_object);
5003 }
5004 #endif /* VM_OBJECT_TRACKING */
5005
5006 vm_object_lock_destroy(backing_object);
5007
5008 zfree(vm_object_zone, backing_object);
5009 }
5010
5011 static void
5012 vm_object_do_bypass(
5013 vm_object_t object,
5014 vm_object_t backing_object)
5015 {
5016 /*
5017 * Make the parent shadow the next object
5018 * in the chain.
5019 */
5020
5021 vm_object_lock_assert_exclusive(object);
5022 vm_object_lock_assert_exclusive(backing_object);
5023
5024 vm_object_reference(backing_object->shadow);
5025
5026 assert(!object->phys_contiguous);
5027 assert(!backing_object->phys_contiguous);
5028 object->shadow = backing_object->shadow;
5029 if (object->shadow) {
5030 assertf(page_aligned(object->vo_shadow_offset),
5031 "object %p shadow_offset 0x%llx",
5032 object, object->vo_shadow_offset);
5033 assertf(page_aligned(backing_object->vo_shadow_offset),
5034 "backing_object %p shadow_offset 0x%llx",
5035 backing_object, backing_object->vo_shadow_offset);
5036 object->vo_shadow_offset += backing_object->vo_shadow_offset;
5037 } else {
5038 /* no shadow, therefore no shadow offset... */
5039 object->vo_shadow_offset = 0;
5040 }
5041
5042 /*
5043 * Backing object might have had a copy pointer
5044 * to us. If it did, clear it.
5045 */
5046 if (backing_object->vo_copy == object) {
5047 VM_OBJECT_COPY_SET(backing_object, VM_OBJECT_NULL);
5048 }
5049
5050 /*
5051 * Drop the reference count on backing_object.
5052 #if TASK_SWAPPER
5053 * Since its ref_count was at least 2, it
5054 * will not vanish; so we don't need to call
5055 * vm_object_deallocate.
5056 * [with a caveat for "named" objects]
5057 *
5058 * The res_count on the backing object is
5059 * conditionally decremented. It's possible
5060 * (via vm_pageout_scan) to get here with
5061 * a "swapped" object, which has a 0 res_count,
5062 * in which case, the backing object res_count
5063 * is already down by one.
5064 #else
5065 * Don't call vm_object_deallocate unless
5066 * ref_count drops to zero.
5067 *
5068 * The ref_count can drop to zero here if the
5069 * backing object could be bypassed but not
5070 * collapsed, such as when the backing object
5071 * is temporary and cachable.
5072 #endif
5073 */
5074 if (os_ref_get_count_raw(&backing_object->ref_count) > 2 ||
5075 (!backing_object->named &&
5076 os_ref_get_count_raw(&backing_object->ref_count) > 1)) {
5077 vm_object_lock_assert_exclusive(backing_object);
5078 os_ref_release_live_locked_raw(&backing_object->ref_count,
5079 &vm_object_refgrp);
5080 vm_object_unlock(backing_object);
5081 } else {
5082 /*
5083 * Drop locks so that we can deallocate
5084 * the backing object.
5085 */
5086
5087 /*
5088 * vm_object_collapse (the caller of this function) is
5089 * now called from contexts that may not guarantee that a
5090 * valid reference is held on the object... w/o a valid
5091 * reference, it is unsafe and unwise (you will definitely
5092 * regret it) to unlock the object and then retake the lock
5093 * since the object may be terminated and recycled in between.
5094 * The "activity_in_progress" reference will keep the object
5095 * 'stable'.
5096 */
5097 vm_object_activity_begin(object);
5098 vm_object_unlock(object);
5099
5100 vm_object_unlock(backing_object);
5101 vm_object_deallocate(backing_object);
5102
5103 /*
5104 * Relock object. We don't have to reverify
5105 * its state since vm_object_collapse will
5106 * do that for us as it starts at the
5107 * top of its loop.
5108 */
5109
5110 vm_object_lock(object);
5111 vm_object_activity_end(object);
5112 }
5113
5114 object_bypasses++;
5115 }
5116
5117
5118 /*
5119 * vm_object_collapse:
5120 *
5121 * Perform an object collapse or an object bypass if appropriate.
5122 * The real work of collapsing and bypassing is performed in
5123 * the routines vm_object_do_collapse and vm_object_do_bypass.
5124 *
5125 * Requires that the object be locked and the page queues be unlocked.
5126 *
5127 */
5128 static unsigned long vm_object_collapse_calls = 0;
5129 static unsigned long vm_object_collapse_objects = 0;
5130 static unsigned long vm_object_collapse_do_collapse = 0;
5131 static unsigned long vm_object_collapse_do_bypass = 0;
5132
5133 __private_extern__ void
5134 vm_object_collapse(
5135 vm_object_t object,
5136 vm_object_offset_t hint_offset,
5137 boolean_t can_bypass)
5138 {
5139 vm_object_t backing_object;
5140 vm_object_size_t object_vcount, object_rcount;
5141 vm_object_t original_object;
5142 int object_lock_type;
5143 int backing_object_lock_type;
5144
5145 vm_object_collapse_calls++;
5146
5147 assertf(page_aligned(hint_offset), "hint_offset 0x%llx", hint_offset);
5148
5149 if (!vm_object_collapse_allowed &&
5150 !(can_bypass && vm_object_bypass_allowed)) {
5151 return;
5152 }
5153
5154 if (object == VM_OBJECT_NULL) {
5155 return;
5156 }
5157
5158 original_object = object;
5159
5160 /*
5161 * The top object was locked "exclusive" by the caller.
5162 * In the first pass, to determine if we can collapse the shadow chain,
5163 * take a "shared" lock on the shadow objects. If we can collapse,
5164 * we'll have to go down the chain again with exclusive locks.
5165 */
5166 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5167 backing_object_lock_type = OBJECT_LOCK_SHARED;
5168
5169 retry:
5170 object = original_object;
5171 vm_object_lock_assert_exclusive(object);
5172
5173 while (TRUE) {
5174 vm_object_collapse_objects++;
5175 /*
5176 * Verify that the conditions are right for either
5177 * collapse or bypass:
5178 */
5179
5180 /*
5181 * There is a backing object, and
5182 */
5183
5184 backing_object = object->shadow;
5185 if (backing_object == VM_OBJECT_NULL) {
5186 if (object != original_object) {
5187 vm_object_unlock(object);
5188 }
5189 return;
5190 }
5191 if (backing_object_lock_type == OBJECT_LOCK_SHARED) {
5192 vm_object_lock_shared(backing_object);
5193 } else {
5194 vm_object_lock(backing_object);
5195 }
5196
5197 /*
5198 * No pages in the object are currently
5199 * being paged out, and
5200 */
5201 if (object->paging_in_progress != 0 ||
5202 object->activity_in_progress != 0) {
5203 /* try and collapse the rest of the shadow chain */
5204 if (object != original_object) {
5205 vm_object_unlock(object);
5206 }
5207 object = backing_object;
5208 object_lock_type = backing_object_lock_type;
5209 continue;
5210 }
5211
5212 /*
5213 * ...
5214 * The backing object is not read_only,
5215 * and no pages in the backing object are
5216 * currently being paged out.
5217 * The backing object is internal.
5218 *
5219 */
5220
5221 if (!backing_object->internal ||
5222 backing_object->paging_in_progress != 0 ||
5223 backing_object->activity_in_progress != 0) {
5224 /* try and collapse the rest of the shadow chain */
5225 if (object != original_object) {
5226 vm_object_unlock(object);
5227 }
5228 object = backing_object;
5229 object_lock_type = backing_object_lock_type;
5230 continue;
5231 }
5232
5233 /*
5234 * Purgeable objects are not supposed to engage in
5235 * copy-on-write activities, so should not have
5236 * any shadow objects or be a shadow object to another
5237 * object.
5238 * Collapsing a purgeable object would require some
5239 * updates to the purgeable compressed ledgers.
5240 */
5241 if (object->purgable != VM_PURGABLE_DENY ||
5242 backing_object->purgable != VM_PURGABLE_DENY) {
5243 panic("vm_object_collapse() attempting to collapse "
5244 "purgeable object: %p(%d) %p(%d)\n",
5245 object, object->purgable,
5246 backing_object, backing_object->purgable);
5247 /* try and collapse the rest of the shadow chain */
5248 if (object != original_object) {
5249 vm_object_unlock(object);
5250 }
5251 object = backing_object;
5252 object_lock_type = backing_object_lock_type;
5253 continue;
5254 }
5255
5256 /*
5257 * The backing object can't be a copy-object:
5258 * the shadow_offset for the copy-object must stay
5259 * as 0. Furthermore (for the 'we have all the
5260 * pages' case), if we bypass backing_object and
5261 * just shadow the next object in the chain, old
5262 * pages from that object would then have to be copied
5263 * BOTH into the (former) backing_object and into the
5264 * parent object.
5265 */
5266 if (backing_object->shadow != VM_OBJECT_NULL &&
5267 backing_object->shadow->vo_copy == backing_object) {
5268 /* try and collapse the rest of the shadow chain */
5269 if (object != original_object) {
5270 vm_object_unlock(object);
5271 }
5272 object = backing_object;
5273 object_lock_type = backing_object_lock_type;
5274 continue;
5275 }
5276
5277 /*
5278 * We can now try to either collapse the backing
5279 * object (if the parent is the only reference to
5280 * it) or (perhaps) remove the parent's reference
5281 * to it.
5282 *
5283 * If there is exactly one reference to the backing
5284 * object, we may be able to collapse it into the
5285 * parent.
5286 *
5287 * As long as one of the objects is still not known
5288 * to the pager, we can collapse them.
5289 */
5290 if (os_ref_get_count_raw(&backing_object->ref_count) == 1 &&
5291 (vm_object_collapse_compressor_allowed ||
5292 !object->pager_created
5293 || (!backing_object->pager_created)
5294 ) && vm_object_collapse_allowed) {
5295 /*
5296 * We need the exclusive lock on the VM objects.
5297 */
5298 if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5299 /*
5300 * We have an object and its shadow locked
5301 * "shared". We can't just upgrade the locks
5302 * to "exclusive", as some other thread might
5303 * also have these objects locked "shared" and
5304 * attempt to upgrade one or the other to
5305 * "exclusive". The upgrades would block
5306 * forever waiting for the other "shared" locks
5307 * to get released.
5308 * So we have to release the locks and go
5309 * down the shadow chain again (since it could
5310 * have changed) with "exclusive" locking.
5311 */
5312 vm_object_unlock(backing_object);
5313 if (object != original_object) {
5314 vm_object_unlock(object);
5315 }
5316 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5317 backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5318 goto retry;
5319 }
5320
5321 /*
5322 * Collapse the object with its backing
5323 * object, and try again with the object's
5324 * new backing object.
5325 */
5326
5327 vm_object_do_collapse(object, backing_object);
5328 vm_object_collapse_do_collapse++;
5329 continue;
5330 }
5331
5332 /*
5333 * Collapsing the backing object was not possible
5334 * or permitted, so let's try bypassing it.
5335 */
5336
5337 if (!(can_bypass && vm_object_bypass_allowed)) {
5338 /* try and collapse the rest of the shadow chain */
5339 if (object != original_object) {
5340 vm_object_unlock(object);
5341 }
5342 object = backing_object;
5343 object_lock_type = backing_object_lock_type;
5344 continue;
5345 }
5346
5347
5348 /*
5349 * If the object doesn't have all its pages present,
5350 * we have to make sure no pages in the backing object
5351 * "show through" before bypassing it.
5352 */
5353 object_vcount = object->vo_size >> PAGE_SHIFT;
5354 object_rcount = (vm_object_size_t)object->resident_page_count;
5355
5356 if (object_rcount != object_vcount) {
5357 vm_object_offset_t offset;
5358 vm_object_offset_t backing_offset;
5359 vm_object_size_t backing_rcount, backing_vcount;
5360
5361 /*
5362 * If the backing object has a pager but no pagemap,
5363 * then we cannot bypass it, because we don't know
5364 * what pages it has.
5365 */
5366 if (backing_object->pager_created) {
5367 /* try and collapse the rest of the shadow chain */
5368 if (object != original_object) {
5369 vm_object_unlock(object);
5370 }
5371 object = backing_object;
5372 object_lock_type = backing_object_lock_type;
5373 continue;
5374 }
5375
5376 /*
5377 * If the object has a pager but no pagemap,
5378 * then we cannot bypass it, because we don't know
5379 * what pages it has.
5380 */
5381 if (object->pager_created) {
5382 /* try and collapse the rest of the shadow chain */
5383 if (object != original_object) {
5384 vm_object_unlock(object);
5385 }
5386 object = backing_object;
5387 object_lock_type = backing_object_lock_type;
5388 continue;
5389 }
5390
5391 backing_offset = object->vo_shadow_offset;
5392 backing_vcount = backing_object->vo_size >> PAGE_SHIFT;
5393 backing_rcount = (vm_object_size_t)backing_object->resident_page_count;
5394 assert(backing_vcount >= object_vcount);
5395
5396 if (backing_rcount > (backing_vcount - object_vcount) &&
5397 backing_rcount - (backing_vcount - object_vcount) > object_rcount) {
5398 /*
5399 * we have enough pages in the backing object to guarantee that
5400 * at least 1 of them must be 'uncovered' by a resident page
5401 * in the object we're evaluating, so move on and
5402 * try to collapse the rest of the shadow chain
5403 */
5404 if (object != original_object) {
5405 vm_object_unlock(object);
5406 }
5407 object = backing_object;
5408 object_lock_type = backing_object_lock_type;
5409 continue;
5410 }
5411
5412 /*
5413 * If all of the pages in the backing object are
5414 * shadowed by the parent object, the parent
5415 * object no longer has to shadow the backing
5416 * object; it can shadow the next one in the
5417 * chain.
5418 *
5419 * If the backing object has existence info,
5420 * we must check examine its existence info
5421 * as well.
5422 *
5423 */
5424
5425 #define EXISTS_IN_OBJECT(obj, off, rc) \
5426 ((vm_object_compressor_pager_state_get((obj), (off)) \
5427 == VM_EXTERNAL_STATE_EXISTS) || \
5428 ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
5429
5430 /*
5431 * Check the hint location first
5432 * (since it is often the quickest way out of here).
5433 */
5434 if (object->cow_hint != ~(vm_offset_t)0) {
5435 hint_offset = (vm_object_offset_t)object->cow_hint;
5436 } else {
5437 hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
5438 (hint_offset - 8 * PAGE_SIZE_64) : 0;
5439 }
5440
5441 if (EXISTS_IN_OBJECT(backing_object, hint_offset +
5442 backing_offset, backing_rcount) &&
5443 !EXISTS_IN_OBJECT(object, hint_offset, object_rcount)) {
5444 /* dependency right at the hint */
5445 object->cow_hint = (vm_offset_t) hint_offset; /* atomic */
5446 /* try and collapse the rest of the shadow chain */
5447 if (object != original_object) {
5448 vm_object_unlock(object);
5449 }
5450 object = backing_object;
5451 object_lock_type = backing_object_lock_type;
5452 continue;
5453 }
5454
5455 /*
5456 * If the object's window onto the backing_object
5457 * is large compared to the number of resident
5458 * pages in the backing object, it makes sense to
5459 * walk the backing_object's resident pages first.
5460 *
5461 * NOTE: Pages may be in both the existence map and/or
5462 * resident, so if we don't find a dependency while
5463 * walking the backing object's resident page list
5464 * directly, and there is an existence map, we'll have
5465 * to run the offset based 2nd pass. Because we may
5466 * have to run both passes, we need to be careful
5467 * not to decrement 'rcount' in the 1st pass
5468 */
5469 if (backing_rcount && backing_rcount < (object_vcount / 8)) {
5470 vm_object_size_t rc = object_rcount;
5471 vm_page_t p;
5472
5473 backing_rcount = backing_object->resident_page_count;
5474 p = (vm_page_t)vm_page_queue_first(&backing_object->memq);
5475 do {
5476 offset = (p->vmp_offset - backing_offset);
5477
5478 if (offset < object->vo_size &&
5479 offset != hint_offset &&
5480 !EXISTS_IN_OBJECT(object, offset, rc)) {
5481 /* found a dependency */
5482 object->cow_hint = (vm_offset_t) offset; /* atomic */
5483
5484 break;
5485 }
5486 p = (vm_page_t) vm_page_queue_next(&p->vmp_listq);
5487 } while (--backing_rcount);
5488 if (backing_rcount != 0) {
5489 /* try and collapse the rest of the shadow chain */
5490 if (object != original_object) {
5491 vm_object_unlock(object);
5492 }
5493 object = backing_object;
5494 object_lock_type = backing_object_lock_type;
5495 continue;
5496 }
5497 }
5498
5499 /*
5500 * Walk through the offsets looking for pages in the
5501 * backing object that show through to the object.
5502 */
5503 if (backing_rcount) {
5504 offset = hint_offset;
5505
5506 while ((offset =
5507 (offset + PAGE_SIZE_64 < object->vo_size) ?
5508 (offset + PAGE_SIZE_64) : 0) != hint_offset) {
5509 if (EXISTS_IN_OBJECT(backing_object, offset +
5510 backing_offset, backing_rcount) &&
5511 !EXISTS_IN_OBJECT(object, offset, object_rcount)) {
5512 /* found a dependency */
5513 object->cow_hint = (vm_offset_t) offset; /* atomic */
5514 break;
5515 }
5516 }
5517 if (offset != hint_offset) {
5518 /* try and collapse the rest of the shadow chain */
5519 if (object != original_object) {
5520 vm_object_unlock(object);
5521 }
5522 object = backing_object;
5523 object_lock_type = backing_object_lock_type;
5524 continue;
5525 }
5526 }
5527 }
5528
5529 /*
5530 * We need "exclusive" locks on the 2 VM objects.
5531 */
5532 if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5533 vm_object_unlock(backing_object);
5534 if (object != original_object) {
5535 vm_object_unlock(object);
5536 }
5537 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5538 backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5539 goto retry;
5540 }
5541
5542 /* reset the offset hint for any objects deeper in the chain */
5543 object->cow_hint = (vm_offset_t)0;
5544
5545 /*
5546 * All interesting pages in the backing object
5547 * already live in the parent or its pager.
5548 * Thus we can bypass the backing object.
5549 */
5550
5551 vm_object_do_bypass(object, backing_object);
5552 vm_object_collapse_do_bypass++;
5553
5554 /*
5555 * Try again with this object's new backing object.
5556 */
5557
5558 continue;
5559 }
5560
5561 /* NOT REACHED */
5562 /*
5563 * if (object != original_object) {
5564 * vm_object_unlock(object);
5565 * }
5566 */
5567 }
5568
5569 /*
5570 * Routine: vm_object_page_remove: [internal]
5571 * Purpose:
5572 * Removes all physical pages in the specified
5573 * object range from the object's list of pages.
5574 *
5575 * In/out conditions:
5576 * The object must be locked.
5577 * The object must not have paging_in_progress, usually
5578 * guaranteed by not having a pager.
5579 */
5580 unsigned int vm_object_page_remove_lookup = 0;
5581 unsigned int vm_object_page_remove_iterate = 0;
5582
5583 __private_extern__ void
5584 vm_object_page_remove(
5585 vm_object_t object,
5586 vm_object_offset_t start,
5587 vm_object_offset_t end)
5588 {
5589 vm_page_t p, next;
5590
5591 /*
5592 * One and two page removals are most popular.
5593 * The factor of 16 here is somewhat arbitrary.
5594 * It balances vm_object_lookup vs iteration.
5595 */
5596
5597 if (atop_64(end - start) < (unsigned)object->resident_page_count / 16) {
5598 vm_object_page_remove_lookup++;
5599
5600 for (; start < end; start += PAGE_SIZE_64) {
5601 p = vm_page_lookup(object, start);
5602 if (p != VM_PAGE_NULL) {
5603 assert(!p->vmp_cleaning && !p->vmp_laundry);
5604 if (!vm_page_is_fictitious(p) && p->vmp_pmapped) {
5605 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
5606 }
5607 VM_PAGE_FREE(p);
5608 }
5609 }
5610 } else {
5611 vm_object_page_remove_iterate++;
5612
5613 p = (vm_page_t) vm_page_queue_first(&object->memq);
5614 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) p)) {
5615 next = (vm_page_t) vm_page_queue_next(&p->vmp_listq);
5616 if ((start <= p->vmp_offset) && (p->vmp_offset < end)) {
5617 assert(!p->vmp_cleaning && !p->vmp_laundry);
5618 if (!vm_page_is_fictitious(p) && p->vmp_pmapped) {
5619 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
5620 }
5621 VM_PAGE_FREE(p);
5622 }
5623 p = next;
5624 }
5625 }
5626 }
5627
5628
5629 /*
5630 * Routine: vm_object_coalesce
5631 * Function: Coalesces two objects backing up adjoining
5632 * regions of memory into a single object.
5633 *
5634 * returns TRUE if objects were combined.
5635 *
5636 * NOTE: Only works at the moment if the second object is NULL -
5637 * if it's not, which object do we lock first?
5638 *
5639 * Parameters:
5640 * prev_object First object to coalesce
5641 * prev_offset Offset into prev_object
5642 * next_object Second object into coalesce
5643 * next_offset Offset into next_object
5644 *
5645 * prev_size Size of reference to prev_object
5646 * next_size Size of reference to next_object
5647 *
5648 * Conditions:
5649 * The object(s) must *not* be locked. The map must be locked
5650 * to preserve the reference to the object(s).
5651 */
5652 static int vm_object_coalesce_count = 0;
5653
5654 __private_extern__ boolean_t
5655 vm_object_coalesce(
5656 vm_object_t prev_object,
5657 vm_object_t next_object,
5658 vm_object_offset_t prev_offset,
5659 __unused vm_object_offset_t next_offset,
5660 vm_object_size_t prev_size,
5661 vm_object_size_t next_size)
5662 {
5663 vm_object_size_t newsize;
5664
5665 #ifdef lint
5666 next_offset++;
5667 #endif /* lint */
5668
5669 if (next_object != VM_OBJECT_NULL) {
5670 return FALSE;
5671 }
5672
5673 if (prev_object == VM_OBJECT_NULL) {
5674 return TRUE;
5675 }
5676
5677 vm_object_lock(prev_object);
5678
5679 /*
5680 * Try to collapse the object first
5681 */
5682 vm_object_collapse(prev_object, prev_offset, TRUE);
5683
5684 /*
5685 * Can't coalesce if pages not mapped to
5686 * prev_entry may be in use any way:
5687 * . more than one reference
5688 * . paged out
5689 * . shadows another object
5690 * . has a copy elsewhere
5691 * . is purgeable
5692 * . paging references (pages might be in page-list)
5693 */
5694
5695 if ((os_ref_get_count_raw(&prev_object->ref_count) > 1) ||
5696 prev_object->pager_created ||
5697 prev_object->phys_contiguous ||
5698 (prev_object->shadow != VM_OBJECT_NULL) ||
5699 (prev_object->vo_copy != VM_OBJECT_NULL) ||
5700 (prev_object->true_share != FALSE) ||
5701 (prev_object->purgable != VM_PURGABLE_DENY) ||
5702 (prev_object->paging_in_progress != 0) ||
5703 (prev_object->activity_in_progress != 0)) {
5704 vm_object_unlock(prev_object);
5705 return FALSE;
5706 }
5707 /* newsize = prev_offset + prev_size + next_size; */
5708 if (__improbable(os_add3_overflow(prev_offset, prev_size, next_size,
5709 &newsize))) {
5710 vm_object_unlock(prev_object);
5711 return FALSE;
5712 }
5713
5714 vm_object_coalesce_count++;
5715
5716 /*
5717 * Remove any pages that may still be in the object from
5718 * a previous deallocation.
5719 */
5720 vm_object_page_remove(prev_object,
5721 prev_offset + prev_size,
5722 prev_offset + prev_size + next_size);
5723
5724 /*
5725 * Extend the object if necessary.
5726 */
5727 if (newsize > prev_object->vo_size) {
5728 assertf(page_aligned(newsize),
5729 "object %p size 0x%llx",
5730 prev_object, (uint64_t)newsize);
5731 prev_object->vo_size = newsize;
5732 }
5733
5734 vm_object_unlock(prev_object);
5735 return TRUE;
5736 }
5737
5738 kern_return_t
5739 vm_object_populate_with_private(
5740 vm_object_t object,
5741 vm_object_offset_t offset,
5742 ppnum_t phys_page,
5743 vm_size_t size)
5744 {
5745 ppnum_t base_page;
5746 vm_object_offset_t base_offset;
5747
5748
5749 if (!object->private) {
5750 return KERN_FAILURE;
5751 }
5752
5753 base_page = phys_page;
5754
5755 vm_object_lock(object);
5756
5757 if (!object->phys_contiguous) {
5758 vm_page_t m;
5759
5760 if ((base_offset = trunc_page_64(offset)) != offset) {
5761 vm_object_unlock(object);
5762 return KERN_FAILURE;
5763 }
5764 base_offset += object->paging_offset;
5765
5766 while (size) {
5767 m = vm_page_lookup(object, base_offset);
5768
5769 if (m != VM_PAGE_NULL) {
5770 ppnum_t m_phys_page = VM_PAGE_GET_PHYS_PAGE(m);
5771
5772 if (m_phys_page == vm_page_guard_addr) {
5773 /* nothing to do */
5774 } else if (m_phys_page == vm_page_fictitious_addr) {
5775 vm_page_lockspin_queues();
5776 vm_page_make_private(m, base_page);
5777 vm_page_unlock_queues();
5778 } else if (m_phys_page != base_page) {
5779 if (!vm_page_is_private(m)) {
5780 /*
5781 * we'd leak a real page... that can't be right
5782 */
5783 panic("vm_object_populate_with_private - %p not private", m);
5784 }
5785 if (m->vmp_pmapped) {
5786 /*
5787 * pmap call to clear old mapping
5788 */
5789 pmap_disconnect(m_phys_page);
5790 }
5791 VM_PAGE_SET_PHYS_PAGE(m, base_page);
5792 }
5793 } else {
5794 m = vm_page_create_private(base_page);
5795
5796 m->vmp_unusual = TRUE;
5797 m->vmp_busy = FALSE;
5798
5799 vm_page_insert(m, object, base_offset);
5800 }
5801 base_page++; /* Go to the next physical page */
5802 base_offset += PAGE_SIZE;
5803 size -= PAGE_SIZE;
5804 }
5805 } else {
5806 /* NOTE: we should check the original settings here */
5807 /* if we have a size > zero a pmap call should be made */
5808 /* to disable the range */
5809
5810 /* pmap_? */
5811
5812 /* shadows on contiguous memory are not allowed */
5813 /* we therefore can use the offset field */
5814 object->vo_shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT;
5815 assertf(page_aligned(size),
5816 "object %p size 0x%llx",
5817 object, (uint64_t)size);
5818 object->vo_size = size;
5819 }
5820 vm_object_unlock(object);
5821
5822 return KERN_SUCCESS;
5823 }
5824
5825
5826 kern_return_t
5827 memory_object_create_named(
5828 memory_object_t pager,
5829 memory_object_offset_t size,
5830 memory_object_control_t *control)
5831 {
5832 vm_object_t object;
5833
5834 *control = MEMORY_OBJECT_CONTROL_NULL;
5835 if (pager == MEMORY_OBJECT_NULL) {
5836 return KERN_INVALID_ARGUMENT;
5837 }
5838
5839 object = vm_object_memory_object_associate(pager,
5840 VM_OBJECT_NULL,
5841 size,
5842 TRUE);
5843 if (object == VM_OBJECT_NULL) {
5844 return KERN_INVALID_OBJECT;
5845 }
5846
5847 /* wait for object (if any) to be ready */
5848 if (object != VM_OBJECT_NULL) {
5849 vm_object_lock(object);
5850 VM_OBJECT_SET_NAMED(object, TRUE);
5851 while (!object->pager_ready) {
5852 vm_object_sleep(object,
5853 VM_OBJECT_EVENT_PAGER_READY,
5854 THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
5855 }
5856 *control = object->pager_control;
5857 vm_object_unlock(object);
5858 }
5859 return KERN_SUCCESS;
5860 }
5861
5862
5863 __private_extern__ kern_return_t
5864 vm_object_lock_request(
5865 vm_object_t object,
5866 vm_object_offset_t offset,
5867 vm_object_size_t size,
5868 memory_object_return_t should_return,
5869 int flags,
5870 vm_prot_t prot)
5871 {
5872 __unused boolean_t should_flush;
5873
5874 should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
5875
5876 /*
5877 * Check for bogus arguments.
5878 */
5879 if (object == VM_OBJECT_NULL) {
5880 return KERN_INVALID_ARGUMENT;
5881 }
5882
5883 if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE) {
5884 return KERN_INVALID_ARGUMENT;
5885 }
5886
5887 /*
5888 * XXX TODO4K
5889 * extend range for conservative operations (copy-on-write, sync, ...)
5890 * truncate range for destructive operations (purge, ...)
5891 */
5892 size = vm_object_round_page(offset + size) - vm_object_trunc_page(offset);
5893 offset = vm_object_trunc_page(offset);
5894
5895 /*
5896 * Lock the object, and acquire a paging reference to
5897 * prevent the memory_object reference from being released.
5898 */
5899 vm_object_lock(object);
5900 vm_object_paging_begin(object);
5901
5902 (void)vm_object_update(object,
5903 offset, size, NULL, NULL, should_return, flags, prot);
5904
5905 vm_object_paging_end(object);
5906 vm_object_unlock(object);
5907
5908 return KERN_SUCCESS;
5909 }
5910
5911 /*
5912 * Empty a purgeable object by grabbing the physical pages assigned to it and
5913 * putting them on the free queue without writing them to backing store, etc.
5914 * When the pages are next touched they will be demand zero-fill pages. We
5915 * skip pages which are busy, being paged in/out, wired, etc. We do _not_
5916 * skip referenced/dirty pages, pages on the active queue, etc. We're more
5917 * than happy to grab these since this is a purgeable object. We mark the
5918 * object as "empty" after reaping its pages.
5919 *
5920 * On entry the object must be locked and it must be
5921 * purgeable with no delayed copies pending.
5922 */
5923 uint64_t
5924 vm_object_purge(vm_object_t object, int flags)
5925 {
5926 unsigned int object_page_count = 0, pgcount = 0;
5927 uint64_t total_purged_pgcount = 0;
5928 boolean_t skipped_object = FALSE;
5929
5930 vm_object_lock_assert_exclusive(object);
5931
5932 if (object->purgable == VM_PURGABLE_DENY) {
5933 return 0;
5934 }
5935
5936 assert(object->vo_copy == VM_OBJECT_NULL);
5937 assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
5938
5939 /*
5940 * We need to set the object's state to VM_PURGABLE_EMPTY *before*
5941 * reaping its pages. We update vm_page_purgeable_count in bulk
5942 * and we don't want vm_page_remove() to update it again for each
5943 * page we reap later.
5944 *
5945 * For the purgeable ledgers, pages from VOLATILE and EMPTY objects
5946 * are all accounted for in the "volatile" ledgers, so this does not
5947 * make any difference.
5948 * If we transitioned directly from NONVOLATILE to EMPTY,
5949 * vm_page_purgeable_count must have been updated when the object
5950 * was dequeued from its volatile queue and the purgeable ledgers
5951 * must have also been updated accordingly at that time (in
5952 * vm_object_purgable_control()).
5953 */
5954 if (object->purgable == VM_PURGABLE_VOLATILE) {
5955 unsigned int delta;
5956 assert(object->resident_page_count >=
5957 object->wired_page_count);
5958 delta = (object->resident_page_count -
5959 object->wired_page_count);
5960 if (delta != 0) {
5961 assert(vm_page_purgeable_count >=
5962 delta);
5963 OSAddAtomic(-delta,
5964 (SInt32 *)&vm_page_purgeable_count);
5965 }
5966 if (object->wired_page_count != 0) {
5967 assert(vm_page_purgeable_wired_count >=
5968 object->wired_page_count);
5969 OSAddAtomic(-object->wired_page_count,
5970 (SInt32 *)&vm_page_purgeable_wired_count);
5971 }
5972 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_EMPTY);
5973 }
5974 assert(object->purgable == VM_PURGABLE_EMPTY);
5975
5976 object_page_count = object->resident_page_count;
5977
5978 vm_object_reap_pages(object, REAP_PURGEABLE);
5979
5980 if (object->resident_page_count >= object_page_count) {
5981 total_purged_pgcount = 0;
5982 } else {
5983 total_purged_pgcount = object_page_count - object->resident_page_count;
5984 }
5985
5986 if (object->pager != NULL) {
5987 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5988
5989 if (object->activity_in_progress == 0 &&
5990 object->paging_in_progress == 0) {
5991 /*
5992 * Also reap any memory coming from this object
5993 * in the VM compressor.
5994 *
5995 * There are no operations in progress on the VM object
5996 * and no operation can start while we're holding the
5997 * VM object lock, so it's safe to reap the compressed
5998 * pages and update the page counts.
5999 */
6000 pgcount = vm_compressor_pager_get_count(object->pager);
6001 if (pgcount) {
6002 pgcount = vm_compressor_pager_reap_pages(object->pager, flags);
6003 vm_compressor_pager_count(object->pager,
6004 -pgcount,
6005 FALSE, /* shared */
6006 object);
6007 vm_object_owner_compressed_update(object,
6008 -pgcount);
6009 }
6010 if (!(flags & C_DONT_BLOCK)) {
6011 assert(vm_compressor_pager_get_count(object->pager)
6012 == 0);
6013 }
6014 } else {
6015 /*
6016 * There's some kind of paging activity in progress
6017 * for this object, which could result in a page
6018 * being compressed or decompressed, possibly while
6019 * the VM object is not locked, so it could race
6020 * with us.
6021 *
6022 * We can't really synchronize this without possibly
6023 * causing a deadlock when the compressor needs to
6024 * allocate or free memory while compressing or
6025 * decompressing a page from a purgeable object
6026 * mapped in the kernel_map...
6027 *
6028 * So let's not attempt to purge the compressor
6029 * pager if there's any kind of operation in
6030 * progress on the VM object.
6031 */
6032 skipped_object = TRUE;
6033 }
6034 }
6035
6036 vm_object_lock_assert_exclusive(object);
6037
6038 total_purged_pgcount += pgcount;
6039
6040 KDBG_RELEASE(VMDBG_CODE(DBG_VM_PURGEABLE_OBJECT_PURGE_ONE) | DBG_FUNC_NONE,
6041 VM_KERNEL_UNSLIDE_OR_PERM(object), /* purged object */
6042 object_page_count,
6043 total_purged_pgcount,
6044 skipped_object);
6045
6046 return total_purged_pgcount;
6047 }
6048
6049
6050 /*
6051 * vm_object_purgeable_control() allows the caller to control and investigate the
6052 * state of a purgeable object. A purgeable object is created via a call to
6053 * vm_allocate() with VM_FLAGS_PURGABLE specified. A purgeable object will
6054 * never be coalesced with any other object -- even other purgeable objects --
6055 * and will thus always remain a distinct object. A purgeable object has
6056 * special semantics when its reference count is exactly 1. If its reference
6057 * count is greater than 1, then a purgeable object will behave like a normal
6058 * object and attempts to use this interface will result in an error return
6059 * of KERN_INVALID_ARGUMENT.
6060 *
6061 * A purgeable object may be put into a "volatile" state which will make the
6062 * object's pages elligable for being reclaimed without paging to backing
6063 * store if the system runs low on memory. If the pages in a volatile
6064 * purgeable object are reclaimed, the purgeable object is said to have been
6065 * "emptied." When a purgeable object is emptied the system will reclaim as
6066 * many pages from the object as it can in a convenient manner (pages already
6067 * en route to backing store or busy for other reasons are left as is). When
6068 * a purgeable object is made volatile, its pages will generally be reclaimed
6069 * before other pages in the application's working set. This semantic is
6070 * generally used by applications which can recreate the data in the object
6071 * faster than it can be paged in. One such example might be media assets
6072 * which can be reread from a much faster RAID volume.
6073 *
6074 * A purgeable object may be designated as "non-volatile" which means it will
6075 * behave like all other objects in the system with pages being written to and
6076 * read from backing store as needed to satisfy system memory needs. If the
6077 * object was emptied before the object was made non-volatile, that fact will
6078 * be returned as the old state of the purgeable object (see
6079 * VM_PURGABLE_SET_STATE below). In this case, any pages of the object which
6080 * were reclaimed as part of emptying the object will be refaulted in as
6081 * zero-fill on demand. It is up to the application to note that an object
6082 * was emptied and recreate the objects contents if necessary. When a
6083 * purgeable object is made non-volatile, its pages will generally not be paged
6084 * out to backing store in the immediate future. A purgeable object may also
6085 * be manually emptied.
6086 *
6087 * Finally, the current state (non-volatile, volatile, volatile & empty) of a
6088 * volatile purgeable object may be queried at any time. This information may
6089 * be used as a control input to let the application know when the system is
6090 * experiencing memory pressure and is reclaiming memory.
6091 *
6092 * The specified address may be any address within the purgeable object. If
6093 * the specified address does not represent any object in the target task's
6094 * virtual address space, then KERN_INVALID_ADDRESS will be returned. If the
6095 * object containing the specified address is not a purgeable object, then
6096 * KERN_INVALID_ARGUMENT will be returned. Otherwise, KERN_SUCCESS will be
6097 * returned.
6098 *
6099 * The control parameter may be any one of VM_PURGABLE_SET_STATE or
6100 * VM_PURGABLE_GET_STATE. For VM_PURGABLE_SET_STATE, the in/out parameter
6101 * state is used to set the new state of the purgeable object and return its
6102 * old state. For VM_PURGABLE_GET_STATE, the current state of the purgeable
6103 * object is returned in the parameter state.
6104 *
6105 * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
6106 * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY. These, respectively, represent
6107 * the non-volatile, volatile and volatile/empty states described above.
6108 * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will
6109 * immediately reclaim as many pages in the object as can be conveniently
6110 * collected (some may have already been written to backing store or be
6111 * otherwise busy).
6112 *
6113 * The process of making a purgeable object non-volatile and determining its
6114 * previous state is atomic. Thus, if a purgeable object is made
6115 * VM_PURGABLE_NONVOLATILE and the old state is returned as
6116 * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are
6117 * completely intact and will remain so until the object is made volatile
6118 * again. If the old state is returned as VM_PURGABLE_EMPTY then the object
6119 * was reclaimed while it was in a volatile state and its previous contents
6120 * have been lost.
6121 */
6122 /*
6123 * The object must be locked.
6124 */
6125 kern_return_t
6126 vm_object_purgable_control(
6127 vm_object_t object,
6128 vm_purgable_t control,
6129 int *state)
6130 {
6131 int old_state;
6132 int new_state;
6133
6134 if (object == VM_OBJECT_NULL) {
6135 /*
6136 * Object must already be present or it can't be purgeable.
6137 */
6138 return KERN_INVALID_ARGUMENT;
6139 }
6140
6141 vm_object_lock_assert_exclusive(object);
6142
6143 /*
6144 * Get current state of the purgeable object.
6145 */
6146 old_state = object->purgable;
6147 if (old_state == VM_PURGABLE_DENY) {
6148 return KERN_INVALID_ARGUMENT;
6149 }
6150
6151 /* purgeable cant have delayed copies - now or in the future */
6152 assert(object->vo_copy == VM_OBJECT_NULL);
6153 assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6154
6155 /*
6156 * Execute the desired operation.
6157 */
6158 if (control == VM_PURGABLE_GET_STATE) {
6159 *state = old_state;
6160 return KERN_SUCCESS;
6161 }
6162
6163 if (control == VM_PURGABLE_SET_STATE &&
6164 object->purgeable_only_by_kernel) {
6165 return KERN_PROTECTION_FAILURE;
6166 }
6167
6168 if (control != VM_PURGABLE_SET_STATE &&
6169 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
6170 return KERN_INVALID_ARGUMENT;
6171 }
6172
6173 if ((*state) & VM_PURGABLE_DEBUG_EMPTY) {
6174 object->volatile_empty = TRUE;
6175 }
6176 if ((*state) & VM_PURGABLE_DEBUG_FAULT) {
6177 object->volatile_fault = TRUE;
6178 }
6179
6180 new_state = *state & VM_PURGABLE_STATE_MASK;
6181 if (new_state == VM_PURGABLE_VOLATILE) {
6182 if (old_state == VM_PURGABLE_EMPTY) {
6183 /* what's been emptied must stay empty */
6184 new_state = VM_PURGABLE_EMPTY;
6185 }
6186 if (object->volatile_empty) {
6187 /* debugging mode: go straight to empty */
6188 new_state = VM_PURGABLE_EMPTY;
6189 }
6190 }
6191
6192 switch (new_state) {
6193 case VM_PURGABLE_DENY:
6194 /*
6195 * Attempting to convert purgeable memory to non-purgeable:
6196 * not allowed.
6197 */
6198 return KERN_INVALID_ARGUMENT;
6199 case VM_PURGABLE_NONVOLATILE:
6200 VM_OBJECT_SET_PURGABLE(object, new_state);
6201
6202 if (old_state == VM_PURGABLE_VOLATILE) {
6203 unsigned int delta;
6204
6205 assert(object->resident_page_count >=
6206 object->wired_page_count);
6207 delta = (object->resident_page_count -
6208 object->wired_page_count);
6209
6210 assert(vm_page_purgeable_count >= delta);
6211
6212 if (delta != 0) {
6213 OSAddAtomic(-delta,
6214 (SInt32 *)&vm_page_purgeable_count);
6215 }
6216 if (object->wired_page_count != 0) {
6217 assert(vm_page_purgeable_wired_count >=
6218 object->wired_page_count);
6219 OSAddAtomic(-object->wired_page_count,
6220 (SInt32 *)&vm_page_purgeable_wired_count);
6221 }
6222
6223 vm_page_lock_queues();
6224
6225 /* object should be on a queue */
6226 assert(object->objq.next != NULL &&
6227 object->objq.prev != NULL);
6228 purgeable_q_t queue;
6229
6230 /*
6231 * Move object from its volatile queue to the
6232 * non-volatile queue...
6233 */
6234 queue = vm_purgeable_object_remove(object);
6235 assert(queue);
6236
6237 if (object->purgeable_when_ripe) {
6238 vm_purgeable_token_delete_last(queue);
6239 }
6240 assert(queue->debug_count_objects >= 0);
6241
6242 vm_page_unlock_queues();
6243 }
6244 if (old_state == VM_PURGABLE_VOLATILE ||
6245 old_state == VM_PURGABLE_EMPTY) {
6246 /*
6247 * Transfer the object's pages from the volatile to
6248 * non-volatile ledgers.
6249 */
6250 vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE);
6251 }
6252
6253 break;
6254
6255 case VM_PURGABLE_VOLATILE:
6256 if (object->volatile_fault) {
6257 vm_page_t p;
6258 int refmod;
6259
6260 vm_page_queue_iterate(&object->memq, p, vmp_listq) {
6261 if (p->vmp_busy ||
6262 VM_PAGE_WIRED(p) ||
6263 vm_page_is_fictitious(p)) {
6264 continue;
6265 }
6266 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
6267 if ((refmod & VM_MEM_MODIFIED) &&
6268 !p->vmp_dirty) {
6269 SET_PAGE_DIRTY(p, FALSE);
6270 }
6271 }
6272 }
6273
6274 assert(old_state != VM_PURGABLE_EMPTY);
6275
6276 purgeable_q_t queue;
6277
6278 /* find the correct queue */
6279 if ((*state & VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE) {
6280 queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
6281 } else {
6282 if ((*state & VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO) {
6283 queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
6284 } else {
6285 queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
6286 }
6287 }
6288
6289 if (old_state == VM_PURGABLE_NONVOLATILE ||
6290 old_state == VM_PURGABLE_EMPTY) {
6291 unsigned int delta;
6292
6293 if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6294 VM_PURGABLE_NO_AGING) {
6295 VM_OBJECT_SET_PURGEABLE_WHEN_RIPE(object, FALSE);
6296 } else {
6297 VM_OBJECT_SET_PURGEABLE_WHEN_RIPE(object, TRUE);
6298 }
6299
6300 if (object->purgeable_when_ripe) {
6301 kern_return_t result;
6302
6303 /* try to add token... this can fail */
6304 vm_page_lock_queues();
6305
6306 result = vm_purgeable_token_add(queue);
6307 if (result != KERN_SUCCESS) {
6308 vm_page_unlock_queues();
6309 return result;
6310 }
6311 vm_page_unlock_queues();
6312 }
6313
6314 assert(object->resident_page_count >=
6315 object->wired_page_count);
6316 delta = (object->resident_page_count -
6317 object->wired_page_count);
6318
6319 if (delta != 0) {
6320 OSAddAtomic(delta,
6321 &vm_page_purgeable_count);
6322 }
6323 if (object->wired_page_count != 0) {
6324 OSAddAtomic(object->wired_page_count,
6325 &vm_page_purgeable_wired_count);
6326 }
6327
6328 VM_OBJECT_SET_PURGABLE(object, new_state);
6329
6330 /* object should be on "non-volatile" queue */
6331 assert(object->objq.next != NULL);
6332 assert(object->objq.prev != NULL);
6333 } else if (old_state == VM_PURGABLE_VOLATILE) {
6334 purgeable_q_t old_queue;
6335 boolean_t purgeable_when_ripe;
6336
6337 /*
6338 * if reassigning priorities / purgeable groups, we don't change the
6339 * token queue. So moving priorities will not make pages stay around longer.
6340 * Reasoning is that the algorithm gives most priority to the most important
6341 * object. If a new token is added, the most important object' priority is boosted.
6342 * This biases the system already for purgeable queues that move a lot.
6343 * It doesn't seem more biasing is neccessary in this case, where no new object is added.
6344 */
6345 assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
6346
6347 old_queue = vm_purgeable_object_remove(object);
6348 assert(old_queue);
6349
6350 if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6351 VM_PURGABLE_NO_AGING) {
6352 purgeable_when_ripe = FALSE;
6353 } else {
6354 purgeable_when_ripe = TRUE;
6355 }
6356
6357 if (old_queue != queue ||
6358 (purgeable_when_ripe !=
6359 object->purgeable_when_ripe)) {
6360 kern_return_t result;
6361
6362 /* Changing queue. Have to move token. */
6363 vm_page_lock_queues();
6364 if (object->purgeable_when_ripe) {
6365 vm_purgeable_token_delete_last(old_queue);
6366 }
6367 VM_OBJECT_SET_PURGEABLE_WHEN_RIPE(object, purgeable_when_ripe);
6368 if (object->purgeable_when_ripe) {
6369 result = vm_purgeable_token_add(queue);
6370 assert(result == KERN_SUCCESS); /* this should never fail since we just freed a token */
6371 }
6372 vm_page_unlock_queues();
6373 }
6374 }
6375 ;
6376 vm_purgeable_object_add(object, queue, (*state & VM_VOLATILE_GROUP_MASK) >> VM_VOLATILE_GROUP_SHIFT );
6377 if (old_state == VM_PURGABLE_NONVOLATILE) {
6378 vm_purgeable_accounting(object,
6379 VM_PURGABLE_NONVOLATILE);
6380 }
6381
6382 assert(queue->debug_count_objects >= 0);
6383
6384 break;
6385
6386
6387 case VM_PURGABLE_EMPTY:
6388 if (object->volatile_fault) {
6389 vm_page_t p;
6390 int refmod;
6391
6392 vm_page_queue_iterate(&object->memq, p, vmp_listq) {
6393 if (p->vmp_busy ||
6394 VM_PAGE_WIRED(p) ||
6395 vm_page_is_fictitious(p)) {
6396 continue;
6397 }
6398 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
6399 if ((refmod & VM_MEM_MODIFIED) &&
6400 !p->vmp_dirty) {
6401 SET_PAGE_DIRTY(p, FALSE);
6402 }
6403 }
6404 }
6405
6406 if (old_state == VM_PURGABLE_VOLATILE) {
6407 purgeable_q_t old_queue;
6408
6409 /* object should be on a queue */
6410 assert(object->objq.next != NULL &&
6411 object->objq.prev != NULL);
6412
6413 old_queue = vm_purgeable_object_remove(object);
6414 assert(old_queue);
6415 if (object->purgeable_when_ripe) {
6416 vm_page_lock_queues();
6417 vm_purgeable_token_delete_first(old_queue);
6418 vm_page_unlock_queues();
6419 }
6420 }
6421
6422 if (old_state == VM_PURGABLE_NONVOLATILE) {
6423 /*
6424 * This object's pages were previously accounted as
6425 * "non-volatile" and now need to be accounted as
6426 * "volatile".
6427 */
6428 vm_purgeable_accounting(object,
6429 VM_PURGABLE_NONVOLATILE);
6430 /*
6431 * Set to VM_PURGABLE_EMPTY because the pages are no
6432 * longer accounted in the "non-volatile" ledger
6433 * and are also not accounted for in
6434 * "vm_page_purgeable_count".
6435 */
6436 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_EMPTY);
6437 }
6438
6439 (void) vm_object_purge(object, 0);
6440 assert(object->purgable == VM_PURGABLE_EMPTY);
6441
6442 break;
6443 }
6444
6445 *state = old_state;
6446
6447 vm_object_lock_assert_exclusive(object);
6448
6449 return KERN_SUCCESS;
6450 }
6451
6452 kern_return_t
6453 vm_object_get_page_counts(
6454 vm_object_t object,
6455 vm_object_offset_t offset,
6456 vm_object_size_t size,
6457 uint64_t *resident_page_count,
6458 uint64_t *dirty_page_count,
6459 uint64_t *swapped_page_count)
6460 {
6461 vm_page_t p = VM_PAGE_NULL;
6462 unsigned int local_resident_count = 0;
6463 unsigned int local_dirty_count = 0;
6464 unsigned int local_swapped_count = 0;
6465 vm_object_offset_t cur_offset = 0;
6466 vm_object_offset_t end_offset = 0;
6467
6468 if (object == VM_OBJECT_NULL) {
6469 return KERN_INVALID_ARGUMENT;
6470 }
6471
6472 cur_offset = offset;
6473 end_offset = offset + size;
6474
6475 vm_object_lock_assert_exclusive(object);
6476
6477 if (resident_page_count != NULL &&
6478 dirty_page_count == NULL &&
6479 offset == 0 &&
6480 object->vo_size == size) {
6481 /*
6482 * Fast path when:
6483 * - we only want the resident page count, and,
6484 * - the entire object is exactly covered by the request.
6485 */
6486 local_resident_count = object->resident_page_count;
6487 if (object->internal && object->pager != NULL) {
6488 local_swapped_count = vm_compressor_pager_get_count(object->pager);
6489 }
6490 goto out;
6491 }
6492
6493 if (object->resident_page_count <= (size >> PAGE_SHIFT) &&
6494 swapped_page_count == NULL) {
6495 /*
6496 * Faster path when we don't care about non-resident pages and the object has
6497 * fewer resident pages than the requested range.
6498 */
6499 vm_page_queue_iterate(&object->memq, p, vmp_listq) {
6500 if (p->vmp_offset >= cur_offset && p->vmp_offset < end_offset) {
6501 local_resident_count++;
6502 if (p->vmp_dirty ||
6503 (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
6504 local_dirty_count++;
6505 }
6506 }
6507 }
6508 goto out;
6509 }
6510
6511 for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) {
6512 p = vm_page_lookup(object, cur_offset);
6513
6514 if (p != VM_PAGE_NULL) {
6515 local_resident_count++;
6516 if (p->vmp_dirty ||
6517 (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
6518 local_dirty_count++;
6519 }
6520 } else if (page_is_paged_out(object, cur_offset)) {
6521 local_swapped_count++;
6522 }
6523 }
6524
6525 out:
6526 if (resident_page_count != NULL) {
6527 *resident_page_count = local_resident_count;
6528 }
6529
6530 if (dirty_page_count != NULL) {
6531 *dirty_page_count = local_dirty_count;
6532 }
6533
6534 if (swapped_page_count != NULL) {
6535 *swapped_page_count = local_swapped_count;
6536 }
6537
6538 return KERN_SUCCESS;
6539 }
6540
6541
6542 /*
6543 * vm_object_reference:
6544 *
6545 * Gets another reference to the given object.
6546 */
6547 #ifdef vm_object_reference
6548 #undef vm_object_reference
6549 #endif
6550 __private_extern__ void
6551 vm_object_reference(
6552 vm_object_t object)
6553 {
6554 if (object == VM_OBJECT_NULL) {
6555 return;
6556 }
6557
6558 vm_object_lock(object);
6559 vm_object_reference_locked(object);
6560 vm_object_unlock(object);
6561 }
6562
6563 /*
6564 * vm_object_transpose
6565 *
6566 * This routine takes two VM objects of the same size and exchanges
6567 * their backing store.
6568 * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
6569 * and UPL_BLOCK_ACCESS if they are referenced anywhere.
6570 *
6571 * The VM objects must not be locked by caller.
6572 */
6573 unsigned int vm_object_transpose_count = 0;
6574 kern_return_t
6575 vm_object_transpose(
6576 vm_object_t object1,
6577 vm_object_t object2,
6578 vm_object_size_t transpose_size)
6579 {
6580 vm_object_t tmp_object;
6581 kern_return_t retval;
6582 boolean_t object1_locked, object2_locked;
6583 vm_page_t page;
6584 vm_object_offset_t page_offset;
6585
6586 tmp_object = VM_OBJECT_NULL;
6587 object1_locked = FALSE; object2_locked = FALSE;
6588
6589 if (object1 == object2 ||
6590 object1 == VM_OBJECT_NULL ||
6591 object2 == VM_OBJECT_NULL) {
6592 /*
6593 * If the 2 VM objects are the same, there's
6594 * no point in exchanging their backing store.
6595 */
6596 retval = KERN_INVALID_VALUE;
6597 goto done;
6598 }
6599
6600 /*
6601 * Since we need to lock both objects at the same time,
6602 * make sure we always lock them in the same order to
6603 * avoid deadlocks.
6604 */
6605 if (object1 > object2) {
6606 tmp_object = object1;
6607 object1 = object2;
6608 object2 = tmp_object;
6609 }
6610
6611 /*
6612 * Allocate a temporary VM object to hold object1's contents
6613 * while we copy object2 to object1.
6614 */
6615 tmp_object = vm_object_allocate(transpose_size, object1->vmo_provenance);
6616 vm_object_lock(tmp_object);
6617 VM_OBJECT_SET_CAN_PERSIST(tmp_object, FALSE);
6618
6619
6620 /*
6621 * Grab control of the 1st VM object.
6622 */
6623 vm_object_lock(object1);
6624 object1_locked = TRUE;
6625 if (!object1->alive || object1->terminating ||
6626 object1->vo_copy || object1->shadow || object1->shadowed ||
6627 object1->purgable != VM_PURGABLE_DENY) {
6628 /*
6629 * We don't deal with copy or shadow objects (yet).
6630 */
6631 retval = KERN_INVALID_VALUE;
6632 goto done;
6633 }
6634 /*
6635 * We're about to mess with the object's backing store and
6636 * taking a "paging_in_progress" reference wouldn't be enough
6637 * to prevent any paging activity on this object, so the caller should
6638 * have "quiesced" the objects beforehand, via a UPL operation with
6639 * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
6640 * and UPL_BLOCK_ACCESS (to mark the pages "busy").
6641 *
6642 * Wait for any paging operation to complete (but only paging, not
6643 * other kind of activities not linked to the pager). After we're
6644 * statisfied that there's no more paging in progress, we keep the
6645 * object locked, to guarantee that no one tries to access its pager.
6646 */
6647 vm_object_paging_only_wait(object1, THREAD_UNINT);
6648
6649 /*
6650 * Same as above for the 2nd object...
6651 */
6652 vm_object_lock(object2);
6653 object2_locked = TRUE;
6654 if (!object2->alive || object2->terminating ||
6655 object2->vo_copy || object2->shadow || object2->shadowed ||
6656 object2->purgable != VM_PURGABLE_DENY) {
6657 retval = KERN_INVALID_VALUE;
6658 goto done;
6659 }
6660 vm_object_paging_only_wait(object2, THREAD_UNINT);
6661
6662
6663 if (object1->vo_size != object2->vo_size ||
6664 object1->vo_size != transpose_size) {
6665 /*
6666 * If the 2 objects don't have the same size, we can't
6667 * exchange their backing stores or one would overflow.
6668 * If their size doesn't match the caller's
6669 * "transpose_size", we can't do it either because the
6670 * transpose operation will affect the entire span of
6671 * the objects.
6672 */
6673 retval = KERN_INVALID_VALUE;
6674 goto done;
6675 }
6676
6677
6678 /*
6679 * Transpose the lists of resident pages.
6680 * This also updates the resident_page_count and the memq_hint.
6681 */
6682 if (object1->phys_contiguous || vm_page_queue_empty(&object1->memq)) {
6683 /*
6684 * No pages in object1, just transfer pages
6685 * from object2 to object1. No need to go through
6686 * an intermediate object.
6687 */
6688 while (!vm_page_queue_empty(&object2->memq)) {
6689 page = (vm_page_t) vm_page_queue_first(&object2->memq);
6690 vm_page_rename(page, object1, page->vmp_offset);
6691 }
6692 assert(vm_page_queue_empty(&object2->memq));
6693 } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) {
6694 /*
6695 * No pages in object2, just transfer pages
6696 * from object1 to object2. No need to go through
6697 * an intermediate object.
6698 */
6699 while (!vm_page_queue_empty(&object1->memq)) {
6700 page = (vm_page_t) vm_page_queue_first(&object1->memq);
6701 vm_page_rename(page, object2, page->vmp_offset);
6702 }
6703 assert(vm_page_queue_empty(&object1->memq));
6704 } else {
6705 /* transfer object1's pages to tmp_object */
6706 while (!vm_page_queue_empty(&object1->memq)) {
6707 page = (vm_page_t) vm_page_queue_first(&object1->memq);
6708 page_offset = page->vmp_offset;
6709 vm_page_remove(page, TRUE);
6710 page->vmp_offset = page_offset;
6711 vm_page_queue_enter(&tmp_object->memq, page, vmp_listq);
6712 }
6713 assert(vm_page_queue_empty(&object1->memq));
6714 /* transfer object2's pages to object1 */
6715 while (!vm_page_queue_empty(&object2->memq)) {
6716 page = (vm_page_t) vm_page_queue_first(&object2->memq);
6717 vm_page_rename(page, object1, page->vmp_offset);
6718 }
6719 assert(vm_page_queue_empty(&object2->memq));
6720 /* transfer tmp_object's pages to object2 */
6721 while (!vm_page_queue_empty(&tmp_object->memq)) {
6722 page = (vm_page_t) vm_page_queue_first(&tmp_object->memq);
6723 vm_page_queue_remove(&tmp_object->memq, page, vmp_listq);
6724 vm_page_insert(page, object2, page->vmp_offset);
6725 }
6726 assert(vm_page_queue_empty(&tmp_object->memq));
6727 }
6728
6729 #define __TRANSPOSE_FIELD(field) \
6730 MACRO_BEGIN \
6731 tmp_object->field = object1->field; \
6732 object1->field = object2->field; \
6733 object2->field = tmp_object->field; \
6734 MACRO_END
6735
6736 /* "Lock" refers to the object not its contents */
6737 /* "size" should be identical */
6738 assert(object1->vo_size == object2->vo_size);
6739 /* "memq_hint" was updated above when transposing pages */
6740 /* "ref_count" refers to the object not its contents */
6741 assert(os_ref_get_count_raw(&object1->ref_count) >= 1);
6742 assert(os_ref_get_count_raw(&object2->ref_count) >= 1);
6743 /* "resident_page_count" was updated above when transposing pages */
6744 /* "wired_page_count" was updated above when transposing pages */
6745 #if !VM_TAG_ACTIVE_UPDATE
6746 /* "wired_objq" was dealt with along with "wired_page_count" */
6747 #endif /* ! VM_TAG_ACTIVE_UPDATE */
6748 /* "reusable_page_count" was updated above when transposing pages */
6749 /* there should be no "copy" */
6750 assert(!object1->vo_copy);
6751 assert(!object2->vo_copy);
6752 /* there should be no "shadow" */
6753 assert(!object1->shadow);
6754 assert(!object2->shadow);
6755 __TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */
6756 __TRANSPOSE_FIELD(pager);
6757 __TRANSPOSE_FIELD(paging_offset);
6758 __TRANSPOSE_FIELD(pager_control);
6759 /* update the memory_objects' pointers back to the VM objects */
6760 if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
6761 memory_object_control_collapse(&object1->pager_control,
6762 object1);
6763 }
6764 if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
6765 memory_object_control_collapse(&object2->pager_control,
6766 object2);
6767 }
6768 __TRANSPOSE_FIELD(copy_strategy);
6769 /* "paging_in_progress" refers to the object not its contents */
6770 assert(!object1->paging_in_progress);
6771 assert(!object2->paging_in_progress);
6772 assert(object1->activity_in_progress);
6773 assert(object2->activity_in_progress);
6774 /* "all_wanted" refers to the object not its contents */
6775 __TRANSPOSE_FIELD(pager_created);
6776 __TRANSPOSE_FIELD(pager_initialized);
6777 __TRANSPOSE_FIELD(pager_ready);
6778 __TRANSPOSE_FIELD(pager_trusted);
6779 __TRANSPOSE_FIELD(can_persist);
6780 __TRANSPOSE_FIELD(internal);
6781 __TRANSPOSE_FIELD(private);
6782 __TRANSPOSE_FIELD(pageout);
6783 /* "alive" should be set */
6784 assert(object1->alive);
6785 assert(object2->alive);
6786 /* "purgeable" should be non-purgeable */
6787 assert(object1->purgable == VM_PURGABLE_DENY);
6788 assert(object2->purgable == VM_PURGABLE_DENY);
6789 /* "shadowed" refers to the the object not its contents */
6790 __TRANSPOSE_FIELD(purgeable_when_ripe);
6791 __TRANSPOSE_FIELD(true_share);
6792 /* "terminating" should not be set */
6793 assert(!object1->terminating);
6794 assert(!object2->terminating);
6795 /* transfer "named" reference if needed */
6796 if (object1->named && !object2->named) {
6797 os_ref_release_live_locked_raw(&object1->ref_count, &vm_object_refgrp);
6798 os_ref_retain_locked_raw(&object2->ref_count, &vm_object_refgrp);
6799 } else if (!object1->named && object2->named) {
6800 os_ref_retain_locked_raw(&object1->ref_count, &vm_object_refgrp);
6801 os_ref_release_live_locked_raw(&object2->ref_count, &vm_object_refgrp);
6802 }
6803 __TRANSPOSE_FIELD(named);
6804 /* "shadow_severed" refers to the object not its contents */
6805 __TRANSPOSE_FIELD(phys_contiguous);
6806 __TRANSPOSE_FIELD(nophyscache);
6807 __TRANSPOSE_FIELD(no_pager_reason);
6808 /* "cached_list.next" points to transposed object */
6809 object1->cached_list.next = (queue_entry_t) object2;
6810 object2->cached_list.next = (queue_entry_t) object1;
6811 /* "cached_list.prev" should be NULL */
6812 assert(object1->cached_list.prev == NULL);
6813 assert(object2->cached_list.prev == NULL);
6814 __TRANSPOSE_FIELD(last_alloc);
6815 __TRANSPOSE_FIELD(sequential);
6816 __TRANSPOSE_FIELD(pages_created);
6817 __TRANSPOSE_FIELD(pages_used);
6818 __TRANSPOSE_FIELD(scan_collisions);
6819 __TRANSPOSE_FIELD(cow_hint);
6820 __TRANSPOSE_FIELD(wimg_bits);
6821 __TRANSPOSE_FIELD(set_cache_attr);
6822 __TRANSPOSE_FIELD(code_signed);
6823 object1->transposed = TRUE;
6824 object2->transposed = TRUE;
6825 __TRANSPOSE_FIELD(mapping_in_progress);
6826 __TRANSPOSE_FIELD(volatile_empty);
6827 __TRANSPOSE_FIELD(volatile_fault);
6828 __TRANSPOSE_FIELD(all_reusable);
6829 assert(object1->blocked_access);
6830 assert(object2->blocked_access);
6831 __TRANSPOSE_FIELD(set_cache_attr);
6832 assert(!object1->object_is_shared_cache);
6833 assert(!object2->object_is_shared_cache);
6834 /* ignore purgeable_queue_type and purgeable_queue_group */
6835 assert(!object1->io_tracking);
6836 assert(!object2->io_tracking);
6837 #if VM_OBJECT_ACCESS_TRACKING
6838 assert(!object1->access_tracking);
6839 assert(!object2->access_tracking);
6840 #endif /* VM_OBJECT_ACCESS_TRACKING */
6841 __TRANSPOSE_FIELD(no_tag_update);
6842 #if CONFIG_SECLUDED_MEMORY
6843 assert(!object1->eligible_for_secluded);
6844 assert(!object2->eligible_for_secluded);
6845 assert(!object1->can_grab_secluded);
6846 assert(!object2->can_grab_secluded);
6847 #else /* CONFIG_SECLUDED_MEMORY */
6848 assert(object1->__object3_unused_bits == 0);
6849 assert(object2->__object3_unused_bits == 0);
6850 #endif /* CONFIG_SECLUDED_MEMORY */
6851 #if UPL_DEBUG
6852 /* "uplq" refers to the object not its contents (see upl_transpose()) */
6853 #endif
6854 assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.next == NULL));
6855 assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.prev == NULL));
6856 assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.next == NULL));
6857 assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.prev == NULL));
6858 __TRANSPOSE_FIELD(vmo_provenance);
6859
6860 #undef __TRANSPOSE_FIELD
6861
6862 retval = KERN_SUCCESS;
6863
6864 done:
6865 /*
6866 * Cleanup.
6867 */
6868 if (tmp_object != VM_OBJECT_NULL) {
6869 vm_object_unlock(tmp_object);
6870 /*
6871 * Re-initialize the temporary object to avoid
6872 * deallocating a real pager.
6873 */
6874 _vm_object_allocate(
6875 transpose_size,
6876 tmp_object,
6877 /*
6878 * Since we're reallocating purely to deallocate,
6879 * don't bother trying to set a sensible provenance.
6880 */
6881 VM_MAP_SERIAL_NONE
6882 );
6883 vm_object_deallocate(tmp_object);
6884 tmp_object = VM_OBJECT_NULL;
6885 }
6886
6887 if (object1_locked) {
6888 vm_object_unlock(object1);
6889 object1_locked = FALSE;
6890 }
6891 if (object2_locked) {
6892 vm_object_unlock(object2);
6893 object2_locked = FALSE;
6894 }
6895
6896 vm_object_transpose_count++;
6897
6898 return retval;
6899 }
6900
6901
6902 /*
6903 * vm_object_cluster_size
6904 *
6905 * Determine how big a cluster we should issue an I/O for...
6906 *
6907 * Inputs: *start == offset of page needed
6908 * *length == maximum cluster pager can handle
6909 * Outputs: *start == beginning offset of cluster
6910 * *length == length of cluster to try
6911 *
6912 * The original *start will be encompassed by the cluster
6913 *
6914 */
6915 extern int speculative_reads_disabled;
6916
6917 /*
6918 * Try to always keep these values an even multiple of PAGE_SIZE. We use these values
6919 * to derive min_ph_bytes and max_ph_bytes (IMP: bytes not # of pages) and expect those values to
6920 * always be page-aligned. The derivation could involve operations (e.g. division)
6921 * that could give us non-page-size aligned values if we start out with values that
6922 * are odd multiples of PAGE_SIZE.
6923 */
6924 #if !XNU_TARGET_OS_OSX
6925 unsigned int preheat_max_bytes = (1024 * 512);
6926 #else /* !XNU_TARGET_OS_OSX */
6927 unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES;
6928 #endif /* !XNU_TARGET_OS_OSX */
6929 unsigned int preheat_min_bytes = (1024 * 32);
6930
6931
6932 __private_extern__ void
6933 vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
6934 vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming)
6935 {
6936 vm_size_t pre_heat_size;
6937 vm_size_t tail_size;
6938 vm_size_t head_size;
6939 vm_size_t max_length;
6940 vm_size_t cluster_size;
6941 vm_object_offset_t object_size;
6942 vm_object_offset_t orig_start;
6943 vm_object_offset_t target_start;
6944 vm_object_offset_t offset;
6945 vm_behavior_t behavior;
6946 boolean_t look_behind = TRUE;
6947 boolean_t look_ahead = TRUE;
6948 boolean_t isSSD = FALSE;
6949 uint32_t throttle_limit;
6950 int sequential_run;
6951 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
6952 vm_size_t max_ph_size;
6953 vm_size_t min_ph_size;
6954
6955 assert( !(*length & PAGE_MASK));
6956 assert( !(*start & PAGE_MASK_64));
6957
6958 /*
6959 * remember maxiumum length of run requested
6960 */
6961 max_length = *length;
6962 /*
6963 * we'll always return a cluster size of at least
6964 * 1 page, since the original fault must always
6965 * be processed
6966 */
6967 *length = PAGE_SIZE;
6968 *io_streaming = 0;
6969
6970 if (speculative_reads_disabled || fault_info == NULL) {
6971 /*
6972 * no cluster... just fault the page in
6973 */
6974 return;
6975 }
6976 orig_start = *start;
6977 target_start = orig_start;
6978 cluster_size = round_page(fault_info->cluster_size);
6979 behavior = fault_info->behavior;
6980
6981 vm_object_lock(object);
6982
6983 if (object->pager == MEMORY_OBJECT_NULL) {
6984 goto out; /* pager is gone for this object, nothing more to do */
6985 }
6986 vnode_pager_get_isSSD(object->pager, &isSSD);
6987
6988 min_ph_size = round_page(preheat_min_bytes);
6989 max_ph_size = round_page(preheat_max_bytes);
6990
6991 #if XNU_TARGET_OS_OSX
6992 /*
6993 * If we're paging from an SSD, we cut the minimum cluster size in half
6994 * and reduce the maximum size by a factor of 8. We do this because the
6995 * latency to issue an I/O is a couple of orders of magnitude smaller than
6996 * on spinning media, so being overly aggressive on the cluster size (to
6997 * try and reduce cumulative seek penalties) isn't a good trade off over
6998 * the increased memory pressure caused by the larger speculative I/Os.
6999 * However, the latency isn't 0, so a small amount of clustering is still
7000 * a win.
7001 *
7002 * If an explicit cluster size has already been provided, then we're
7003 * receiving a strong hint that the entire range will be needed (e.g.
7004 * wiring, willneed). In these cases, we want to maximize the I/O size
7005 * to minimize the number of I/Os issued.
7006 */
7007 if (isSSD && cluster_size <= PAGE_SIZE) {
7008 min_ph_size /= 2;
7009 max_ph_size /= 8;
7010
7011 if (min_ph_size & PAGE_MASK_64) {
7012 min_ph_size = trunc_page(min_ph_size);
7013 }
7014
7015 if (max_ph_size & PAGE_MASK_64) {
7016 max_ph_size = trunc_page(max_ph_size);
7017 }
7018 }
7019 #endif /* XNU_TARGET_OS_OSX */
7020
7021 if (min_ph_size < PAGE_SIZE) {
7022 min_ph_size = PAGE_SIZE;
7023 }
7024
7025 if (max_ph_size < PAGE_SIZE) {
7026 max_ph_size = PAGE_SIZE;
7027 } else if (max_ph_size > MAX_UPL_TRANSFER_BYTES) {
7028 max_ph_size = MAX_UPL_TRANSFER_BYTES;
7029 }
7030
7031 if (max_length > max_ph_size) {
7032 max_length = max_ph_size;
7033 }
7034
7035 if (max_length <= PAGE_SIZE) {
7036 goto out;
7037 }
7038
7039 if (object->internal) {
7040 object_size = object->vo_size;
7041 } else {
7042 vnode_pager_get_object_size(object->pager, &object_size);
7043 }
7044
7045 object_size = round_page_64(object_size);
7046
7047 if (orig_start >= object_size) {
7048 /*
7049 * fault occurred beyond the EOF...
7050 * we need to punt w/o changing the
7051 * starting offset
7052 */
7053 goto out;
7054 }
7055 if (object->pages_used > object->pages_created) {
7056 /*
7057 * must have wrapped our 32 bit counters
7058 * so reset
7059 */
7060 object->pages_used = object->pages_created = 0;
7061 }
7062 if ((sequential_run = object->sequential)) {
7063 if (sequential_run < 0) {
7064 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
7065 sequential_run = 0 - sequential_run;
7066 } else {
7067 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
7068 }
7069 }
7070 switch (behavior) {
7071 default:
7072 behavior = VM_BEHAVIOR_DEFAULT;
7073 OS_FALLTHROUGH;
7074
7075 case VM_BEHAVIOR_DEFAULT:
7076 if (object->internal && fault_info->user_tag == VM_MEMORY_STACK) {
7077 goto out;
7078 }
7079
7080 if (sequential_run >= (3 * PAGE_SIZE)) {
7081 pre_heat_size = sequential_run + PAGE_SIZE;
7082
7083 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
7084 look_behind = FALSE;
7085 } else {
7086 look_ahead = FALSE;
7087 }
7088
7089 *io_streaming = 1;
7090 } else {
7091 if (object->pages_created < (20 * (min_ph_size >> PAGE_SHIFT))) {
7092 /*
7093 * prime the pump
7094 */
7095 pre_heat_size = min_ph_size;
7096 } else {
7097 /*
7098 * Linear growth in PH size: The maximum size is max_length...
7099 * this cacluation will result in a size that is neither a
7100 * power of 2 nor a multiple of PAGE_SIZE... so round
7101 * it up to the nearest PAGE_SIZE boundary
7102 */
7103 pre_heat_size = (max_length * (uint64_t)object->pages_used) / object->pages_created;
7104
7105 if (pre_heat_size < min_ph_size) {
7106 pre_heat_size = min_ph_size;
7107 } else {
7108 pre_heat_size = round_page(pre_heat_size);
7109 }
7110 }
7111 }
7112 break;
7113
7114 case VM_BEHAVIOR_RANDOM:
7115 if ((pre_heat_size = cluster_size) <= PAGE_SIZE) {
7116 goto out;
7117 }
7118 break;
7119
7120 case VM_BEHAVIOR_SEQUENTIAL:
7121 if ((pre_heat_size = cluster_size) == 0) {
7122 pre_heat_size = sequential_run + PAGE_SIZE;
7123 }
7124 look_behind = FALSE;
7125 *io_streaming = 1;
7126
7127 break;
7128
7129 case VM_BEHAVIOR_RSEQNTL:
7130 if ((pre_heat_size = cluster_size) == 0) {
7131 pre_heat_size = sequential_run + PAGE_SIZE;
7132 }
7133 look_ahead = FALSE;
7134 *io_streaming = 1;
7135
7136 break;
7137 }
7138 throttle_limit = (uint32_t) max_length;
7139 assert(throttle_limit == max_length);
7140
7141 if (vnode_pager_get_throttle_io_limit(object->pager, &throttle_limit) == KERN_SUCCESS) {
7142 if (max_length > throttle_limit) {
7143 max_length = throttle_limit;
7144 }
7145 }
7146 if (pre_heat_size > max_length) {
7147 pre_heat_size = max_length;
7148 }
7149
7150 if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size)) {
7151 unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count;
7152
7153 if (consider_free < vm_page_throttle_limit) {
7154 pre_heat_size = trunc_page(pre_heat_size / 16);
7155 } else if (consider_free < vm_page_free_target) {
7156 pre_heat_size = trunc_page(pre_heat_size / 4);
7157 }
7158
7159 if (pre_heat_size < min_ph_size) {
7160 pre_heat_size = min_ph_size;
7161 }
7162 }
7163 if (look_ahead == TRUE) {
7164 if (look_behind == TRUE) {
7165 /*
7166 * if we get here its due to a random access...
7167 * so we want to center the original fault address
7168 * within the cluster we will issue... make sure
7169 * to calculate 'head_size' as a multiple of PAGE_SIZE...
7170 * 'pre_heat_size' is a multiple of PAGE_SIZE but not
7171 * necessarily an even number of pages so we need to truncate
7172 * the result to a PAGE_SIZE boundary
7173 */
7174 head_size = trunc_page(pre_heat_size / 2);
7175
7176 if (target_start > head_size) {
7177 target_start -= head_size;
7178 } else {
7179 target_start = 0;
7180 }
7181
7182 /*
7183 * 'target_start' at this point represents the beginning offset
7184 * of the cluster we are considering... 'orig_start' will be in
7185 * the center of this cluster if we didn't have to clip the start
7186 * due to running into the start of the file
7187 */
7188 }
7189 if ((target_start + pre_heat_size) > object_size) {
7190 pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start));
7191 }
7192 /*
7193 * at this point caclulate the number of pages beyond the original fault
7194 * address that we want to consider... this is guaranteed not to extend beyond
7195 * the current EOF...
7196 */
7197 assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start));
7198 tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE;
7199 } else {
7200 if (pre_heat_size > target_start) {
7201 /*
7202 * since pre_heat_size is always smaller then 2^32,
7203 * if it is larger then target_start (a 64 bit value)
7204 * it is safe to clip target_start to 32 bits
7205 */
7206 pre_heat_size = (vm_size_t) target_start;
7207 }
7208 tail_size = 0;
7209 }
7210 assert( !(target_start & PAGE_MASK_64));
7211 assert( !(pre_heat_size & PAGE_MASK_64));
7212
7213 if (pre_heat_size <= PAGE_SIZE) {
7214 goto out;
7215 }
7216
7217 if (look_behind == TRUE) {
7218 /*
7219 * take a look at the pages before the original
7220 * faulting offset... recalculate this in case
7221 * we had to clip 'pre_heat_size' above to keep
7222 * from running past the EOF.
7223 */
7224 head_size = pre_heat_size - tail_size - PAGE_SIZE;
7225
7226 for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
7227 /*
7228 * don't poke below the lowest offset
7229 */
7230 if (offset < fault_info->lo_offset) {
7231 break;
7232 }
7233 /*
7234 * for external objects or internal objects w/o a pager,
7235 * vm_object_compressor_pager_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7236 */
7237 if (vm_object_compressor_pager_state_get(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
7238 break;
7239 }
7240 if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7241 /*
7242 * don't bridge resident pages
7243 */
7244 break;
7245 }
7246 *start = offset;
7247 *length += PAGE_SIZE;
7248 }
7249 }
7250 if (look_ahead == TRUE) {
7251 for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
7252 /*
7253 * don't poke above the highest offset
7254 */
7255 if (offset >= fault_info->hi_offset) {
7256 break;
7257 }
7258 assert(offset < object_size);
7259
7260 /*
7261 * for external objects or internal objects w/o a pager,
7262 * vm_object_compressor_pager_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7263 */
7264 if (vm_object_compressor_pager_state_get(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
7265 break;
7266 }
7267 if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7268 /*
7269 * don't bridge resident pages
7270 */
7271 break;
7272 }
7273 *length += PAGE_SIZE;
7274 }
7275 }
7276 out:
7277 if (*length > max_length) {
7278 *length = max_length;
7279 }
7280
7281 vm_object_unlock(object);
7282
7283 DTRACE_VM1(clustersize, vm_size_t, *length);
7284 }
7285
7286
7287 /*
7288 * Allow manipulation of individual page state. This is actually part of
7289 * the UPL regimen but takes place on the VM object rather than on a UPL
7290 */
7291
7292 kern_return_t
7293 vm_object_page_op(
7294 vm_object_t object,
7295 vm_object_offset_t offset,
7296 int ops,
7297 ppnum_t *phys_entry,
7298 int *flags)
7299 {
7300 vm_page_t dst_page;
7301
7302 vm_object_lock(object);
7303
7304 if (ops & UPL_POP_PHYSICAL) {
7305 if (object->phys_contiguous) {
7306 if (phys_entry) {
7307 *phys_entry = (ppnum_t)
7308 (object->vo_shadow_offset >> PAGE_SHIFT);
7309 }
7310 vm_object_unlock(object);
7311 return KERN_SUCCESS;
7312 } else {
7313 vm_object_unlock(object);
7314 return KERN_INVALID_OBJECT;
7315 }
7316 }
7317 if (object->phys_contiguous) {
7318 vm_object_unlock(object);
7319 return KERN_INVALID_OBJECT;
7320 }
7321
7322 while (TRUE) {
7323 if ((dst_page = vm_page_lookup(object, offset)) == VM_PAGE_NULL) {
7324 vm_object_unlock(object);
7325 return KERN_FAILURE;
7326 }
7327
7328 /* Sync up on getting the busy bit */
7329 if ((dst_page->vmp_busy || dst_page->vmp_cleaning) &&
7330 (((ops & UPL_POP_SET) &&
7331 (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
7332 /* someone else is playing with the page, we will */
7333 /* have to wait */
7334 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_DEFAULT);
7335 continue;
7336 }
7337
7338 if (ops & UPL_POP_DUMP) {
7339 if (dst_page->vmp_pmapped == TRUE) {
7340 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
7341 }
7342
7343 VM_PAGE_FREE(dst_page);
7344 break;
7345 }
7346
7347 if (flags) {
7348 *flags = 0;
7349
7350 /* Get the condition of flags before requested ops */
7351 /* are undertaken */
7352
7353 if (dst_page->vmp_dirty) {
7354 *flags |= UPL_POP_DIRTY;
7355 }
7356 if (dst_page->vmp_free_when_done) {
7357 *flags |= UPL_POP_PAGEOUT;
7358 }
7359 if (dst_page->vmp_precious) {
7360 *flags |= UPL_POP_PRECIOUS;
7361 }
7362 if (dst_page->vmp_absent) {
7363 *flags |= UPL_POP_ABSENT;
7364 }
7365 if (dst_page->vmp_busy) {
7366 *flags |= UPL_POP_BUSY;
7367 }
7368 }
7369
7370 /* The caller should have made a call either contingent with */
7371 /* or prior to this call to set UPL_POP_BUSY */
7372 if (ops & UPL_POP_SET) {
7373 /* The protection granted with this assert will */
7374 /* not be complete. If the caller violates the */
7375 /* convention and attempts to change page state */
7376 /* without first setting busy we may not see it */
7377 /* because the page may already be busy. However */
7378 /* if such violations occur we will assert sooner */
7379 /* or later. */
7380 assert(dst_page->vmp_busy || (ops & UPL_POP_BUSY));
7381 if (ops & UPL_POP_DIRTY) {
7382 SET_PAGE_DIRTY(dst_page, FALSE);
7383 }
7384 if (ops & UPL_POP_PAGEOUT) {
7385 dst_page->vmp_free_when_done = TRUE;
7386 }
7387 if (ops & UPL_POP_PRECIOUS) {
7388 dst_page->vmp_precious = TRUE;
7389 }
7390 if (ops & UPL_POP_ABSENT) {
7391 dst_page->vmp_absent = TRUE;
7392 }
7393 if (ops & UPL_POP_BUSY) {
7394 dst_page->vmp_busy = TRUE;
7395 }
7396 }
7397
7398 if (ops & UPL_POP_CLR) {
7399 assert(dst_page->vmp_busy);
7400 if (ops & UPL_POP_DIRTY) {
7401 dst_page->vmp_dirty = FALSE;
7402 }
7403 if (ops & UPL_POP_PAGEOUT) {
7404 dst_page->vmp_free_when_done = FALSE;
7405 }
7406 if (ops & UPL_POP_PRECIOUS) {
7407 dst_page->vmp_precious = FALSE;
7408 }
7409 if (ops & UPL_POP_ABSENT) {
7410 dst_page->vmp_absent = FALSE;
7411 }
7412 if (ops & UPL_POP_BUSY) {
7413 dst_page->vmp_busy = FALSE;
7414 vm_page_wakeup(object, dst_page);
7415 }
7416 }
7417 if (phys_entry) {
7418 /*
7419 * The physical page number will remain valid
7420 * only if the page is kept busy.
7421 */
7422 assert(dst_page->vmp_busy);
7423 *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page);
7424 }
7425
7426 break;
7427 }
7428
7429 vm_object_unlock(object);
7430 return KERN_SUCCESS;
7431 }
7432
7433 /*
7434 * vm_object_range_op offers performance enhancement over
7435 * vm_object_page_op for page_op functions which do not require page
7436 * level state to be returned from the call. Page_op was created to provide
7437 * a low-cost alternative to page manipulation via UPLs when only a single
7438 * page was involved. The range_op call establishes the ability in the _op
7439 * family of functions to work on multiple pages where the lack of page level
7440 * state handling allows the caller to avoid the overhead of the upl structures.
7441 */
7442
7443 kern_return_t
7444 vm_object_range_op(
7445 vm_object_t object,
7446 vm_object_offset_t offset_beg,
7447 vm_object_offset_t offset_end,
7448 int ops,
7449 uint32_t *range)
7450 {
7451 vm_object_offset_t offset;
7452 vm_page_t dst_page;
7453
7454 if (object->resident_page_count == 0) {
7455 if (range) {
7456 if (ops & UPL_ROP_PRESENT) {
7457 *range = 0;
7458 } else {
7459 *range = (uint32_t) (offset_end - offset_beg);
7460 assert(*range == (offset_end - offset_beg));
7461 }
7462 }
7463 return KERN_SUCCESS;
7464 }
7465 vm_object_lock(object);
7466
7467 if (object->phys_contiguous) {
7468 vm_object_unlock(object);
7469 return KERN_INVALID_OBJECT;
7470 }
7471
7472 offset = offset_beg & ~PAGE_MASK_64;
7473
7474 while (offset < offset_end) {
7475 dst_page = vm_page_lookup(object, offset);
7476 if (dst_page != VM_PAGE_NULL) {
7477 if (ops & UPL_ROP_DUMP) {
7478 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
7479 /*
7480 * someone else is playing with the
7481 * page, we will have to wait
7482 */
7483 vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_DEFAULT);
7484 /*
7485 * need to relook the page up since it's
7486 * state may have changed while we slept
7487 * it might even belong to a different object
7488 * at this point
7489 */
7490 continue;
7491 }
7492 if (dst_page->vmp_laundry) {
7493 vm_pageout_steal_laundry(dst_page, FALSE);
7494 }
7495
7496 if (dst_page->vmp_pmapped == TRUE) {
7497 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
7498 }
7499
7500 VM_PAGE_FREE(dst_page);
7501 } else if ((ops & UPL_ROP_ABSENT)
7502 && (!dst_page->vmp_absent || dst_page->vmp_busy)) {
7503 break;
7504 }
7505 } else if (ops & UPL_ROP_PRESENT) {
7506 break;
7507 }
7508
7509 offset += PAGE_SIZE;
7510 }
7511 vm_object_unlock(object);
7512
7513 if (range) {
7514 if (offset > offset_end) {
7515 offset = offset_end;
7516 }
7517 if (offset > offset_beg) {
7518 *range = (uint32_t) (offset - offset_beg);
7519 assert(*range == (offset - offset_beg));
7520 } else {
7521 *range = 0;
7522 }
7523 }
7524 return KERN_SUCCESS;
7525 }
7526
7527 /*
7528 * Used to point a pager directly to a range of memory (when the pager may be associated
7529 * with a non-device vnode). Takes a virtual address, an offset, and a size. We currently
7530 * expect that the virtual address will denote the start of a range that is physically contiguous.
7531 */
7532 kern_return_t
7533 pager_map_to_phys_contiguous(
7534 memory_object_control_t object,
7535 memory_object_offset_t offset,
7536 addr64_t base_vaddr,
7537 vm_size_t size)
7538 {
7539 ppnum_t page_num;
7540 boolean_t clobbered_private;
7541 kern_return_t retval;
7542 vm_object_t pager_object;
7543
7544 page_num = pmap_find_phys(kernel_pmap, base_vaddr);
7545
7546 if (!page_num) {
7547 retval = KERN_FAILURE;
7548 goto out;
7549 }
7550
7551 pager_object = memory_object_control_to_vm_object(object);
7552
7553 if (!pager_object) {
7554 retval = KERN_FAILURE;
7555 goto out;
7556 }
7557
7558 clobbered_private = pager_object->private;
7559 if (pager_object->private != TRUE) {
7560 vm_object_lock(pager_object);
7561 VM_OBJECT_SET_PRIVATE(pager_object, TRUE);
7562 vm_object_unlock(pager_object);
7563 }
7564 retval = vm_object_populate_with_private(pager_object, offset, page_num, size);
7565
7566 if (retval != KERN_SUCCESS) {
7567 if (pager_object->private != clobbered_private) {
7568 vm_object_lock(pager_object);
7569 VM_OBJECT_SET_PRIVATE(pager_object, clobbered_private);
7570 vm_object_unlock(pager_object);
7571 }
7572 }
7573
7574 out:
7575 return retval;
7576 }
7577
7578 uint32_t scan_object_collision = 0;
7579
7580 void
7581 vm_object_lock(vm_object_t object)
7582 {
7583 if (object == vm_pageout_scan_wants_object) {
7584 scan_object_collision++;
7585 mutex_pause(2);
7586 }
7587 DTRACE_VM(vm_object_lock_w);
7588 lck_rw_lock_exclusive(&object->Lock);
7589 }
7590
7591 boolean_t
7592 vm_object_lock_avoid(vm_object_t object)
7593 {
7594 if (object == vm_pageout_scan_wants_object) {
7595 scan_object_collision++;
7596 return TRUE;
7597 }
7598 return FALSE;
7599 }
7600
7601 boolean_t
7602 _vm_object_lock_try(vm_object_t object)
7603 {
7604 boolean_t retval;
7605
7606 retval = lck_rw_try_lock_exclusive(&object->Lock);
7607 #if DEVELOPMENT || DEBUG
7608 if (retval == TRUE) {
7609 DTRACE_VM(vm_object_lock_w);
7610 }
7611 #endif
7612 return retval;
7613 }
7614
7615 boolean_t
7616 vm_object_lock_try(vm_object_t object)
7617 {
7618 /*
7619 * Called from hibernate path so check before blocking.
7620 */
7621 if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level() == 0) {
7622 mutex_pause(2);
7623 }
7624 return _vm_object_lock_try(object);
7625 }
7626
7627 /*
7628 * Lock the object exclusive.
7629 *
7630 * Returns true iff the thread had to spin or block before
7631 * acquiring the lock.
7632 */
7633 bool
7634 vm_object_lock_check_contended(vm_object_t object)
7635 {
7636 if (object == vm_pageout_scan_wants_object) {
7637 scan_object_collision++;
7638 mutex_pause(2);
7639 }
7640 DTRACE_VM(vm_object_lock_w);
7641 return lck_rw_lock_exclusive_check_contended(&object->Lock);
7642 }
7643
7644 void
7645 vm_object_lock_shared(vm_object_t object)
7646 {
7647 if (vm_object_lock_avoid(object)) {
7648 mutex_pause(2);
7649 }
7650 DTRACE_VM(vm_object_lock_r);
7651 lck_rw_lock_shared(&object->Lock);
7652 }
7653
7654 boolean_t
7655 vm_object_lock_yield_shared(vm_object_t object)
7656 {
7657 boolean_t retval = FALSE, force_yield = FALSE;
7658
7659 vm_object_lock_assert_shared(object);
7660
7661 force_yield = vm_object_lock_avoid(object);
7662
7663 retval = lck_rw_lock_yield_shared(&object->Lock, force_yield);
7664 if (retval) {
7665 DTRACE_VM(vm_object_lock_yield);
7666 }
7667
7668 return retval;
7669 }
7670
7671 boolean_t
7672 vm_object_lock_try_shared(vm_object_t object)
7673 {
7674 boolean_t retval;
7675
7676 if (vm_object_lock_avoid(object)) {
7677 mutex_pause(2);
7678 }
7679 retval = lck_rw_try_lock_shared(&object->Lock);
7680 if (retval) {
7681 DTRACE_VM(vm_object_lock_r);
7682 }
7683 return retval;
7684 }
7685
7686 boolean_t
7687 vm_object_lock_upgrade(vm_object_t object)
7688 {
7689 boolean_t retval;
7690
7691 retval = lck_rw_lock_shared_to_exclusive(&object->Lock);
7692 #if DEVELOPMENT || DEBUG
7693 if (retval == TRUE) {
7694 DTRACE_VM(vm_object_lock_w);
7695 }
7696 #endif
7697 return retval;
7698 }
7699
7700 void
7701 vm_object_unlock(vm_object_t object)
7702 {
7703 #if DEVELOPMENT || DEBUG
7704 DTRACE_VM(vm_object_unlock);
7705 #endif
7706 lck_rw_done(&object->Lock);
7707 }
7708
7709
7710 unsigned int vm_object_change_wimg_mode_count = 0;
7711
7712 /*
7713 * The object must be locked
7714 */
7715 void
7716 vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode)
7717 {
7718 vm_object_lock_assert_exclusive(object);
7719
7720 vm_object_paging_only_wait(object, THREAD_UNINT);
7721
7722 #if HAS_MTE
7723 if (vm_object_is_mte_mappable(object)) {
7724 panic("Changing WIMG mode on tagged VM object: %d", wimg_mode);
7725 } else if (wimg_mode == VM_WIMG_MTE) {
7726 panic("Changing untagged VM object to VM_WIMG_MTE: %d", object->wimg_bits);
7727 }
7728 #endif /* HAS_MTE */
7729
7730 const unified_page_list_t pmap_batch_list = {
7731 .pageq = &object->memq,
7732 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_OBJ_Q,
7733 };
7734 pmap_batch_set_cache_attributes(&pmap_batch_list, wimg_mode);
7735 object->set_cache_attr = !HAS_DEFAULT_CACHEABILITY(wimg_mode);
7736
7737 object->wimg_bits = wimg_mode;
7738
7739 vm_object_change_wimg_mode_count++;
7740 }
7741
7742 #if CONFIG_FREEZE
7743
7744 extern struct freezer_context freezer_context_global;
7745
7746 /*
7747 * This routine does the "relocation" of previously
7748 * compressed pages belonging to this object that are
7749 * residing in a number of compressed segments into
7750 * a set of compressed segments dedicated to hold
7751 * compressed pages belonging to this object.
7752 */
7753
7754 extern AbsoluteTime c_freezer_last_yield_ts;
7755
7756 #define MAX_FREE_BATCH 32
7757 #define FREEZER_DUTY_CYCLE_ON_MS 5
7758 #define FREEZER_DUTY_CYCLE_OFF_MS 5
7759
7760 static int c_freezer_should_yield(void);
7761
7762
7763 static int
7764 c_freezer_should_yield()
7765 {
7766 AbsoluteTime cur_time;
7767 uint64_t nsecs;
7768
7769 assert(c_freezer_last_yield_ts);
7770 clock_get_uptime(&cur_time);
7771
7772 SUB_ABSOLUTETIME(&cur_time, &c_freezer_last_yield_ts);
7773 absolutetime_to_nanoseconds(cur_time, &nsecs);
7774
7775 if (nsecs > 1000 * 1000 * FREEZER_DUTY_CYCLE_ON_MS) {
7776 return 1;
7777 }
7778 return 0;
7779 }
7780
7781
7782 void
7783 vm_object_compressed_freezer_done()
7784 {
7785 vm_compressor_finished_filling( &(freezer_context_global.freezer_ctx_chead));
7786 }
7787
7788
7789 uint32_t
7790 vm_object_compressed_freezer_pageout(
7791 vm_object_t object, uint32_t dirty_budget)
7792 {
7793 vm_page_t p;
7794 vm_page_t local_freeq = NULL;
7795 int local_freed = 0;
7796 kern_return_t retval = KERN_SUCCESS;
7797 int obj_resident_page_count_snapshot = 0;
7798 uint32_t paged_out_count = 0;
7799
7800 assert(object != VM_OBJECT_NULL);
7801 assert(object->internal);
7802
7803 vm_object_lock(object);
7804
7805 if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
7806 if (!object->pager_initialized) {
7807 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
7808
7809 if (!object->pager_initialized) {
7810 vm_object_compressor_pager_create(object);
7811 }
7812 }
7813
7814 if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
7815 vm_object_unlock(object);
7816 return paged_out_count;
7817 }
7818 }
7819
7820 /*
7821 * We could be freezing a shared internal object that might
7822 * be part of some other thread's current VM operations.
7823 * We skip it if there's a paging-in-progress or activity-in-progress
7824 * because we could be here a long time with the map lock held.
7825 *
7826 * Note: We are holding the map locked while we wait.
7827 * This is fine in the freezer path because the task
7828 * is suspended and so this latency is acceptable.
7829 */
7830 if (object->paging_in_progress || object->activity_in_progress) {
7831 vm_object_unlock(object);
7832 return paged_out_count;
7833 }
7834
7835 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
7836 vm_object_offset_t curr_offset = 0;
7837
7838 /*
7839 * Go through the object and make sure that any
7840 * previously compressed pages are relocated into
7841 * a compressed segment associated with our "freezer_chead".
7842 */
7843 while (curr_offset < object->vo_size) {
7844 curr_offset = vm_compressor_pager_next_compressed(object->pager, curr_offset);
7845
7846 if (curr_offset == (vm_object_offset_t) -1) {
7847 break;
7848 }
7849
7850 retval = vm_compressor_pager_relocate(object->pager, curr_offset, &(freezer_context_global.freezer_ctx_chead));
7851
7852 if (retval != KERN_SUCCESS) {
7853 break;
7854 }
7855
7856 curr_offset += PAGE_SIZE_64;
7857 }
7858 }
7859
7860 /*
7861 * We can't hold the object lock while heading down into the compressed pager
7862 * layer because we might need the kernel map lock down there to allocate new
7863 * compressor data structures. And if this same object is mapped in the kernel
7864 * and there's a fault on it, then that thread will want the object lock while
7865 * holding the kernel map lock.
7866 *
7867 * Since we are going to drop/grab the object lock repeatedly, we must make sure
7868 * we won't be stuck in an infinite loop if the same page(s) keep getting
7869 * decompressed. So we grab a snapshot of the number of pages in the object and
7870 * we won't process any more than that number of pages.
7871 */
7872
7873 obj_resident_page_count_snapshot = object->resident_page_count;
7874
7875 vm_object_activity_begin(object);
7876
7877 while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq) && paged_out_count < dirty_budget) {
7878 p = (vm_page_t)vm_page_queue_first(&object->memq);
7879
7880 KDBG_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed);
7881
7882 vm_page_lockspin_queues();
7883
7884 if (p->vmp_cleaning || vm_page_is_fictitious(p) ||
7885 p->vmp_busy || p->vmp_absent || p->vmp_unusual ||
7886 VMP_ERROR_GET(p) || VM_PAGE_WIRED(p)) {
7887 vm_page_unlock_queues();
7888
7889 KDBG_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1);
7890
7891 vm_page_queue_remove(&object->memq, p, vmp_listq);
7892 vm_page_queue_enter(&object->memq, p, vmp_listq);
7893
7894 continue;
7895 }
7896
7897 if (p->vmp_pmapped == TRUE) {
7898 int refmod_state, pmap_flags;
7899
7900 if (p->vmp_dirty || p->vmp_precious) {
7901 pmap_flags = PMAP_OPTIONS_COMPRESSOR;
7902 } else {
7903 pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
7904 }
7905
7906 vm_page_lockconvert_queues();
7907 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), pmap_flags, NULL);
7908 if (refmod_state & VM_MEM_MODIFIED) {
7909 SET_PAGE_DIRTY(p, FALSE);
7910 }
7911 }
7912
7913 if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) {
7914 /*
7915 * Clean and non-precious page.
7916 */
7917 vm_page_unlock_queues();
7918 VM_PAGE_FREE(p);
7919
7920 KDBG_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 2);
7921 continue;
7922 }
7923
7924 if (p->vmp_laundry) {
7925 vm_pageout_steal_laundry(p, TRUE);
7926 }
7927
7928 vm_page_queues_remove(p, TRUE);
7929
7930 vm_page_unlock_queues();
7931
7932
7933 /*
7934 * In case the compressor fails to compress this page, we need it at
7935 * the back of the object memq so that we don't keep trying to process it.
7936 * Make the move here while we have the object lock held.
7937 */
7938
7939 vm_page_queue_remove(&object->memq, p, vmp_listq);
7940 vm_page_queue_enter(&object->memq, p, vmp_listq);
7941
7942 /*
7943 * Grab an activity_in_progress here for vm_pageout_compress_page() to consume.
7944 *
7945 * Mark the page busy so no one messes with it while we have the object lock dropped.
7946 */
7947 p->vmp_busy = TRUE;
7948
7949 vm_object_activity_begin(object);
7950
7951 vm_object_unlock(object);
7952
7953 if (vm_pageout_compress_page(&(freezer_context_global.freezer_ctx_chead),
7954 (freezer_context_global.freezer_ctx_compressor_scratch_buf),
7955 p) == KERN_SUCCESS) {
7956 /*
7957 * page has already been un-tabled from the object via 'vm_page_remove'
7958 */
7959 p->vmp_snext = local_freeq;
7960 local_freeq = p;
7961 local_freed++;
7962 paged_out_count++;
7963
7964 if (local_freed >= MAX_FREE_BATCH) {
7965 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
7966
7967 vm_page_free_list(local_freeq, TRUE);
7968
7969 local_freeq = NULL;
7970 local_freed = 0;
7971 }
7972 freezer_context_global.freezer_ctx_uncompressed_pages++;
7973 }
7974 KDBG_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed);
7975
7976 if (local_freed == 0 && c_freezer_should_yield()) {
7977 thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS);
7978 clock_get_uptime(&c_freezer_last_yield_ts);
7979 }
7980
7981 vm_object_lock(object);
7982 }
7983
7984 if (local_freeq) {
7985 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
7986
7987 vm_page_free_list(local_freeq, TRUE);
7988
7989 local_freeq = NULL;
7990 local_freed = 0;
7991 }
7992
7993 vm_object_activity_end(object);
7994
7995 vm_object_unlock(object);
7996
7997 if (c_freezer_should_yield()) {
7998 thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS);
7999 clock_get_uptime(&c_freezer_last_yield_ts);
8000 }
8001 return paged_out_count;
8002 }
8003
8004 #endif /* CONFIG_FREEZE */
8005
8006
8007 uint64_t vm_object_pageout_not_on_queue = 0;
8008 uint64_t vm_object_pageout_not_pageable = 0;
8009 uint64_t vm_object_pageout_pageable = 0;
8010 uint64_t vm_object_pageout_active_local = 0;
8011 void
8012 vm_object_pageout(
8013 vm_object_t object)
8014 {
8015 vm_page_t p, next;
8016 struct vm_pageout_queue *iq;
8017
8018 if (!VM_CONFIG_COMPRESSOR_IS_PRESENT) {
8019 return;
8020 }
8021
8022 iq = &vm_pageout_queue_internal;
8023
8024 assert(object != VM_OBJECT_NULL );
8025
8026 vm_object_lock(object);
8027
8028 if (!object->internal ||
8029 object->terminating ||
8030 !object->alive) {
8031 vm_object_unlock(object);
8032 return;
8033 }
8034
8035 if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
8036 if (!object->pager_initialized) {
8037 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
8038
8039 if (!object->pager_initialized) {
8040 vm_object_compressor_pager_create(object);
8041 }
8042 }
8043
8044 if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
8045 vm_object_unlock(object);
8046 return;
8047 }
8048 }
8049
8050 ReScan:
8051 next = (vm_page_t)vm_page_queue_first(&object->memq);
8052
8053 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) {
8054 p = next;
8055 next = (vm_page_t)vm_page_queue_next(&next->vmp_listq);
8056
8057 vm_page_lockspin_queues();
8058
8059 assert(p->vmp_q_state != VM_PAGE_ON_FREE_Q);
8060 assert(p->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8061
8062 if ((p->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) ||
8063 p->vmp_cleaning ||
8064 p->vmp_laundry ||
8065 p->vmp_busy ||
8066 p->vmp_absent ||
8067 VMP_ERROR_GET(p) ||
8068 vm_page_is_fictitious(p) ||
8069 VM_PAGE_WIRED(p)) {
8070 /*
8071 * Page is already being cleaned or can't be cleaned.
8072 */
8073 vm_page_unlock_queues();
8074 continue;
8075 }
8076 if (p->vmp_q_state == VM_PAGE_NOT_ON_Q) {
8077 // printf("FBDP %s:%d page %p object %p offset 0x%llx state %d not on queue\n", __FUNCTION__, __LINE__, p, VM_PAGE_OBJECT(p), p->vmp_offset, p->vmp_q_state);
8078 vm_object_pageout_not_on_queue++;
8079 vm_page_unlock_queues();
8080 continue;
8081 }
8082 if (!VM_PAGE_PAGEABLE(p)) {
8083 if (p->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
8084 vm_object_pageout_active_local++;
8085 } else {
8086 vm_object_pageout_not_pageable++;
8087 vm_page_unlock_queues();
8088 continue;
8089 }
8090 } else {
8091 vm_object_pageout_pageable++;
8092 }
8093
8094 if (vm_compressor_low_on_space()) {
8095 vm_page_unlock_queues();
8096 break;
8097 }
8098
8099 /* Throw to the pageout queue */
8100
8101 if (VM_PAGE_Q_THROTTLED(iq)) {
8102 iq->pgo_draining = TRUE;
8103
8104 assert_wait((event_t) (&iq->pgo_laundry + 1),
8105 THREAD_INTERRUPTIBLE);
8106 vm_page_unlock_queues();
8107 vm_object_unlock(object);
8108
8109 thread_block(THREAD_CONTINUE_NULL);
8110
8111 vm_object_lock(object);
8112 goto ReScan;
8113 }
8114
8115 assert(!vm_page_is_fictitious(p));
8116 assert(!p->vmp_busy);
8117 assert(!p->vmp_absent);
8118 assert(!p->vmp_unusual);
8119 assert(!VMP_ERROR_GET(p)); /* XXX there's a window here where we could have an ECC error! */
8120 assert(!VM_PAGE_WIRED(p));
8121 assert(!p->vmp_cleaning);
8122
8123 if (p->vmp_pmapped == TRUE) {
8124 int refmod_state;
8125 int pmap_options;
8126
8127 /*
8128 * Tell pmap the page should be accounted
8129 * for as "compressed" if it's been modified.
8130 */
8131 pmap_options =
8132 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
8133 if (p->vmp_dirty || p->vmp_precious) {
8134 /*
8135 * We already know it's been modified,
8136 * so tell pmap to account for it
8137 * as "compressed".
8138 */
8139 pmap_options = PMAP_OPTIONS_COMPRESSOR;
8140 }
8141 vm_page_lockconvert_queues();
8142 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p),
8143 pmap_options,
8144 NULL);
8145 if (refmod_state & VM_MEM_MODIFIED) {
8146 SET_PAGE_DIRTY(p, FALSE);
8147 }
8148 }
8149
8150 if (!p->vmp_dirty && !p->vmp_precious) {
8151 vm_page_unlock_queues();
8152 VM_PAGE_FREE(p);
8153 continue;
8154 }
8155 vm_page_queues_remove(p, TRUE);
8156
8157 vm_pageout_cluster(p);
8158
8159 vm_page_unlock_queues();
8160 }
8161 vm_object_unlock(object);
8162 }
8163
8164
8165 #if CONFIG_IOSCHED
8166
8167 void
8168 vm_page_request_reprioritize(vm_object_t o, uint64_t blkno, uint32_t len, int prio)
8169 {
8170 io_reprioritize_req_t req;
8171 struct vnode *devvp = NULL;
8172
8173 if (vnode_pager_get_object_devvp(o->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) {
8174 return;
8175 }
8176
8177 /*
8178 * Create the request for I/O reprioritization.
8179 * We use the noblock variant of zalloc because we're holding the object
8180 * lock here and we could cause a deadlock in low memory conditions.
8181 */
8182 req = (io_reprioritize_req_t)zalloc_noblock(io_reprioritize_req_zone);
8183 if (req == NULL) {
8184 return;
8185 }
8186 req->blkno = blkno;
8187 req->len = len;
8188 req->priority = prio;
8189 req->devvp = devvp;
8190
8191 /* Insert request into the reprioritization list */
8192 mpsc_daemon_enqueue(&io_reprioritize_q, &req->iorr_elm, MPSC_QUEUE_DISABLE_PREEMPTION);
8193
8194 return;
8195 }
8196
8197 void
8198 vm_decmp_upl_reprioritize(upl_t upl, int prio)
8199 {
8200 int offset;
8201 vm_object_t object;
8202 io_reprioritize_req_t req;
8203 struct vnode *devvp = NULL;
8204 uint64_t blkno;
8205 uint32_t len;
8206 upl_t io_upl;
8207 uint64_t *io_upl_reprio_info;
8208 int io_upl_size;
8209
8210 if ((upl->flags & UPL_TRACKED_BY_OBJECT) == 0 || (upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
8211 return;
8212 }
8213
8214 /*
8215 * We dont want to perform any allocations with the upl lock held since that might
8216 * result in a deadlock. If the system is low on memory, the pageout thread would
8217 * try to pageout stuff and might wait on this lock. If we are waiting for the memory to
8218 * be freed up by the pageout thread, it would be a deadlock.
8219 */
8220
8221
8222 /* First step is just to get the size of the upl to find out how big the reprio info is */
8223 if (!upl_try_lock(upl)) {
8224 return;
8225 }
8226
8227 if (upl->decmp_io_upl == NULL) {
8228 /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8229 upl_unlock(upl);
8230 return;
8231 }
8232
8233 io_upl = upl->decmp_io_upl;
8234 assert((io_upl->flags & UPL_DECMP_REAL_IO) != 0);
8235 assertf(page_aligned(io_upl->u_offset) && page_aligned(io_upl->u_size),
8236 "upl %p offset 0x%llx size 0x%x\n",
8237 io_upl, io_upl->u_offset, io_upl->u_size);
8238 io_upl_size = io_upl->u_size;
8239 upl_unlock(upl);
8240
8241 /* Now perform the allocation */
8242 io_upl_reprio_info = kalloc_data(sizeof(uint64_t) * atop(io_upl_size), Z_WAITOK);
8243 if (io_upl_reprio_info == NULL) {
8244 return;
8245 }
8246
8247 /* Now again take the lock, recheck the state and grab out the required info */
8248 if (!upl_try_lock(upl)) {
8249 goto out;
8250 }
8251
8252 if (upl->decmp_io_upl == NULL || upl->decmp_io_upl != io_upl) {
8253 /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8254 upl_unlock(upl);
8255 goto out;
8256 }
8257 memcpy(io_upl_reprio_info, io_upl->upl_reprio_info,
8258 sizeof(uint64_t) * atop(io_upl_size));
8259
8260 /* Get the VM object for this UPL */
8261 if (io_upl->flags & UPL_SHADOWED) {
8262 object = io_upl->map_object->shadow;
8263 } else {
8264 object = io_upl->map_object;
8265 }
8266
8267 /* Get the dev vnode ptr for this object */
8268 if (!object || !object->pager ||
8269 vnode_pager_get_object_devvp(object->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) {
8270 upl_unlock(upl);
8271 goto out;
8272 }
8273
8274 upl_unlock(upl);
8275
8276 /* Now we have all the information needed to do the expedite */
8277
8278 offset = 0;
8279 while (offset < io_upl_size) {
8280 blkno = io_upl_reprio_info[atop(offset)] & UPL_REPRIO_INFO_MASK;
8281 len = (io_upl_reprio_info[atop(offset)] >> UPL_REPRIO_INFO_SHIFT) & UPL_REPRIO_INFO_MASK;
8282
8283 /*
8284 * This implementation may cause some spurious expedites due to the
8285 * fact that we dont cleanup the blkno & len from the upl_reprio_info
8286 * even after the I/O is complete.
8287 */
8288
8289 if (blkno != 0 && len != 0) {
8290 /* Create the request for I/O reprioritization */
8291 req = zalloc_flags(io_reprioritize_req_zone,
8292 Z_WAITOK | Z_NOFAIL);
8293 req->blkno = blkno;
8294 req->len = len;
8295 req->priority = prio;
8296 req->devvp = devvp;
8297
8298 /* Insert request into the reprioritization list */
8299 mpsc_daemon_enqueue(&io_reprioritize_q, &req->iorr_elm, MPSC_QUEUE_DISABLE_PREEMPTION);
8300
8301 offset += len;
8302 } else {
8303 offset += PAGE_SIZE;
8304 }
8305 }
8306
8307 out:
8308 kfree_data(io_upl_reprio_info, sizeof(uint64_t) * atop(io_upl_size));
8309 }
8310
8311 void
8312 vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m)
8313 {
8314 upl_t upl;
8315 upl_page_info_t *pl;
8316 unsigned int i, num_pages;
8317 int cur_tier;
8318
8319 cur_tier = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
8320
8321 /*
8322 * Scan through all UPLs associated with the object to find the
8323 * UPL containing the contended page.
8324 */
8325 queue_iterate(&o->uplq, upl, upl_t, uplq) {
8326 if (((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) || upl->upl_priority <= cur_tier) {
8327 continue;
8328 }
8329 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
8330 assertf(page_aligned(upl->u_offset) && page_aligned(upl->u_size),
8331 "upl %p offset 0x%llx size 0x%x\n",
8332 upl, upl->u_offset, upl->u_size);
8333 num_pages = (upl->u_size / PAGE_SIZE);
8334
8335 /*
8336 * For each page in the UPL page list, see if it matches the contended
8337 * page and was issued as a low prio I/O.
8338 */
8339 for (i = 0; i < num_pages; i++) {
8340 if (UPL_PAGE_PRESENT(pl, i) && VM_PAGE_GET_PHYS_PAGE(m) == pl[i].phys_addr) {
8341 if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) {
8342 KDBG((VMDBG_CODE(DBG_VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m),
8343 VM_KERNEL_UNSLIDE_OR_PERM(upl), upl->upl_priority);
8344 vm_decmp_upl_reprioritize(upl, cur_tier);
8345 break;
8346 }
8347 KDBG((VMDBG_CODE(DBG_VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m),
8348 upl->upl_reprio_info[i], upl->upl_priority);
8349 if (UPL_REPRIO_INFO_BLKNO(upl, i) != 0 && UPL_REPRIO_INFO_LEN(upl, i) != 0) {
8350 vm_page_request_reprioritize(o, UPL_REPRIO_INFO_BLKNO(upl, i), UPL_REPRIO_INFO_LEN(upl, i), cur_tier);
8351 }
8352 break;
8353 }
8354 }
8355 /* Check if we found any hits */
8356 if (i != num_pages) {
8357 break;
8358 }
8359 }
8360
8361 return;
8362 }
8363
8364 void
8365 kdp_vm_object_sleep_find_owner(
8366 event64_t wait_event,
8367 block_hint_t wait_type,
8368 thread_waitinfo_t *waitinfo)
8369 {
8370 assert(wait_type >= kThreadWaitPagerInit && wait_type <= kThreadWaitPageInThrottle);
8371 vm_object_wait_reason_t wait_reason = wait_type - kThreadWaitPagerInit;
8372 vm_object_t object = (vm_object_t)((uintptr_t)wait_event - wait_reason);
8373 waitinfo->context = VM_KERNEL_ADDRPERM(object);
8374 /*
8375 * There is currently no non-trivial way to ascertain the thread(s)
8376 * currently operating on this object.
8377 */
8378 waitinfo->owner = 0;
8379 }
8380
8381
8382 wait_result_t
8383 vm_object_sleep(
8384 vm_object_t object,
8385 vm_object_wait_reason_t reason,
8386 wait_interrupt_t interruptible,
8387 lck_sleep_action_t action)
8388 {
8389 wait_result_t wr;
8390 block_hint_t block_hint;
8391 event_t wait_event;
8392
8393 vm_object_lock_assert_exclusive(object);
8394 assert(reason >= 0 && reason <= VM_OBJECT_EVENT_MAX);
8395 switch (reason) {
8396 case VM_OBJECT_EVENT_PL_REQ_IN_PROGRESS:
8397 block_hint = kThreadWaitPagerInit; /* XXX change that */
8398 break;
8399 case VM_OBJECT_EVENT_PAGER_READY:
8400 block_hint = kThreadWaitPagerReady;
8401 break;
8402 case VM_OBJECT_EVENT_PAGING_IN_PROGRESS:
8403 block_hint = kThreadWaitPagingActivity;
8404 break;
8405 case VM_OBJECT_EVENT_MAPPING_IN_PROGRESS:
8406 block_hint = kThreadWaitMappingInProgress;
8407 break;
8408 case VM_OBJECT_EVENT_UNBLOCKED:
8409 block_hint = kThreadWaitMemoryBlocked;
8410 break;
8411 case VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS:
8412 block_hint = kThreadWaitPagingInProgress;
8413 break;
8414 case VM_OBJECT_EVENT_PAGEIN_THROTTLE:
8415 block_hint = kThreadWaitPageInThrottle;
8416 break;
8417 default:
8418 panic("Unexpected wait reason %u", reason);
8419 }
8420 thread_set_pending_block_hint(current_thread(), block_hint);
8421
8422 KDBG_FILTERED(VMDBG_CODE(DBG_VM_OBJECT_SLEEP) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(object), reason);
8423
8424 vm_object_set_wanted(object, reason);
8425 wait_event = (event_t)((uintptr_t)object + (uintptr_t)reason);
8426 wr = lck_rw_sleep(&object->Lock, LCK_SLEEP_PROMOTED_PRI | action, wait_event, interruptible);
8427
8428 KDBG_FILTERED(VMDBG_CODE(DBG_VM_OBJECT_SLEEP) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(object), reason, wr);
8429 return wr;
8430 }
8431
8432 wait_result_t
8433 vm_object_pl_req_wait(vm_object_t object, wait_interrupt_t interruptible)
8434 {
8435 wait_result_t wr = THREAD_NOT_WAITING;
8436 vm_object_lock_assert_exclusive(object);
8437 while (object->vmo_pl_req_in_progress != 0) {
8438 wr = vm_object_sleep(object,
8439 VM_OBJECT_EVENT_PL_REQ_IN_PROGRESS,
8440 interruptible,
8441 LCK_SLEEP_EXCLUSIVE);
8442 if (wr != THREAD_AWAKENED) {
8443 break;
8444 }
8445 }
8446 return wr;
8447 }
8448
8449 wait_result_t
8450 vm_object_paging_wait(vm_object_t object, wait_interrupt_t interruptible)
8451 {
8452 wait_result_t wr = THREAD_NOT_WAITING;
8453 vm_object_lock_assert_exclusive(object);
8454 while (object->paging_in_progress != 0 ||
8455 object->activity_in_progress != 0) {
8456 wr = vm_object_sleep((object),
8457 VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
8458 interruptible,
8459 LCK_SLEEP_EXCLUSIVE);
8460 if (wr != THREAD_AWAKENED) {
8461 break;
8462 }
8463 }
8464 return wr;
8465 }
8466
8467 wait_result_t
8468 vm_object_paging_only_wait(vm_object_t object, wait_interrupt_t interruptible)
8469 {
8470 wait_result_t wr = THREAD_NOT_WAITING;
8471 vm_object_lock_assert_exclusive(object);
8472 while (object->paging_in_progress != 0) {
8473 wr = vm_object_sleep(object,
8474 VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS,
8475 interruptible,
8476 LCK_SLEEP_EXCLUSIVE);
8477 if (wr != THREAD_AWAKENED) {
8478 break;
8479 }
8480 }
8481 return wr;
8482 }
8483
8484 wait_result_t
8485 vm_object_paging_throttle_wait(vm_object_t object, wait_interrupt_t interruptible)
8486 {
8487 wait_result_t wr = THREAD_NOT_WAITING;
8488 vm_object_lock_assert_exclusive(object);
8489 /*
8490 * TODO: consider raising the throttle limit specifically for
8491 * shared-cache objects, which are expected to be highly contended.
8492 * (rdar://127899888)
8493 */
8494 while (object->paging_in_progress >= vm_object_pagein_throttle) {
8495 wr = vm_object_sleep(object,
8496 VM_OBJECT_EVENT_PAGEIN_THROTTLE,
8497 interruptible,
8498 LCK_SLEEP_EXCLUSIVE);
8499 if (wr != THREAD_AWAKENED) {
8500 break;
8501 }
8502 }
8503 return wr;
8504 }
8505
8506 wait_result_t
8507 vm_object_mapping_wait(vm_object_t object, wait_interrupt_t interruptible)
8508 {
8509 wait_result_t wr = THREAD_NOT_WAITING;
8510 vm_object_lock_assert_exclusive(object);
8511 while (object->mapping_in_progress) {
8512 wr = vm_object_sleep(object,
8513 VM_OBJECT_EVENT_MAPPING_IN_PROGRESS,
8514 interruptible,
8515 LCK_SLEEP_EXCLUSIVE);
8516 if (wr != THREAD_AWAKENED) {
8517 break;
8518 }
8519 }
8520 return wr;
8521 }
8522
8523 void
8524 vm_object_wakeup(
8525 vm_object_t object,
8526 vm_object_wait_reason_t reason)
8527 {
8528 vm_object_lock_assert_exclusive(object);
8529 assert(reason >= 0 && reason <= VM_OBJECT_EVENT_MAX);
8530
8531 if (vm_object_wanted(object, reason)) {
8532 thread_wakeup((event_t)((uintptr_t)object + (uintptr_t)reason));
8533 }
8534 object->all_wanted &= ~(1 << reason);
8535 }
8536
8537
8538 void
8539 kdp_vm_page_sleep_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
8540 {
8541 vm_page_t m = (vm_page_t)wait_event;
8542 waitinfo->context = VM_KERNEL_ADDRPERM(m);
8543 /*
8544 * There is not currently a non-trivial way to identify the thread
8545 * holding a page busy.
8546 */
8547 waitinfo->owner = 0;
8548 }
8549
8550 #if PAGE_SLEEP_WITH_INHERITOR
8551 static wait_result_t vm_page_sleep_with_inheritor(lck_rw_t *lck, lck_sleep_action_t lck_sleep_action, event_t event, wait_interrupt_t interruptible);
8552 #endif /* PAGE_SLEEP_WITH_INHERITOR */
8553
8554 wait_result_t
8555 vm_page_sleep(vm_object_t object, vm_page_t m, wait_interrupt_t interruptible, lck_sleep_action_t action)
8556 {
8557 wait_result_t ret;
8558
8559 KDBG_FILTERED((VMDBG_CODE(DBG_VM_PAGE_SLEEP)) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(object), m->vmp_offset, VM_KERNEL_ADDRHIDE(m));
8560 #if CONFIG_IOSCHED
8561 if (object->io_tracking && ((m->vmp_busy == TRUE) || (m->vmp_cleaning == TRUE) || VM_PAGE_WIRED(m))) {
8562 /*
8563 * Indicates page is busy due to an I/O. Issue a reprioritize request if necessary.
8564 */
8565 vm_page_handle_prio_inversion(object, m);
8566 }
8567 #endif /* CONFIG_IOSCHED */
8568 m->vmp_wanted = TRUE;
8569 thread_set_pending_block_hint(current_thread(), kThreadWaitPageBusy);
8570 #if PAGE_SLEEP_WITH_INHERITOR
8571 ret = vm_page_sleep_with_inheritor(&object->Lock, action, (event_t)m, interruptible);
8572 #else
8573 ret = lck_rw_sleep(&object->Lock, LCK_SLEEP_PROMOTED_PRI | action, (event_t)m, interruptible);
8574 #endif
8575 KDBG_FILTERED((VMDBG_CODE(DBG_VM_PAGE_SLEEP)) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(object), m->vmp_offset, VM_KERNEL_ADDRHIDE(m));
8576 return ret;
8577 }
8578
8579 void
8580 vm_page_wakeup(vm_object_t object, vm_page_t m)
8581 {
8582 assert(m);
8583 /*
8584 * The page may have been freed from its object before this wakeup is issued
8585 */
8586 if (object != VM_OBJECT_NULL) {
8587 vm_object_lock_assert_exclusive(object);
8588 }
8589
8590 if (m->vmp_wanted) {
8591 KDBG(VMDBG_CODE(DBG_VM_PAGE_WAKEUP) | DBG_FUNC_NONE,
8592 VM_KERNEL_ADDRHIDE(object), m->vmp_offset,
8593 VM_KERNEL_ADDRHIDE(m));
8594 m->vmp_wanted = false;
8595 thread_wakeup((event_t)m);
8596 }
8597 }
8598
8599 void
8600 vm_page_wakeup_done(__assert_only vm_object_t object, vm_page_t m)
8601 {
8602 assert(object);
8603 assert(m->vmp_busy);
8604 vm_object_lock_assert_exclusive(object);
8605
8606 KDBG(VMDBG_CODE(DBG_VM_PAGE_WAKEUP_DONE) | DBG_FUNC_NONE,
8607 VM_KERNEL_ADDRHIDE(object), m->vmp_offset,
8608 VM_KERNEL_ADDRHIDE(m), m->vmp_wanted);
8609 m->vmp_busy = false;
8610 vm_page_wakeup(object, m);
8611 }
8612
8613 #if PAGE_SLEEP_WITH_INHERITOR
8614 static bool page_worker_unregister_worker(event_t event, thread_t expect_th, page_worker_token_t *token);
8615 #endif /* PAGE_SLEEP_WITH_INHERITOR */
8616
8617 /* This function duplicates all of what vm_page_wakeup_done() does and adds the option
8618 * that we're being called from vm_fault_page() in a page that is possibly boosted due to being an inheritor*/
8619 void
8620 vm_page_wakeup_done_with_inheritor(vm_object_t object __unused, vm_page_t m, page_worker_token_t *token __unused)
8621 {
8622 #if PAGE_SLEEP_WITH_INHERITOR
8623 assert(object);
8624 assert(m->vmp_busy);
8625 vm_object_lock_assert_exclusive(object);
8626
8627 bool had_inheritor = page_worker_unregister_worker((event_t)m, current_thread(), token);
8628
8629 KDBG(VMDBG_CODE(DBG_VM_PAGE_WAKEUP_DONE) | DBG_FUNC_NONE,
8630 VM_KERNEL_ADDRHIDE(object), VM_KERNEL_ADDRHIDE(m),
8631 m->vmp_wanted, had_inheritor);
8632 m->vmp_busy = FALSE;
8633
8634 if (m->vmp_wanted) {
8635 m->vmp_wanted = FALSE;
8636 if (had_inheritor) {
8637 wakeup_all_with_inheritor((event_t)m, THREAD_AWAKENED);
8638 } else {
8639 thread_wakeup((event_t)m);
8640 }
8641 }
8642 #else /* PAGE_SLEEP_WITH_INHERITOR */
8643 vm_page_wakeup_done(object, m);
8644 #endif /* PAGE_SLEEP_WITH_INHERITOR */
8645 }
8646
8647 #if PAGE_SLEEP_WITH_INHERITOR
8648
8649 /*
8650 * vm_page_sleep_with_inheritor:
8651 * The goal of this functionality is to prevent priority inversion that can occur when a low-priority
8652 * thread is stuck in the compressor and a higher priority thread waits for the same page.
8653 * Just before vm_fault_page() calls into the compressor it calls page_worker_register_worker()
8654 * this registers the calling thread as the "page worker" of this page.
8655 * When another thread then tries to vm_page_sleep() on that page, (wait for it to un-busy) the worker is found and
8656 * instead of a plain thread_block() (in lck_rw_sleep()) we do lck_rw_sleep_with_inheritor() and give the registered
8657 * worker thread as the inheritor of the priority boost.
8658 * The worker thread might have started its work on a low priority, and when a waiter was added, it got boost.
8659 * When the worker is done getting the page it calls vm_page_wakeup_done_with_inheritor() instead of
8660 * vm_page_wakeup_done() this unregisters the thread, clears the page busy bit (so that now other threads can
8661 * use this page), and wakes up any waiters waiting for that page with wakeup_all_with_inheritor(), which
8662 * removes the priority boost.
8663 *
8664 * The worker registration is done in a simple single entry per bucket hash table. A hash collision may occur
8665 * if two faulting pages end up in the same entry. In this case, the registration of the second one is going to
8666 * fail and the only repercussions of this is that it would not get the possible boost if anyone is going to wait
8667 * on it. This implementation was selected over a full hash-table to keep it simple and fast.
8668 */
8669
8670 struct page_worker {
8671 lck_ticket_t pw_entry_lock;
8672 event_t pw_owner_event;
8673 thread_t pw_current_worker;
8674 };
8675
8676 SECURITY_READ_ONLY_LATE(uint32_t) page_worker_table_size = 0;
8677 SECURITY_READ_ONLY_LATE(static struct page_worker *)page_worker_table = NULL;
8678 SCALABLE_COUNTER_DEFINE(page_worker_hash_collisions);
8679 SCALABLE_COUNTER_DEFINE(page_worker_inheritor_sleeps);
8680
8681 LCK_GRP_DECLARE(page_worker_table_lock_grp, "page_worker_table_locks");
8682
8683 #define page_worker_entry_unlock(entry) \
8684 lck_ticket_unlock(&entry->pw_entry_lock);
8685
8686 #define PAGE_WORKER_TABLE_BUCKETS (256)
8687
8688 void
8689 page_worker_init(void)
8690 {
8691 page_worker_table_size = PAGE_WORKER_TABLE_BUCKETS;
8692 #if DEVELOPMENT || DEBUG
8693 PE_parse_boot_argn("page_worker_table_size", &page_worker_table_size, sizeof(page_worker_table_size));
8694 #endif /* DEVELOPMENT || DEBUG */
8695 /* This checks that the size is a positive power of 2, needed for the hash function */
8696 assert(page_worker_table_size > 0 && !(page_worker_table_size & (page_worker_table_size - 1)));
8697
8698 page_worker_table = zalloc_permanent(page_worker_table_size * sizeof(struct page_worker), ZALIGN_PTR);
8699 if (page_worker_table == NULL) {
8700 panic("Page events hash table memory allocation failed!");
8701 }
8702 for (uint32_t i = 0; i < page_worker_table_size; ++i) {
8703 struct page_worker* we = &(page_worker_table[i]);
8704 lck_ticket_init(&we->pw_entry_lock, &page_worker_table_lock_grp);
8705 }
8706 }
8707
8708 static struct page_worker *
8709 page_worker_lock_table_entry(event_t event)
8710 {
8711 if (page_worker_table == NULL) {
8712 return NULL;
8713 }
8714 uint32_t hash = os_hash_kernel_pointer((void *)event);
8715 uint32_t index = hash & (page_worker_table_size - 1);
8716
8717 struct page_worker *entry = &page_worker_table[index];
8718
8719 lck_ticket_lock(&entry->pw_entry_lock, &page_worker_table_lock_grp);
8720 return entry;
8721 }
8722
8723 /* returns a locked entry if found or added, otherwise returns NULL */
8724 static struct page_worker *
8725 page_worker_lookup(event_t event, bool try_add_missing)
8726 {
8727 assert(event != NULL);
8728 struct page_worker *entry = page_worker_lock_table_entry(event);
8729 if (entry == NULL) {
8730 /* table not initialized */
8731 return NULL;
8732 }
8733 if (entry->pw_owner_event == event) {
8734 /* found existing entry and it belongs to this event */
8735 return entry;
8736 }
8737
8738 if (try_add_missing) {
8739 if (entry->pw_owner_event == NULL) {
8740 /* found empty entry, take over it */
8741 entry->pw_owner_event = event;
8742 return entry;
8743 }
8744 /* didn't find the event, need to add it, but can't because it's occupied */
8745 counter_inc(&page_worker_hash_collisions);
8746 }
8747 page_worker_entry_unlock(entry);
8748 return NULL;
8749 }
8750
8751 /* returns true if current_thread() was successfully registered as worker */
8752 void
8753 page_worker_register_worker(event_t event __unused, page_worker_token_t *out_token)
8754 {
8755 out_token->pwt_did_register_inheritor = false;
8756 out_token->pwt_floor_token.thread = THREAD_NULL;
8757
8758 struct page_worker* entry = page_worker_lookup(event, TRUE);
8759 if (entry == NULL) {
8760 /* failed registration due to a hash collision */
8761 out_token->pwt_floor_token = thread_priority_floor_start();
8762 return;
8763 }
8764 entry->pw_current_worker = current_thread();
8765 /* no need to take the thread reference because this is going to get cleared in the same call of vm_page_fault() */
8766 page_worker_entry_unlock(entry);
8767 out_token->pwt_did_register_inheritor = true;
8768 }
8769
8770 static bool
8771 page_worker_unregister_worker(event_t event, thread_t expect_th __unused, page_worker_token_t *token)
8772 {
8773 struct page_worker *entry = page_worker_lookup(event, FALSE);
8774 if (entry == NULL) {
8775 assert(!token->pwt_did_register_inheritor);
8776 /* did we do thread_priority_floor_start() ? */
8777 if (token->pwt_floor_token.thread != THREAD_NULL) {
8778 thread_priority_floor_end(&token->pwt_floor_token);
8779 }
8780 return false;
8781 }
8782 assert(token->pwt_did_register_inheritor);
8783 assert(token->pwt_floor_token.thread == THREAD_NULL); /* we shouldn't have done thread_priority_floor_start() */
8784 assert(entry->pw_owner_event != 0);
8785 assert(entry->pw_current_worker == expect_th);
8786 entry->pw_owner_event = 0;
8787 entry->pw_current_worker = THREAD_NULL;
8788 page_worker_entry_unlock(entry); /* was locked in page_worker_lookup() */
8789 return true;
8790 }
8791
8792 static wait_result_t
8793 vm_page_sleep_with_inheritor(lck_rw_t *lck, lck_sleep_action_t action, event_t event, wait_interrupt_t interruptible)
8794 {
8795 struct page_worker *entry = page_worker_lookup(event, FALSE);
8796 thread_t inheritor = THREAD_NULL;
8797 if (entry != NULL) {
8798 inheritor = entry->pw_current_worker;
8799 page_worker_entry_unlock(entry);
8800 }
8801
8802 wait_result_t ret;
8803 if (inheritor == THREAD_NULL) {
8804 /* no worker was found */
8805 ret = lck_rw_sleep(lck, LCK_SLEEP_PROMOTED_PRI | action, event, interruptible);
8806 } else {
8807 counter_inc(&page_worker_inheritor_sleeps);
8808 ret = lck_rw_sleep_with_inheritor(lck, action, event, inheritor, interruptible, TIMEOUT_WAIT_FOREVER);
8809 }
8810
8811 return ret;
8812 }
8813 #endif /* PAGE_SLEEP_WITH_INHERITOR */
8814
8815 static void
8816 io_reprioritize(mpsc_queue_chain_t elm, __assert_only mpsc_daemon_queue_t dq)
8817 {
8818 assert3p(dq, ==, &io_reprioritize_q);
8819 io_reprioritize_req_t req = mpsc_queue_element(elm, struct io_reprioritize_req, iorr_elm);
8820 vnode_pager_issue_reprioritize_io(req->devvp, req->blkno, req->len, req->priority);
8821 zfree(io_reprioritize_req_zone, req);
8822 }
8823
8824 #endif /* CONFIG_IOSCHED */
8825
8826 #if VM_OBJECT_ACCESS_TRACKING
8827 void
8828 vm_object_access_tracking(
8829 vm_object_t object,
8830 int *access_tracking_p,
8831 uint32_t *access_tracking_reads_p,
8832 uint32_t *access_tracking_writes_p)
8833 {
8834 int access_tracking;
8835
8836 access_tracking = !!*access_tracking_p;
8837
8838 vm_object_lock(object);
8839 *access_tracking_p = object->access_tracking;
8840 if (access_tracking_reads_p) {
8841 *access_tracking_reads_p = object->access_tracking_reads;
8842 }
8843 if (access_tracking_writes_p) {
8844 *access_tracking_writes_p = object->access_tracking_writes;
8845 }
8846 object->access_tracking = access_tracking;
8847 object->access_tracking_reads = 0;
8848 object->access_tracking_writes = 0;
8849 vm_object_unlock(object);
8850
8851 if (access_tracking) {
8852 vm_object_pmap_protect_options(object,
8853 0,
8854 object->vo_size,
8855 PMAP_NULL,
8856 PAGE_SIZE,
8857 0,
8858 VM_PROT_NONE,
8859 0);
8860 }
8861 }
8862 #endif /* VM_OBJECT_ACCESS_TRACKING */
8863
8864 void
8865 vm_object_ledger_tag_ledgers(
8866 vm_object_t object,
8867 int *ledger_idx_volatile,
8868 int *ledger_idx_nonvolatile,
8869 int *ledger_idx_volatile_compressed,
8870 int *ledger_idx_nonvolatile_compressed,
8871 int *ledger_idx_composite,
8872 int *ledger_idx_external_wired,
8873 boolean_t *do_footprint)
8874 {
8875 assert(object->shadow == VM_OBJECT_NULL);
8876
8877 *ledger_idx_volatile = -1;
8878 *ledger_idx_nonvolatile = -1;
8879 *ledger_idx_volatile_compressed = -1;
8880 *ledger_idx_nonvolatile_compressed = -1;
8881 *ledger_idx_composite = -1;
8882 *ledger_idx_external_wired = -1;
8883 *do_footprint = !object->vo_no_footprint;
8884
8885 if (!object->internal) {
8886 switch (object->vo_ledger_tag) {
8887 case VM_LEDGER_TAG_DEFAULT:
8888 if (*do_footprint) {
8889 *ledger_idx_external_wired = task_ledgers.tagged_footprint;
8890 } else {
8891 *ledger_idx_external_wired = task_ledgers.tagged_nofootprint;
8892 }
8893 break;
8894 case VM_LEDGER_TAG_NETWORK:
8895 *do_footprint = FALSE;
8896 *ledger_idx_external_wired = task_ledgers.network_nonvolatile;
8897 break;
8898 case VM_LEDGER_TAG_MEDIA:
8899 if (*do_footprint) {
8900 *ledger_idx_external_wired = task_ledgers.media_footprint;
8901 } else {
8902 *ledger_idx_external_wired = task_ledgers.media_nofootprint;
8903 }
8904 break;
8905 case VM_LEDGER_TAG_GRAPHICS:
8906 if (*do_footprint) {
8907 *ledger_idx_external_wired = task_ledgers.graphics_footprint;
8908 } else {
8909 *ledger_idx_external_wired = task_ledgers.graphics_nofootprint;
8910 }
8911 break;
8912 case VM_LEDGER_TAG_NEURAL:
8913 *ledger_idx_composite = task_ledgers.neural_nofootprint_total;
8914 if (*do_footprint) {
8915 *ledger_idx_external_wired = task_ledgers.neural_footprint;
8916 } else {
8917 *ledger_idx_external_wired = task_ledgers.neural_nofootprint;
8918 }
8919 break;
8920 case VM_LEDGER_TAG_NONE:
8921 default:
8922 panic("%s: external object %p has unsupported ledger_tag %d",
8923 __FUNCTION__, object, object->vo_ledger_tag);
8924 }
8925 return;
8926 }
8927
8928 assert(object->internal);
8929 switch (object->vo_ledger_tag) {
8930 case VM_LEDGER_TAG_NONE:
8931 /*
8932 * Regular purgeable memory:
8933 * counts in footprint only when nonvolatile.
8934 */
8935 *do_footprint = TRUE;
8936 assert(object->purgable != VM_PURGABLE_DENY);
8937 *ledger_idx_volatile = task_ledgers.purgeable_volatile;
8938 *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile;
8939 *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed;
8940 *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed;
8941 break;
8942 case VM_LEDGER_TAG_DEFAULT:
8943 /*
8944 * "default" tagged memory:
8945 * counts in footprint only when nonvolatile and not marked
8946 * as "no_footprint".
8947 */
8948 *ledger_idx_volatile = task_ledgers.tagged_nofootprint;
8949 *ledger_idx_volatile_compressed = task_ledgers.tagged_nofootprint_compressed;
8950 if (*do_footprint) {
8951 *ledger_idx_nonvolatile = task_ledgers.tagged_footprint;
8952 *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_footprint_compressed;
8953 } else {
8954 *ledger_idx_nonvolatile = task_ledgers.tagged_nofootprint;
8955 *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_nofootprint_compressed;
8956 }
8957 break;
8958 case VM_LEDGER_TAG_NETWORK:
8959 /*
8960 * "network" tagged memory:
8961 * never counts in footprint.
8962 */
8963 *do_footprint = FALSE;
8964 *ledger_idx_volatile = task_ledgers.network_volatile;
8965 *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed;
8966 *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile;
8967 *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed;
8968 break;
8969 case VM_LEDGER_TAG_MEDIA:
8970 /*
8971 * "media" tagged memory:
8972 * counts in footprint only when nonvolatile and not marked
8973 * as "no footprint".
8974 */
8975 *ledger_idx_volatile = task_ledgers.media_nofootprint;
8976 *ledger_idx_volatile_compressed = task_ledgers.media_nofootprint_compressed;
8977 if (*do_footprint) {
8978 *ledger_idx_nonvolatile = task_ledgers.media_footprint;
8979 *ledger_idx_nonvolatile_compressed = task_ledgers.media_footprint_compressed;
8980 } else {
8981 *ledger_idx_nonvolatile = task_ledgers.media_nofootprint;
8982 *ledger_idx_nonvolatile_compressed = task_ledgers.media_nofootprint_compressed;
8983 }
8984 break;
8985 case VM_LEDGER_TAG_GRAPHICS:
8986 /*
8987 * "graphics" tagged memory:
8988 * counts in footprint only when nonvolatile and not marked
8989 * as "no footprint".
8990 */
8991 *ledger_idx_volatile = task_ledgers.graphics_nofootprint;
8992 *ledger_idx_volatile_compressed = task_ledgers.graphics_nofootprint_compressed;
8993 if (*do_footprint) {
8994 *ledger_idx_nonvolatile = task_ledgers.graphics_footprint;
8995 *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_footprint_compressed;
8996 } else {
8997 *ledger_idx_nonvolatile = task_ledgers.graphics_nofootprint;
8998 *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_nofootprint_compressed;
8999 }
9000 break;
9001 case VM_LEDGER_TAG_NEURAL:
9002 /*
9003 * "neural" tagged memory:
9004 * counts in footprint only when nonvolatile and not marked
9005 * as "no footprint".
9006 */
9007 *ledger_idx_composite = task_ledgers.neural_nofootprint_total;
9008 *ledger_idx_volatile = task_ledgers.neural_nofootprint;
9009 *ledger_idx_volatile_compressed = task_ledgers.neural_nofootprint_compressed;
9010 if (*do_footprint) {
9011 *ledger_idx_nonvolatile = task_ledgers.neural_footprint;
9012 *ledger_idx_nonvolatile_compressed = task_ledgers.neural_footprint_compressed;
9013 } else {
9014 *ledger_idx_nonvolatile = task_ledgers.neural_nofootprint;
9015 *ledger_idx_nonvolatile_compressed = task_ledgers.neural_nofootprint_compressed;
9016 }
9017 break;
9018 default:
9019 panic("%s: object %p has unsupported ledger_tag %d",
9020 __FUNCTION__, object, object->vo_ledger_tag);
9021 }
9022 }
9023
9024 kern_return_t
9025 vm_object_ownership_change(
9026 vm_object_t object,
9027 int new_ledger_tag,
9028 task_t new_owner,
9029 int new_ledger_flags,
9030 boolean_t old_task_objq_locked)
9031 {
9032 int old_ledger_tag;
9033 task_t old_owner;
9034 int resident_count, wired_count;
9035 unsigned int compressed_count;
9036 int ledger_idx_volatile;
9037 int ledger_idx_nonvolatile;
9038 int ledger_idx_volatile_compressed;
9039 int ledger_idx_nonvolatile_compressed;
9040 int ledger_idx;
9041 int ledger_idx_compressed;
9042 int ledger_idx_composite;
9043 int ledger_idx_external_wired;
9044 boolean_t do_footprint, old_no_footprint, new_no_footprint;
9045 boolean_t new_task_objq_locked;
9046
9047 vm_object_lock_assert_exclusive(object);
9048
9049 if (new_owner != VM_OBJECT_OWNER_DISOWNED &&
9050 new_owner != TASK_NULL) {
9051 if (new_ledger_tag == VM_LEDGER_TAG_NONE &&
9052 object->purgable == VM_PURGABLE_DENY) {
9053 /* non-purgeable memory must have a valid non-zero ledger tag */
9054 return KERN_INVALID_ARGUMENT;
9055 }
9056 if (!object->internal
9057 && !memory_object_is_vnode_pager(object->pager)) {
9058 /* non-file-backed "external" objects can't be owned */
9059 return KERN_INVALID_ARGUMENT;
9060 }
9061 }
9062 if (new_owner == VM_OBJECT_OWNER_UNCHANGED) {
9063 /* leave owner unchanged */
9064 new_owner = VM_OBJECT_OWNER(object);
9065 }
9066 if (new_ledger_tag == VM_LEDGER_TAG_UNCHANGED) {
9067 /* leave ledger_tag unchanged */
9068 new_ledger_tag = object->vo_ledger_tag;
9069 }
9070 if (new_ledger_tag < 0 ||
9071 new_ledger_tag > VM_LEDGER_TAG_MAX) {
9072 return KERN_INVALID_ARGUMENT;
9073 }
9074 if (new_ledger_flags & ~VM_LEDGER_FLAGS_ALL) {
9075 return KERN_INVALID_ARGUMENT;
9076 }
9077 if (object->internal &&
9078 object->vo_ledger_tag == VM_LEDGER_TAG_NONE &&
9079 object->purgable == VM_PURGABLE_DENY) {
9080 /*
9081 * This VM object is neither ledger-tagged nor purgeable.
9082 * We can convert it to "ledger tag" ownership iff it
9083 * has not been used at all yet (no resident pages and
9084 * no pager) and it's going to be assigned to a valid task.
9085 */
9086 if (object->resident_page_count != 0 ||
9087 object->pager != NULL ||
9088 object->pager_created ||
9089 os_ref_get_count_raw(&object->ref_count) != 1 ||
9090 object->vo_owner != TASK_NULL ||
9091 object->copy_strategy != MEMORY_OBJECT_COPY_NONE ||
9092 new_owner == TASK_NULL) {
9093 return KERN_FAILURE;
9094 }
9095 }
9096
9097 if (new_ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) {
9098 new_no_footprint = TRUE;
9099 } else {
9100 new_no_footprint = FALSE;
9101 }
9102 #if __arm64__
9103 if (!new_no_footprint &&
9104 object->purgable != VM_PURGABLE_DENY &&
9105 new_owner != TASK_NULL &&
9106 new_owner != VM_OBJECT_OWNER_DISOWNED &&
9107 new_owner->task_legacy_footprint) {
9108 /*
9109 * This task has been granted "legacy footprint" and should
9110 * not be charged for its IOKit purgeable memory. Since we
9111 * might now change the accounting of such memory to the
9112 * "graphics" ledger, for example, give it the "no footprint"
9113 * option.
9114 */
9115 new_no_footprint = TRUE;
9116 }
9117 #endif /* __arm64__ */
9118 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
9119 assert(object->shadow == VM_OBJECT_NULL);
9120 if (object->internal) {
9121 assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
9122 assert(object->vo_copy == VM_OBJECT_NULL);
9123 }
9124
9125 old_ledger_tag = object->vo_ledger_tag;
9126 old_no_footprint = object->vo_no_footprint;
9127 old_owner = VM_OBJECT_OWNER(object);
9128
9129 if (__improbable(vm_debug_events)) {
9130 DTRACE_VM8(object_ownership_change,
9131 vm_object_t, object,
9132 task_t, old_owner,
9133 int, old_ledger_tag,
9134 int, old_no_footprint,
9135 task_t, new_owner,
9136 int, new_ledger_tag,
9137 int, new_no_footprint,
9138 int, VM_OBJECT_ID(object));
9139 }
9140
9141 resident_count = object->resident_page_count - object->wired_page_count;
9142 wired_count = object->wired_page_count;
9143 if (object->internal) {
9144 compressed_count = vm_compressor_pager_get_count(object->pager);
9145 } else {
9146 compressed_count = 0;
9147 }
9148
9149 /*
9150 * Deal with the old owner and/or ledger tag, if needed.
9151 */
9152 if (old_owner != TASK_NULL &&
9153 ((old_owner != new_owner) /* new owner ... */
9154 || /* ... or ... */
9155 (old_no_footprint != new_no_footprint) /* new "no_footprint" */
9156 || /* ... or ... */
9157 old_ledger_tag != new_ledger_tag)) { /* ... new ledger */
9158 /*
9159 * Take this object off of the old owner's ledgers.
9160 */
9161 vm_object_ledger_tag_ledgers(object,
9162 &ledger_idx_volatile,
9163 &ledger_idx_nonvolatile,
9164 &ledger_idx_volatile_compressed,
9165 &ledger_idx_nonvolatile_compressed,
9166 &ledger_idx_composite,
9167 &ledger_idx_external_wired,
9168 &do_footprint);
9169 if (object->internal) {
9170 if (object->purgable == VM_PURGABLE_VOLATILE ||
9171 object->purgable == VM_PURGABLE_EMPTY) {
9172 ledger_idx = ledger_idx_volatile;
9173 ledger_idx_compressed = ledger_idx_volatile_compressed;
9174 } else {
9175 ledger_idx = ledger_idx_nonvolatile;
9176 ledger_idx_compressed = ledger_idx_nonvolatile_compressed;
9177 }
9178 if (resident_count) {
9179 /*
9180 * Adjust the appropriate old owners's ledgers by the
9181 * number of resident pages.
9182 */
9183 ledger_debit(old_owner->ledger,
9184 ledger_idx,
9185 ptoa_64(resident_count));
9186 /* adjust old owner's footprint */
9187 if (object->purgable != VM_PURGABLE_VOLATILE &&
9188 object->purgable != VM_PURGABLE_EMPTY) {
9189 if (do_footprint) {
9190 ledger_debit(old_owner->ledger,
9191 task_ledgers.phys_footprint,
9192 ptoa_64(resident_count));
9193 } else if (ledger_idx_composite != -1) {
9194 ledger_debit(old_owner->ledger,
9195 ledger_idx_composite,
9196 ptoa_64(resident_count));
9197 }
9198 }
9199 }
9200 if (wired_count) {
9201 /* wired pages are always nonvolatile */
9202 ledger_debit(old_owner->ledger,
9203 ledger_idx_nonvolatile,
9204 ptoa_64(wired_count));
9205 if (do_footprint) {
9206 ledger_debit(old_owner->ledger,
9207 task_ledgers.phys_footprint,
9208 ptoa_64(wired_count));
9209 } else if (ledger_idx_composite != -1) {
9210 ledger_debit(old_owner->ledger,
9211 ledger_idx_composite,
9212 ptoa_64(wired_count));
9213 }
9214 }
9215 if (compressed_count) {
9216 /*
9217 * Adjust the appropriate old owner's ledgers
9218 * by the number of compressed pages.
9219 */
9220 ledger_debit(old_owner->ledger,
9221 ledger_idx_compressed,
9222 ptoa_64(compressed_count));
9223 if (object->purgable != VM_PURGABLE_VOLATILE &&
9224 object->purgable != VM_PURGABLE_EMPTY) {
9225 if (do_footprint) {
9226 ledger_debit(old_owner->ledger,
9227 task_ledgers.phys_footprint,
9228 ptoa_64(compressed_count));
9229 } else if (ledger_idx_composite != -1) {
9230 ledger_debit(old_owner->ledger,
9231 ledger_idx_composite,
9232 ptoa_64(compressed_count));
9233 }
9234 }
9235 }
9236 } else {
9237 /* external but owned object: count wired pages */
9238 if (wired_count) {
9239 ledger_debit(old_owner->ledger,
9240 ledger_idx_external_wired,
9241 ptoa_64(wired_count));
9242 if (do_footprint) {
9243 ledger_debit(old_owner->ledger,
9244 task_ledgers.phys_footprint,
9245 ptoa_64(wired_count));
9246 } else if (ledger_idx_composite != -1) {
9247 ledger_debit(old_owner->ledger,
9248 ledger_idx_composite,
9249 ptoa_64(wired_count));
9250 }
9251 }
9252 }
9253 if (old_owner != new_owner) {
9254 /* remove object from old_owner's list of owned objects */
9255 DTRACE_VM2(object_owner_remove,
9256 vm_object_t, object,
9257 task_t, old_owner);
9258 if (!old_task_objq_locked) {
9259 task_objq_lock(old_owner);
9260 }
9261 old_owner->task_owned_objects--;
9262 queue_remove(&old_owner->task_objq, object,
9263 vm_object_t, task_objq);
9264 switch (object->purgable) {
9265 case VM_PURGABLE_NONVOLATILE:
9266 case VM_PURGABLE_EMPTY:
9267 vm_purgeable_nonvolatile_owner_update(old_owner,
9268 -1);
9269 break;
9270 case VM_PURGABLE_VOLATILE:
9271 vm_purgeable_volatile_owner_update(old_owner,
9272 -1);
9273 break;
9274 default:
9275 break;
9276 }
9277 if (!old_task_objq_locked) {
9278 task_objq_unlock(old_owner);
9279 }
9280 }
9281 }
9282
9283 /*
9284 * Switch to new ledger tag and/or owner.
9285 */
9286
9287 new_task_objq_locked = FALSE;
9288 if (new_owner != old_owner &&
9289 new_owner != TASK_NULL &&
9290 new_owner != VM_OBJECT_OWNER_DISOWNED) {
9291 /*
9292 * If the new owner is not accepting new objects ("disowning"),
9293 * the object becomes "disowned" and will be added to
9294 * the kernel's task_objq.
9295 *
9296 * Check first without locking, to avoid blocking while the
9297 * task is disowning its objects.
9298 */
9299 if (new_owner->task_objects_disowning) {
9300 new_owner = VM_OBJECT_OWNER_DISOWNED;
9301 } else {
9302 task_objq_lock(new_owner);
9303 /* check again now that we have the lock */
9304 if (new_owner->task_objects_disowning) {
9305 new_owner = VM_OBJECT_OWNER_DISOWNED;
9306 task_objq_unlock(new_owner);
9307 } else {
9308 new_task_objq_locked = TRUE;
9309 }
9310 }
9311 }
9312
9313 object->vo_ledger_tag = new_ledger_tag;
9314 object->vo_owner = new_owner;
9315 object->vo_no_footprint = new_no_footprint;
9316
9317 if (new_owner == VM_OBJECT_OWNER_DISOWNED) {
9318 /*
9319 * Disowned objects are added to the kernel's task_objq but
9320 * are marked as owned by "VM_OBJECT_OWNER_DISOWNED" to
9321 * differentiate them from objects intentionally owned by
9322 * the kernel.
9323 */
9324 assert(old_owner != kernel_task);
9325 new_owner = kernel_task;
9326 assert(!new_task_objq_locked);
9327 task_objq_lock(new_owner);
9328 new_task_objq_locked = TRUE;
9329 }
9330
9331 /*
9332 * Deal with the new owner and/or ledger tag, if needed.
9333 */
9334 if (new_owner != TASK_NULL &&
9335 ((new_owner != old_owner) /* new owner ... */
9336 || /* ... or ... */
9337 (new_no_footprint != old_no_footprint) /* ... new "no_footprint" */
9338 || /* ... or ... */
9339 new_ledger_tag != old_ledger_tag)) { /* ... new ledger */
9340 /*
9341 * Add this object to the new owner's ledgers.
9342 */
9343 vm_object_ledger_tag_ledgers(object,
9344 &ledger_idx_volatile,
9345 &ledger_idx_nonvolatile,
9346 &ledger_idx_volatile_compressed,
9347 &ledger_idx_nonvolatile_compressed,
9348 &ledger_idx_composite,
9349 &ledger_idx_external_wired,
9350 &do_footprint);
9351 if (object->internal) {
9352 if (object->purgable == VM_PURGABLE_VOLATILE ||
9353 object->purgable == VM_PURGABLE_EMPTY) {
9354 ledger_idx = ledger_idx_volatile;
9355 ledger_idx_compressed = ledger_idx_volatile_compressed;
9356 } else {
9357 ledger_idx = ledger_idx_nonvolatile;
9358 ledger_idx_compressed = ledger_idx_nonvolatile_compressed;
9359 }
9360 if (resident_count) {
9361 /*
9362 * Adjust the appropriate new owners's ledgers by the
9363 * number of resident pages.
9364 */
9365 ledger_credit(new_owner->ledger,
9366 ledger_idx,
9367 ptoa_64(resident_count));
9368 /* adjust new owner's footprint */
9369 if (object->purgable != VM_PURGABLE_VOLATILE &&
9370 object->purgable != VM_PURGABLE_EMPTY) {
9371 if (do_footprint) {
9372 ledger_credit(new_owner->ledger,
9373 task_ledgers.phys_footprint,
9374 ptoa_64(resident_count));
9375 } else if (ledger_idx_composite != -1) {
9376 ledger_credit(new_owner->ledger,
9377 ledger_idx_composite,
9378 ptoa_64(resident_count));
9379 }
9380 }
9381 }
9382 if (wired_count) {
9383 /* wired pages are always nonvolatile */
9384 ledger_credit(new_owner->ledger,
9385 ledger_idx_nonvolatile,
9386 ptoa_64(wired_count));
9387 if (do_footprint) {
9388 ledger_credit(new_owner->ledger,
9389 task_ledgers.phys_footprint,
9390 ptoa_64(wired_count));
9391 } else if (ledger_idx_composite != -1) {
9392 ledger_credit(new_owner->ledger,
9393 ledger_idx_composite,
9394 ptoa_64(wired_count));
9395 }
9396 }
9397 if (compressed_count) {
9398 /*
9399 * Adjust the new owner's ledgers by the number of
9400 * compressed pages.
9401 */
9402 ledger_credit(new_owner->ledger,
9403 ledger_idx_compressed,
9404 ptoa_64(compressed_count));
9405 if (object->purgable != VM_PURGABLE_VOLATILE &&
9406 object->purgable != VM_PURGABLE_EMPTY) {
9407 if (do_footprint) {
9408 ledger_credit(new_owner->ledger,
9409 task_ledgers.phys_footprint,
9410 ptoa_64(compressed_count));
9411 } else if (ledger_idx_composite != -1) {
9412 ledger_credit(new_owner->ledger,
9413 ledger_idx_composite,
9414 ptoa_64(compressed_count));
9415 }
9416 }
9417 }
9418 } else {
9419 /* external but owned object: count wired pages */
9420 if (wired_count) {
9421 ledger_credit(new_owner->ledger,
9422 ledger_idx_external_wired,
9423 ptoa_64(wired_count));
9424 if (do_footprint) {
9425 ledger_credit(new_owner->ledger,
9426 task_ledgers.phys_footprint,
9427 ptoa_64(wired_count));
9428 } else if (ledger_idx_composite != -1) {
9429 ledger_credit(new_owner->ledger,
9430 ledger_idx_composite,
9431 ptoa_64(wired_count));
9432 }
9433 }
9434 }
9435 if (new_owner != old_owner) {
9436 /* add object to new_owner's list of owned objects */
9437 DTRACE_VM2(object_owner_add,
9438 vm_object_t, object,
9439 task_t, new_owner);
9440 assert(new_task_objq_locked);
9441 new_owner->task_owned_objects++;
9442 queue_enter(&new_owner->task_objq, object,
9443 vm_object_t, task_objq);
9444 switch (object->purgable) {
9445 case VM_PURGABLE_NONVOLATILE:
9446 case VM_PURGABLE_EMPTY:
9447 vm_purgeable_nonvolatile_owner_update(new_owner,
9448 +1);
9449 break;
9450 case VM_PURGABLE_VOLATILE:
9451 vm_purgeable_volatile_owner_update(new_owner,
9452 +1);
9453 break;
9454 default:
9455 break;
9456 }
9457 }
9458 }
9459
9460 if (new_task_objq_locked) {
9461 task_objq_unlock(new_owner);
9462 }
9463
9464 return KERN_SUCCESS;
9465 }
9466
9467 void
9468 vm_owned_objects_disown(
9469 task_t task)
9470 {
9471 vm_object_t next_object;
9472 vm_object_t object;
9473 int collisions;
9474 kern_return_t kr;
9475
9476 if (task == NULL) {
9477 return;
9478 }
9479
9480 collisions = 0;
9481
9482 again:
9483 if (task->task_objects_disowned) {
9484 /* task has already disowned its owned objects */
9485 assert(task->task_volatile_objects == 0);
9486 assert(task->task_nonvolatile_objects == 0);
9487 assert(task->task_owned_objects == 0);
9488 return;
9489 }
9490
9491 task_objq_lock(task);
9492
9493 task->task_objects_disowning = TRUE;
9494
9495 for (object = (vm_object_t) queue_first(&task->task_objq);
9496 !queue_end(&task->task_objq, (queue_entry_t) object);
9497 object = next_object) {
9498 if (task->task_nonvolatile_objects == 0 &&
9499 task->task_volatile_objects == 0 &&
9500 task->task_owned_objects == 0) {
9501 /* no more objects owned by "task" */
9502 break;
9503 }
9504
9505 next_object = (vm_object_t) queue_next(&object->task_objq);
9506
9507 #if DEBUG
9508 assert(object->vo_purgeable_volatilizer == NULL);
9509 #endif /* DEBUG */
9510 assert(object->vo_owner == task);
9511 if (!vm_object_lock_try(object)) {
9512 task_objq_unlock(task);
9513 mutex_pause(collisions++);
9514 goto again;
9515 }
9516 /* transfer ownership to the kernel */
9517 assert(VM_OBJECT_OWNER(object) != kernel_task);
9518 kr = vm_object_ownership_change(
9519 object,
9520 object->vo_ledger_tag, /* unchanged */
9521 VM_OBJECT_OWNER_DISOWNED, /* new owner */
9522 0, /* new_ledger_flags */
9523 TRUE); /* old_owner->task_objq locked */
9524 assert(kr == KERN_SUCCESS);
9525 assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
9526 vm_object_unlock(object);
9527 }
9528
9529 if (__improbable(task->task_owned_objects != 0)) {
9530 panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p",
9531 __FUNCTION__,
9532 task,
9533 task->task_volatile_objects,
9534 task->task_nonvolatile_objects,
9535 task->task_owned_objects,
9536 &task->task_objq,
9537 queue_first(&task->task_objq),
9538 queue_last(&task->task_objq));
9539 }
9540
9541 /* there shouldn't be any objects owned by task now */
9542 assert(task->task_volatile_objects == 0);
9543 assert(task->task_nonvolatile_objects == 0);
9544 assert(task->task_owned_objects == 0);
9545 assert(task->task_objects_disowning);
9546
9547 /* and we don't need to try and disown again */
9548 task->task_objects_disowned = TRUE;
9549
9550 task_objq_unlock(task);
9551 }
9552
9553 void
9554 vm_object_wired_page_update_ledgers(
9555 vm_object_t object,
9556 int64_t wired_delta)
9557 {
9558 task_t owner;
9559
9560 vm_object_lock_assert_exclusive(object);
9561 if (wired_delta == 0) {
9562 /* no change in number of wired pages */
9563 return;
9564 }
9565 if (object->internal) {
9566 /* no extra accounting needed for internal objects */
9567 return;
9568 }
9569 if (!object->vo_ledger_tag) {
9570 /* external object but not owned: no extra accounting */
9571 return;
9572 }
9573
9574 /*
9575 * For an explicitly-owned external VM object, account for
9576 * wired pages in one of the owner's ledgers.
9577 */
9578 owner = VM_OBJECT_OWNER(object);
9579 if (owner) {
9580 int ledger_idx_volatile;
9581 int ledger_idx_nonvolatile;
9582 int ledger_idx_volatile_compressed;
9583 int ledger_idx_nonvolatile_compressed;
9584 int ledger_idx_composite;
9585 int ledger_idx_external_wired;
9586 boolean_t do_footprint;
9587
9588 /* ask which ledgers need an update */
9589 vm_object_ledger_tag_ledgers(object,
9590 &ledger_idx_volatile,
9591 &ledger_idx_nonvolatile,
9592 &ledger_idx_volatile_compressed,
9593 &ledger_idx_nonvolatile_compressed,
9594 &ledger_idx_composite,
9595 &ledger_idx_external_wired,
9596 &do_footprint);
9597 if (wired_delta > 0) {
9598 /* more external wired bytes */
9599 ledger_credit(owner->ledger,
9600 ledger_idx_external_wired,
9601 ptoa(wired_delta));
9602 if (do_footprint) {
9603 /* more footprint */
9604 ledger_credit(owner->ledger,
9605 task_ledgers.phys_footprint,
9606 ptoa(wired_delta));
9607 } else if (ledger_idx_composite != -1) {
9608 ledger_credit(owner->ledger,
9609 ledger_idx_composite,
9610 ptoa(wired_delta));
9611 }
9612 } else {
9613 /* less external wired bytes */
9614 ledger_debit(owner->ledger,
9615 ledger_idx_external_wired,
9616 ptoa(-wired_delta));
9617 if (do_footprint) {
9618 /* more footprint */
9619 ledger_debit(owner->ledger,
9620 task_ledgers.phys_footprint,
9621 ptoa(-wired_delta));
9622 } else if (ledger_idx_composite != -1) {
9623 ledger_debit(owner->ledger,
9624 ledger_idx_composite,
9625 ptoa(-wired_delta));
9626 }
9627 }
9628 }
9629 }
9630