1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <libkern/OSAtomic.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h> /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88 #include <kern/exc_guard.h>
89
90 #include <vm/vm_compressor_internal.h>
91 #include <vm/vm_compressor_pager_internal.h>
92 #include <vm/vm_dyld_pager_internal.h>
93 #include <vm/vm_fault_internal.h>
94 #include <vm/vm_map_internal.h>
95 #include <vm/vm_object_internal.h>
96 #include <vm/vm_page_internal.h>
97 #include <vm/vm_kern_internal.h>
98 #include <vm/pmap.h>
99 #include <vm/vm_pageout_internal.h>
100 #include <vm/vm_protos_internal.h>
101 #include <vm/vm_external.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
104 #include <vm/vm_shared_region.h>
105 #include <vm/vm_page_internal.h>
106 #if HAS_MTE
107 #include <vm/vm_mteinfo_internal.h>
108 #include <vm/vm_memtag.h>
109 #endif /* HAS_MTE */
110
111 #include <sys/codesign.h>
112 #include <sys/code_signing.h>
113 #include <sys/kdebug.h>
114 #include <sys/kdebug_triage.h>
115 #include <sys/reason.h>
116 #include <sys/signalvar.h>
117
118 #include <san/kasan.h>
119 #include <libkern/coreanalytics/coreanalytics.h>
120
121 #define VM_FAULT_CLASSIFY 0
122
123 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
124
125 int vm_protect_privileged_from_untrusted = 1;
126
127 /*
128 * Enforce a maximum number of concurrent PageIns per vm-object to prevent
129 * high-I/O-volume tasks from saturating storage and starving the rest of the
130 * system.
131 *
132 * TODO: This throttling mechanism may be more naturally done by the pager,
133 * filesystem, or storage layers, which will have better information about how
134 * much concurrency the backing store can reasonably support.
135 */
136 TUNABLE(uint16_t, vm_object_pagein_throttle, "vm_object_pagein_throttle", 16);
137
138 /*
139 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
140 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
141 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
142 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
143 * keep the UI active so that the user has a chance to kill the offending task before the system
144 * completely hangs.
145 *
146 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
147 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
148 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
149 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
150 */
151
152 extern void throttle_lowpri_io(int);
153
154 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
155
156 uint64_t vm_hard_throttle_threshold;
157
158 #if DEBUG || DEVELOPMENT
159 static bool vmtc_panic_instead = false;
160 int panic_object_not_alive = 1;
161 #endif /* DEBUG || DEVELOPMENT */
162
163 OS_ALWAYS_INLINE
164 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)165 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
166 {
167 return vm_wants_task_throttled(current_task()) ||
168 ((vm_page_free_count < vm_page_throttle_limit ||
169 HARD_THROTTLE_LIMIT_REACHED()) &&
170 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
171 }
172
173
174 /*
175 * XXX: For now, vm faults cannot be recursively disabled. If the need for
176 * nested code that disables faults arises, the implementation can be modified
177 * to track a disabled-count.
178 */
179
180 OS_ALWAYS_INLINE
181 void
vm_fault_disable(void)182 vm_fault_disable(void)
183 {
184 thread_t t = current_thread();
185 assert(!t->th_vm_faults_disabled);
186 t->th_vm_faults_disabled = true;
187 act_set_debug_assert();
188 }
189
190 OS_ALWAYS_INLINE
191 void
vm_fault_enable(void)192 vm_fault_enable(void)
193 {
194 thread_t t = current_thread();
195 assert(t->th_vm_faults_disabled);
196 t->th_vm_faults_disabled = false;
197 }
198
199 OS_ALWAYS_INLINE
200 bool
vm_fault_get_disabled(void)201 vm_fault_get_disabled(void)
202 {
203 thread_t t = current_thread();
204 return t->th_vm_faults_disabled;
205 }
206
207 #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
208 #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
209
210 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
211 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
212
213
214 #define VM_STAT_DECOMPRESSIONS() \
215 MACRO_BEGIN \
216 counter_inc(&vm_statistics_decompressions); \
217 current_thread()->decompressions++; \
218 MACRO_END
219
220 boolean_t current_thread_aborted(void);
221
222 /* Forward declarations of internal routines. */
223 static kern_return_t vm_fault_wire_fast(
224 vm_map_t map,
225 vm_map_offset_t va,
226 vm_prot_t prot,
227 vm_tag_t wire_tag,
228 vm_map_entry_t entry,
229 pmap_t pmap,
230 vm_map_offset_t pmap_addr,
231 ppnum_t *physpage_p);
232
233 static kern_return_t vm_fault_internal(
234 vm_map_t map,
235 vm_map_offset_t vaddr,
236 vm_prot_t caller_prot,
237 vm_tag_t wire_tag,
238 pmap_t pmap,
239 vm_map_offset_t pmap_addr,
240 ppnum_t *physpage_p,
241 vm_object_fault_info_t fault_info);
242
243 static void vm_fault_copy_cleanup(
244 vm_page_t page,
245 vm_page_t top_page);
246
247 static void vm_fault_copy_dst_cleanup(
248 vm_page_t page);
249
250 #if VM_FAULT_CLASSIFY
251 extern void vm_fault_classify(vm_object_t object,
252 vm_object_offset_t offset,
253 vm_prot_t fault_type);
254
255 extern void vm_fault_classify_init(void);
256 #endif
257
258 unsigned long vm_pmap_enter_blocked = 0;
259 unsigned long vm_pmap_enter_retried = 0;
260
261 unsigned long vm_cs_validates = 0;
262 unsigned long vm_cs_revalidates = 0;
263 unsigned long vm_cs_query_modified = 0;
264 unsigned long vm_cs_validated_dirtied = 0;
265 unsigned long vm_cs_bitmap_validated = 0;
266
267 #if CODE_SIGNING_MONITOR
268 uint64_t vm_cs_defer_to_csm = 0;
269 uint64_t vm_cs_defer_to_csm_not = 0;
270 #endif /* CODE_SIGNING_MONITOR */
271
272 extern char *kdp_compressor_decompressed_page;
273 extern addr64_t kdp_compressor_decompressed_page_paddr;
274 extern ppnum_t kdp_compressor_decompressed_page_ppnum;
275
276 struct vmrtfr {
277 int vmrtfr_maxi;
278 int vmrtfr_curi;
279 int64_t vmrtf_total;
280 vm_rtfault_record_t *vm_rtf_records;
281 } vmrtfrs;
282 #define VMRTF_DEFAULT_BUFSIZE (4096)
283 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
284 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
285
286 static void vm_rtfrecord_lock(void);
287 static void vm_rtfrecord_unlock(void);
288 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
289
290 extern lck_grp_t vm_page_lck_grp_bucket;
291 extern lck_attr_t vm_page_lck_attr;
292 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
293
294 #if DEVELOPMENT || DEBUG
295 extern int madvise_free_debug;
296 extern int madvise_free_debug_sometimes;
297 #endif /* DEVELOPMENT || DEBUG */
298
299 extern int vm_pageout_protect_realtime;
300
301 #if CONFIG_FREEZE
302 #endif /* CONFIG_FREEZE */
303
304 /*
305 * Routine: vm_fault_init
306 * Purpose:
307 * Initialize our private data structures.
308 */
309 __startup_func
310 void
vm_fault_init(void)311 vm_fault_init(void)
312 {
313 int i, vm_compressor_temp;
314 boolean_t need_default_val = TRUE;
315 /*
316 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
317 * computed as a percentage of available memory, and the percentage used is scaled inversely with
318 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
319 * and reduce the value down to 10% for very large memory configurations. This helps give us a
320 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
321 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
322 */
323
324 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
325
326 /*
327 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
328 */
329
330 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
331 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
332 if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
333 need_default_val = FALSE;
334 vm_compressor_mode = vm_compressor_temp;
335 break;
336 }
337 }
338 if (need_default_val) {
339 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
340 }
341 }
342 #if CONFIG_FREEZE
343 if (need_default_val) {
344 if (osenvironment_is_diagnostics() || osenvironment_is_device_recovery()) {
345 printf("osenvironment == \"diagnostics or device-recovery\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
346 vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
347 need_default_val = false;
348 }
349 }
350 #endif /* CONFIG_FREEZE */
351 if (need_default_val) {
352 /* If no boot arg or incorrect boot arg, try device tree. */
353 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
354 }
355 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
356 vm_config_init();
357
358 PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
359 &vm_protect_privileged_from_untrusted,
360 sizeof(vm_protect_privileged_from_untrusted));
361
362 #if DEBUG || DEVELOPMENT
363 (void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
364
365 if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
366 madvise_free_debug = 0;
367 madvise_free_debug_sometimes = 0;
368 }
369
370 PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
371 #endif /* DEBUG || DEVELOPMENT */
372 }
373
374 __startup_func
375 static void
vm_rtfault_record_init(void)376 vm_rtfault_record_init(void)
377 {
378 size_t size;
379
380 vmrtf_num_records = MAX(vmrtf_num_records, 1);
381 size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
382 vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
383 ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
384 vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
385 }
386 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
387
388 /*
389 * Routine: vm_fault_cleanup
390 * Purpose:
391 * Clean up the result of vm_fault_page.
392 * Results:
393 * The paging reference for "object" is released.
394 * "object" is unlocked.
395 * If "top_page" is not null, "top_page" is
396 * freed and the paging reference for the object
397 * containing it is released.
398 *
399 * In/out conditions:
400 * "object" must be locked.
401 */
402 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)403 vm_fault_cleanup(
404 vm_object_t object,
405 vm_page_t top_page)
406 {
407 thread_pri_floor_t token = {
408 .thread = THREAD_NULL
409 };
410 if (top_page != VM_PAGE_NULL &&
411 top_page->vmp_busy) {
412 /*
413 * We busied the top page. Apply a priority floor before dropping the
414 * current object (and therefore the rw-lock boost) to avoid
415 * inversions due to another thread sleeping on the top-level page.
416 *
417 * TODO: Register a page-worker token when busying the top-level page instead
418 * (rdar://154313767)
419 */
420 token = thread_priority_floor_start();
421 }
422
423 vm_object_paging_end(object);
424 vm_object_unlock(object);
425
426 if (top_page != VM_PAGE_NULL) {
427 object = VM_PAGE_OBJECT(top_page);
428
429 vm_object_lock(object);
430 VM_PAGE_FREE(top_page);
431 vm_object_paging_end(object);
432 vm_object_unlock(object);
433 }
434 if (token.thread != THREAD_NULL) {
435 thread_priority_floor_end(&token);
436 }
437 }
438
439 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
440
441
442 TUNABLE(bool, vm_page_deactivate_behind, "vm_deactivate_behind", true);
443 TUNABLE(uint32_t, vm_page_deactivate_behind_min_resident_ratio, "vm_deactivate_behind_min_resident_ratio", 3);
444 /*
445 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
446 */
447 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
448 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
449 /* we use it to size an array on the stack */
450
451 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
452
453 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
454
455 /*
456 * vm_page_is_sequential
457 *
458 * Determine if sequential access is in progress
459 * in accordance with the behavior specified.
460 * Update state to indicate current access pattern.
461 *
462 * object must have at least the shared lock held
463 */
464 static
465 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)466 vm_fault_is_sequential(
467 vm_object_t object,
468 vm_object_offset_t offset,
469 vm_behavior_t behavior)
470 {
471 vm_object_offset_t last_alloc;
472 int sequential;
473 int orig_sequential;
474
475 last_alloc = object->last_alloc;
476 sequential = object->sequential;
477 orig_sequential = sequential;
478
479 offset = vm_object_trunc_page(offset);
480 if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
481 /* re-faulting in the same page: no change in behavior */
482 return;
483 }
484
485 switch (behavior) {
486 case VM_BEHAVIOR_RANDOM:
487 /*
488 * reset indicator of sequential behavior
489 */
490 sequential = 0;
491 break;
492
493 case VM_BEHAVIOR_SEQUENTIAL:
494 if (offset && last_alloc == offset - PAGE_SIZE_64) {
495 /*
496 * advance indicator of sequential behavior
497 */
498 if (sequential < MAX_SEQUENTIAL_RUN) {
499 sequential += PAGE_SIZE;
500 }
501 } else {
502 /*
503 * reset indicator of sequential behavior
504 */
505 sequential = 0;
506 }
507 break;
508
509 case VM_BEHAVIOR_RSEQNTL:
510 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
511 /*
512 * advance indicator of sequential behavior
513 */
514 if (sequential > -MAX_SEQUENTIAL_RUN) {
515 sequential -= PAGE_SIZE;
516 }
517 } else {
518 /*
519 * reset indicator of sequential behavior
520 */
521 sequential = 0;
522 }
523 break;
524
525 case VM_BEHAVIOR_DEFAULT:
526 default:
527 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
528 /*
529 * advance indicator of sequential behavior
530 */
531 if (sequential < 0) {
532 sequential = 0;
533 }
534 if (sequential < MAX_SEQUENTIAL_RUN) {
535 sequential += PAGE_SIZE;
536 }
537 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
538 /*
539 * advance indicator of sequential behavior
540 */
541 if (sequential > 0) {
542 sequential = 0;
543 }
544 if (sequential > -MAX_SEQUENTIAL_RUN) {
545 sequential -= PAGE_SIZE;
546 }
547 } else {
548 /*
549 * reset indicator of sequential behavior
550 */
551 sequential = 0;
552 }
553 break;
554 }
555 if (sequential != orig_sequential) {
556 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
557 /*
558 * if someone else has already updated object->sequential
559 * don't bother trying to update it or object->last_alloc
560 */
561 return;
562 }
563 }
564 /*
565 * I'd like to do this with a OSCompareAndSwap64, but that
566 * doesn't exist for PPC... however, it shouldn't matter
567 * that much... last_alloc is maintained so that we can determine
568 * if a sequential access pattern is taking place... if only
569 * one thread is banging on this object, no problem with the unprotected
570 * update... if 2 or more threads are banging away, we run the risk of
571 * someone seeing a mangled update... however, in the face of multiple
572 * accesses, no sequential access pattern can develop anyway, so we
573 * haven't lost any real info.
574 */
575 object->last_alloc = offset;
576 }
577
578 #if DEVELOPMENT || DEBUG
579 SCALABLE_COUNTER_DEFINE(vm_page_deactivate_behind_count);
580 #endif /* DEVELOPMENT || DEBUG */
581
582 /*
583 * @func vm_fault_deactivate_behind
584 *
585 * @description
586 * Determine if sequential access is in progress
587 * in accordance with the behavior specified. If
588 * so, compute a potential page to deactivate and
589 * deactivate it.
590 *
591 * object must be locked.
592 *
593 * @returns the number of deactivated pages
594 */
595 static
596 uint32_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)597 vm_fault_deactivate_behind(
598 vm_object_t object,
599 vm_object_offset_t offset,
600 vm_behavior_t behavior)
601 {
602 uint32_t pages_in_run = 0;
603 uint32_t max_pages_in_run = 0;
604 int32_t sequential_run;
605 vm_behavior_t sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
606 vm_object_offset_t run_offset = 0;
607 vm_object_offset_t pg_offset = 0;
608 vm_page_t m;
609 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
610
611 #if TRACEFAULTPAGE
612 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
613 #endif
614 if (is_kernel_object(object) ||
615 !vm_page_deactivate_behind ||
616 (vm_object_trunc_page(offset) != offset) ||
617 (object->resident_page_count - object->wired_page_count <
618 vm_page_active_count / vm_page_deactivate_behind_min_resident_ratio)) {
619 /*
620 * Do not deactivate pages from the kernel object: they
621 * are not intended to become pageable.
622 * or we've disabled the deactivate behind mechanism
623 * or we are dealing with an offset that is not aligned to
624 * the system's PAGE_SIZE because in that case we will
625 * handle the deactivation on the aligned offset and, thus,
626 * the full PAGE_SIZE page once. This helps us avoid the redundant
627 * deactivates and the extra faults.
628 *
629 * Objects need only participate in backwards
630 * deactivation if they are exceedingly large (i.e. their
631 * resident pages are liable to comprise a substantially large
632 * portion of the active queue and push out the rest of the
633 * system's working set).
634 */
635 return 0;
636 }
637
638 KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_START,
639 VM_KERNEL_ADDRHIDE(object), offset, behavior);
640
641 if ((sequential_run = object->sequential)) {
642 if (sequential_run < 0) {
643 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
644 sequential_run = 0 - sequential_run;
645 } else {
646 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
647 }
648 }
649 switch (behavior) {
650 case VM_BEHAVIOR_RANDOM:
651 break;
652 case VM_BEHAVIOR_SEQUENTIAL:
653 if (sequential_run >= (int)PAGE_SIZE) {
654 run_offset = 0 - PAGE_SIZE_64;
655 max_pages_in_run = 1;
656 }
657 break;
658 case VM_BEHAVIOR_RSEQNTL:
659 if (sequential_run >= (int)PAGE_SIZE) {
660 run_offset = PAGE_SIZE_64;
661 max_pages_in_run = 1;
662 }
663 break;
664 case VM_BEHAVIOR_DEFAULT:
665 default:
666 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
667
668 /*
669 * determine if the run of sequential accesss has been
670 * long enough on an object with default access behavior
671 * to consider it for deactivation
672 */
673 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
674 /*
675 * the comparisons between offset and behind are done
676 * in this kind of odd fashion in order to prevent wrap around
677 * at the end points
678 */
679 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
680 if (offset >= behind) {
681 run_offset = 0 - behind;
682 pg_offset = PAGE_SIZE_64;
683 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
684 }
685 } else {
686 if (offset < -behind) {
687 run_offset = behind;
688 pg_offset = 0 - PAGE_SIZE_64;
689 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
690 }
691 }
692 }
693 break;}
694 }
695 for (unsigned n = 0; n < max_pages_in_run; n++) {
696 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
697
698 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache &&
699 (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) &&
700 !vm_page_is_fictitious(m) && !m->vmp_absent) {
701 page_run[pages_in_run++] = m;
702
703 /*
704 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
705 *
706 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
707 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
708 * new reference happens. If no futher references happen on the page after that remote TLB flushes
709 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
710 * by pageout_scan, which is just fine since the last reference would have happened quite far
711 * in the past (TLB caches don't hang around for very long), and of course could just as easily
712 * have happened before we did the deactivate_behind.
713 */
714 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
715 }
716 }
717
718 if (pages_in_run) {
719 vm_page_lockspin_queues();
720
721 for (unsigned n = 0; n < pages_in_run; n++) {
722 m = page_run[n];
723
724 vm_page_deactivate_internal(m, FALSE);
725
726 #if DEVELOPMENT || DEBUG
727 counter_inc(&vm_page_deactivate_behind_count);
728 #endif /* DEVELOPMENT || DEBUG */
729
730 #if TRACEFAULTPAGE
731 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
732 #endif
733 }
734 vm_page_unlock_queues();
735 }
736
737 KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_END,
738 pages_in_run);
739
740 return pages_in_run;
741 }
742
743
744 #if (DEVELOPMENT || DEBUG)
745 uint32_t vm_page_creation_throttled_hard = 0;
746 uint32_t vm_page_creation_throttled_soft = 0;
747 uint64_t vm_page_creation_throttle_avoided = 0;
748 #endif /* DEVELOPMENT || DEBUG */
749
750 static int
vm_page_throttled(boolean_t page_kept)751 vm_page_throttled(boolean_t page_kept)
752 {
753 clock_sec_t elapsed_sec;
754 clock_sec_t tv_sec;
755 clock_usec_t tv_usec;
756 task_t curtask = current_task_early();
757
758 thread_t thread = current_thread();
759
760 if (thread->options & TH_OPT_VMPRIV) {
761 return 0;
762 }
763
764 if (curtask && !curtask->active) {
765 return 0;
766 }
767
768 if (thread->t_page_creation_throttled) {
769 thread->t_page_creation_throttled = 0;
770
771 if (page_kept == FALSE) {
772 goto no_throttle;
773 }
774 }
775 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
776 #if (DEVELOPMENT || DEBUG)
777 thread->t_page_creation_throttled_hard++;
778 OSAddAtomic(1, &vm_page_creation_throttled_hard);
779 #endif /* DEVELOPMENT || DEBUG */
780 return HARD_THROTTLE_DELAY;
781 }
782
783 if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
784 thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
785 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
786 #if (DEVELOPMENT || DEBUG)
787 OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
788 #endif
789 goto no_throttle;
790 }
791 clock_get_system_microtime(&tv_sec, &tv_usec);
792
793 elapsed_sec = tv_sec - thread->t_page_creation_time;
794
795 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
796 (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
797 if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
798 /*
799 * we'll reset our stats to give a well behaved app
800 * that was unlucky enough to accumulate a bunch of pages
801 * over a long period of time a chance to get out of
802 * the throttled state... we reset the counter and timestamp
803 * so that if it stays under the rate limit for the next second
804 * it will be back in our good graces... if it exceeds it, it
805 * will remain in the throttled state
806 */
807 thread->t_page_creation_time = tv_sec;
808 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
809 }
810 VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
811
812 thread->t_page_creation_throttled = 1;
813
814 if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
815 #if (DEVELOPMENT || DEBUG)
816 thread->t_page_creation_throttled_hard++;
817 OSAddAtomic(1, &vm_page_creation_throttled_hard);
818 #endif /* DEVELOPMENT || DEBUG */
819 return HARD_THROTTLE_DELAY;
820 } else {
821 #if (DEVELOPMENT || DEBUG)
822 thread->t_page_creation_throttled_soft++;
823 OSAddAtomic(1, &vm_page_creation_throttled_soft);
824 #endif /* DEVELOPMENT || DEBUG */
825 return SOFT_THROTTLE_DELAY;
826 }
827 }
828 thread->t_page_creation_time = tv_sec;
829 thread->t_page_creation_count = 0;
830 }
831 no_throttle:
832 thread->t_page_creation_count++;
833
834 return 0;
835 }
836
837 extern boolean_t vm_pageout_running;
838 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)839 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
840 int throttle_delay)
841 {
842 /* make sure vm_pageout_scan() gets to work while we're throttled */
843 if (!vm_pageout_running) {
844 thread_wakeup((event_t)&vm_page_free_wanted);
845 }
846 delay(throttle_delay);
847 }
848
849
850 /*
851 * check for various conditions that would
852 * prevent us from creating a ZF page...
853 * cleanup is based on being called from vm_fault_page
854 *
855 * object must be locked
856 * object == m->vmp_object
857 */
858 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)859 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
860 {
861 int throttle_delay;
862
863 if (object->shadow_severed ||
864 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
865 /*
866 * Either:
867 * 1. the shadow chain was severed,
868 * 2. the purgeable object is volatile or empty and is marked
869 * to fault on access while volatile.
870 * Just have to return an error at this point
871 */
872 if (m != VM_PAGE_NULL) {
873 VM_PAGE_FREE(m);
874 }
875 vm_fault_cleanup(object, first_m);
876
877 thread_interrupt_level(interruptible_state);
878
879 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
880 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
881 }
882
883 if (object->shadow_severed) {
884 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
885 }
886 return VM_FAULT_MEMORY_ERROR;
887 }
888 if (page_throttle == TRUE) {
889 if ((throttle_delay = vm_page_throttled(FALSE))) {
890 /*
891 * we're throttling zero-fills...
892 * treat this as if we couldn't grab a page
893 */
894 if (m != VM_PAGE_NULL) {
895 VM_PAGE_FREE(m);
896 }
897 vm_fault_cleanup(object, first_m);
898
899 VM_DEBUG_EVENT(vmf_check_zfdelay, DBG_VM_FAULT_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
900
901 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
902
903 if (current_thread_aborted()) {
904 thread_interrupt_level(interruptible_state);
905 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
906 return VM_FAULT_INTERRUPTED;
907 }
908 thread_interrupt_level(interruptible_state);
909
910 return VM_FAULT_MEMORY_SHORTAGE;
911 }
912 }
913 return VM_FAULT_SUCCESS;
914 }
915
916 /*
917 * Clear the code signing bits on the given page_t
918 */
919 static void
vm_fault_cs_clear(vm_page_t m)920 vm_fault_cs_clear(vm_page_t m)
921 {
922 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
923 m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
924 m->vmp_cs_nx = VMP_CS_ALL_FALSE;
925 }
926
927 /*
928 * Enqueues the given page on the throttled queue.
929 * The caller must hold the vm_page_queue_lock and it will be held on return.
930 */
931 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)932 vm_fault_enqueue_throttled_locked(vm_page_t m)
933 {
934 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
935 assert(!VM_PAGE_WIRED(m));
936
937 /*
938 * can't be on the pageout queue since we don't
939 * have a pager to try and clean to
940 */
941 vm_page_queues_remove(m, TRUE);
942 vm_page_check_pageable_safe(m);
943 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
944 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
945 vm_page_throttled_count++;
946 }
947
948 /*
949 * do the work to zero fill a page and
950 * inject it into the correct paging queue
951 *
952 * m->vmp_object must be locked
953 * page queue lock must NOT be held
954 */
955 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)956 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
957 {
958 int my_fault = DBG_ZERO_FILL_FAULT;
959 vm_object_t object;
960
961 object = VM_PAGE_OBJECT(m);
962
963 /*
964 * This is is a zero-fill page fault...
965 *
966 * Checking the page lock is a waste of
967 * time; this page was absent, so
968 * it can't be page locked by a pager.
969 *
970 * we also consider it undefined
971 * with respect to instruction
972 * execution. i.e. it is the responsibility
973 * of higher layers to call for an instruction
974 * sync after changing the contents and before
975 * sending a program into this area. We
976 * choose this approach for performance
977 */
978 vm_fault_cs_clear(m);
979 m->vmp_pmapped = TRUE;
980
981 if (no_zero_fill == TRUE) {
982 my_fault = DBG_NZF_PAGE_FAULT;
983
984 if (m->vmp_absent && m->vmp_busy) {
985 return my_fault;
986 }
987 } else {
988 vm_page_zero_fill(
989 m
990 #if HAS_MTE
991 , true /* zero_tags */
992 #endif /* HAS_MTE */
993 );
994
995 counter_inc(&vm_statistics_zero_fill_count);
996 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
997 }
998 assert(!m->vmp_laundry);
999 assert(!is_kernel_object(object));
1000 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
1001 if (!VM_DYNAMIC_PAGING_ENABLED() &&
1002 (object->purgable == VM_PURGABLE_DENY ||
1003 object->purgable == VM_PURGABLE_NONVOLATILE ||
1004 object->purgable == VM_PURGABLE_VOLATILE)) {
1005 vm_page_lockspin_queues();
1006 if (!VM_DYNAMIC_PAGING_ENABLED()) {
1007 vm_fault_enqueue_throttled_locked(m);
1008 }
1009 vm_page_unlock_queues();
1010 }
1011 return my_fault;
1012 }
1013
1014 /*
1015 * Recovery actions for vm_fault_page
1016 */
1017 __attribute__((always_inline))
1018 static void
vm_fault_page_release_page(vm_page_t m,bool * clear_absent_on_error)1019 vm_fault_page_release_page(
1020 vm_page_t m, /* Page to release */
1021 bool *clear_absent_on_error /* IN/OUT */)
1022 {
1023 vm_page_wakeup_done(VM_PAGE_OBJECT(m), m);
1024 if (!VM_PAGE_PAGEABLE(m)) {
1025 vm_page_lockspin_queues();
1026 if (*clear_absent_on_error && m->vmp_absent) {
1027 vm_page_zero_fill(
1028 m
1029 #if HAS_MTE
1030 , false /* zero_tags */
1031 #endif /* HAS_MTE */
1032 );
1033 counter_inc(&vm_statistics_zero_fill_count);
1034 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
1035 m->vmp_absent = false;
1036 }
1037 if (!VM_PAGE_PAGEABLE(m)) {
1038 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1039 vm_page_deactivate(m);
1040 } else {
1041 vm_page_activate(m);
1042 }
1043 }
1044 vm_page_unlock_queues();
1045 }
1046 *clear_absent_on_error = false;
1047 }
1048 /*
1049 * Routine: vm_fault_page
1050 * Purpose:
1051 * Find the resident page for the virtual memory
1052 * specified by the given virtual memory object
1053 * and offset.
1054 * Additional arguments:
1055 * The required permissions for the page is given
1056 * in "fault_type". Desired permissions are included
1057 * in "protection".
1058 * fault_info is passed along to determine pagein cluster
1059 * limits... it contains the expected reference pattern,
1060 * cluster size if available, etc...
1061 *
1062 * If the desired page is known to be resident (for
1063 * example, because it was previously wired down), asserting
1064 * the "unwiring" parameter will speed the search.
1065 *
1066 * If the operation can be interrupted (by thread_abort
1067 * or thread_terminate), then the "interruptible"
1068 * parameter should be asserted.
1069 *
1070 * Results:
1071 * The page containing the proper data is returned
1072 * in "result_page".
1073 *
1074 * In/out conditions:
1075 * The source object must be locked and referenced,
1076 * and must donate one paging reference. The reference
1077 * is not affected. The paging reference and lock are
1078 * consumed.
1079 *
1080 * If the call succeeds, the object in which "result_page"
1081 * resides is left locked and holding a paging reference.
1082 * If this is not the original object, a busy page in the
1083 * original object is returned in "top_page", to prevent other
1084 * callers from pursuing this same data, along with a paging
1085 * reference for the original object. The "top_page" should
1086 * be destroyed when this guarantee is no longer required.
1087 * The "result_page" is also left busy. It is not removed
1088 * from the pageout queues.
1089 * Special Case:
1090 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
1091 * fault succeeded but there's no VM page (i.e. the VM object
1092 * does not actually hold VM pages, but device memory or
1093 * large pages). The object is still locked and we still hold a
1094 * paging_in_progress reference.
1095 */
1096 unsigned int vm_fault_page_blocked_access = 0;
1097 unsigned int vm_fault_page_forced_retry = 0;
1098
1099 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)1100 vm_fault_page(
1101 /* Arguments: */
1102 vm_object_t first_object, /* Object to begin search */
1103 vm_object_offset_t first_offset, /* Offset into object */
1104 vm_prot_t fault_type, /* What access is requested */
1105 boolean_t must_be_resident,/* Must page be resident? */
1106 boolean_t caller_lookup, /* caller looked up page */
1107 /* Modifies in place: */
1108 vm_prot_t *protection, /* Protection for mapping */
1109 vm_page_t *result_page, /* Page found, if successful */
1110 /* Returns: */
1111 vm_page_t *top_page, /* Page in top object, if
1112 * not result_page. */
1113 int *type_of_fault, /* if non-null, fill in with type of fault
1114 * COW, zero-fill, etc... returned in trace point */
1115 /* More arguments: */
1116 kern_return_t *error_code, /* code if page is in error */
1117 boolean_t no_zero_fill, /* don't zero fill absent pages */
1118 vm_object_fault_info_t fault_info)
1119 {
1120 vm_page_t m;
1121 vm_object_t object;
1122 vm_object_offset_t offset;
1123 vm_page_t first_m;
1124 vm_object_t next_object;
1125 vm_object_t copy_object;
1126 boolean_t look_for_page;
1127 boolean_t force_fault_retry = FALSE;
1128 vm_prot_t access_required = fault_type;
1129 vm_prot_t wants_copy_flag;
1130 kern_return_t wait_result;
1131 wait_interrupt_t interruptible_state;
1132 boolean_t data_already_requested = FALSE;
1133 vm_behavior_t orig_behavior;
1134 vm_size_t orig_cluster_size;
1135 vm_fault_return_t error;
1136 int my_fault;
1137 uint32_t try_failed_count;
1138 wait_interrupt_t interruptible; /* how may fault be interrupted? */
1139 int external_state = VM_EXTERNAL_STATE_UNKNOWN;
1140 memory_object_t pager;
1141 vm_fault_return_t retval;
1142 vm_grab_options_t grab_options;
1143 bool clear_absent_on_error = false;
1144
1145 /*
1146 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1147 * marked as paged out in the compressor pager or the pager doesn't exist.
1148 * Note also that if the pager for an internal object
1149 * has not been created, the pager is not invoked regardless of the value
1150 * of MUST_ASK_PAGER().
1151 *
1152 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1153 * is marked as paged out in the compressor pager.
1154 * PAGED_OUT() is used to determine if a page has already been pushed
1155 * into a copy object in order to avoid a redundant page out operation.
1156 */
1157 #define MUST_ASK_PAGER(o, f, s) \
1158 ((s = vm_object_compressor_pager_state_get((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1159
1160 #define PAGED_OUT(o, f) \
1161 (vm_object_compressor_pager_state_get((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1162
1163 #if TRACEFAULTPAGE
1164 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1165 #endif
1166
1167 interruptible = fault_info->interruptible;
1168 interruptible_state = thread_interrupt_level(interruptible);
1169
1170 /*
1171 * INVARIANTS (through entire routine):
1172 *
1173 * 1) At all times, we must either have the object
1174 * lock or a busy page in some object to prevent
1175 * some other thread from trying to bring in
1176 * the same page.
1177 *
1178 * Note that we cannot hold any locks during the
1179 * pager access or when waiting for memory, so
1180 * we use a busy page then.
1181 *
1182 * 2) To prevent another thread from racing us down the
1183 * shadow chain and entering a new page in the top
1184 * object before we do, we must keep a busy page in
1185 * the top object while following the shadow chain.
1186 *
1187 * 3) We must increment paging_in_progress on any object
1188 * for which we have a busy page before dropping
1189 * the object lock
1190 *
1191 * 4) We leave busy pages on the pageout queues.
1192 * If the pageout daemon comes across a busy page,
1193 * it will remove the page from the pageout queues.
1194 */
1195
1196 object = first_object;
1197 offset = first_offset;
1198 first_m = VM_PAGE_NULL;
1199 access_required = fault_type;
1200
1201 /*
1202 * default type of fault
1203 */
1204 my_fault = DBG_CACHE_HIT_FAULT;
1205 thread_pri_floor_t token;
1206 bool drop_floor = false;
1207
1208 while (TRUE) {
1209 #if TRACEFAULTPAGE
1210 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1211 #endif
1212
1213 grab_options = vm_page_grab_options_for_object(object);
1214 #if HAS_MTE
1215 if (!(grab_options & VM_PAGE_GRAB_MTE) &&
1216 mteinfo_vm_tag_can_use_tag_storage((vm_tag_t)fault_info->user_tag)) {
1217 grab_options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
1218 }
1219 #endif /* HAS_MTE */
1220
1221 if (!object->alive) {
1222 /*
1223 * object is no longer valid
1224 * clean up and return error
1225 */
1226 #if DEVELOPMENT || DEBUG
1227 printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, os_ref_get_count_raw(&object->ref_count), object->shadow_severed);
1228 if (panic_object_not_alive) {
1229 panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, os_ref_get_count_raw(&object->ref_count), object->shadow_severed);
1230 }
1231 #endif /* DEVELOPMENT || DEBUG */
1232 vm_fault_cleanup(object, first_m);
1233 thread_interrupt_level(interruptible_state);
1234
1235 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1236 return VM_FAULT_MEMORY_ERROR;
1237 }
1238
1239 if (!object->pager_created && object->phys_contiguous) {
1240 /*
1241 * A physically-contiguous object without a pager:
1242 * must be a "large page" object. We do not deal
1243 * with VM pages for this object.
1244 */
1245 caller_lookup = FALSE;
1246 m = VM_PAGE_NULL;
1247 goto phys_contig_object;
1248 }
1249
1250 if (object->blocked_access) {
1251 /*
1252 * Access to this VM object has been blocked.
1253 * Replace our "paging_in_progress" reference with
1254 * a "activity_in_progress" reference and wait for
1255 * access to be unblocked.
1256 */
1257 caller_lookup = FALSE; /* no longer valid after sleep */
1258 vm_object_activity_begin(object);
1259 vm_object_paging_end(object);
1260 while (object->blocked_access) {
1261 vm_object_sleep(object,
1262 VM_OBJECT_EVENT_UNBLOCKED,
1263 THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
1264 }
1265 vm_fault_page_blocked_access++;
1266 vm_object_paging_begin(object);
1267 vm_object_activity_end(object);
1268 }
1269
1270 /*
1271 * See whether the page at 'offset' is resident
1272 */
1273 if (caller_lookup == TRUE) {
1274 /*
1275 * The caller has already looked up the page
1276 * and gave us the result in "result_page".
1277 * We can use this for the first lookup but
1278 * it loses its validity as soon as we unlock
1279 * the object.
1280 */
1281 m = *result_page;
1282 caller_lookup = FALSE; /* no longer valid after that */
1283 } else {
1284 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1285 }
1286 #if TRACEFAULTPAGE
1287 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1288 #endif
1289 if (m != VM_PAGE_NULL) {
1290 if (m->vmp_busy) {
1291 /*
1292 * The page is being brought in,
1293 * wait for it and then retry.
1294 */
1295 #if TRACEFAULTPAGE
1296 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1297 #endif
1298 if (fault_info->fi_no_sleep) {
1299 /* Caller has requested not to sleep on busy pages */
1300 vm_fault_cleanup(object, first_m);
1301 thread_interrupt_level(interruptible_state);
1302 return VM_FAULT_BUSY;
1303 }
1304
1305 wait_result = vm_page_sleep(object, m, interruptible, LCK_SLEEP_DEFAULT);
1306
1307 if (wait_result != THREAD_AWAKENED) {
1308 vm_fault_cleanup(object, first_m);
1309 thread_interrupt_level(interruptible_state);
1310
1311 if (wait_result == THREAD_RESTART) {
1312 return VM_FAULT_RETRY;
1313 } else {
1314 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1315 return VM_FAULT_INTERRUPTED;
1316 }
1317 }
1318 continue;
1319 }
1320 if (m->vmp_laundry) {
1321 m->vmp_free_when_done = FALSE;
1322
1323 if (!m->vmp_cleaning) {
1324 vm_pageout_steal_laundry(m, FALSE);
1325 }
1326 }
1327 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1328 if (vm_page_is_guard(m)) {
1329 /*
1330 * Guard page: off limits !
1331 */
1332 if (fault_type == VM_PROT_NONE) {
1333 /*
1334 * The fault is not requesting any
1335 * access to the guard page, so it must
1336 * be just to wire or unwire it.
1337 * Let's pretend it succeeded...
1338 */
1339 m->vmp_busy = TRUE;
1340 *result_page = m;
1341 assert(first_m == VM_PAGE_NULL);
1342 *top_page = first_m;
1343 if (type_of_fault) {
1344 *type_of_fault = DBG_GUARD_FAULT;
1345 }
1346 thread_interrupt_level(interruptible_state);
1347 return VM_FAULT_SUCCESS;
1348 } else {
1349 /*
1350 * The fault requests access to the
1351 * guard page: let's deny that !
1352 */
1353 vm_fault_cleanup(object, first_m);
1354 thread_interrupt_level(interruptible_state);
1355 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1356 return VM_FAULT_MEMORY_ERROR;
1357 }
1358 }
1359
1360
1361 if (m->vmp_error) {
1362 /*
1363 * The page is in error, give up now.
1364 */
1365 #if TRACEFAULTPAGE
1366 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1367 #endif
1368 if (error_code) {
1369 *error_code = KERN_MEMORY_ERROR;
1370 }
1371 VM_PAGE_FREE(m);
1372
1373 vm_fault_cleanup(object, first_m);
1374 thread_interrupt_level(interruptible_state);
1375
1376 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1377 return VM_FAULT_MEMORY_ERROR;
1378 }
1379 if (m->vmp_restart) {
1380 /*
1381 * The pager wants us to restart
1382 * at the top of the chain,
1383 * typically because it has moved the
1384 * page to another pager, then do so.
1385 */
1386 #if TRACEFAULTPAGE
1387 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1388 #endif
1389 VM_PAGE_FREE(m);
1390
1391 vm_fault_cleanup(object, first_m);
1392 thread_interrupt_level(interruptible_state);
1393
1394 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1395 return VM_FAULT_RETRY;
1396 }
1397 if (m->vmp_absent) {
1398 /*
1399 * The page isn't busy, but is absent,
1400 * therefore it's deemed "unavailable".
1401 *
1402 * Remove the non-existent page (unless it's
1403 * in the top object) and move on down to the
1404 * next object (if there is one).
1405 */
1406 #if TRACEFAULTPAGE
1407 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1408 #endif
1409 next_object = object->shadow;
1410
1411 if (next_object == VM_OBJECT_NULL) {
1412 /*
1413 * Absent page at bottom of shadow
1414 * chain; zero fill the page we left
1415 * busy in the first object, and free
1416 * the absent page.
1417 */
1418 assert(!must_be_resident);
1419
1420 /*
1421 * check for any conditions that prevent
1422 * us from creating a new zero-fill page
1423 * vm_fault_check will do all of the
1424 * fault cleanup in the case of an error condition
1425 * including resetting the thread_interrupt_level
1426 */
1427 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1428
1429 if (error != VM_FAULT_SUCCESS) {
1430 return error;
1431 }
1432
1433 if (object != first_object) {
1434 /*
1435 * free the absent page we just found
1436 */
1437 VM_PAGE_FREE(m);
1438
1439 /*
1440 * drop reference and lock on current object
1441 */
1442 vm_object_paging_end(object);
1443 vm_object_unlock(object);
1444
1445 /*
1446 * grab the original page we
1447 * 'soldered' in place and
1448 * retake lock on 'first_object'
1449 */
1450 m = first_m;
1451 first_m = VM_PAGE_NULL;
1452
1453 object = first_object;
1454 offset = first_offset;
1455
1456 vm_object_lock(object);
1457 } else {
1458 /*
1459 * we're going to use the absent page we just found
1460 * so convert it to a 'busy' page
1461 */
1462 m->vmp_absent = FALSE;
1463 m->vmp_busy = TRUE;
1464 }
1465 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1466 m->vmp_absent = TRUE;
1467 clear_absent_on_error = true;
1468 }
1469 /*
1470 * zero-fill the page and put it on
1471 * the correct paging queue
1472 */
1473 my_fault = vm_fault_zero_page(m, no_zero_fill);
1474
1475 break;
1476 } else {
1477 if (must_be_resident) {
1478 vm_object_paging_end(object);
1479 } else if (object != first_object) {
1480 vm_object_paging_end(object);
1481 VM_PAGE_FREE(m);
1482 } else {
1483 first_m = m;
1484 m->vmp_absent = FALSE;
1485 m->vmp_busy = TRUE;
1486
1487 vm_page_lockspin_queues();
1488 vm_page_queues_remove(m, FALSE);
1489 vm_page_unlock_queues();
1490 }
1491
1492 offset += object->vo_shadow_offset;
1493 fault_info->lo_offset += object->vo_shadow_offset;
1494 fault_info->hi_offset += object->vo_shadow_offset;
1495 access_required = VM_PROT_READ;
1496
1497 vm_object_lock(next_object);
1498 vm_object_unlock(object);
1499 object = next_object;
1500 vm_object_paging_begin(object);
1501
1502 /*
1503 * reset to default type of fault
1504 */
1505 my_fault = DBG_CACHE_HIT_FAULT;
1506
1507 continue;
1508 }
1509 }
1510 if ((m->vmp_cleaning)
1511 && ((object != first_object) || (object->vo_copy != VM_OBJECT_NULL))
1512 && (fault_type & VM_PROT_WRITE)) {
1513 /*
1514 * This is a copy-on-write fault that will
1515 * cause us to revoke access to this page, but
1516 * this page is in the process of being cleaned
1517 * in a clustered pageout. We must wait until
1518 * the cleaning operation completes before
1519 * revoking access to the original page,
1520 * otherwise we might attempt to remove a
1521 * wired mapping.
1522 */
1523 #if TRACEFAULTPAGE
1524 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1525 #endif
1526 /*
1527 * take an extra ref so that object won't die
1528 */
1529 vm_object_reference_locked(object);
1530
1531 vm_fault_cleanup(object, first_m);
1532
1533 vm_object_lock(object);
1534 assert(os_ref_get_count_raw(&object->ref_count) > 0);
1535
1536 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1537
1538 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1539 wait_result = vm_page_sleep(object, m, interruptible, LCK_SLEEP_UNLOCK);
1540 vm_object_deallocate(object);
1541 goto backoff;
1542 } else {
1543 vm_object_unlock(object);
1544
1545 vm_object_deallocate(object);
1546 thread_interrupt_level(interruptible_state);
1547
1548 return VM_FAULT_RETRY;
1549 }
1550 }
1551 if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1552 !(fault_info != NULL && fault_info->stealth)) {
1553 /*
1554 * If we were passed a non-NULL pointer for
1555 * "type_of_fault", than we came from
1556 * vm_fault... we'll let it deal with
1557 * this condition, since it
1558 * needs to see m->vmp_speculative to correctly
1559 * account the pageins, otherwise...
1560 * take it off the speculative queue, we'll
1561 * let the caller of vm_fault_page deal
1562 * with getting it onto the correct queue
1563 *
1564 * If the caller specified in fault_info that
1565 * it wants a "stealth" fault, we also leave
1566 * the page in the speculative queue.
1567 */
1568 vm_page_lockspin_queues();
1569 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1570 vm_page_queues_remove(m, FALSE);
1571 }
1572 vm_page_unlock_queues();
1573 }
1574 assert(object == VM_PAGE_OBJECT(m));
1575
1576 if (object->code_signed) {
1577 /*
1578 * CODE SIGNING:
1579 * We just paged in a page from a signed
1580 * memory object but we don't need to
1581 * validate it now. We'll validate it if
1582 * when it gets mapped into a user address
1583 * space for the first time or when the page
1584 * gets copied to another object as a result
1585 * of a copy-on-write.
1586 */
1587 }
1588
1589 /*
1590 * We mark the page busy and leave it on
1591 * the pageout queues. If the pageout
1592 * deamon comes across it, then it will
1593 * remove the page from the queue, but not the object
1594 */
1595 #if TRACEFAULTPAGE
1596 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1597 #endif
1598 assert(!m->vmp_busy);
1599 assert(!m->vmp_absent);
1600
1601 m->vmp_busy = TRUE;
1602 break;
1603 }
1604
1605 /*
1606 * we get here when there is no page present in the object at
1607 * the offset we're interested in... we'll allocate a page
1608 * at this point if the pager associated with
1609 * this object can provide the data or we're the top object...
1610 * object is locked; m == NULL
1611 */
1612
1613 if (must_be_resident) {
1614 if (fault_type == VM_PROT_NONE &&
1615 is_kernel_object(object)) {
1616 /*
1617 * We've been called from vm_fault_unwire()
1618 * while removing a map entry that was allocated
1619 * with KMA_KOBJECT and KMA_VAONLY. This page
1620 * is not present and there's nothing more to
1621 * do here (nothing to unwire).
1622 */
1623 vm_fault_cleanup(object, first_m);
1624 thread_interrupt_level(interruptible_state);
1625
1626 return VM_FAULT_MEMORY_ERROR;
1627 }
1628
1629 goto dont_look_for_page;
1630 }
1631
1632 /* Don't expect to fault pages into the kernel object. */
1633 assert(!is_kernel_object(object));
1634
1635 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1636
1637 #if TRACEFAULTPAGE
1638 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1639 #endif
1640 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1641 /*
1642 * Allocate a new page for this object/offset pair as a placeholder
1643 */
1644 m = vm_page_grab_options(grab_options);
1645 #if TRACEFAULTPAGE
1646 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1647 #endif
1648 if (m == VM_PAGE_NULL) {
1649 vm_fault_cleanup(object, first_m);
1650 thread_interrupt_level(interruptible_state);
1651
1652 return VM_FAULT_MEMORY_SHORTAGE;
1653 }
1654
1655 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1656 vm_page_insert_internal(m, object,
1657 vm_object_trunc_page(offset),
1658 VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1659 } else {
1660 vm_page_insert(m, object, vm_object_trunc_page(offset));
1661 }
1662 }
1663 if (look_for_page) {
1664 kern_return_t rc;
1665 int my_fault_type;
1666
1667 /*
1668 * If the memory manager is not ready, we
1669 * cannot make requests.
1670 */
1671 if (!object->pager_ready) {
1672 #if TRACEFAULTPAGE
1673 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1674 #endif
1675 if (m != VM_PAGE_NULL) {
1676 VM_PAGE_FREE(m);
1677 }
1678
1679 /*
1680 * take an extra ref so object won't die
1681 */
1682 vm_object_reference_locked(object);
1683 vm_fault_cleanup(object, first_m);
1684
1685 vm_object_lock(object);
1686 assert(os_ref_get_count_raw(&object->ref_count) > 0);
1687
1688 if (!object->pager_ready) {
1689 wait_result = vm_object_sleep(object, VM_OBJECT_EVENT_PAGER_READY, interruptible, LCK_SLEEP_UNLOCK);
1690 vm_object_deallocate(object);
1691
1692 goto backoff;
1693 } else {
1694 vm_object_unlock(object);
1695 vm_object_deallocate(object);
1696 thread_interrupt_level(interruptible_state);
1697
1698 return VM_FAULT_RETRY;
1699 }
1700 }
1701 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1702 /*
1703 * If there are too many outstanding page
1704 * requests pending on this external object, we
1705 * wait for them to be resolved now.
1706 */
1707 #if TRACEFAULTPAGE
1708 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1709 #endif
1710 if (m != VM_PAGE_NULL) {
1711 VM_PAGE_FREE(m);
1712 }
1713 /*
1714 * take an extra ref so object won't die
1715 */
1716 vm_object_reference_locked(object);
1717
1718 vm_fault_cleanup(object, first_m);
1719
1720 vm_object_lock(object);
1721 assert(os_ref_get_count_raw(&object->ref_count) > 0);
1722
1723 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1724 wait_result = vm_object_paging_throttle_wait(object, interruptible);
1725 vm_object_unlock(object);
1726 vm_object_deallocate(object);
1727 goto backoff;
1728 } else {
1729 vm_object_unlock(object);
1730 vm_object_deallocate(object);
1731 thread_interrupt_level(interruptible_state);
1732
1733 return VM_FAULT_RETRY;
1734 }
1735 }
1736 if (object->internal) {
1737 int compressed_count_delta;
1738 vm_compressor_options_t c_flags = 0;
1739
1740 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1741
1742 if (m == VM_PAGE_NULL) {
1743 /*
1744 * Allocate a new page for this object/offset pair as a placeholder
1745 */
1746 m = vm_page_grab_options(grab_options);
1747 #if TRACEFAULTPAGE
1748 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1749 #endif
1750 if (m == VM_PAGE_NULL) {
1751 vm_fault_cleanup(object, first_m);
1752 thread_interrupt_level(interruptible_state);
1753
1754 return VM_FAULT_MEMORY_SHORTAGE;
1755 }
1756
1757 m->vmp_absent = TRUE;
1758 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1759 vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1760 } else {
1761 vm_page_insert(m, object, vm_object_trunc_page(offset));
1762 }
1763 }
1764 assert(m->vmp_busy);
1765
1766 m->vmp_absent = TRUE;
1767 pager = object->pager;
1768
1769 assert(object->paging_in_progress > 0);
1770
1771 page_worker_token_t pw_token;
1772 #if PAGE_SLEEP_WITH_INHERITOR
1773 page_worker_register_worker((event_t)m, &pw_token);
1774 #endif /* PAGE_SLEEP_WITH_INHERITOR */
1775
1776 vm_object_unlock(object);
1777 #if HAS_MTE
1778 if (vm_object_is_mte_mappable(object)) {
1779 c_flags |= C_MTE;
1780 }
1781 #endif /* HAS_MTE */
1782 rc = vm_compressor_pager_get(
1783 pager,
1784 offset + object->paging_offset,
1785 VM_PAGE_GET_PHYS_PAGE(m),
1786 &my_fault_type,
1787 c_flags,
1788 &compressed_count_delta);
1789
1790 if (type_of_fault == NULL) {
1791 int throttle_delay;
1792
1793 /*
1794 * we weren't called from vm_fault, so we
1795 * need to apply page creation throttling
1796 * do it before we re-acquire any locks
1797 */
1798 if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1799 if ((throttle_delay = vm_page_throttled(TRUE))) {
1800 VM_DEBUG_EVENT(vmf_compressordelay, DBG_VM_FAULT_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1801 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1802 }
1803 }
1804 }
1805 vm_object_lock(object);
1806 assert(object->paging_in_progress > 0);
1807
1808 vm_compressor_pager_count(
1809 pager,
1810 compressed_count_delta,
1811 FALSE, /* shared_lock */
1812 object);
1813
1814 switch (rc) {
1815 case KERN_SUCCESS:
1816 m->vmp_absent = FALSE;
1817 m->vmp_dirty = TRUE;
1818 if (!HAS_DEFAULT_CACHEABILITY(object->wimg_bits &
1819 VM_WIMG_MASK)) {
1820 /*
1821 * If the page is not cacheable,
1822 * we can't let its contents
1823 * linger in the data cache
1824 * after the decompression.
1825 */
1826 pmap_sync_page_attributes_phys(
1827 VM_PAGE_GET_PHYS_PAGE(m));
1828 } else {
1829 m->vmp_written_by_kernel = TRUE;
1830 }
1831 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
1832 if ((fault_type & VM_PROT_WRITE) == 0) {
1833 vm_object_lock_assert_exclusive(object);
1834 vm_page_lockspin_queues();
1835 m->vmp_unmodified_ro = true;
1836 vm_page_unlock_queues();
1837 os_atomic_inc(&compressor_ro_uncompressed, relaxed);
1838 *protection &= ~VM_PROT_WRITE;
1839 }
1840 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
1841
1842 /*
1843 * If the object is purgeable, its
1844 * owner's purgeable ledgers have been
1845 * updated in vm_page_insert() but the
1846 * page was also accounted for in a
1847 * "compressed purgeable" ledger, so
1848 * update that now.
1849 */
1850 if (((object->purgable !=
1851 VM_PURGABLE_DENY) ||
1852 object->vo_ledger_tag) &&
1853 (object->vo_owner !=
1854 NULL)) {
1855 /*
1856 * One less compressed
1857 * purgeable/tagged page.
1858 */
1859 if (compressed_count_delta) {
1860 vm_object_owner_compressed_update(
1861 object,
1862 -1);
1863 }
1864 }
1865
1866 break;
1867 case KERN_MEMORY_FAILURE:
1868 m->vmp_unusual = TRUE;
1869 m->vmp_error = TRUE;
1870 m->vmp_absent = FALSE;
1871 break;
1872 case KERN_MEMORY_ERROR:
1873 assert(m->vmp_absent);
1874 break;
1875 default:
1876 panic("vm_fault_page(): unexpected "
1877 "error %d from "
1878 "vm_compressor_pager_get()\n",
1879 rc);
1880 }
1881 vm_page_wakeup_done_with_inheritor(object, m, &pw_token);
1882
1883 rc = KERN_SUCCESS;
1884 goto data_requested;
1885 }
1886 my_fault_type = DBG_PAGEIN_FAULT;
1887
1888 if (m != VM_PAGE_NULL) {
1889 VM_PAGE_FREE(m);
1890 m = VM_PAGE_NULL;
1891 }
1892
1893 #if TRACEFAULTPAGE
1894 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1895 #endif
1896
1897 /*
1898 * It's possible someone called vm_object_destroy while we weren't
1899 * holding the object lock. If that has happened, then bail out
1900 * here.
1901 */
1902
1903 pager = object->pager;
1904
1905 if (pager == MEMORY_OBJECT_NULL) {
1906 vm_fault_cleanup(object, first_m);
1907 thread_interrupt_level(interruptible_state);
1908
1909 static const enum vm_subsys_error_codes object_destroy_errors[VM_OBJECT_DESTROY_MAX + 1] = {
1910 [VM_OBJECT_DESTROY_UNKNOWN_REASON] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER,
1911 [VM_OBJECT_DESTROY_UNMOUNT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_UNMOUNT,
1912 [VM_OBJECT_DESTROY_FORCED_UNMOUNT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_FORCED_UNMOUNT,
1913 [VM_OBJECT_DESTROY_UNGRAFT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_UNGRAFT,
1914 [VM_OBJECT_DESTROY_PAGER] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_DEALLOC_PAGER,
1915 [VM_OBJECT_DESTROY_RECLAIM] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_RECLAIM,
1916 };
1917 enum vm_subsys_error_codes kdbg_code = object_destroy_errors[(vm_object_destroy_reason_t)object->no_pager_reason];
1918 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, kdbg_code), 0 /* arg */);
1919 return VM_FAULT_MEMORY_ERROR;
1920 }
1921
1922 /*
1923 * We have an absent page in place for the faulting offset,
1924 * so we can release the object lock.
1925 */
1926
1927 if (object->object_is_shared_cache || pager->mo_pager_ops == &dyld_pager_ops) {
1928 token = thread_priority_floor_start();
1929 /*
1930 * A non-native shared cache object might
1931 * be getting set up in parallel with this
1932 * fault and so we can't assume that this
1933 * check will be valid after we drop the
1934 * object lock below.
1935 *
1936 * FIXME: This should utilize @c page_worker_register_worker()
1937 * (rdar://153586539)
1938 */
1939 drop_floor = true;
1940 }
1941
1942 vm_object_unlock(object);
1943
1944 /*
1945 * If this object uses a copy_call strategy,
1946 * and we are interested in a copy of this object
1947 * (having gotten here only by following a
1948 * shadow chain), then tell the memory manager
1949 * via a flag added to the desired_access
1950 * parameter, so that it can detect a race
1951 * between our walking down the shadow chain
1952 * and its pushing pages up into a copy of
1953 * the object that it manages.
1954 */
1955 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1956 wants_copy_flag = VM_PROT_WANTS_COPY;
1957 } else {
1958 wants_copy_flag = VM_PROT_NONE;
1959 }
1960
1961 if (object->vo_copy == first_object) {
1962 /*
1963 * if we issue the memory_object_data_request in
1964 * this state, we are subject to a deadlock with
1965 * the underlying filesystem if it is trying to
1966 * shrink the file resulting in a push of pages
1967 * into the copy object... that push will stall
1968 * on the placeholder page, and if the pushing thread
1969 * is holding a lock that is required on the pagein
1970 * path (such as a truncate lock), we'll deadlock...
1971 * to avoid this potential deadlock, we throw away
1972 * our placeholder page before calling memory_object_data_request
1973 * and force this thread to retry the vm_fault_page after
1974 * we have issued the I/O. the second time through this path
1975 * we will find the page already in the cache (presumably still
1976 * busy waiting for the I/O to complete) and then complete
1977 * the fault w/o having to go through memory_object_data_request again
1978 */
1979 assert(first_m != VM_PAGE_NULL);
1980 assert(VM_PAGE_OBJECT(first_m) == first_object);
1981
1982 vm_object_lock(first_object);
1983 VM_PAGE_FREE(first_m);
1984 vm_object_paging_end(first_object);
1985 vm_object_unlock(first_object);
1986
1987 first_m = VM_PAGE_NULL;
1988 force_fault_retry = TRUE;
1989
1990 vm_fault_page_forced_retry++;
1991 }
1992
1993 if (data_already_requested == TRUE) {
1994 orig_behavior = fault_info->behavior;
1995 orig_cluster_size = fault_info->cluster_size;
1996
1997 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1998 fault_info->cluster_size = PAGE_SIZE;
1999 }
2000 /*
2001 * Call the memory manager to retrieve the data.
2002 */
2003 rc = memory_object_data_request(
2004 pager,
2005 vm_object_trunc_page(offset) + object->paging_offset,
2006 PAGE_SIZE,
2007 access_required | wants_copy_flag,
2008 (memory_object_fault_info_t)fault_info);
2009
2010 if (data_already_requested == TRUE) {
2011 fault_info->behavior = orig_behavior;
2012 fault_info->cluster_size = orig_cluster_size;
2013 } else {
2014 data_already_requested = TRUE;
2015 }
2016
2017 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2018 #if TRACEFAULTPAGE
2019 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
2020 #endif
2021 vm_object_lock(object);
2022
2023 if (drop_floor) {
2024 thread_priority_floor_end(&token);
2025 drop_floor = false;
2026 }
2027
2028 data_requested:
2029 if (rc != ERR_SUCCESS) {
2030 vm_fault_cleanup(object, first_m);
2031 thread_interrupt_level(interruptible_state);
2032
2033 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
2034
2035 if (rc == MACH_SEND_INTERRUPTED) {
2036 return VM_FAULT_INTERRUPTED;
2037 } else if (rc == KERN_ALREADY_WAITING) {
2038 return VM_FAULT_BUSY;
2039 } else {
2040 return VM_FAULT_MEMORY_ERROR;
2041 }
2042 } else {
2043 clock_sec_t tv_sec;
2044 clock_usec_t tv_usec;
2045
2046 if (my_fault_type == DBG_PAGEIN_FAULT) {
2047 clock_get_system_microtime(&tv_sec, &tv_usec);
2048 current_thread()->t_page_creation_time = tv_sec;
2049 current_thread()->t_page_creation_count = 0;
2050 }
2051 }
2052 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
2053 vm_fault_cleanup(object, first_m);
2054 thread_interrupt_level(interruptible_state);
2055
2056 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2057 return VM_FAULT_INTERRUPTED;
2058 }
2059 if (force_fault_retry == TRUE) {
2060 vm_fault_cleanup(object, first_m);
2061 thread_interrupt_level(interruptible_state);
2062
2063 return VM_FAULT_RETRY;
2064 }
2065 if (m == VM_PAGE_NULL && object->phys_contiguous) {
2066 /*
2067 * No page here means that the object we
2068 * initially looked up was "physically
2069 * contiguous" (i.e. device memory). However,
2070 * with Virtual VRAM, the object might not
2071 * be backed by that device memory anymore,
2072 * so we're done here only if the object is
2073 * still "phys_contiguous".
2074 * Otherwise, if the object is no longer
2075 * "phys_contiguous", we need to retry the
2076 * page fault against the object's new backing
2077 * store (different memory object).
2078 */
2079 phys_contig_object:
2080 assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
2081 assert(object == first_object);
2082 goto done;
2083 }
2084 /*
2085 * potentially a pagein fault
2086 * if we make it through the state checks
2087 * above, than we'll count it as such
2088 */
2089 my_fault = my_fault_type;
2090
2091 /*
2092 * Retry with same object/offset, since new data may
2093 * be in a different page (i.e., m is meaningless at
2094 * this point).
2095 */
2096 continue;
2097 }
2098 dont_look_for_page:
2099 /*
2100 * We get here if the object has no pager, or an existence map
2101 * exists and indicates the page isn't present on the pager
2102 * or we're unwiring a page. If a pager exists, but there
2103 * is no existence map, then the m->vmp_absent case above handles
2104 * the ZF case when the pager can't provide the page
2105 */
2106 #if TRACEFAULTPAGE
2107 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
2108 #endif
2109 if (object == first_object) {
2110 first_m = m;
2111 } else {
2112 assert(m == VM_PAGE_NULL);
2113 }
2114
2115 next_object = object->shadow;
2116
2117 if (next_object == VM_OBJECT_NULL) {
2118 /*
2119 * we've hit the bottom of the shadown chain,
2120 * fill the page in the top object with zeros.
2121 */
2122 assert(!must_be_resident);
2123
2124 if (object != first_object) {
2125 vm_object_paging_end(object);
2126 vm_object_unlock(object);
2127
2128 object = first_object;
2129 offset = first_offset;
2130 vm_object_lock(object);
2131 }
2132 m = first_m;
2133 assert(VM_PAGE_OBJECT(m) == object);
2134 first_m = VM_PAGE_NULL;
2135
2136 /*
2137 * check for any conditions that prevent
2138 * us from creating a new zero-fill page
2139 * vm_fault_check will do all of the
2140 * fault cleanup in the case of an error condition
2141 * including resetting the thread_interrupt_level
2142 */
2143 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
2144
2145 if (error != VM_FAULT_SUCCESS) {
2146 return error;
2147 }
2148
2149 if (m == VM_PAGE_NULL) {
2150 m = vm_page_grab_options(grab_options);
2151
2152 if (m == VM_PAGE_NULL) {
2153 vm_fault_cleanup(object, VM_PAGE_NULL);
2154 thread_interrupt_level(interruptible_state);
2155
2156 return VM_FAULT_MEMORY_SHORTAGE;
2157 }
2158 vm_page_insert(m, object, vm_object_trunc_page(offset));
2159 }
2160 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2161 m->vmp_absent = TRUE;
2162 clear_absent_on_error = true;
2163 }
2164
2165 my_fault = vm_fault_zero_page(m, no_zero_fill);
2166
2167 break;
2168 } else {
2169 /*
2170 * Move on to the next object. Lock the next
2171 * object before unlocking the current one.
2172 */
2173 if ((object != first_object) || must_be_resident) {
2174 vm_object_paging_end(object);
2175 }
2176
2177 offset += object->vo_shadow_offset;
2178 fault_info->lo_offset += object->vo_shadow_offset;
2179 fault_info->hi_offset += object->vo_shadow_offset;
2180 access_required = VM_PROT_READ;
2181
2182 vm_object_lock(next_object);
2183 vm_object_unlock(object);
2184
2185 object = next_object;
2186 vm_object_paging_begin(object);
2187 }
2188 }
2189
2190 /*
2191 * PAGE HAS BEEN FOUND.
2192 *
2193 * This page (m) is:
2194 * busy, so that we can play with it;
2195 * not absent, so that nobody else will fill it;
2196 * possibly eligible for pageout;
2197 *
2198 * The top-level page (first_m) is:
2199 * VM_PAGE_NULL if the page was found in the
2200 * top-level object;
2201 * busy, not absent, and ineligible for pageout.
2202 *
2203 * The current object (object) is locked. A paging
2204 * reference is held for the current and top-level
2205 * objects.
2206 */
2207
2208 #if TRACEFAULTPAGE
2209 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
2210 #endif
2211 #if EXTRA_ASSERTIONS
2212 assert(m->vmp_busy && !m->vmp_absent);
2213 assert((first_m == VM_PAGE_NULL) ||
2214 (first_m->vmp_busy && !first_m->vmp_absent &&
2215 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2216 #endif /* EXTRA_ASSERTIONS */
2217
2218 /*
2219 * If the page is being written, but isn't
2220 * already owned by the top-level object,
2221 * we have to copy it into a new page owned
2222 * by the top-level object.
2223 */
2224 if (object != first_object) {
2225 #if TRACEFAULTPAGE
2226 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2227 #endif
2228 if (fault_type & VM_PROT_WRITE) {
2229 vm_page_t copy_m;
2230
2231 /*
2232 * We only really need to copy if we
2233 * want to write it.
2234 */
2235 assert(!must_be_resident);
2236
2237 /*
2238 * If we try to collapse first_object at this
2239 * point, we may deadlock when we try to get
2240 * the lock on an intermediate object (since we
2241 * have the bottom object locked). We can't
2242 * unlock the bottom object, because the page
2243 * we found may move (by collapse) if we do.
2244 *
2245 * Instead, we first copy the page. Then, when
2246 * we have no more use for the bottom object,
2247 * we unlock it and try to collapse.
2248 *
2249 * Note that we copy the page even if we didn't
2250 * need to... that's the breaks.
2251 */
2252
2253 /*
2254 * Allocate a page for the copy
2255 */
2256 copy_m = vm_page_grab_options(grab_options);
2257
2258 if (copy_m == VM_PAGE_NULL) {
2259 vm_fault_page_release_page(m, &clear_absent_on_error);
2260
2261 vm_fault_cleanup(object, first_m);
2262 thread_interrupt_level(interruptible_state);
2263
2264 return VM_FAULT_MEMORY_SHORTAGE;
2265 }
2266
2267 vm_page_copy(m, copy_m);
2268
2269 /*
2270 * If another map is truly sharing this
2271 * page with us, we have to flush all
2272 * uses of the original page, since we
2273 * can't distinguish those which want the
2274 * original from those which need the
2275 * new copy.
2276 *
2277 * XXXO If we know that only one map has
2278 * access to this page, then we could
2279 * avoid the pmap_disconnect() call.
2280 */
2281 if (m->vmp_pmapped) {
2282 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2283 }
2284
2285 if (m->vmp_clustered) {
2286 VM_PAGE_COUNT_AS_PAGEIN(m);
2287 VM_PAGE_CONSUME_CLUSTERED(m);
2288 }
2289 assert(!m->vmp_cleaning);
2290
2291 /*
2292 * We no longer need the old page or object.
2293 */
2294 vm_fault_page_release_page(m, &clear_absent_on_error);
2295
2296 /*
2297 * This check helps with marking the object as having a sequential pattern
2298 * Normally we'll miss doing this below because this fault is about COW to
2299 * the first_object i.e. bring page in from disk, push to object above but
2300 * don't update the file object's sequential pattern.
2301 */
2302 if (object->internal == FALSE) {
2303 vm_fault_is_sequential(object, offset, fault_info->behavior);
2304 }
2305
2306 vm_object_paging_end(object);
2307 vm_object_unlock(object);
2308
2309 my_fault = DBG_COW_FAULT;
2310 counter_inc(&vm_statistics_cow_faults);
2311 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2312 counter_inc(¤t_task()->cow_faults);
2313
2314 object = first_object;
2315 offset = first_offset;
2316
2317 vm_object_lock(object);
2318 /*
2319 * get rid of the place holder
2320 * page that we soldered in earlier
2321 */
2322 VM_PAGE_FREE(first_m);
2323 first_m = VM_PAGE_NULL;
2324
2325 /*
2326 * and replace it with the
2327 * page we just copied into
2328 */
2329 assert(copy_m->vmp_busy);
2330 vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2331 SET_PAGE_DIRTY(copy_m, TRUE);
2332
2333 m = copy_m;
2334 /*
2335 * Now that we've gotten the copy out of the
2336 * way, let's try to collapse the top object.
2337 * But we have to play ugly games with
2338 * paging_in_progress to do that...
2339 */
2340 vm_object_paging_end(object);
2341 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2342 vm_object_paging_begin(object);
2343 } else {
2344 *protection &= (~VM_PROT_WRITE);
2345 }
2346 }
2347 /*
2348 * Now check whether the page needs to be pushed into the
2349 * copy object. The use of asymmetric copy on write for
2350 * shared temporary objects means that we may do two copies to
2351 * satisfy the fault; one above to get the page from a
2352 * shadowed object, and one here to push it into the copy.
2353 */
2354 try_failed_count = 0;
2355
2356 while ((copy_object = first_object->vo_copy) != VM_OBJECT_NULL) {
2357 vm_object_offset_t copy_offset;
2358 vm_page_t copy_m;
2359
2360 #if TRACEFAULTPAGE
2361 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2362 #endif
2363 /*
2364 * If the page is being written, but hasn't been
2365 * copied to the copy-object, we have to copy it there.
2366 */
2367 if ((fault_type & VM_PROT_WRITE) == 0) {
2368 *protection &= ~VM_PROT_WRITE;
2369 break;
2370 }
2371
2372 /*
2373 * If the page was guaranteed to be resident,
2374 * we must have already performed the copy.
2375 */
2376 if (must_be_resident) {
2377 break;
2378 }
2379
2380 /*
2381 * Try to get the lock on the copy_object.
2382 */
2383 if (!vm_object_lock_try(copy_object)) {
2384 vm_object_unlock(object);
2385 try_failed_count++;
2386
2387 mutex_pause(try_failed_count); /* wait a bit */
2388 vm_object_lock(object);
2389
2390 continue;
2391 }
2392 try_failed_count = 0;
2393
2394 /*
2395 * Make another reference to the copy-object,
2396 * to keep it from disappearing during the
2397 * copy.
2398 */
2399 vm_object_reference_locked(copy_object);
2400
2401 /*
2402 * Does the page exist in the copy?
2403 */
2404 copy_offset = first_offset - copy_object->vo_shadow_offset;
2405 copy_offset = vm_object_trunc_page(copy_offset);
2406
2407 if (copy_object->vo_size <= copy_offset) {
2408 /*
2409 * Copy object doesn't cover this page -- do nothing.
2410 */
2411 ;
2412 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2413 /*
2414 * Page currently exists in the copy object
2415 */
2416 if (copy_m->vmp_busy) {
2417 /*
2418 * If the page is being brought
2419 * in, wait for it and then retry.
2420 */
2421 vm_fault_page_release_page(m, &clear_absent_on_error);
2422
2423 /*
2424 * take an extra ref so object won't die
2425 */
2426 vm_object_reference_locked(copy_object);
2427 vm_object_unlock(copy_object);
2428 vm_fault_cleanup(object, first_m);
2429
2430 vm_object_lock(copy_object);
2431 vm_object_lock_assert_exclusive(copy_object);
2432 os_ref_release_live_locked_raw(©_object->ref_count,
2433 &vm_object_refgrp);
2434 copy_m = vm_page_lookup(copy_object, copy_offset);
2435
2436 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2437 wait_result = vm_page_sleep(copy_object, copy_m, interruptible, LCK_SLEEP_UNLOCK);
2438 vm_object_deallocate(copy_object);
2439
2440 goto backoff;
2441 } else {
2442 vm_object_unlock(copy_object);
2443 vm_object_deallocate(copy_object);
2444 thread_interrupt_level(interruptible_state);
2445
2446 return VM_FAULT_RETRY;
2447 }
2448 }
2449 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2450 /*
2451 * If PAGED_OUT is TRUE, then the page used to exist
2452 * in the copy-object, and has already been paged out.
2453 * We don't need to repeat this. If PAGED_OUT is
2454 * FALSE, then either we don't know (!pager_created,
2455 * for example) or it hasn't been paged out.
2456 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2457 * We must copy the page to the copy object.
2458 *
2459 * Allocate a page for the copy
2460 */
2461 copy_m = vm_page_grab_options(grab_options);
2462
2463 if (copy_m == VM_PAGE_NULL) {
2464 vm_fault_page_release_page(m, &clear_absent_on_error);
2465
2466 vm_object_lock_assert_exclusive(copy_object);
2467 os_ref_release_live_locked_raw(©_object->ref_count,
2468 &vm_object_refgrp);
2469
2470 vm_object_unlock(copy_object);
2471 vm_fault_cleanup(object, first_m);
2472 thread_interrupt_level(interruptible_state);
2473
2474 return VM_FAULT_MEMORY_SHORTAGE;
2475 }
2476
2477 /*
2478 * Must copy page into copy-object.
2479 */
2480 vm_page_insert(copy_m, copy_object, copy_offset);
2481 vm_page_copy(m, copy_m);
2482
2483 /*
2484 * If the old page was in use by any users
2485 * of the copy-object, it must be removed
2486 * from all pmaps. (We can't know which
2487 * pmaps use it.)
2488 */
2489 if (m->vmp_pmapped) {
2490 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2491 }
2492
2493 if (m->vmp_clustered) {
2494 VM_PAGE_COUNT_AS_PAGEIN(m);
2495 VM_PAGE_CONSUME_CLUSTERED(m);
2496 }
2497 /*
2498 * If there's a pager, then immediately
2499 * page out this page, using the "initialize"
2500 * option. Else, we use the copy.
2501 */
2502 if ((!copy_object->pager_ready)
2503 || vm_object_compressor_pager_state_get(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2504 ) {
2505 vm_page_lockspin_queues();
2506 assert(!m->vmp_cleaning);
2507 vm_page_activate(copy_m);
2508 vm_page_unlock_queues();
2509
2510 SET_PAGE_DIRTY(copy_m, TRUE);
2511 vm_page_wakeup_done(copy_object, copy_m);
2512 } else {
2513 assert(copy_m->vmp_busy == TRUE);
2514 assert(!m->vmp_cleaning);
2515
2516 /*
2517 * dirty is protected by the object lock
2518 */
2519 SET_PAGE_DIRTY(copy_m, TRUE);
2520
2521 /*
2522 * The page is already ready for pageout:
2523 * not on pageout queues and busy.
2524 * Unlock everything except the
2525 * copy_object itself.
2526 */
2527 vm_object_unlock(object);
2528
2529 /*
2530 * Write the page to the copy-object,
2531 * flushing it from the kernel.
2532 */
2533 vm_pageout_initialize_page(copy_m);
2534
2535 /*
2536 * Since the pageout may have
2537 * temporarily dropped the
2538 * copy_object's lock, we
2539 * check whether we'll have
2540 * to deallocate the hard way.
2541 */
2542 if ((copy_object->shadow != object) ||
2543 (os_ref_get_count_raw(©_object->ref_count) == 1)) {
2544 vm_object_unlock(copy_object);
2545 vm_object_deallocate(copy_object);
2546 vm_object_lock(object);
2547
2548 continue;
2549 }
2550 /*
2551 * Pick back up the old object's
2552 * lock. [It is safe to do so,
2553 * since it must be deeper in the
2554 * object tree.]
2555 */
2556 vm_object_lock(object);
2557 }
2558
2559 /*
2560 * Because we're pushing a page upward
2561 * in the object tree, we must restart
2562 * any faults that are waiting here.
2563 * [Note that this is an expansion of
2564 * vm_page_wakeup() that uses the THREAD_RESTART
2565 * wait result]. Can't turn off the page's
2566 * busy bit because we're not done with it.
2567 */
2568 if (m->vmp_wanted) {
2569 m->vmp_wanted = FALSE;
2570 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2571 }
2572 }
2573 /*
2574 * The reference count on copy_object must be
2575 * at least 2: one for our extra reference,
2576 * and at least one from the outside world
2577 * (we checked that when we last locked
2578 * copy_object).
2579 */
2580 vm_object_lock_assert_exclusive(copy_object);
2581 os_ref_release_live_locked_raw(©_object->ref_count,
2582 &vm_object_refgrp);
2583
2584 vm_object_unlock(copy_object);
2585
2586 break;
2587 }
2588
2589 done:
2590 *result_page = m;
2591 *top_page = first_m;
2592
2593 if (m != VM_PAGE_NULL) {
2594 assert(VM_PAGE_OBJECT(m) == object);
2595
2596 retval = VM_FAULT_SUCCESS;
2597
2598 if (my_fault == DBG_PAGEIN_FAULT) {
2599 VM_PAGE_COUNT_AS_PAGEIN(m);
2600
2601 if (object->internal) {
2602 my_fault = DBG_PAGEIND_FAULT;
2603 } else {
2604 my_fault = DBG_PAGEINV_FAULT;
2605 }
2606
2607 /*
2608 * evaluate access pattern and update state
2609 * vm_fault_deactivate_behind depends on the
2610 * state being up to date
2611 */
2612 vm_fault_is_sequential(object, offset, fault_info->behavior);
2613 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2614 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2615 /*
2616 * we weren't called from vm_fault, so handle the
2617 * accounting here for hits in the cache
2618 */
2619 if (m->vmp_clustered) {
2620 VM_PAGE_COUNT_AS_PAGEIN(m);
2621 VM_PAGE_CONSUME_CLUSTERED(m);
2622 }
2623 vm_fault_is_sequential(object, offset, fault_info->behavior);
2624 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2625 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2626 VM_STAT_DECOMPRESSIONS();
2627 }
2628 if (type_of_fault) {
2629 *type_of_fault = my_fault;
2630 }
2631 } else {
2632 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2633 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2634 assert(first_m == VM_PAGE_NULL);
2635 assert(object == first_object);
2636 }
2637
2638 thread_interrupt_level(interruptible_state);
2639
2640 #if TRACEFAULTPAGE
2641 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2642 #endif
2643 return retval;
2644
2645 backoff:
2646 thread_interrupt_level(interruptible_state);
2647
2648 if (wait_result == THREAD_INTERRUPTED) {
2649 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2650 return VM_FAULT_INTERRUPTED;
2651 }
2652 return VM_FAULT_RETRY;
2653 }
2654
2655 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2656 #define PANIC_ON_CS_KILLED_DEFAULT true
2657 #else
2658 #define PANIC_ON_CS_KILLED_DEFAULT false
2659 #endif
2660 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2661 PANIC_ON_CS_KILLED_DEFAULT);
2662
2663 extern int proc_selfpid(void);
2664 extern char *proc_name_address(struct proc *p);
2665 extern const char *proc_best_name(struct proc *);
2666 unsigned long cs_enter_tainted_rejected = 0;
2667 unsigned long cs_enter_tainted_accepted = 0;
2668
2669 /*
2670 * CODE SIGNING:
2671 * When soft faulting a page, we have to validate the page if:
2672 * 1. the page is being mapped in user space
2673 * 2. the page hasn't already been found to be "tainted"
2674 * 3. the page belongs to a code-signed object
2675 * 4. the page has not been validated yet or has been mapped for write.
2676 */
2677 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2678 vm_fault_cs_need_validation(
2679 pmap_t pmap,
2680 vm_page_t page,
2681 vm_object_t page_obj,
2682 vm_map_size_t fault_page_size,
2683 vm_map_offset_t fault_phys_offset)
2684 {
2685 if (pmap == kernel_pmap) {
2686 /* 1 - not user space */
2687 return false;
2688 }
2689 if (!page_obj->code_signed) {
2690 /* 3 - page does not belong to a code-signed object */
2691 return false;
2692 }
2693 if (fault_page_size == PAGE_SIZE) {
2694 /* looking at the whole page */
2695 assertf(fault_phys_offset == 0,
2696 "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2697 (uint64_t)fault_page_size,
2698 (uint64_t)fault_phys_offset);
2699 if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2700 /* 2 - page is all tainted */
2701 return false;
2702 }
2703 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2704 !page->vmp_wpmapped) {
2705 /* 4 - already fully validated and never mapped writable */
2706 return false;
2707 }
2708 } else {
2709 /* looking at a specific sub-page */
2710 if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2711 /* 2 - sub-page was already marked as tainted */
2712 return false;
2713 }
2714 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2715 !page->vmp_wpmapped) {
2716 /* 4 - already validated and never mapped writable */
2717 return false;
2718 }
2719 }
2720 /* page needs to be validated */
2721 return true;
2722 }
2723
2724
2725 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2726 vm_fault_cs_page_immutable(
2727 vm_page_t m,
2728 vm_map_size_t fault_page_size,
2729 vm_map_offset_t fault_phys_offset,
2730 vm_prot_t prot __unused)
2731 {
2732 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2733 /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2734 return true;
2735 }
2736 return false;
2737 }
2738
2739 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2740 vm_fault_cs_page_nx(
2741 vm_page_t m,
2742 vm_map_size_t fault_page_size,
2743 vm_map_offset_t fault_phys_offset)
2744 {
2745 return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2746 }
2747
2748 /*
2749 * Check if the page being entered into the pmap violates code signing.
2750 */
2751 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2752 vm_fault_cs_check_violation(
2753 bool cs_bypass,
2754 vm_object_t object,
2755 vm_page_t m,
2756 pmap_t pmap,
2757 vm_prot_t prot,
2758 vm_prot_t caller_prot,
2759 vm_map_size_t fault_page_size,
2760 vm_map_offset_t fault_phys_offset,
2761 vm_object_fault_info_t fault_info,
2762 bool map_is_switched,
2763 bool map_is_switch_protected,
2764 bool *cs_violation)
2765 {
2766 #if !CODE_SIGNING_MONITOR
2767 #pragma unused(caller_prot)
2768 #pragma unused(fault_info)
2769 #endif /* !CODE_SIGNING_MONITOR */
2770
2771 int cs_enforcement_enabled;
2772 if (!cs_bypass &&
2773 vm_fault_cs_need_validation(pmap, m, object,
2774 fault_page_size, fault_phys_offset)) {
2775 vm_object_lock_assert_exclusive(object);
2776
2777 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2778 vm_cs_revalidates++;
2779 }
2780
2781 /* VM map is locked, so 1 ref will remain on VM object -
2782 * so no harm if vm_page_validate_cs drops the object lock */
2783
2784 #if CODE_SIGNING_MONITOR
2785 if (fault_info->csm_associated &&
2786 csm_enabled() &&
2787 !VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2788 !VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) &&
2789 !VMP_CS_NX(m, fault_page_size, fault_phys_offset) &&
2790 (prot & VM_PROT_EXECUTE) &&
2791 (caller_prot & VM_PROT_EXECUTE)) {
2792 /*
2793 * When we have a code signing monitor, the monitor will evaluate the code signature
2794 * for any executable page mapping. No need for the VM to also validate the page.
2795 * In the code signing monitor we trust :)
2796 */
2797 vm_cs_defer_to_csm++;
2798 } else {
2799 vm_cs_defer_to_csm_not++;
2800 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2801 }
2802 #else /* CODE_SIGNING_MONITOR */
2803 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2804 #endif /* CODE_SIGNING_MONITOR */
2805 }
2806
2807 /* If the map is switched, and is switch-protected, we must protect
2808 * some pages from being write-faulted: immutable pages because by
2809 * definition they may not be written, and executable pages because that
2810 * would provide a way to inject unsigned code.
2811 * If the page is immutable, we can simply return. However, we can't
2812 * immediately determine whether a page is executable anywhere. But,
2813 * we can disconnect it everywhere and remove the executable protection
2814 * from the current map. We do that below right before we do the
2815 * PMAP_ENTER.
2816 */
2817 if (pmap == kernel_pmap) {
2818 /* kernel fault: cs_enforcement does not apply */
2819 cs_enforcement_enabled = 0;
2820 } else {
2821 cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2822 }
2823
2824 if (cs_enforcement_enabled && map_is_switched &&
2825 map_is_switch_protected &&
2826 vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2827 (prot & VM_PROT_WRITE)) {
2828 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2829 return KERN_CODESIGN_ERROR;
2830 }
2831
2832 if (cs_enforcement_enabled &&
2833 vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2834 (prot & VM_PROT_EXECUTE)) {
2835 if (cs_debug) {
2836 printf("page marked to be NX, not letting it be mapped EXEC\n");
2837 }
2838 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2839 return KERN_CODESIGN_ERROR;
2840 }
2841
2842 /* A page could be tainted, or pose a risk of being tainted later.
2843 * Check whether the receiving process wants it, and make it feel
2844 * the consequences (that hapens in cs_invalid_page()).
2845 * For CS Enforcement, two other conditions will
2846 * cause that page to be tainted as well:
2847 * - pmapping an unsigned page executable - this means unsigned code;
2848 * - writeable mapping of a validated page - the content of that page
2849 * can be changed without the kernel noticing, therefore unsigned
2850 * code can be created
2851 */
2852 if (cs_bypass) {
2853 /* code-signing is bypassed */
2854 *cs_violation = FALSE;
2855 } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2856 /* tainted page */
2857 *cs_violation = TRUE;
2858 } else if (!cs_enforcement_enabled) {
2859 /* no further code-signing enforcement */
2860 *cs_violation = FALSE;
2861 } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2862 ((prot & VM_PROT_WRITE) ||
2863 m->vmp_wpmapped)) {
2864 /*
2865 * The page should be immutable, but is in danger of being
2866 * modified.
2867 * This is the case where we want policy from the code
2868 * directory - is the page immutable or not? For now we have
2869 * to assume that code pages will be immutable, data pages not.
2870 * We'll assume a page is a code page if it has a code directory
2871 * and we fault for execution.
2872 * That is good enough since if we faulted the code page for
2873 * writing in another map before, it is wpmapped; if we fault
2874 * it for writing in this map later it will also be faulted for
2875 * executing at the same time; and if we fault for writing in
2876 * another map later, we will disconnect it from this pmap so
2877 * we'll notice the change.
2878 */
2879 *cs_violation = TRUE;
2880 } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2881 (prot & VM_PROT_EXECUTE)
2882 #if CODE_SIGNING_MONITOR
2883 /*
2884 * Executable pages will be validated by the code signing monitor. If the
2885 * code signing monitor is turned off, then this is a code-signing violation.
2886 */
2887 && !csm_enabled()
2888 #endif /* CODE_SIGNING_MONITOR */
2889 ) {
2890 *cs_violation = TRUE;
2891 } else {
2892 *cs_violation = FALSE;
2893 }
2894 return KERN_SUCCESS;
2895 }
2896
2897 /*
2898 * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2899 * @param must_disconnect This value will be set to true if the caller must disconnect
2900 * this page.
2901 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2902 */
2903 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2904 vm_fault_cs_handle_violation(
2905 vm_object_t object,
2906 vm_page_t m,
2907 pmap_t pmap,
2908 vm_prot_t prot,
2909 vm_map_offset_t vaddr,
2910 vm_map_size_t fault_page_size,
2911 vm_map_offset_t fault_phys_offset,
2912 bool map_is_switched,
2913 bool map_is_switch_protected,
2914 bool *must_disconnect)
2915 {
2916 #if !MACH_ASSERT
2917 #pragma unused(pmap)
2918 #pragma unused(map_is_switch_protected)
2919 #endif /* !MACH_ASSERT */
2920 /*
2921 * We will have a tainted page. Have to handle the special case
2922 * of a switched map now. If the map is not switched, standard
2923 * procedure applies - call cs_invalid_page().
2924 * If the map is switched, the real owner is invalid already.
2925 * There is no point in invalidating the switching process since
2926 * it will not be executing from the map. So we don't call
2927 * cs_invalid_page() in that case.
2928 */
2929 boolean_t reject_page, cs_killed;
2930 kern_return_t kr;
2931 if (map_is_switched) {
2932 assert(pmap == vm_map_pmap(current_thread()->map));
2933 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2934 reject_page = FALSE;
2935 } else {
2936 if (cs_debug > 5) {
2937 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2938 object->code_signed ? "yes" : "no",
2939 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2940 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2941 m->vmp_wpmapped ? "yes" : "no",
2942 (int)prot);
2943 }
2944 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2945 }
2946
2947 if (reject_page) {
2948 /* reject the invalid page: abort the page fault */
2949 int pid;
2950 const char *procname;
2951 task_t task;
2952 vm_object_t file_object, shadow;
2953 vm_object_offset_t file_offset;
2954 char *pathname, *filename;
2955 vm_size_t pathname_len, filename_len;
2956 boolean_t truncated_path;
2957 #define __PATH_MAX 1024
2958 struct timespec mtime, cs_mtime;
2959 int shadow_depth;
2960 os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2961
2962 kr = KERN_CODESIGN_ERROR;
2963 cs_enter_tainted_rejected++;
2964
2965 /* get process name and pid */
2966 procname = "?";
2967 task = current_task();
2968 pid = proc_selfpid();
2969 if (get_bsdtask_info(task) != NULL) {
2970 procname = proc_name_address(get_bsdtask_info(task));
2971 }
2972
2973 /* get file's VM object */
2974 file_object = object;
2975 file_offset = m->vmp_offset;
2976 for (shadow = file_object->shadow,
2977 shadow_depth = 0;
2978 shadow != VM_OBJECT_NULL;
2979 shadow = file_object->shadow,
2980 shadow_depth++) {
2981 vm_object_lock_shared(shadow);
2982 if (file_object != object) {
2983 vm_object_unlock(file_object);
2984 }
2985 file_offset += file_object->vo_shadow_offset;
2986 file_object = shadow;
2987 }
2988
2989 mtime.tv_sec = 0;
2990 mtime.tv_nsec = 0;
2991 cs_mtime.tv_sec = 0;
2992 cs_mtime.tv_nsec = 0;
2993
2994 /* get file's pathname and/or filename */
2995 pathname = NULL;
2996 filename = NULL;
2997 pathname_len = 0;
2998 filename_len = 0;
2999 truncated_path = FALSE;
3000 /* no pager -> no file -> no pathname, use "<nil>" in that case */
3001 if (file_object->pager != NULL) {
3002 pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
3003 if (pathname) {
3004 pathname[0] = '\0';
3005 pathname_len = __PATH_MAX;
3006 filename = pathname + pathname_len;
3007 filename_len = __PATH_MAX;
3008
3009 if (vnode_pager_get_object_name(file_object->pager,
3010 pathname,
3011 pathname_len,
3012 filename,
3013 filename_len,
3014 &truncated_path) == KERN_SUCCESS) {
3015 /* safety first... */
3016 pathname[__PATH_MAX - 1] = '\0';
3017 filename[__PATH_MAX - 1] = '\0';
3018
3019 vnode_pager_get_object_mtime(file_object->pager,
3020 &mtime,
3021 &cs_mtime);
3022 } else {
3023 kfree_data(pathname, __PATH_MAX * 2);
3024 pathname = NULL;
3025 filename = NULL;
3026 pathname_len = 0;
3027 filename_len = 0;
3028 truncated_path = FALSE;
3029 }
3030 }
3031 }
3032 printf("CODE SIGNING: process %d[%s]: "
3033 "rejecting invalid page at address 0x%llx "
3034 "from offset 0x%llx in file \"%s%s%s\" "
3035 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
3036 "(signed:%d validated:%d tainted:%d nx:%d "
3037 "wpmapped:%d dirty:%d depth:%d)\n",
3038 pid, procname, (addr64_t) vaddr,
3039 file_offset,
3040 (pathname ? pathname : "<nil>"),
3041 (truncated_path ? "/.../" : ""),
3042 (truncated_path ? filename : ""),
3043 cs_mtime.tv_sec, cs_mtime.tv_nsec,
3044 ((cs_mtime.tv_sec == mtime.tv_sec &&
3045 cs_mtime.tv_nsec == mtime.tv_nsec)
3046 ? "=="
3047 : "!="),
3048 mtime.tv_sec, mtime.tv_nsec,
3049 object->code_signed,
3050 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
3051 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
3052 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
3053 m->vmp_wpmapped,
3054 m->vmp_dirty,
3055 shadow_depth);
3056
3057 /*
3058 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
3059 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
3060 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
3061 * will deal with the segmentation fault.
3062 */
3063 if (cs_killed) {
3064 KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
3065 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
3066
3067 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
3068 if (codesigning_exit_reason == NULL) {
3069 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
3070 } else {
3071 mach_vm_address_t data_addr = 0;
3072 struct codesigning_exit_reason_info *ceri = NULL;
3073 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
3074
3075 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
3076 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
3077 } else {
3078 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
3079 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
3080 ceri = (struct codesigning_exit_reason_info *)data_addr;
3081 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
3082
3083 ceri->ceri_virt_addr = vaddr;
3084 ceri->ceri_file_offset = file_offset;
3085 if (pathname) {
3086 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
3087 } else {
3088 ceri->ceri_pathname[0] = '\0';
3089 }
3090 if (filename) {
3091 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
3092 } else {
3093 ceri->ceri_filename[0] = '\0';
3094 }
3095 ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
3096 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
3097 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
3098 ceri->ceri_page_modtime_secs = mtime.tv_sec;
3099 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
3100 ceri->ceri_object_codesigned = (object->code_signed);
3101 ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
3102 ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
3103 ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
3104 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
3105 ceri->ceri_page_slid = 0;
3106 ceri->ceri_page_dirty = (m->vmp_dirty);
3107 ceri->ceri_page_shadow_depth = shadow_depth;
3108 } else {
3109 #if DEBUG || DEVELOPMENT
3110 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
3111 #else
3112 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
3113 #endif /* DEBUG || DEVELOPMENT */
3114 /* Free the buffer */
3115 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
3116 }
3117 }
3118 }
3119
3120 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
3121 }
3122 if (panic_on_cs_killed &&
3123 object->object_is_shared_cache) {
3124 char *tainted_contents;
3125 vm_map_offset_t src_vaddr;
3126 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
3127 tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
3128 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
3129 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
3130 panic("CODE SIGNING: process %d[%s]: "
3131 "rejecting invalid page (phys#0x%x) at address 0x%llx "
3132 "from offset 0x%llx in file \"%s%s%s\" "
3133 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
3134 "(signed:%d validated:%d tainted:%d nx:%d"
3135 "wpmapped:%d dirty:%d depth:%d)\n",
3136 pid, procname,
3137 VM_PAGE_GET_PHYS_PAGE(m),
3138 (addr64_t) vaddr,
3139 file_offset,
3140 (pathname ? pathname : "<nil>"),
3141 (truncated_path ? "/.../" : ""),
3142 (truncated_path ? filename : ""),
3143 cs_mtime.tv_sec, cs_mtime.tv_nsec,
3144 ((cs_mtime.tv_sec == mtime.tv_sec &&
3145 cs_mtime.tv_nsec == mtime.tv_nsec)
3146 ? "=="
3147 : "!="),
3148 mtime.tv_sec, mtime.tv_nsec,
3149 object->code_signed,
3150 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
3151 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
3152 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
3153 m->vmp_wpmapped,
3154 m->vmp_dirty,
3155 shadow_depth);
3156 }
3157
3158 if (file_object != object) {
3159 vm_object_unlock(file_object);
3160 }
3161 if (pathname_len != 0) {
3162 kfree_data(pathname, __PATH_MAX * 2);
3163 pathname = NULL;
3164 filename = NULL;
3165 }
3166 } else {
3167 /* proceed with the invalid page */
3168 kr = KERN_SUCCESS;
3169 if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
3170 !object->code_signed) {
3171 /*
3172 * This page has not been (fully) validated but
3173 * does not belong to a code-signed object
3174 * so it should not be forcefully considered
3175 * as tainted.
3176 * We're just concerned about it here because
3177 * we've been asked to "execute" it but that
3178 * does not mean that it should cause other
3179 * accesses to fail.
3180 * This happens when a debugger sets a
3181 * breakpoint and we then execute code in
3182 * that page. Marking the page as "tainted"
3183 * would cause any inspection tool ("leaks",
3184 * "vmmap", "CrashReporter", ...) to get killed
3185 * due to code-signing violation on that page,
3186 * even though they're just reading it and not
3187 * executing from it.
3188 */
3189 } else {
3190 /*
3191 * Page might have been tainted before or not;
3192 * now it definitively is. If the page wasn't
3193 * tainted, we must disconnect it from all
3194 * pmaps later, to force existing mappings
3195 * through that code path for re-consideration
3196 * of the validity of that page.
3197 */
3198 if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3199 *must_disconnect = TRUE;
3200 VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3201 }
3202 }
3203 cs_enter_tainted_accepted++;
3204 }
3205 if (kr != KERN_SUCCESS) {
3206 if (cs_debug) {
3207 printf("CODESIGNING: vm_fault_enter(0x%llx): "
3208 "*** INVALID PAGE ***\n",
3209 (long long)vaddr);
3210 }
3211 #if !SECURE_KERNEL
3212 if (cs_enforcement_panic) {
3213 panic("CODESIGNING: panicking on invalid page");
3214 }
3215 #endif
3216 }
3217 return kr;
3218 }
3219
3220 /*
3221 * Check that the code signature is valid for the given page being inserted into
3222 * the pmap.
3223 *
3224 * @param must_disconnect This value will be set to true if the caller must disconnect
3225 * this page.
3226 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3227 */
3228 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3229 vm_fault_validate_cs(
3230 bool cs_bypass,
3231 vm_object_t object,
3232 vm_page_t m,
3233 pmap_t pmap,
3234 vm_map_offset_t vaddr,
3235 vm_prot_t prot,
3236 vm_prot_t caller_prot,
3237 vm_map_size_t fault_page_size,
3238 vm_map_offset_t fault_phys_offset,
3239 vm_object_fault_info_t fault_info,
3240 bool *must_disconnect)
3241 {
3242 bool map_is_switched, map_is_switch_protected, cs_violation;
3243 kern_return_t kr;
3244 /* Validate code signature if necessary. */
3245 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3246 (pmap == vm_map_pmap(current_thread()->map)));
3247 map_is_switch_protected = current_thread()->map->switch_protect;
3248 kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3249 prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3250 map_is_switched, map_is_switch_protected, &cs_violation);
3251 if (kr != KERN_SUCCESS) {
3252 return kr;
3253 }
3254 if (cs_violation) {
3255 kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3256 fault_page_size, fault_phys_offset,
3257 map_is_switched, map_is_switch_protected, must_disconnect);
3258 }
3259 return kr;
3260 }
3261
3262 /*
3263 * Enqueue the page on the appropriate paging queue.
3264 */
3265 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3266 vm_fault_enqueue_page(
3267 vm_object_t object,
3268 vm_page_t m,
3269 bool wired,
3270 bool change_wiring,
3271 vm_tag_t wire_tag,
3272 bool no_cache,
3273 int *type_of_fault,
3274 kern_return_t kr)
3275 {
3276 assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3277 boolean_t page_queues_locked = FALSE;
3278 boolean_t previously_pmapped = m->vmp_pmapped;
3279 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3280 MACRO_BEGIN \
3281 if (! page_queues_locked) { \
3282 page_queues_locked = TRUE; \
3283 vm_page_lockspin_queues(); \
3284 } \
3285 MACRO_END
3286 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3287 MACRO_BEGIN \
3288 if (page_queues_locked) { \
3289 page_queues_locked = FALSE; \
3290 vm_page_unlock_queues(); \
3291 } \
3292 MACRO_END
3293
3294 vm_page_update_special_state(m);
3295 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3296 /*
3297 * Compressor pages are neither wired
3298 * nor pageable and should never change.
3299 */
3300 assert(object == compressor_object);
3301 } else if (change_wiring) {
3302 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3303
3304 if (wired) {
3305 if (kr == KERN_SUCCESS) {
3306 vm_page_wire(m, wire_tag, TRUE);
3307 }
3308 } else {
3309 vm_page_unwire(m, TRUE);
3310 }
3311 /* we keep the page queues lock, if we need it later */
3312 } else {
3313 if (object->internal == TRUE) {
3314 /*
3315 * don't allow anonymous pages on
3316 * the speculative queues
3317 */
3318 no_cache = FALSE;
3319 }
3320 if (kr != KERN_SUCCESS) {
3321 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3322 vm_page_deactivate(m);
3323 /* we keep the page queues lock, if we need it later */
3324 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3325 (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3326 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3327 ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3328 !VM_PAGE_WIRED(m)) {
3329 if (vm_page_local_q &&
3330 (*type_of_fault == DBG_COW_FAULT ||
3331 *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3332 struct vpl *lq;
3333 uint32_t lid;
3334
3335 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3336
3337 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3338 vm_object_lock_assert_exclusive(object);
3339
3340 /*
3341 * we got a local queue to stuff this
3342 * new page on...
3343 * its safe to manipulate local and
3344 * local_id at this point since we're
3345 * behind an exclusive object lock and
3346 * the page is not on any global queue.
3347 *
3348 * we'll use the current cpu number to
3349 * select the queue note that we don't
3350 * need to disable preemption... we're
3351 * going to be behind the local queue's
3352 * lock to do the real work
3353 */
3354 lid = cpu_number();
3355
3356 lq = zpercpu_get_cpu(vm_page_local_q, lid);
3357
3358 VPL_LOCK(&lq->vpl_lock);
3359
3360 vm_page_check_pageable_safe(m);
3361 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3362 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3363 m->vmp_local_id = (uint16_t)lid;
3364 lq->vpl_count++;
3365
3366 if (object->internal) {
3367 lq->vpl_internal_count++;
3368 } else {
3369 lq->vpl_external_count++;
3370 }
3371
3372 VPL_UNLOCK(&lq->vpl_lock);
3373
3374 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3375 /*
3376 * we're beyond the soft limit
3377 * for the local queue
3378 * vm_page_reactivate_local will
3379 * 'try' to take the global page
3380 * queue lock... if it can't
3381 * that's ok... we'll let the
3382 * queue continue to grow up
3383 * to the hard limit... at that
3384 * point we'll wait for the
3385 * lock... once we've got the
3386 * lock, we'll transfer all of
3387 * the pages from the local
3388 * queue to the global active
3389 * queue
3390 */
3391 vm_page_reactivate_local(lid, FALSE, FALSE);
3392 }
3393 } else {
3394 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3395
3396 /*
3397 * test again now that we hold the
3398 * page queue lock
3399 */
3400 if (!VM_PAGE_WIRED(m)) {
3401 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3402 vm_page_queues_remove(m, FALSE);
3403
3404 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3405 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3406 }
3407
3408 if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3409 no_cache) {
3410 /*
3411 * If this is a no_cache mapping
3412 * and the page has never been
3413 * mapped before or was
3414 * previously a no_cache page,
3415 * then we want to leave pages
3416 * in the speculative state so
3417 * that they can be readily
3418 * recycled if free memory runs
3419 * low. Otherwise the page is
3420 * activated as normal.
3421 */
3422
3423 if (no_cache &&
3424 (!previously_pmapped ||
3425 m->vmp_no_cache)) {
3426 m->vmp_no_cache = TRUE;
3427
3428 if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3429 vm_page_speculate(m, FALSE);
3430 }
3431 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3432 vm_page_activate(m);
3433 }
3434 }
3435 }
3436 /* we keep the page queues lock, if we need it later */
3437 }
3438 }
3439 }
3440 /* we're done with the page queues lock, if we ever took it */
3441 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3442 }
3443
3444 /*
3445 * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3446 * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3447 * before being inserted into the pmap.
3448 */
3449 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3450 vm_fault_enter_set_mapped(
3451 vm_object_t object,
3452 vm_page_t m,
3453 vm_prot_t prot,
3454 vm_prot_t fault_type)
3455 {
3456 bool page_needs_sync = false;
3457 /*
3458 * NOTE: we may only hold the vm_object lock SHARED
3459 * at this point, so we need the phys_page lock to
3460 * properly serialize updating the pmapped and
3461 * xpmapped bits
3462 */
3463 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3464 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3465
3466 pmap_lock_phys_page(phys_page);
3467 m->vmp_pmapped = TRUE;
3468
3469 if (!m->vmp_xpmapped) {
3470 m->vmp_xpmapped = TRUE;
3471
3472 pmap_unlock_phys_page(phys_page);
3473
3474 if (!object->internal) {
3475 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3476 }
3477
3478 #if defined(__arm64__)
3479 page_needs_sync = true;
3480 #else
3481 if (object->internal &&
3482 object->pager != NULL) {
3483 /*
3484 * This page could have been
3485 * uncompressed by the
3486 * compressor pager and its
3487 * contents might be only in
3488 * the data cache.
3489 * Since it's being mapped for
3490 * "execute" for the fist time,
3491 * make sure the icache is in
3492 * sync.
3493 */
3494 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3495 page_needs_sync = true;
3496 }
3497 #endif
3498 } else {
3499 pmap_unlock_phys_page(phys_page);
3500 }
3501 } else {
3502 if (m->vmp_pmapped == FALSE) {
3503 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3504
3505 pmap_lock_phys_page(phys_page);
3506 m->vmp_pmapped = TRUE;
3507 pmap_unlock_phys_page(phys_page);
3508 }
3509 }
3510
3511 if (fault_type & VM_PROT_WRITE) {
3512 if (m->vmp_wpmapped == FALSE) {
3513 vm_object_lock_assert_exclusive(object);
3514 if (!object->internal && object->pager) {
3515 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3516 }
3517 m->vmp_wpmapped = TRUE;
3518 }
3519 }
3520 return page_needs_sync;
3521 }
3522
3523 #if HAS_MTE
3524 static bool
vm_should_override_mte_cacheattr(pmap_t pmap,vm_object_t obj,__unused vm_map_address_t va,pmap_paddr_t pa)3525 vm_should_override_mte_cacheattr(
3526 pmap_t pmap,
3527 vm_object_t obj,
3528 __unused vm_map_address_t va,
3529 pmap_paddr_t pa)
3530 {
3531 /*
3532 * We need to ask whether _any_ tagged mapping exists for this frame,
3533 * rather than asking whether the object we're holding _now_ is tagged.
3534 * This is how we ensure that if an MTE mapping escapes into a non-MTE
3535 * context, shuffles around a bit, then comes back around to the originating
3536 * context, we'll enter it as MTE.
3537 */
3538 if (obj != VM_OBJECT_NULL
3539 && pmap_is_tagged_page((ppnum_t)atop(pa))
3540 && pmap->associated_vm_map_serial_id != obj->vmo_provenance) {
3541 return true;
3542 }
3543
3544 return false;
3545 }
3546 #endif
3547
3548 static inline kern_return_t
vm_fault_pmap_validate_page(pmap_t pmap __unused,vm_page_t m __unused,vm_map_offset_t vaddr __unused,vm_prot_t prot __unused,vm_object_fault_info_t fault_info __unused,bool * page_sleep_needed)3549 vm_fault_pmap_validate_page(
3550 pmap_t pmap __unused,
3551 vm_page_t m __unused,
3552 vm_map_offset_t vaddr __unused,
3553 vm_prot_t prot __unused,
3554 vm_object_fault_info_t fault_info __unused,
3555 bool *page_sleep_needed)
3556 {
3557 assert(page_sleep_needed != NULL);
3558 *page_sleep_needed = false;
3559 #if CONFIG_SPTM
3560 /*
3561 * Reject the executable or debug mapping if the page is already wired for I/O. The SPTM's security
3562 * model doesn't allow us to reliably use executable pages for I/O due to both CS integrity
3563 * protections and the possibility that the pages may be dynamically retyped while wired for I/O.
3564 * This check is required to happen under the VM object lock in order to synchronize with the
3565 * complementary check on the I/O wiring path in vm_page_do_delayed_work().
3566 */
3567 if (__improbable((m->vmp_cleaning || m->vmp_iopl_wired) &&
3568 pmap_will_retype(pmap, vaddr, VM_PAGE_GET_PHYS_PAGE(m), prot, fault_info->pmap_options |
3569 ((fault_info->fi_xnu_user_debug && !VM_PAGE_OBJECT(m)->code_signed) ? PMAP_OPTIONS_XNU_USER_DEBUG : 0),
3570 PMAP_MAPPING_TYPE_INFER))) {
3571 if (__improbable(m->vmp_iopl_wired)) {
3572 vm_map_guard_exception(vaddr, kGUARD_EXC_SEC_EXEC_ON_IOPL_PAGE);
3573 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
3574 KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE), (uintptr_t)vaddr);
3575 return KERN_PROTECTION_FAILURE;
3576 }
3577 *page_sleep_needed = m->vmp_cleaning;
3578 }
3579 #endif /* CONFIG_SPTM */
3580 return KERN_SUCCESS;
3581 }
3582
3583 /*
3584 * wrappers for pmap_enter_options()
3585 */
3586 kern_return_t
pmap_enter_object_options_check(pmap_t pmap,vm_map_address_t virtual_address,vm_map_offset_t fault_phys_offset,vm_object_t obj,ppnum_t pn,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired,unsigned int options)3587 pmap_enter_object_options_check(
3588 pmap_t pmap,
3589 vm_map_address_t virtual_address,
3590 vm_map_offset_t fault_phys_offset,
3591 vm_object_t obj,
3592 ppnum_t pn,
3593 vm_prot_t protection,
3594 vm_prot_t fault_type,
3595 boolean_t wired,
3596 unsigned int options)
3597 {
3598 unsigned int flags = 0;
3599 unsigned int extra_options = 0;
3600
3601 if (obj->internal) {
3602 extra_options |= PMAP_OPTIONS_INTERNAL;
3603 }
3604 pmap_paddr_t physical_address = (pmap_paddr_t)ptoa(pn) + fault_phys_offset;
3605
3606 #if HAS_MTE
3607 /*
3608 * By policy we sometimes decide to enter an MTE-capable object
3609 * as non-MTE in a particular map.
3610 *
3611 * Most notably, we enact a general policy that MTE memory which escapes its
3612 * original context will be aliased in other maps as non-MTE (aliasing back
3613 * into the originating map will result in an MTE-enabled mapping.)
3614 *
3615 * Using VM_WIMG_DEFAULT for this pmap_enter only sets the PTE values
3616 * correctly *for this mapping only* without changing the MTE-ness
3617 * of the underlying page.
3618 */
3619 if (vm_should_override_mte_cacheattr(pmap, obj, virtual_address, physical_address)) {
3620 /*
3621 * Certain first-party actors (such as WCP and BlastDoor) are modeled untrustworthy, and should never
3622 * be allowed to receive untagged aliases to tagged memory from other actors.
3623 * If we make it this far on a pmap that should never receive untagged aliases, throw a fatal guard.
3624 */
3625 if (pmap->restrict_receiving_aliases_to_tagged_memory) {
3626 /* Immediately send a fatal guard */
3627 uint64_t address_to_report = 0;
3628 #if DEBUG || DEVELOPMENT
3629 /* On internal variants, report the PA we tried to alias */
3630 address_to_report = physical_address;
3631 #endif /* DEBUG || DEVELOPMENT */
3632 mach_exception_code_t code = 0;
3633 EXC_GUARD_ENCODE_TYPE(code, GUARD_TYPE_VIRT_MEMORY);
3634 EXC_GUARD_ENCODE_FLAVOR(code, kGUARD_EXC_SEC_SHARING_DENIED);
3635 thread_guard_violation(
3636 current_thread(),
3637 code,
3638 address_to_report,
3639 /* Fatal */
3640 true);
3641 /* And indicate that something went wrong */
3642 return VM_FAULT_MEMORY_ERROR;
3643 } else {
3644 assert(!(flags & VM_WIMG_MASK));
3645 flags |= VM_WIMG_USE_DEFAULT;
3646 }
3647 }
3648 #endif /* HAS_MTE */
3649
3650 return pmap_enter_options_addr(pmap,
3651 virtual_address,
3652 physical_address,
3653 protection,
3654 fault_type,
3655 flags,
3656 wired,
3657 options | extra_options,
3658 NULL,
3659 PMAP_MAPPING_TYPE_INFER);
3660 }
3661
3662 kern_return_t
pmap_enter_options_check(pmap_t pmap,vm_map_address_t virtual_address,vm_map_offset_t fault_phys_offset,vm_page_t page,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired,unsigned int options)3663 pmap_enter_options_check(
3664 pmap_t pmap,
3665 vm_map_address_t virtual_address,
3666 vm_map_offset_t fault_phys_offset,
3667 vm_page_t page,
3668 vm_prot_t protection,
3669 vm_prot_t fault_type,
3670 boolean_t wired,
3671 unsigned int options)
3672 {
3673 if (page->vmp_error) {
3674 return KERN_MEMORY_FAILURE;
3675 }
3676 vm_object_t obj = VM_PAGE_OBJECT(page);
3677 if (page->vmp_reusable || obj->all_reusable) {
3678 options |= PMAP_OPTIONS_REUSABLE;
3679 }
3680 assert(page->vmp_pmapped);
3681 if (fault_type & VM_PROT_WRITE) {
3682 if (pmap == kernel_pmap) {
3683 /*
3684 * The kernel sometimes needs to map a page to provide its
3685 * initial contents but that does not mean that the page is
3686 * actually dirty/modified, so let's not assert that it's been
3687 * "wpmapped".
3688 */
3689 } else {
3690 assert(page->vmp_wpmapped);
3691 }
3692 }
3693 return pmap_enter_object_options_check(
3694 pmap,
3695 virtual_address,
3696 fault_phys_offset,
3697 obj,
3698 VM_PAGE_GET_PHYS_PAGE(page),
3699 protection,
3700 fault_type,
3701 wired,
3702 options);
3703 }
3704
3705 kern_return_t
pmap_enter_check(pmap_t pmap,vm_map_address_t virtual_address,vm_page_t page,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired)3706 pmap_enter_check(
3707 pmap_t pmap,
3708 vm_map_address_t virtual_address,
3709 vm_page_t page,
3710 vm_prot_t protection,
3711 vm_prot_t fault_type,
3712 boolean_t wired)
3713 {
3714 return pmap_enter_options_check(pmap,
3715 virtual_address,
3716 0 /* fault_phys_offset */,
3717 page,
3718 protection,
3719 fault_type,
3720 wired,
3721 0 /* options */);
3722 }
3723
3724 /*
3725 * Try to enter the given page into the pmap.
3726 * Will retry without execute permission if the code signing monitor is enabled and
3727 * we encounter a codesigning failure on a non-execute fault.
3728 */
3729 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3730 vm_fault_attempt_pmap_enter(
3731 pmap_t pmap,
3732 vm_map_offset_t vaddr,
3733 vm_map_size_t fault_page_size,
3734 vm_map_offset_t fault_phys_offset,
3735 vm_page_t m,
3736 vm_prot_t *prot,
3737 vm_prot_t caller_prot,
3738 vm_prot_t fault_type,
3739 bool wired,
3740 int pmap_options)
3741 {
3742 #if !CODE_SIGNING_MONITOR
3743 #pragma unused(caller_prot)
3744 #endif /* !CODE_SIGNING_MONITOR */
3745
3746 kern_return_t kr;
3747 if (fault_page_size != PAGE_SIZE) {
3748 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3749 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3750 fault_phys_offset < PAGE_SIZE),
3751 "0x%llx\n", (uint64_t)fault_phys_offset);
3752 } else {
3753 assertf(fault_phys_offset == 0,
3754 "0x%llx\n", (uint64_t)fault_phys_offset);
3755 }
3756
3757 kr = pmap_enter_options_check(pmap, vaddr,
3758 fault_phys_offset,
3759 m, *prot, fault_type,
3760 wired, pmap_options);
3761
3762 #if CODE_SIGNING_MONITOR
3763 /*
3764 * Retry without execute permission if we encountered a codesigning
3765 * failure on a non-execute fault. This allows applications which
3766 * don't actually need to execute code to still map it for read access.
3767 */
3768 if (kr == KERN_CODESIGN_ERROR &&
3769 csm_enabled() &&
3770 (*prot & VM_PROT_EXECUTE) &&
3771 !(caller_prot & VM_PROT_EXECUTE)) {
3772 *prot &= ~VM_PROT_EXECUTE;
3773 kr = pmap_enter_options_check(pmap, vaddr,
3774 fault_phys_offset,
3775 m, *prot, fault_type,
3776 wired, pmap_options);
3777 }
3778 #endif /* CODE_SIGNING_MONITOR */
3779
3780 return kr;
3781 }
3782
3783 /*
3784 * Enter the given page into the pmap.
3785 * The map must be locked shared.
3786 * The vm object must NOT be locked.
3787 *
3788 * @param need_retry if not null, avoid making a (potentially) blocking call into
3789 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3790 */
3791 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,bool * need_retry)3792 vm_fault_pmap_enter(
3793 pmap_t pmap,
3794 vm_map_offset_t vaddr,
3795 vm_map_size_t fault_page_size,
3796 vm_map_offset_t fault_phys_offset,
3797 vm_page_t m,
3798 vm_prot_t *prot,
3799 vm_prot_t caller_prot,
3800 vm_prot_t fault_type,
3801 bool wired,
3802 int pmap_options,
3803 bool *need_retry)
3804 {
3805 kern_return_t kr;
3806 if (need_retry != NULL) {
3807 /*
3808 * Although we don't hold a lock on this object, we hold a lock
3809 * on the top object in the chain. To prevent a deadlock, we
3810 * can't allow the pmap layer to block.
3811 */
3812 pmap_options |= PMAP_OPTIONS_NOWAIT;
3813 }
3814 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3815 fault_page_size, fault_phys_offset,
3816 m, prot, caller_prot, fault_type, wired, pmap_options);
3817 if (kr == KERN_RESOURCE_SHORTAGE) {
3818 if (need_retry) {
3819 /*
3820 * There's nothing we can do here since we hold the
3821 * lock on the top object in the chain. The caller
3822 * will need to deal with this by dropping that lock and retrying.
3823 */
3824 *need_retry = true;
3825 vm_pmap_enter_retried++;
3826 }
3827 }
3828 return kr;
3829 }
3830
3831 /*
3832 * Enter the given page into the pmap.
3833 * The vm map must be locked shared.
3834 * The vm object must be locked exclusive, unless this is a soft fault.
3835 * For a soft fault, the object must be locked shared or exclusive.
3836 *
3837 * @param need_retry if not null, avoid making a (potentially) blocking call into
3838 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3839 */
3840 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,bool * need_retry,uint8_t * object_lock_type)3841 vm_fault_pmap_enter_with_object_lock(
3842 vm_object_t object,
3843 pmap_t pmap,
3844 vm_map_offset_t vaddr,
3845 vm_map_size_t fault_page_size,
3846 vm_map_offset_t fault_phys_offset,
3847 vm_page_t m,
3848 vm_prot_t *prot,
3849 vm_prot_t caller_prot,
3850 vm_prot_t fault_type,
3851 bool wired,
3852 int pmap_options,
3853 bool *need_retry,
3854 uint8_t *object_lock_type)
3855 {
3856 kern_return_t kr;
3857
3858 assert(need_retry != NULL);
3859 *need_retry = false;
3860
3861 /*
3862 * Prevent a deadlock by not
3863 * holding the object lock if we need to wait for a page in
3864 * pmap_enter() - <rdar://problem/7138958>
3865 */
3866 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3867 fault_page_size, fault_phys_offset,
3868 m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3869 #if __x86_64__
3870 if (kr == KERN_INVALID_ARGUMENT &&
3871 pmap == PMAP_NULL &&
3872 wired) {
3873 /*
3874 * Wiring a page in a pmap-less VM map:
3875 * VMware's "vmmon" kernel extension does this
3876 * to grab pages.
3877 * Let it proceed even though the PMAP_ENTER() failed.
3878 */
3879 kr = KERN_SUCCESS;
3880 }
3881 #endif /* __x86_64__ */
3882
3883 if (kr == KERN_RESOURCE_SHORTAGE) {
3884 /*
3885 * We can't drop the object lock(s) here to retry the pmap_enter()
3886 * in a blocking way so that it can expand the page table as needed.
3887 * That would allow vm_object_copy_delayed() to create a new copy object
3888 * and change the copy-on-write obligations.
3889 * Our only recourse is to deal with it at a higher level where we can
3890 * drop both locks, expand the page table and retry the fault.
3891 */
3892 *need_retry = true;
3893 vm_pmap_enter_retried++;
3894 goto done;
3895 }
3896
3897 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
3898 if ((*prot & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
3899 if (*object_lock_type == OBJECT_LOCK_SHARED) {
3900 boolean_t was_busy = m->vmp_busy;
3901 m->vmp_busy = TRUE;
3902
3903 *object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3904
3905 if (vm_object_lock_upgrade(object) == FALSE) {
3906 vm_object_lock(object);
3907 }
3908
3909 if (!was_busy) {
3910 vm_page_wakeup_done(object, m);
3911 }
3912 }
3913 vm_object_lock_assert_exclusive(object);
3914 vm_page_lockspin_queues();
3915 m->vmp_unmodified_ro = false;
3916 vm_page_unlock_queues();
3917 os_atomic_dec(&compressor_ro_uncompressed, relaxed);
3918
3919 vm_object_compressor_pager_state_clr(VM_PAGE_OBJECT(m), m->vmp_offset);
3920 }
3921 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3922 #pragma unused(object)
3923 #pragma unused(object_lock_type)
3924 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3925
3926 done:
3927 return kr;
3928 }
3929
3930 /*
3931 * Prepare to enter a page into the pmap by checking CS, protection bits,
3932 * and setting mapped bits on the page_t.
3933 * Does not modify the page's paging queue.
3934 *
3935 * page queue lock must NOT be held
3936 * m->vmp_object must be locked
3937 *
3938 * NOTE: m->vmp_object could be locked "shared" only if we are called
3939 * from vm_fault() as part of a soft fault.
3940 */
3941 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync,bool * page_needs_sleep)3942 vm_fault_enter_prepare(
3943 vm_page_t m,
3944 pmap_t pmap,
3945 vm_map_offset_t vaddr,
3946 vm_prot_t *prot,
3947 vm_prot_t caller_prot,
3948 vm_map_size_t fault_page_size,
3949 vm_map_offset_t fault_phys_offset,
3950 vm_prot_t fault_type,
3951 vm_object_fault_info_t fault_info,
3952 int *type_of_fault,
3953 bool *page_needs_data_sync,
3954 bool *page_needs_sleep)
3955 {
3956 kern_return_t kr;
3957 bool is_tainted = false;
3958 vm_object_t object;
3959 boolean_t cs_bypass = fault_info->cs_bypass;
3960
3961 object = VM_PAGE_OBJECT(m);
3962
3963 vm_object_lock_assert_held(object);
3964
3965 #if KASAN
3966 if (pmap == kernel_pmap) {
3967 kasan_notify_address(vaddr, PAGE_SIZE);
3968 }
3969 #endif
3970
3971 #if CODE_SIGNING_MONITOR
3972 if (csm_address_space_exempt(pmap) == KERN_SUCCESS) {
3973 cs_bypass = TRUE;
3974 }
3975 #endif
3976
3977 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3978
3979 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3980 vm_object_lock_assert_exclusive(object);
3981 } else if ((fault_type & VM_PROT_WRITE) == 0 &&
3982 !fault_info->fi_change_wiring &&
3983 (!m->vmp_wpmapped
3984 #if VM_OBJECT_ACCESS_TRACKING
3985 || object->access_tracking
3986 #endif /* VM_OBJECT_ACCESS_TRACKING */
3987 )) {
3988 /*
3989 * This is not a "write" fault, so we
3990 * might not have taken the object lock
3991 * exclusively and we might not be able
3992 * to update the "wpmapped" bit in
3993 * vm_fault_enter().
3994 * Let's just grant read access to
3995 * the page for now and we'll
3996 * soft-fault again if we need write
3997 * access later...
3998 */
3999
4000 /* This had better not be a JIT page. */
4001 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
4002 /*
4003 * This pmap enforces extra constraints for this set of
4004 * protections, so we can't modify them.
4005 */
4006 if (!cs_bypass) {
4007 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x !cs_bypass",
4008 __FUNCTION__, pmap, (uint64_t)vaddr,
4009 *prot, fault_info->pmap_options);
4010 }
4011 } else {
4012 *prot &= ~VM_PROT_WRITE;
4013 }
4014 }
4015 if (m->vmp_pmapped == FALSE) {
4016 if (m->vmp_clustered) {
4017 if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
4018 /*
4019 * found it in the cache, but this
4020 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
4021 * so it must have come in as part of
4022 * a cluster... account 1 pagein against it
4023 */
4024 if (object->internal) {
4025 *type_of_fault = DBG_PAGEIND_FAULT;
4026 } else {
4027 *type_of_fault = DBG_PAGEINV_FAULT;
4028 }
4029
4030 VM_PAGE_COUNT_AS_PAGEIN(m);
4031 }
4032 VM_PAGE_CONSUME_CLUSTERED(m);
4033 }
4034 }
4035
4036 if (*type_of_fault != DBG_COW_FAULT) {
4037 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
4038
4039 if (pmap == kernel_pmap) {
4040 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
4041 }
4042 }
4043
4044 kr = vm_fault_pmap_validate_page(pmap, m, vaddr, *prot, fault_info, page_needs_sleep);
4045 if (__improbable((kr != KERN_SUCCESS) || *page_needs_sleep)) {
4046 return kr;
4047 }
4048 kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
4049 *prot, caller_prot, fault_page_size, fault_phys_offset,
4050 fault_info, &is_tainted);
4051 if (kr == KERN_SUCCESS) {
4052 /*
4053 * We either have a good page, or a tainted page that has been accepted by the process.
4054 * In both cases the page will be entered into the pmap.
4055 */
4056 *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
4057 if ((fault_type & VM_PROT_WRITE) && is_tainted) {
4058 /*
4059 * This page is tainted but we're inserting it anyways.
4060 * Since it's writeable, we need to disconnect it from other pmaps
4061 * now so those processes can take note.
4062 */
4063
4064 /*
4065 * We can only get here
4066 * because of the CSE logic
4067 */
4068 assert(pmap_get_vm_map_cs_enforced(pmap));
4069 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
4070 /*
4071 * If we are faulting for a write, we can clear
4072 * the execute bit - that will ensure the page is
4073 * checked again before being executable, which
4074 * protects against a map switch.
4075 * This only happens the first time the page
4076 * gets tainted, so we won't get stuck here
4077 * to make an already writeable page executable.
4078 */
4079 if (!cs_bypass) {
4080 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
4081 /*
4082 * This pmap enforces extra constraints
4083 * for this set of protections, so we
4084 * can't change the protections.
4085 */
4086 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
4087 __FUNCTION__, pmap,
4088 (uint64_t)vaddr, *prot,
4089 fault_info->pmap_options);
4090 }
4091 *prot &= ~VM_PROT_EXECUTE;
4092 }
4093 }
4094 assert(VM_PAGE_OBJECT(m) == object);
4095
4096 #if VM_OBJECT_ACCESS_TRACKING
4097 if (object->access_tracking) {
4098 DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
4099 if (fault_type & VM_PROT_WRITE) {
4100 object->access_tracking_writes++;
4101 vm_object_access_tracking_writes++;
4102 } else {
4103 object->access_tracking_reads++;
4104 vm_object_access_tracking_reads++;
4105 }
4106 }
4107 #endif /* VM_OBJECT_ACCESS_TRACKING */
4108 }
4109
4110 return kr;
4111 }
4112
4113 /*
4114 * page queue lock must NOT be held
4115 * m->vmp_object must be locked
4116 *
4117 * NOTE: m->vmp_object could be locked "shared" only if we are called
4118 * from vm_fault() as part of a soft fault. If so, we must be
4119 * careful not to modify the VM object in any way that is not
4120 * legal under a shared lock...
4121 */
4122 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,bool * need_retry,int * type_of_fault,uint8_t * object_lock_type,bool * page_needs_sleep)4123 vm_fault_enter(
4124 vm_page_t m,
4125 pmap_t pmap,
4126 vm_map_offset_t vaddr,
4127 vm_map_size_t fault_page_size,
4128 vm_map_offset_t fault_phys_offset,
4129 vm_prot_t prot,
4130 vm_prot_t caller_prot,
4131 boolean_t wired,
4132 vm_tag_t wire_tag,
4133 vm_object_fault_info_t fault_info,
4134 bool *need_retry,
4135 int *type_of_fault,
4136 uint8_t *object_lock_type,
4137 bool *page_needs_sleep)
4138 {
4139 kern_return_t kr;
4140 vm_object_t object;
4141 bool page_needs_data_sync;
4142 vm_prot_t fault_type;
4143 int pmap_options = fault_info->pmap_options;
4144
4145 assert(need_retry != NULL);
4146
4147 if (vm_page_is_guard(m)) {
4148 return KERN_SUCCESS;
4149 }
4150
4151 fault_type = fault_info->fi_change_wiring ? VM_PROT_NONE : caller_prot;
4152
4153 assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
4154 kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
4155 fault_page_size, fault_phys_offset, fault_type,
4156 fault_info, type_of_fault, &page_needs_data_sync, page_needs_sleep);
4157 object = VM_PAGE_OBJECT(m);
4158
4159 vm_fault_enqueue_page(object, m, wired, fault_info->fi_change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
4160
4161 if (__probable((kr == KERN_SUCCESS) && !(*page_needs_sleep))) {
4162 if (page_needs_data_sync) {
4163 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
4164 }
4165
4166 if (fault_info->fi_xnu_user_debug && !object->code_signed) {
4167 pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
4168 }
4169
4170
4171 kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
4172 fault_page_size, fault_phys_offset, m,
4173 &prot, caller_prot, fault_type, wired, pmap_options, need_retry, object_lock_type);
4174 }
4175
4176 return kr;
4177 }
4178
4179 kern_return_t
vm_pre_fault_with_info(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t prot,vm_object_fault_info_t fault_info)4180 vm_pre_fault_with_info(
4181 vm_map_t map,
4182 vm_map_offset_t vaddr,
4183 vm_prot_t prot,
4184 vm_object_fault_info_t fault_info)
4185 {
4186 assert(fault_info != NULL);
4187 if (pmap_find_phys(map->pmap, vaddr) == 0) {
4188 return vm_fault_internal(map,
4189 vaddr, /* vaddr */
4190 prot, /* fault_type */
4191 VM_KERN_MEMORY_NONE, /* tag - not wiring */
4192 NULL, /* caller_pmap */
4193 0, /* caller_pmap_addr */
4194 NULL,
4195 fault_info);
4196 }
4197 return KERN_SUCCESS;
4198 }
4199
4200 /*
4201 * Fault on the given vaddr iff the page is not already entered in the pmap.
4202 */
4203 kern_return_t
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)4204 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
4205 {
4206 struct vm_object_fault_info fault_info = {
4207 .interruptible = THREAD_UNINT,
4208 };
4209 return vm_pre_fault_with_info(current_map(), vaddr, prot, &fault_info);
4210 }
4211
4212 /*
4213 * Routine: vm_fault
4214 * Purpose:
4215 * Handle page faults, including pseudo-faults
4216 * used to change the wiring status of pages.
4217 * Returns:
4218 * Explicit continuations have been removed.
4219 * Implementation:
4220 * vm_fault and vm_fault_page save mucho state
4221 * in the moral equivalent of a closure. The state
4222 * structure is allocated when first entering vm_fault
4223 * and deallocated when leaving vm_fault.
4224 */
4225
4226 extern uint64_t get_current_unique_pid(void);
4227
4228 unsigned long vm_fault_collapse_total = 0;
4229 unsigned long vm_fault_collapse_skipped = 0;
4230
4231
4232 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)4233 vm_fault_external(
4234 vm_map_t map,
4235 vm_map_offset_t vaddr,
4236 vm_prot_t fault_type,
4237 boolean_t change_wiring,
4238 int interruptible,
4239 pmap_t caller_pmap,
4240 vm_map_offset_t caller_pmap_addr)
4241 {
4242 struct vm_object_fault_info fault_info = {
4243 .interruptible = interruptible,
4244 .fi_change_wiring = change_wiring,
4245 };
4246
4247 return vm_fault_internal(map, vaddr, fault_type,
4248 change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
4249 caller_pmap, caller_pmap_addr,
4250 NULL, &fault_info);
4251 }
4252
4253 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)4254 vm_fault(
4255 vm_map_t map,
4256 vm_map_offset_t vaddr,
4257 vm_prot_t fault_type,
4258 boolean_t change_wiring,
4259 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
4260 int interruptible,
4261 pmap_t caller_pmap,
4262 vm_map_offset_t caller_pmap_addr)
4263 {
4264 struct vm_object_fault_info fault_info = {
4265 .interruptible = interruptible,
4266 .fi_change_wiring = change_wiring,
4267 };
4268
4269 return vm_fault_internal(map, vaddr, fault_type, wire_tag,
4270 caller_pmap, caller_pmap_addr,
4271 NULL, &fault_info);
4272 }
4273
4274 static boolean_t
current_proc_is_privileged(void)4275 current_proc_is_privileged(void)
4276 {
4277 return csproc_get_platform_binary(current_proc());
4278 }
4279
4280 uint64_t vm_copied_on_read = 0;
4281 uint64_t vm_copied_on_read_kernel_map = 0;
4282 uint64_t vm_copied_on_read_platform_map = 0;
4283
4284 /*
4285 * Cleanup after a vm_fault_enter.
4286 * At this point, the fault should either have failed (kr != KERN_SUCCESS)
4287 * or the page should be in the pmap and on the correct paging queue.
4288 *
4289 * Precondition:
4290 * map must be locked shared.
4291 * m_object must be locked.
4292 * If top_object != VM_OBJECT_NULL, it must be locked.
4293 * real_map must be locked.
4294 *
4295 * Postcondition:
4296 * map will be unlocked
4297 * m_object will be unlocked
4298 * top_object will be unlocked
4299 * If real_map != map, it will be unlocked
4300 */
4301 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,bool need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)4302 vm_fault_complete(
4303 vm_map_t map,
4304 vm_map_t real_map,
4305 vm_object_t object,
4306 vm_object_t m_object,
4307 vm_page_t m,
4308 vm_map_offset_t offset,
4309 vm_map_offset_t trace_real_vaddr,
4310 vm_object_fault_info_t fault_info,
4311 vm_prot_t caller_prot,
4312 #if CONFIG_DTRACE
4313 vm_map_offset_t real_vaddr,
4314 #else
4315 __unused vm_map_offset_t real_vaddr,
4316 #endif /* CONFIG_DTRACE */
4317 int type_of_fault,
4318 bool need_retry,
4319 kern_return_t kr,
4320 ppnum_t *physpage_p,
4321 vm_prot_t prot,
4322 vm_object_t top_object,
4323 boolean_t need_collapse,
4324 vm_map_offset_t cur_offset,
4325 vm_prot_t fault_type,
4326 vm_object_t *written_on_object,
4327 memory_object_t *written_on_pager,
4328 vm_object_offset_t *written_on_offset)
4329 {
4330 int event_code = 0;
4331
4332 vm_map_lock_assert_shared(map);
4333 vm_object_lock_assert_held(m_object);
4334 if (top_object != VM_OBJECT_NULL) {
4335 vm_object_lock_assert_held(top_object);
4336 }
4337 vm_map_lock_assert_held(real_map);
4338
4339 if (m_object->internal) {
4340 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4341 } else if (m_object->object_is_shared_cache) {
4342 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4343 } else {
4344 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4345 }
4346 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
4347 if (!need_retry) {
4348 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
4349 }
4350 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
4351 if (kr == KERN_SUCCESS &&
4352 physpage_p != NULL) {
4353 /* for vm_map_wire_and_extract() */
4354 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4355 if (prot & VM_PROT_WRITE) {
4356 vm_object_lock_assert_exclusive(m_object);
4357 m->vmp_dirty = TRUE;
4358 }
4359 }
4360
4361 if (top_object != VM_OBJECT_NULL) {
4362 /*
4363 * It's safe to drop the top object
4364 * now that we've done our
4365 * vm_fault_enter(). Any other fault
4366 * in progress for that virtual
4367 * address will either find our page
4368 * and translation or put in a new page
4369 * and translation.
4370 */
4371 vm_object_unlock(top_object);
4372 top_object = VM_OBJECT_NULL;
4373 }
4374
4375 if (need_collapse == TRUE) {
4376 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
4377 }
4378
4379 if (!need_retry &&
4380 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4381 /*
4382 * evaluate access pattern and update state
4383 * vm_fault_deactivate_behind depends on the
4384 * state being up to date
4385 */
4386 vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
4387
4388 vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
4389 }
4390 /*
4391 * That's it, clean up and return.
4392 */
4393 if (m->vmp_busy) {
4394 vm_object_lock_assert_exclusive(m_object);
4395 vm_page_wakeup_done(m_object, m);
4396 }
4397
4398 if (!need_retry && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4399 vm_object_paging_begin(m_object);
4400
4401 assert3p(*written_on_object, ==, VM_OBJECT_NULL);
4402 *written_on_object = m_object;
4403 *written_on_pager = m_object->pager;
4404 *written_on_offset = m_object->paging_offset + m->vmp_offset;
4405 }
4406 vm_object_unlock(object);
4407
4408 vm_map_unlock_read(map);
4409 if (real_map != map) {
4410 vm_map_unlock(real_map);
4411 }
4412 }
4413
4414 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)4415 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
4416 {
4417 if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
4418 return DBG_COR_FAULT;
4419 }
4420 return type_of_fault;
4421 }
4422
4423 uint64_t vm_fault_resilient_media_initiate = 0;
4424 uint64_t vm_fault_resilient_media_retry = 0;
4425 uint64_t vm_fault_resilient_media_proceed = 0;
4426 uint64_t vm_fault_resilient_media_release = 0;
4427 uint64_t vm_fault_resilient_media_abort1 = 0;
4428 uint64_t vm_fault_resilient_media_abort2 = 0;
4429
4430 #if MACH_ASSERT
4431 int vm_fault_resilient_media_inject_error1_rate = 0;
4432 int vm_fault_resilient_media_inject_error1 = 0;
4433 int vm_fault_resilient_media_inject_error2_rate = 0;
4434 int vm_fault_resilient_media_inject_error2 = 0;
4435 int vm_fault_resilient_media_inject_error3_rate = 0;
4436 int vm_fault_resilient_media_inject_error3 = 0;
4437 #endif /* MACH_ASSERT */
4438
4439 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,vm_tag_t wire_tag,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p,vm_object_fault_info_t fault_info)4440 vm_fault_internal(
4441 vm_map_t map,
4442 vm_map_offset_t vaddr,
4443 vm_prot_t caller_prot,
4444 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
4445 pmap_t caller_pmap,
4446 vm_map_offset_t caller_pmap_addr,
4447 ppnum_t *physpage_p,
4448 vm_object_fault_info_t fault_info)
4449 {
4450 vm_map_version_t version; /* Map version for verificiation */
4451 boolean_t wired; /* Should mapping be wired down? */
4452 vm_object_t object; /* Top-level object */
4453 vm_object_offset_t offset; /* Top-level offset */
4454 vm_prot_t prot; /* Protection for mapping */
4455 vm_object_t old_copy_object; /* Saved copy object */
4456 uint64_t old_copy_version;
4457 vm_page_t result_page; /* Result of vm_fault_page */
4458 vm_page_t top_page; /* Placeholder page */
4459 kern_return_t kr;
4460
4461 vm_page_t m; /* Fast access to result_page */
4462 kern_return_t error_code;
4463 vm_object_t cur_object;
4464 vm_object_t m_object = NULL;
4465 vm_object_offset_t cur_offset;
4466 vm_page_t cur_m;
4467 vm_object_t new_object;
4468 int type_of_fault;
4469 pmap_t pmap;
4470 wait_interrupt_t interruptible_state;
4471 vm_map_t real_map = map;
4472 vm_map_t original_map = map;
4473 bool object_locks_dropped = FALSE;
4474 vm_prot_t fault_type;
4475 vm_prot_t original_fault_type;
4476 bool need_collapse = FALSE;
4477 bool need_retry = false;
4478 uint8_t object_lock_type = 0;
4479 uint8_t cur_object_lock_type;
4480 vm_object_t top_object = VM_OBJECT_NULL;
4481 vm_object_t written_on_object = VM_OBJECT_NULL;
4482 memory_object_t written_on_pager = NULL;
4483 vm_object_offset_t written_on_offset = 0;
4484 int throttle_delay;
4485 int compressed_count_delta;
4486 vm_grab_options_t grab_options;
4487 bool need_copy;
4488 bool need_copy_on_read;
4489 vm_map_offset_t trace_vaddr;
4490 vm_map_offset_t trace_real_vaddr;
4491 vm_map_size_t fault_page_size;
4492 vm_map_size_t fault_page_mask;
4493 int fault_page_shift;
4494 vm_map_offset_t fault_phys_offset;
4495 vm_map_offset_t real_vaddr;
4496 bool resilient_media_retry = false;
4497 bool resilient_media_ref_transfer = false;
4498 vm_object_t resilient_media_object = VM_OBJECT_NULL;
4499 vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
4500 bool page_needs_data_sync = false;
4501 /*
4502 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4503 * If so, the zero fill path will drop the lock
4504 * NB: Ideally we would always drop the lock rather than rely on
4505 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4506 */
4507 bool object_is_contended = false;
4508
4509 vmlp_api_start(VM_FAULT_INTERNAL);
4510
4511 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
4512 /*
4513 * We may be faulting on a tagged address. Canonicalize it here so we have
4514 * a chance to find it in the VM map.
4515 */
4516 if (current_task_has_sec_enabled()) {
4517 vaddr = vm_memtag_canonicalize(map, vaddr);
4518 }
4519 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
4520
4521 real_vaddr = vaddr;
4522 trace_real_vaddr = vaddr;
4523
4524 /*
4525 * Some (kernel) submaps are marked with "should never fault", so that
4526 * guard pages in such submaps do not need to use fictitious
4527 * placeholders at all, while not causing ZFOD pages to be made
4528 * (which is the default behavior otherwise).
4529 *
4530 * We also want capture the fault address easily so that the zone
4531 * allocator might present an enhanced panic log.
4532 */
4533 if (map->never_faults) {
4534 assert(map->pmap == kernel_pmap);
4535 vmlp_api_end(VM_FAULT_INTERNAL, KERN_INVALID_ADDRESS);
4536 return KERN_INVALID_ADDRESS;
4537 }
4538
4539 if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4540 fault_phys_offset = (vm_map_offset_t)-1;
4541 fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4542 fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4543 fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4544 if (fault_page_size < PAGE_SIZE) {
4545 DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4546 vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4547 }
4548 } else {
4549 fault_phys_offset = 0;
4550 fault_page_size = PAGE_SIZE;
4551 fault_page_mask = PAGE_MASK;
4552 fault_page_shift = PAGE_SHIFT;
4553 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4554 }
4555
4556 if (map == kernel_map) {
4557 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4558 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4559 } else {
4560 trace_vaddr = vaddr;
4561 }
4562
4563 KDBG_RELEASE(
4564 (VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_START,
4565 ((uint64_t)trace_vaddr >> 32),
4566 trace_vaddr,
4567 (map == kernel_map));
4568
4569 if (get_preemption_level() != 0) {
4570 KDBG_RELEASE(
4571 (VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_END,
4572 ((uint64_t)trace_vaddr >> 32),
4573 trace_vaddr,
4574 KERN_FAILURE);
4575
4576 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4577 vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
4578 return KERN_FAILURE;
4579 }
4580
4581 thread_t cthread = current_thread();
4582
4583 if (cthread->th_vm_faults_disabled) {
4584 KDBG_RELEASE(
4585 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4586 ((uint64_t)trace_vaddr >> 32),
4587 trace_vaddr,
4588 KERN_FAILURE);
4589 ktriage_record(thread_tid(cthread),
4590 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4591 KDBG_TRIAGE_RESERVED,
4592 KDBG_TRIAGE_VM_FAULTS_DISABLED),
4593 0 /* arg */);
4594 vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
4595 return KERN_FAILURE;
4596 }
4597
4598 bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4599 bool page_sleep_needed = false;
4600 uint64_t fstart = 0;
4601
4602 if (rtfault) {
4603 fstart = mach_continuous_time();
4604 }
4605
4606 assert(fault_info != NULL);
4607 interruptible_state = thread_interrupt_level(fault_info->interruptible);
4608
4609 fault_type = (fault_info->fi_change_wiring ? VM_PROT_NONE : caller_prot);
4610
4611 counter_inc(&vm_statistics_faults);
4612 counter_inc(¤t_task()->faults);
4613 original_fault_type = fault_type;
4614
4615 need_copy = FALSE;
4616 if (fault_type & VM_PROT_WRITE) {
4617 need_copy = TRUE;
4618 }
4619
4620 if (need_copy || fault_info->fi_change_wiring) {
4621 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4622 } else {
4623 object_lock_type = OBJECT_LOCK_SHARED;
4624 }
4625
4626 cur_object_lock_type = OBJECT_LOCK_SHARED;
4627
4628 if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4629 if (compressor_map) {
4630 if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4631 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4632 }
4633 }
4634 }
4635 RetryFault:
4636 assert3p(written_on_object, ==, VM_OBJECT_NULL);
4637
4638 /*
4639 * assume we will hit a page in the cache
4640 * otherwise, explicitly override with
4641 * the real fault type once we determine it
4642 */
4643 type_of_fault = DBG_CACHE_HIT_FAULT;
4644
4645 /*
4646 * Find the backing store object and offset into
4647 * it to begin the search.
4648 */
4649 fault_type = original_fault_type;
4650 map = original_map;
4651 vm_map_lock_read(map);
4652
4653 if (resilient_media_retry) {
4654 /*
4655 * If we have to insert a fake zero-filled page to hide
4656 * a media failure to provide the real page, we need to
4657 * resolve any pending copy-on-write on this mapping.
4658 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4659 * with that even if this is not a "write" fault.
4660 */
4661 need_copy = TRUE;
4662 /*
4663 * If the top object is COPY_DELAYED and has a "copy" object,
4664 * we would have to push our zero-filled page to this copy
4665 * object before allowing it to be modified, so let's consider
4666 * this as a read-only fault for now. If this was a write
4667 * fault, we'll fault again on the read-only zero-filled page
4668 * and fulfill our copy-on-write obligations then.
4669 */
4670 fault_type = VM_PROT_READ;
4671 /*
4672 * We need the object's exclusive lock to insert the
4673 * zero-filled page.
4674 */
4675 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4676 vm_fault_resilient_media_retry++;
4677 }
4678
4679 kr = vm_map_lookup_and_lock_object(&map, vaddr,
4680 (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4681 object_lock_type, &version,
4682 &object, &offset, &prot, &wired,
4683 fault_info,
4684 &real_map,
4685 &object_is_contended);
4686 object_is_contended = false; /* avoid unsafe optimization */
4687
4688 if (kr != KERN_SUCCESS) {
4689 vm_map_unlock_read(map);
4690 /*
4691 * This can be seen in a crash report if indeed the
4692 * thread is crashing due to an invalid access in a non-existent
4693 * range.
4694 * Turning this OFF for now because it is noisy and not always fatal
4695 * eg prefaulting.
4696 *
4697 * if (kr == KERN_INVALID_ADDRESS) {
4698 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4699 * }
4700 */
4701 goto done;
4702 }
4703
4704 pmap = real_map->pmap;
4705 fault_info->io_sync = FALSE;
4706 fault_info->mark_zf_absent = FALSE;
4707 fault_info->batch_pmap_op = FALSE;
4708
4709
4710 if (resilient_media_retry) {
4711 /*
4712 * We're retrying this fault after having detected a media
4713 * failure from a "resilient_media" mapping.
4714 * Check that the mapping is still pointing at the object
4715 * that just failed to provide a page.
4716 */
4717 assert(resilient_media_object != VM_OBJECT_NULL);
4718 assert(resilient_media_offset != (vm_object_offset_t)-1);
4719 if ((object != VM_OBJECT_NULL &&
4720 object == resilient_media_object &&
4721 offset == resilient_media_offset &&
4722 fault_info->resilient_media)
4723 #if MACH_ASSERT
4724 && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4725 (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4726 #endif /* MACH_ASSERT */
4727 ) {
4728 /*
4729 * This mapping still points at the same object
4730 * and is still "resilient_media": proceed in
4731 * "recovery-from-media-failure" mode, where we'll
4732 * insert a zero-filled page in the top object.
4733 */
4734 // printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4735 vm_fault_resilient_media_proceed++;
4736 } else {
4737 /* not recovering: reset state and retry fault */
4738 // printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info->resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4739 vm_object_unlock(object);
4740 if (real_map != map) {
4741 vm_map_unlock(real_map);
4742 }
4743 vm_map_unlock_read(map);
4744 /* release our extra reference on failed object */
4745 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4746 vm_object_deallocate(resilient_media_object);
4747 resilient_media_object = VM_OBJECT_NULL;
4748 resilient_media_offset = (vm_object_offset_t)-1;
4749 resilient_media_retry = false;
4750 vm_fault_resilient_media_abort1++;
4751 goto RetryFault;
4752 }
4753 } else {
4754 assert(resilient_media_object == VM_OBJECT_NULL);
4755 resilient_media_offset = (vm_object_offset_t)-1;
4756 }
4757
4758 /*
4759 * If the page is wired, we must fault for the current protection
4760 * value, to avoid further faults.
4761 */
4762 if (wired) {
4763 fault_type = prot | VM_PROT_WRITE;
4764 }
4765 if (wired || need_copy) {
4766 /*
4767 * since we're treating this fault as a 'write'
4768 * we must hold the top object lock exclusively
4769 */
4770 if (object_lock_type == OBJECT_LOCK_SHARED) {
4771 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4772
4773 if (vm_object_lock_upgrade(object) == FALSE) {
4774 /*
4775 * couldn't upgrade, so explictly
4776 * take the lock exclusively
4777 */
4778 vm_object_lock(object);
4779 }
4780 }
4781 }
4782
4783 #if VM_FAULT_CLASSIFY
4784 /*
4785 * Temporary data gathering code
4786 */
4787 vm_fault_classify(object, offset, fault_type);
4788 #endif
4789 /*
4790 * Fast fault code. The basic idea is to do as much as
4791 * possible while holding the map lock and object locks.
4792 * Busy pages are not used until the object lock has to
4793 * be dropped to do something (copy, zero fill, pmap enter).
4794 * Similarly, paging references aren't acquired until that
4795 * point, and object references aren't used.
4796 *
4797 * If we can figure out what to do
4798 * (zero fill, copy on write, pmap enter) while holding
4799 * the locks, then it gets done. Otherwise, we give up,
4800 * and use the original fault path (which doesn't hold
4801 * the map lock, and relies on busy pages).
4802 * The give up cases include:
4803 * - Have to talk to pager.
4804 * - Page is busy, absent or in error.
4805 * - Pager has locked out desired access.
4806 * - Fault needs to be restarted.
4807 * - Have to push page into copy object.
4808 *
4809 * The code is an infinite loop that moves one level down
4810 * the shadow chain each time. cur_object and cur_offset
4811 * refer to the current object being examined. object and offset
4812 * are the original object from the map. The loop is at the
4813 * top level if and only if object and cur_object are the same.
4814 *
4815 * Invariants: Map lock is held throughout. Lock is held on
4816 * original object and cur_object (if different) when
4817 * continuing or exiting loop.
4818 *
4819 */
4820
4821 #if defined(__arm64__)
4822 /*
4823 * Fail if reading an execute-only page in a
4824 * pmap that enforces execute-only protection.
4825 */
4826 if (fault_type == VM_PROT_READ &&
4827 (prot & VM_PROT_EXECUTE) &&
4828 !(prot & VM_PROT_READ) &&
4829 pmap_enforces_execute_only(pmap)) {
4830 vm_object_unlock(object);
4831 vm_map_unlock_read(map);
4832 if (real_map != map) {
4833 vm_map_unlock(real_map);
4834 }
4835 kr = KERN_PROTECTION_FAILURE;
4836 goto done;
4837 }
4838 #endif
4839
4840 fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4841
4842 /*
4843 * If this page is to be inserted in a copy delay object
4844 * for writing, and if the object has a copy, then the
4845 * copy delay strategy is implemented in the slow fault page.
4846 */
4847 if ((object->copy_strategy == MEMORY_OBJECT_COPY_DELAY ||
4848 object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK) &&
4849 object->vo_copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4850 assert(!resilient_media_retry); /* should be read-only fault */
4851 goto handle_copy_delay;
4852 }
4853
4854 cur_object = object;
4855 cur_offset = offset;
4856
4857 grab_options = vm_page_grab_options_for_object(object);
4858 #if HAS_MTE
4859 if (!(grab_options & VM_PAGE_GRAB_MTE) &&
4860 mteinfo_vm_tag_can_use_tag_storage((vm_tag_t)fault_info->user_tag)) {
4861 grab_options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
4862 }
4863 #endif /* HAS_MTE */
4864
4865 while (TRUE) {
4866 if (!cur_object->pager_created &&
4867 cur_object->phys_contiguous) { /* superpage */
4868 break;
4869 }
4870
4871 if (cur_object->blocked_access) {
4872 /*
4873 * Access to this VM object has been blocked.
4874 * Let the slow path handle it.
4875 */
4876 break;
4877 }
4878
4879 m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4880 m_object = NULL;
4881
4882 if (m != VM_PAGE_NULL) {
4883 m_object = cur_object;
4884
4885 if (__improbable(page_sleep_needed)) {
4886 /*
4887 * If a prior iteration of the loop requested vm_page_sleep(), re-validate the page
4888 * to see if it's still needed.
4889 */
4890 kr = vm_fault_pmap_validate_page(pmap, m, vaddr, prot, fault_info, &page_sleep_needed);
4891 if (__improbable(kr != KERN_SUCCESS)) {
4892 vm_object_unlock(object);
4893 if (object != cur_object) {
4894 vm_object_unlock(cur_object);
4895 }
4896 vm_map_unlock_read(map);
4897 if (real_map != map) {
4898 vm_map_unlock(real_map);
4899 }
4900 goto done;
4901 }
4902 }
4903 if (m->vmp_busy || page_sleep_needed) {
4904 page_sleep_needed = false;
4905 wait_result_t result;
4906
4907 /*
4908 * in order to vm_page_sleep(), we must
4909 * have object that 'm' belongs to locked exclusively
4910 */
4911 if (object != cur_object) {
4912 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4913 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4914
4915 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4916 /*
4917 * couldn't upgrade so go do a full retry
4918 * immediately since we can no longer be
4919 * certain about cur_object (since we
4920 * don't hold a reference on it)...
4921 * first drop the top object lock
4922 */
4923 vm_object_unlock(object);
4924
4925 vm_map_unlock_read(map);
4926 if (real_map != map) {
4927 vm_map_unlock(real_map);
4928 }
4929
4930 goto RetryFault;
4931 }
4932 }
4933 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4934 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4935
4936 if (vm_object_lock_upgrade(object) == FALSE) {
4937 /*
4938 * couldn't upgrade, so explictly take the lock
4939 * exclusively and go relookup the page since we
4940 * will have dropped the object lock and
4941 * a different thread could have inserted
4942 * a page at this offset
4943 * no need for a full retry since we're
4944 * at the top level of the object chain
4945 */
4946 vm_object_lock(object);
4947
4948 continue;
4949 }
4950 }
4951 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4952 /*
4953 * m->vmp_busy == TRUE and the object is locked exclusively
4954 * if m->pageout_queue == TRUE after we acquire the
4955 * queues lock, we are guaranteed that it is stable on
4956 * the pageout queue and therefore reclaimable
4957 *
4958 * NOTE: this is only true for the internal pageout queue
4959 * in the compressor world
4960 */
4961 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4962
4963 vm_page_lock_queues();
4964
4965 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4966 vm_pageout_throttle_up(m);
4967 vm_page_unlock_queues();
4968
4969 vm_page_wakeup_done(m_object, m);
4970 goto reclaimed_from_pageout;
4971 }
4972 vm_page_unlock_queues();
4973 }
4974 if (object != cur_object) {
4975 vm_object_unlock(object);
4976 }
4977
4978 vm_map_unlock_read(map);
4979 if (real_map != map) {
4980 vm_map_unlock(real_map);
4981 }
4982
4983 result = vm_page_sleep(cur_object, m, fault_info->interruptible, LCK_SLEEP_UNLOCK);
4984 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4985 goto RetryFault;
4986 }
4987
4988 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4989 kr = KERN_ABORTED;
4990 goto done;
4991 }
4992 reclaimed_from_pageout:
4993 if (m->vmp_laundry) {
4994 if (object != cur_object) {
4995 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4996 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4997
4998 vm_object_unlock(object);
4999 vm_object_unlock(cur_object);
5000
5001 vm_map_unlock_read(map);
5002 if (real_map != map) {
5003 vm_map_unlock(real_map);
5004 }
5005
5006 goto RetryFault;
5007 }
5008 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
5009 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5010
5011 if (vm_object_lock_upgrade(object) == FALSE) {
5012 /*
5013 * couldn't upgrade, so explictly take the lock
5014 * exclusively and go relookup the page since we
5015 * will have dropped the object lock and
5016 * a different thread could have inserted
5017 * a page at this offset
5018 * no need for a full retry since we're
5019 * at the top level of the object chain
5020 */
5021 vm_object_lock(object);
5022
5023 continue;
5024 }
5025 }
5026 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
5027 vm_pageout_steal_laundry(m, FALSE);
5028 }
5029
5030
5031 if (vm_page_is_guard(m)) {
5032 /*
5033 * Guard page: let the slow path deal with it
5034 */
5035 break;
5036 }
5037 if (m->vmp_unusual && (m->vmp_error || m->vmp_restart ||
5038 vm_page_is_private(m) || m->vmp_absent)) {
5039 /*
5040 * Unusual case... let the slow path deal with it
5041 */
5042 break;
5043 }
5044 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
5045 if (object != cur_object) {
5046 vm_object_unlock(object);
5047 }
5048 vm_map_unlock_read(map);
5049 if (real_map != map) {
5050 vm_map_unlock(real_map);
5051 }
5052 vm_object_unlock(cur_object);
5053 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5054 kr = KERN_MEMORY_ERROR;
5055 goto done;
5056 }
5057 assert(m_object == VM_PAGE_OBJECT(m));
5058
5059 if (vm_fault_cs_need_validation(map->pmap, m, m_object,
5060 PAGE_SIZE, 0) ||
5061 (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
5062 upgrade_lock_and_retry:
5063 /*
5064 * We might need to validate this page
5065 * against its code signature, so we
5066 * want to hold the VM object exclusively.
5067 */
5068 if (object != cur_object) {
5069 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5070 vm_object_unlock(object);
5071 vm_object_unlock(cur_object);
5072
5073 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5074
5075 vm_map_unlock_read(map);
5076 if (real_map != map) {
5077 vm_map_unlock(real_map);
5078 }
5079
5080 goto RetryFault;
5081 }
5082 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
5083 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5084
5085 if (vm_object_lock_upgrade(object) == FALSE) {
5086 /*
5087 * couldn't upgrade, so explictly take the lock
5088 * exclusively and go relookup the page since we
5089 * will have dropped the object lock and
5090 * a different thread could have inserted
5091 * a page at this offset
5092 * no need for a full retry since we're
5093 * at the top level of the object chain
5094 */
5095 vm_object_lock(object);
5096
5097 continue;
5098 }
5099 }
5100 }
5101 /*
5102 * Two cases of map in faults:
5103 * - At top level w/o copy object.
5104 * - Read fault anywhere.
5105 * --> must disallow write.
5106 */
5107
5108 if (object == cur_object && object->vo_copy == VM_OBJECT_NULL) {
5109 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5110 if ((fault_type & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
5111 assert(cur_object == VM_PAGE_OBJECT(m));
5112 assert(cur_object->internal);
5113 vm_object_lock_assert_exclusive(cur_object);
5114 vm_page_lockspin_queues();
5115 m->vmp_unmodified_ro = false;
5116 vm_page_unlock_queues();
5117 os_atomic_dec(&compressor_ro_uncompressed, relaxed);
5118 vm_object_compressor_pager_state_clr(cur_object, m->vmp_offset);
5119 }
5120 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5121 goto FastPmapEnter;
5122 }
5123
5124 if (!need_copy &&
5125 !fault_info->no_copy_on_read &&
5126 cur_object != object &&
5127 !cur_object->internal &&
5128 !cur_object->pager_trusted &&
5129 !cur_object->code_signed &&
5130 vm_protect_privileged_from_untrusted &&
5131 (current_proc_is_privileged() ||
5132 vm_kernel_map_is_kernel(map) ||
5133 vm_map_is_platform_binary(map))) {
5134 /*
5135 * We're faulting on a page in "object" and
5136 * went down the shadow chain to "cur_object"
5137 * to find out that "cur_object"'s pager
5138 * is not "trusted", i.e. we can not trust it
5139 * to always return the same contents.
5140 * Since the target is a "privileged" process,
5141 * let's treat this as a copy-on-read fault, as
5142 * if it was a copy-on-write fault.
5143 * Once "object" gets a copy of this page, it
5144 * won't have to rely on "cur_object" to
5145 * provide the contents again.
5146 *
5147 * This is done by setting "need_copy" and
5148 * retrying the fault from the top with the
5149 * appropriate locking.
5150 *
5151 * Special case: if the mapping is executable
5152 * and the untrusted object is code-signed and
5153 * the process is "cs_enforced", we do not
5154 * copy-on-read because that would break
5155 * code-signing enforcement expectations (an
5156 * executable page must belong to a code-signed
5157 * object) and we can rely on code-signing
5158 * to re-validate the page if it gets evicted
5159 * and paged back in.
5160 */
5161 // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5162 vm_copied_on_read++;
5163 if (!current_proc_is_privileged()) {
5164 /* not a privileged proc but still copy-on-read... */
5165 if (vm_kernel_map_is_kernel(map)) {
5166 /* ... because target map is a kernel map */
5167 vm_copied_on_read_kernel_map++;
5168 } else {
5169 /* ... because target map is "platform" */
5170 vm_copied_on_read_platform_map++;
5171 }
5172 }
5173 need_copy = TRUE;
5174
5175 vm_object_unlock(object);
5176 vm_object_unlock(cur_object);
5177 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5178 vm_map_unlock_read(map);
5179 if (real_map != map) {
5180 vm_map_unlock(real_map);
5181 }
5182 goto RetryFault;
5183 }
5184
5185 if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
5186 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5187 /*
5188 * For a protection that the pmap cares
5189 * about, we must hand over the full
5190 * set of protections (so that the pmap
5191 * layer can apply any desired policy).
5192 * This means that cs_bypass must be
5193 * set, as this can force us to pass
5194 * RWX.
5195 */
5196 if (!fault_info->cs_bypass) {
5197 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5198 __FUNCTION__, pmap,
5199 (uint64_t)vaddr, prot,
5200 fault_info->pmap_options);
5201 }
5202 } else {
5203 prot &= ~VM_PROT_WRITE;
5204 }
5205
5206 if (object != cur_object) {
5207 /*
5208 * We still need to hold the top object
5209 * lock here to prevent a race between
5210 * a read fault (taking only "shared"
5211 * locks) and a write fault (taking
5212 * an "exclusive" lock on the top
5213 * object.
5214 * Otherwise, as soon as we release the
5215 * top lock, the write fault could
5216 * proceed and actually complete before
5217 * the read fault, and the copied page's
5218 * translation could then be overwritten
5219 * by the read fault's translation for
5220 * the original page.
5221 *
5222 * Let's just record what the top object
5223 * is and we'll release it later.
5224 */
5225 top_object = object;
5226
5227 /*
5228 * switch to the object that has the new page
5229 */
5230 object = cur_object;
5231 object_lock_type = cur_object_lock_type;
5232 }
5233 FastPmapEnter:
5234 assert(m_object == VM_PAGE_OBJECT(m));
5235
5236 if (resilient_media_retry && (prot & VM_PROT_WRITE)) {
5237 /*
5238 * We might have bypassed some copy-on-write
5239 * mechanism to get here (theoretically inserting
5240 * a zero-filled page in the top object to avoid
5241 * raising an exception on an unavailable page at
5242 * the bottom of the shadow chain.
5243 * So let's not grant write access to this page yet.
5244 * If write access is needed, the next fault should
5245 * handle any copy-on-write obligations.
5246 */
5247 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5248 /*
5249 * For a protection that the pmap cares
5250 * about, we must hand over the full
5251 * set of protections (so that the pmap
5252 * layer can apply any desired policy).
5253 * This means that cs_bypass must be
5254 * set, as this can force us to pass
5255 * RWX.
5256 */
5257 if (!fault_info->cs_bypass) {
5258 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5259 __FUNCTION__, pmap,
5260 (uint64_t)vaddr, prot,
5261 fault_info->pmap_options);
5262 }
5263 } else {
5264 prot &= ~VM_PROT_WRITE;
5265 }
5266 }
5267
5268 /*
5269 * prepare for the pmap_enter...
5270 * object and map are both locked
5271 * m contains valid data
5272 * object == m->vmp_object
5273 * cur_object == NULL or it's been unlocked
5274 * no paging references on either object or cur_object
5275 */
5276
5277 if (fault_page_size < PAGE_SIZE) {
5278 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5279 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5280 fault_phys_offset < PAGE_SIZE),
5281 "0x%llx\n", (uint64_t)fault_phys_offset);
5282 } else {
5283 assertf(fault_phys_offset == 0,
5284 "0x%llx\n", (uint64_t)fault_phys_offset);
5285 }
5286
5287 if (__improbable(rtfault &&
5288 !m->vmp_realtime &&
5289 vm_pageout_protect_realtime)) {
5290 vm_page_lock_queues();
5291 if (!m->vmp_realtime) {
5292 m->vmp_realtime = true;
5293 VM_COUNTER_INC(&vm_page_realtime_count);
5294 }
5295 vm_page_unlock_queues();
5296 }
5297 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
5298 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5299 need_retry = false;
5300 if (caller_pmap) {
5301 kr = vm_fault_enter(m,
5302 caller_pmap,
5303 caller_pmap_addr,
5304 fault_page_size,
5305 fault_phys_offset,
5306 prot,
5307 caller_prot,
5308 wired,
5309 wire_tag,
5310 fault_info,
5311 &need_retry,
5312 &type_of_fault,
5313 &object_lock_type,
5314 &page_sleep_needed);
5315 } else {
5316 kr = vm_fault_enter(m,
5317 pmap,
5318 vaddr,
5319 fault_page_size,
5320 fault_phys_offset,
5321 prot,
5322 caller_prot,
5323 wired,
5324 wire_tag,
5325 fault_info,
5326 &need_retry,
5327 &type_of_fault,
5328 &object_lock_type,
5329 &page_sleep_needed);
5330 }
5331
5332 vm_fault_complete(
5333 map,
5334 real_map,
5335 object,
5336 m_object,
5337 m,
5338 offset,
5339 trace_real_vaddr,
5340 fault_info,
5341 caller_prot,
5342 real_vaddr,
5343 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
5344 need_retry || page_sleep_needed,
5345 kr,
5346 physpage_p,
5347 prot,
5348 top_object,
5349 need_collapse,
5350 cur_offset,
5351 fault_type,
5352 &written_on_object,
5353 &written_on_pager,
5354 &written_on_offset);
5355 top_object = VM_OBJECT_NULL;
5356 if (need_retry) {
5357 /*
5358 * vm_fault_enter couldn't complete the PMAP_ENTER...
5359 * at this point we don't hold any locks so it's safe
5360 * to ask the pmap layer to expand the page table to
5361 * accommodate this mapping... once expanded, we'll
5362 * re-drive the fault which should result in vm_fault_enter
5363 * being able to successfully enter the mapping this time around
5364 */
5365 (void)pmap_enter_options(
5366 pmap, vaddr, 0, 0, 0, 0, 0,
5367 PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER);
5368
5369 need_retry = false;
5370 goto RetryFault;
5371 }
5372 if (page_sleep_needed) {
5373 goto RetryFault;
5374 }
5375 goto done;
5376 }
5377 /*
5378 * COPY ON WRITE FAULT
5379 */
5380 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
5381
5382 /*
5383 * If objects match, then
5384 * object->vo_copy must not be NULL (else control
5385 * would be in previous code block), and we
5386 * have a potential push into the copy object
5387 * with which we can't cope with here.
5388 */
5389 if (cur_object == object) {
5390 /*
5391 * must take the slow path to
5392 * deal with the copy push
5393 */
5394 break;
5395 }
5396
5397 /*
5398 * This is now a shadow based copy on write
5399 * fault -- it requires a copy up the shadow
5400 * chain.
5401 */
5402 assert(m_object == VM_PAGE_OBJECT(m));
5403
5404 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
5405 vm_fault_cs_need_validation(NULL, m, m_object,
5406 PAGE_SIZE, 0)) {
5407 goto upgrade_lock_and_retry;
5408 }
5409
5410 #if MACH_ASSERT
5411 if (resilient_media_retry &&
5412 vm_fault_resilient_media_inject_error2_rate != 0 &&
5413 (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
5414 /* inject an error */
5415 cur_m = m;
5416 m = VM_PAGE_NULL;
5417 m_object = VM_OBJECT_NULL;
5418 break;
5419 }
5420 #endif /* MACH_ASSERT */
5421 /*
5422 * Allocate a page in the original top level
5423 * object. Give up if allocate fails. Also
5424 * need to remember current page, as it's the
5425 * source of the copy.
5426 *
5427 * at this point we hold locks on both
5428 * object and cur_object... no need to take
5429 * paging refs or mark pages BUSY since
5430 * we don't drop either object lock until
5431 * the page has been copied and inserted
5432 */
5433 cur_m = m;
5434 m = vm_page_grab_options(grab_options);
5435 m_object = NULL;
5436
5437 if (m == VM_PAGE_NULL) {
5438 /*
5439 * no free page currently available...
5440 * must take the slow path
5441 */
5442 break;
5443 }
5444
5445 /*
5446 * Now do the copy. Mark the source page busy...
5447 *
5448 * NOTE: This code holds the map lock across
5449 * the page copy.
5450 */
5451 vm_page_copy(cur_m, m);
5452 vm_page_insert(m, object, vm_object_trunc_page(offset));
5453 if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
5454 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5455 }
5456 m_object = object;
5457 SET_PAGE_DIRTY(m, FALSE);
5458
5459 /*
5460 * Now cope with the source page and object
5461 */
5462 if (os_ref_get_count_raw(&object->ref_count) > 1 &&
5463 cur_m->vmp_pmapped) {
5464 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
5465 } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
5466 /*
5467 * We've copied the full 16K page but we're
5468 * about to call vm_fault_enter() only for
5469 * the 4K chunk we're faulting on. The other
5470 * three 4K chunks in that page could still
5471 * be pmapped in this pmap.
5472 * Since the VM object layer thinks that the
5473 * entire page has been dealt with and the
5474 * original page might no longer be needed,
5475 * it might collapse/bypass the original VM
5476 * object and free its pages, which would be
5477 * bad (and would trigger pmap_verify_free()
5478 * assertions) if the other 4K chunks are still
5479 * pmapped.
5480 */
5481 /*
5482 * XXX FBDP TODO4K: to be revisisted
5483 * Technically, we need to pmap_disconnect()
5484 * only the target pmap's mappings for the 4K
5485 * chunks of this 16K VM page. If other pmaps
5486 * have PTEs on these chunks, that means that
5487 * the associated VM map must have a reference
5488 * on the VM object, so no need to worry about
5489 * those.
5490 * pmap_protect() for each 4K chunk would be
5491 * better but we'd have to check which chunks
5492 * are actually mapped before and after this
5493 * one.
5494 * A full-blown pmap_disconnect() is easier
5495 * for now but not efficient.
5496 */
5497 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
5498 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
5499 }
5500
5501 if (cur_m->vmp_clustered) {
5502 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
5503 VM_PAGE_CONSUME_CLUSTERED(cur_m);
5504 vm_fault_is_sequential(cur_object, cur_offset, fault_info->behavior);
5505 }
5506 need_collapse = TRUE;
5507
5508 if (!cur_object->internal &&
5509 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
5510 /*
5511 * The object from which we've just
5512 * copied a page is most probably backed
5513 * by a vnode. We don't want to waste too
5514 * much time trying to collapse the VM objects
5515 * and create a bottleneck when several tasks
5516 * map the same file.
5517 */
5518 if (cur_object->vo_copy == object) {
5519 /*
5520 * Shared mapping or no COW yet.
5521 * We can never collapse a copy
5522 * object into its backing object.
5523 */
5524 need_collapse = FALSE;
5525 } else if (cur_object->vo_copy == object->shadow &&
5526 object->shadow->resident_page_count == 0) {
5527 /*
5528 * Shared mapping after a COW occurred.
5529 */
5530 need_collapse = FALSE;
5531 }
5532 }
5533 vm_object_unlock(cur_object);
5534
5535 if (need_collapse == FALSE) {
5536 vm_fault_collapse_skipped++;
5537 }
5538 vm_fault_collapse_total++;
5539
5540 type_of_fault = DBG_COW_FAULT;
5541 counter_inc(&vm_statistics_cow_faults);
5542 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
5543 counter_inc(¤t_task()->cow_faults);
5544
5545 goto FastPmapEnter;
5546 } else {
5547 /*
5548 * No page at cur_object, cur_offset... m == NULL
5549 */
5550 if (cur_object->pager_created) {
5551 vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
5552
5553 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
5554 int my_fault_type;
5555 vm_compressor_options_t c_flags = C_DONT_BLOCK;
5556 bool insert_cur_object = FALSE;
5557
5558 /*
5559 * May have to talk to a pager...
5560 * if so, take the slow path by
5561 * doing a 'break' from the while (TRUE) loop
5562 *
5563 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
5564 * if the compressor is active and the page exists there
5565 */
5566 if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
5567 break;
5568 }
5569
5570 if (map == kernel_map || real_map == kernel_map) {
5571 /*
5572 * can't call into the compressor with the kernel_map
5573 * lock held, since the compressor may try to operate
5574 * on the kernel map in order to return an empty c_segment
5575 */
5576 break;
5577 }
5578 if (object != cur_object) {
5579 if (fault_type & VM_PROT_WRITE) {
5580 c_flags |= C_KEEP;
5581 } else {
5582 insert_cur_object = TRUE;
5583 }
5584 }
5585 if (insert_cur_object == TRUE) {
5586 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5587 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5588
5589 if (vm_object_lock_upgrade(cur_object) == FALSE) {
5590 /*
5591 * couldn't upgrade so go do a full retry
5592 * immediately since we can no longer be
5593 * certain about cur_object (since we
5594 * don't hold a reference on it)...
5595 * first drop the top object lock
5596 */
5597 vm_object_unlock(object);
5598
5599 vm_map_unlock_read(map);
5600 if (real_map != map) {
5601 vm_map_unlock(real_map);
5602 }
5603
5604 goto RetryFault;
5605 }
5606 }
5607 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
5608 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5609
5610 if (object != cur_object) {
5611 /*
5612 * we can't go for the upgrade on the top
5613 * lock since the upgrade may block waiting
5614 * for readers to drain... since we hold
5615 * cur_object locked at this point, waiting
5616 * for the readers to drain would represent
5617 * a lock order inversion since the lock order
5618 * for objects is the reference order in the
5619 * shadown chain
5620 */
5621 vm_object_unlock(object);
5622 vm_object_unlock(cur_object);
5623
5624 vm_map_unlock_read(map);
5625 if (real_map != map) {
5626 vm_map_unlock(real_map);
5627 }
5628
5629 goto RetryFault;
5630 }
5631 if (vm_object_lock_upgrade(object) == FALSE) {
5632 /*
5633 * couldn't upgrade, so explictly take the lock
5634 * exclusively and go relookup the page since we
5635 * will have dropped the object lock and
5636 * a different thread could have inserted
5637 * a page at this offset
5638 * no need for a full retry since we're
5639 * at the top level of the object chain
5640 */
5641 vm_object_lock(object);
5642
5643 continue;
5644 }
5645 }
5646
5647 #if HAS_MTE
5648 if (vm_object_is_mte_mappable(object)) {
5649 c_flags |= C_MTE;
5650 }
5651 #endif /* HAS_MTE */
5652 m = vm_page_grab_options(grab_options);
5653 m_object = NULL;
5654
5655 if (m == VM_PAGE_NULL) {
5656 /*
5657 * no free page currently available...
5658 * must take the slow path
5659 */
5660 break;
5661 }
5662
5663 /*
5664 * The object is and remains locked
5665 * so no need to take a
5666 * "paging_in_progress" reference.
5667 */
5668 bool shared_lock;
5669 if ((object == cur_object &&
5670 object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5671 (object != cur_object &&
5672 cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5673 shared_lock = FALSE;
5674 } else {
5675 shared_lock = TRUE;
5676 }
5677
5678 kr = vm_compressor_pager_get(
5679 cur_object->pager,
5680 (vm_object_trunc_page(cur_offset)
5681 + cur_object->paging_offset),
5682 VM_PAGE_GET_PHYS_PAGE(m),
5683 &my_fault_type,
5684 c_flags,
5685 &compressed_count_delta);
5686
5687 vm_compressor_pager_count(
5688 cur_object->pager,
5689 compressed_count_delta,
5690 shared_lock,
5691 cur_object);
5692
5693 if (kr != KERN_SUCCESS) {
5694 vm_page_release(m,
5695 VMP_RELEASE_NONE);
5696 m = VM_PAGE_NULL;
5697 }
5698 /*
5699 * If vm_compressor_pager_get() returns
5700 * KERN_MEMORY_FAILURE, then the
5701 * compressed data is permanently lost,
5702 * so return this error immediately.
5703 */
5704 if (kr == KERN_MEMORY_FAILURE) {
5705 if (object != cur_object) {
5706 vm_object_unlock(cur_object);
5707 }
5708 vm_object_unlock(object);
5709 vm_map_unlock_read(map);
5710 if (real_map != map) {
5711 vm_map_unlock(real_map);
5712 }
5713
5714 goto done;
5715 } else if (kr != KERN_SUCCESS) {
5716 break;
5717 }
5718 m->vmp_dirty = TRUE;
5719 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5720 if ((fault_type & VM_PROT_WRITE) == 0) {
5721 prot &= ~VM_PROT_WRITE;
5722 /*
5723 * The page, m, has yet to be inserted
5724 * into an object. So we are fine with
5725 * the object/cur_object lock being held
5726 * shared.
5727 */
5728 vm_page_lockspin_queues();
5729 m->vmp_unmodified_ro = true;
5730 vm_page_unlock_queues();
5731 os_atomic_inc(&compressor_ro_uncompressed, relaxed);
5732 }
5733 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5734
5735 /*
5736 * If the object is purgeable, its
5737 * owner's purgeable ledgers will be
5738 * updated in vm_page_insert() but the
5739 * page was also accounted for in a
5740 * "compressed purgeable" ledger, so
5741 * update that now.
5742 */
5743 if (object != cur_object &&
5744 !insert_cur_object) {
5745 /*
5746 * We're not going to insert
5747 * the decompressed page into
5748 * the object it came from.
5749 *
5750 * We're dealing with a
5751 * copy-on-write fault on
5752 * "object".
5753 * We're going to decompress
5754 * the page directly into the
5755 * target "object" while
5756 * keepin the compressed
5757 * page for "cur_object", so
5758 * no ledger update in that
5759 * case.
5760 */
5761 } else if (((cur_object->purgable ==
5762 VM_PURGABLE_DENY) &&
5763 (!cur_object->vo_ledger_tag)) ||
5764 (cur_object->vo_owner ==
5765 NULL)) {
5766 /*
5767 * "cur_object" is not purgeable
5768 * and is not ledger-taged, or
5769 * there's no owner for it,
5770 * so no owner's ledgers to
5771 * update.
5772 */
5773 } else {
5774 /*
5775 * One less compressed
5776 * purgeable/tagged page for
5777 * cur_object's owner.
5778 */
5779 if (compressed_count_delta) {
5780 vm_object_owner_compressed_update(
5781 cur_object,
5782 -1);
5783 }
5784 }
5785
5786 if (insert_cur_object) {
5787 vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5788 m_object = cur_object;
5789 } else {
5790 vm_page_insert(m, object, vm_object_trunc_page(offset));
5791 m_object = object;
5792 }
5793
5794 if (!HAS_DEFAULT_CACHEABILITY(m_object->wimg_bits & VM_WIMG_MASK)) {
5795 /*
5796 * If the page is not cacheable,
5797 * we can't let its contents
5798 * linger in the data cache
5799 * after the decompression.
5800 */
5801 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5802 }
5803
5804 type_of_fault = my_fault_type;
5805
5806 VM_STAT_DECOMPRESSIONS();
5807
5808 if (cur_object != object) {
5809 if (insert_cur_object) {
5810 top_object = object;
5811 /*
5812 * switch to the object that has the new page
5813 */
5814 object = cur_object;
5815 object_lock_type = cur_object_lock_type;
5816 } else {
5817 vm_object_unlock(cur_object);
5818 cur_object = object;
5819 }
5820 }
5821 goto FastPmapEnter;
5822 }
5823 /*
5824 * existence map present and indicates
5825 * that the pager doesn't have this page
5826 */
5827 }
5828 if (cur_object->shadow == VM_OBJECT_NULL ||
5829 resilient_media_retry) {
5830 /*
5831 * Zero fill fault. Page gets
5832 * inserted into the original object.
5833 */
5834 if (cur_object->shadow_severed ||
5835 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5836 cur_object == compressor_object ||
5837 is_kernel_object(cur_object)) {
5838 if (object != cur_object) {
5839 vm_object_unlock(cur_object);
5840 }
5841 vm_object_unlock(object);
5842
5843 vm_map_unlock_read(map);
5844 if (real_map != map) {
5845 vm_map_unlock(real_map);
5846 }
5847 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5848 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5849 }
5850
5851 if (cur_object->shadow_severed) {
5852 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5853 }
5854
5855 kr = KERN_MEMORY_ERROR;
5856 goto done;
5857 }
5858 if (cur_object != object) {
5859 vm_object_unlock(cur_object);
5860
5861 cur_object = object;
5862 }
5863 if (object_lock_type == OBJECT_LOCK_SHARED) {
5864 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5865
5866 if (vm_object_lock_upgrade(object) == FALSE) {
5867 /*
5868 * couldn't upgrade so do a full retry on the fault
5869 * since we dropped the object lock which
5870 * could allow another thread to insert
5871 * a page at this offset
5872 */
5873 vm_map_unlock_read(map);
5874 if (real_map != map) {
5875 vm_map_unlock(real_map);
5876 }
5877
5878 goto RetryFault;
5879 }
5880 }
5881 if (!object->internal) {
5882 panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5883 }
5884 #if MACH_ASSERT
5885 if (resilient_media_retry &&
5886 vm_fault_resilient_media_inject_error3_rate != 0 &&
5887 (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5888 /* inject an error */
5889 m_object = NULL;
5890 break;
5891 }
5892 #endif /* MACH_ASSERT */
5893
5894 m = vm_page_grab_options(grab_options);
5895 m_object = NULL;
5896
5897 if (m == VM_PAGE_NULL) {
5898 /*
5899 * no free page currently available...
5900 * must take the slow path
5901 */
5902 break;
5903 }
5904 m_object = object;
5905 vm_page_insert(m, m_object, vm_object_trunc_page(offset));
5906
5907 if ((prot & VM_PROT_WRITE) &&
5908 !(fault_type & VM_PROT_WRITE) &&
5909 object->vo_copy != VM_OBJECT_NULL) {
5910 /*
5911 * This is not a write fault and
5912 * we might have a copy-on-write
5913 * obligation to honor (copy object or
5914 * "needs_copy" map entry), so do not
5915 * give write access yet.
5916 * We'll need to catch the first write
5917 * to resolve the copy-on-write by
5918 * pushing this page to a copy object
5919 * or making a shadow object.
5920 */
5921 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5922 /*
5923 * This pmap enforces extra
5924 * constraints for this set of
5925 * protections, so we can't
5926 * change the protections.
5927 * We would expect code-signing
5928 * to be bypassed in this case.
5929 */
5930 if (!fault_info->cs_bypass) {
5931 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5932 __FUNCTION__,
5933 pmap,
5934 (uint64_t)vaddr,
5935 prot,
5936 fault_info->pmap_options);
5937 }
5938 } else {
5939 prot &= ~VM_PROT_WRITE;
5940 }
5941 }
5942 if (resilient_media_retry) {
5943 /*
5944 * Not a real write, so no reason to assert.
5945 * We've just allocated a new page for this
5946 * <object,offset> so we know nobody has any
5947 * PTE pointing at any previous version of this
5948 * page and no copy-on-write is involved here.
5949 * We're just inserting a page of zeroes at this
5950 * stage of the shadow chain because the pager
5951 * for the lowest object in the shadow chain
5952 * said it could not provide that page and we
5953 * want to avoid failing the fault and causing
5954 * a crash on this "resilient_media" mapping.
5955 */
5956 } else {
5957 assertf(!((fault_type & VM_PROT_WRITE) && object->vo_copy),
5958 "map %p va 0x%llx wrong path for write fault (fault_type 0x%x) on object %p with copy %p\n",
5959 map, (uint64_t)vaddr, fault_type, object, object->vo_copy);
5960 }
5961
5962 vm_object_t saved_copy_object;
5963 uint64_t saved_copy_version;
5964 saved_copy_object = object->vo_copy;
5965 saved_copy_version = object->vo_copy_version;
5966
5967 /*
5968 * Zeroing the page and entering into it into the pmap
5969 * represents a significant amount of the zero fill fault handler's work.
5970 *
5971 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5972 * now that we've inserted the page into the vm object.
5973 * Before dropping the lock, we need to check protection bits and set the
5974 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5975 * zero it, and do the pmap enter. We'll need to reacquire the lock
5976 * to clear the busy bit and wake up any waiters.
5977 */
5978 vm_fault_cs_clear(m);
5979 m->vmp_pmapped = TRUE;
5980 if (map->no_zero_fill) {
5981 type_of_fault = DBG_NZF_PAGE_FAULT;
5982 } else {
5983 type_of_fault = DBG_ZERO_FILL_FAULT;
5984 }
5985 {
5986 pmap_t destination_pmap;
5987 vm_map_offset_t destination_pmap_vaddr;
5988 vm_prot_t enter_fault_type;
5989 if (caller_pmap) {
5990 destination_pmap = caller_pmap;
5991 destination_pmap_vaddr = caller_pmap_addr;
5992 } else {
5993 destination_pmap = pmap;
5994 destination_pmap_vaddr = vaddr;
5995 }
5996 if (fault_info->fi_change_wiring) {
5997 enter_fault_type = VM_PROT_NONE;
5998 } else {
5999 enter_fault_type = caller_prot;
6000 }
6001 assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6002 kr = vm_fault_enter_prepare(m,
6003 destination_pmap,
6004 destination_pmap_vaddr,
6005 &prot,
6006 caller_prot,
6007 fault_page_size,
6008 fault_phys_offset,
6009 enter_fault_type,
6010 fault_info,
6011 &type_of_fault,
6012 &page_needs_data_sync,
6013 &page_sleep_needed);
6014
6015 assert(!page_sleep_needed);
6016 if (kr != KERN_SUCCESS) {
6017 goto zero_fill_cleanup;
6018 }
6019
6020 if (object_is_contended) {
6021 /*
6022 * At this point the page is in the vm object, but not on a paging queue.
6023 * Since it's accessible to another thread but its contents are invalid
6024 * (it hasn't been zeroed) mark it busy before dropping the object lock.
6025 */
6026 m->vmp_busy = TRUE;
6027 vm_object_paging_begin(object); /* keep object alive */
6028 vm_object_unlock(object);
6029 }
6030 if (type_of_fault == DBG_ZERO_FILL_FAULT) {
6031 /*
6032 * Now zero fill page...
6033 * the page is probably going to
6034 * be written soon, so don't bother
6035 * to clear the modified bit
6036 *
6037 * NOTE: This code holds the map
6038 * lock across the zero fill.
6039 */
6040 vm_page_zero_fill(
6041 m
6042 #if HAS_MTE
6043 , true /* zero_tags */
6044 #endif /* HAS_MTE */
6045 );
6046 counter_inc(&vm_statistics_zero_fill_count);
6047 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
6048 }
6049
6050 if (object_is_contended) {
6051 /*
6052 * It's not safe to do the pmap_enter() without holding
6053 * the object lock because its "vo_copy" could change.
6054 */
6055 object_is_contended = false; /* get out of that code path */
6056
6057 vm_object_lock(object);
6058 vm_object_paging_end(object);
6059 if (object->vo_copy != saved_copy_object ||
6060 object->vo_copy_version != saved_copy_version) {
6061 /*
6062 * The COPY_DELAY copy-on-write situation for
6063 * this VM object has changed while it was
6064 * unlocked, so do not grant write access to
6065 * this page.
6066 * The write access will fault again and we'll
6067 * resolve the copy-on-write then.
6068 */
6069 if (pmap_has_prot_policy(pmap,
6070 fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE,
6071 prot)) {
6072 /* we should not do CoW on pmap_has_prot_policy mappings */
6073 panic("%s: map %p va 0x%llx obj %p,%llu saved %p,%llu: unexpected CoW",
6074 __FUNCTION__,
6075 map, (uint64_t)vaddr,
6076 object, object->vo_copy_version,
6077 saved_copy_object, saved_copy_version);
6078 } else {
6079 /* the pmap layer is OK with changing the PTE's prot */
6080 prot &= ~VM_PROT_WRITE;
6081 }
6082 }
6083 }
6084
6085 if (page_needs_data_sync) {
6086 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
6087 }
6088
6089 if (fault_info->fi_xnu_user_debug &&
6090 !object->code_signed) {
6091 fault_info->pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
6092 }
6093 if (object_is_contended) {
6094 panic("object_is_contended");
6095 kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
6096 fault_page_size, fault_phys_offset,
6097 m, &prot, caller_prot, enter_fault_type, wired,
6098 fault_info->pmap_options, &need_retry);
6099 vm_object_lock(object);
6100 assertf(!((prot & VM_PROT_WRITE) && object->vo_copy),
6101 "prot 0x%x object %p copy %p\n",
6102 prot, object, object->vo_copy);
6103 } else {
6104 need_retry = false;
6105 kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
6106 fault_page_size, fault_phys_offset,
6107 m, &prot, caller_prot, enter_fault_type, wired,
6108 fault_info->pmap_options, &need_retry, &object_lock_type);
6109 }
6110 }
6111 zero_fill_cleanup:
6112 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6113 (object->purgable == VM_PURGABLE_DENY ||
6114 object->purgable == VM_PURGABLE_NONVOLATILE ||
6115 object->purgable == VM_PURGABLE_VOLATILE)) {
6116 vm_page_lockspin_queues();
6117 if (!VM_DYNAMIC_PAGING_ENABLED()) {
6118 vm_fault_enqueue_throttled_locked(m);
6119 }
6120 vm_page_unlock_queues();
6121 }
6122 vm_fault_enqueue_page(object, m, wired, fault_info->fi_change_wiring, wire_tag, fault_info->no_cache, &type_of_fault, kr);
6123
6124 if (__improbable(rtfault &&
6125 !m->vmp_realtime &&
6126 vm_pageout_protect_realtime)) {
6127 vm_page_lock_queues();
6128 if (!m->vmp_realtime) {
6129 m->vmp_realtime = true;
6130 VM_COUNTER_INC(&vm_page_realtime_count);
6131 }
6132 vm_page_unlock_queues();
6133 }
6134 vm_fault_complete(
6135 map,
6136 real_map,
6137 object,
6138 m_object,
6139 m,
6140 offset,
6141 trace_real_vaddr,
6142 fault_info,
6143 caller_prot,
6144 real_vaddr,
6145 type_of_fault,
6146 need_retry,
6147 kr,
6148 physpage_p,
6149 prot,
6150 top_object,
6151 need_collapse,
6152 cur_offset,
6153 fault_type,
6154 &written_on_object,
6155 &written_on_pager,
6156 &written_on_offset);
6157 top_object = VM_OBJECT_NULL;
6158 if (need_retry) {
6159 /*
6160 * vm_fault_enter couldn't complete the PMAP_ENTER...
6161 * at this point we don't hold any locks so it's safe
6162 * to ask the pmap layer to expand the page table to
6163 * accommodate this mapping... once expanded, we'll
6164 * re-drive the fault which should result in vm_fault_enter
6165 * being able to successfully enter the mapping this time around
6166 */
6167 (void)pmap_enter_options(
6168 pmap, vaddr, 0, 0, 0, 0, 0,
6169 PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER);
6170
6171 need_retry = FALSE;
6172 goto RetryFault;
6173 }
6174 goto done;
6175 }
6176 /*
6177 * On to the next level in the shadow chain
6178 */
6179 cur_offset += cur_object->vo_shadow_offset;
6180 new_object = cur_object->shadow;
6181 fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
6182
6183 /*
6184 * take the new_object's lock with the indicated state
6185 */
6186 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
6187 vm_object_lock_shared(new_object);
6188 } else {
6189 vm_object_lock(new_object);
6190 }
6191
6192 if (cur_object != object) {
6193 vm_object_unlock(cur_object);
6194 }
6195
6196 cur_object = new_object;
6197
6198 continue;
6199 }
6200 }
6201 /*
6202 * Cleanup from fast fault failure. Drop any object
6203 * lock other than original and drop map lock.
6204 */
6205 if (object != cur_object) {
6206 vm_object_unlock(cur_object);
6207 }
6208
6209 /*
6210 * must own the object lock exclusively at this point
6211 */
6212 if (object_lock_type == OBJECT_LOCK_SHARED) {
6213 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
6214
6215 if (vm_object_lock_upgrade(object) == FALSE) {
6216 /*
6217 * couldn't upgrade, so explictly
6218 * take the lock exclusively
6219 * no need to retry the fault at this
6220 * point since "vm_fault_page" will
6221 * completely re-evaluate the state
6222 */
6223 vm_object_lock(object);
6224 }
6225 }
6226
6227 handle_copy_delay:
6228 vm_map_unlock_read(map);
6229 if (real_map != map) {
6230 vm_map_unlock(real_map);
6231 }
6232
6233 if (__improbable(object == compressor_object ||
6234 is_kernel_object(object))) {
6235 /*
6236 * These objects are explicitly managed and populated by the
6237 * kernel. The virtual ranges backed by these objects should
6238 * either have wired pages or "holes" that are not supposed to
6239 * be accessed at all until they get explicitly populated.
6240 * We should never have to resolve a fault on a mapping backed
6241 * by one of these VM objects and providing a zero-filled page
6242 * would be wrong here, so let's fail the fault and let the
6243 * caller crash or recover.
6244 */
6245 vm_object_unlock(object);
6246 kr = KERN_MEMORY_ERROR;
6247 goto done;
6248 }
6249
6250 resilient_media_ref_transfer = false;
6251 if (resilient_media_retry) {
6252 /*
6253 * We could get here if we failed to get a free page
6254 * to zero-fill and had to take the slow path again.
6255 * Reset our "recovery-from-failed-media" state.
6256 */
6257 assert(resilient_media_object != VM_OBJECT_NULL);
6258 assert(resilient_media_offset != (vm_object_offset_t)-1);
6259 /* release our extra reference on failed object */
6260 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6261 if (object == resilient_media_object) {
6262 /*
6263 * We're holding "object"'s lock, so we can't release
6264 * our extra reference at this point.
6265 * We need an extra reference on "object" anyway
6266 * (see below), so let's just transfer this reference.
6267 */
6268 resilient_media_ref_transfer = true;
6269 } else {
6270 vm_object_deallocate(resilient_media_object);
6271 }
6272 resilient_media_object = VM_OBJECT_NULL;
6273 resilient_media_offset = (vm_object_offset_t)-1;
6274 resilient_media_retry = false;
6275 vm_fault_resilient_media_abort2++;
6276 }
6277
6278 /*
6279 * Make a reference to this object to
6280 * prevent its disposal while we are messing with
6281 * it. Once we have the reference, the map is free
6282 * to be diddled. Since objects reference their
6283 * shadows (and copies), they will stay around as well.
6284 */
6285 if (resilient_media_ref_transfer) {
6286 /* we already have an extra reference on this object */
6287 resilient_media_ref_transfer = false;
6288 } else {
6289 vm_object_reference_locked(object);
6290 }
6291 vm_object_paging_begin(object);
6292
6293 set_thread_pagein_error(cthread, 0);
6294 error_code = 0;
6295
6296 result_page = VM_PAGE_NULL;
6297 vm_fault_return_t err = vm_fault_page(object, offset, fault_type,
6298 (fault_info->fi_change_wiring && !wired),
6299 FALSE, /* page not looked up */
6300 &prot, &result_page, &top_page,
6301 &type_of_fault,
6302 &error_code, map->no_zero_fill,
6303 fault_info);
6304
6305 /*
6306 * if kr != VM_FAULT_SUCCESS, then the paging reference
6307 * has been dropped and the object unlocked... the ref_count
6308 * is still held
6309 *
6310 * if kr == VM_FAULT_SUCCESS, then the paging reference
6311 * is still held along with the ref_count on the original object
6312 *
6313 * the object is returned locked with a paging reference
6314 *
6315 * if top_page != NULL, then it's BUSY and the
6316 * object it belongs to has a paging reference
6317 * but is returned unlocked
6318 */
6319 if (err != VM_FAULT_SUCCESS &&
6320 err != VM_FAULT_SUCCESS_NO_VM_PAGE) {
6321 if (err == VM_FAULT_MEMORY_ERROR &&
6322 fault_info->resilient_media) {
6323 assertf(object->internal, "object %p", object);
6324 /*
6325 * This fault failed but the mapping was
6326 * "media resilient", so we'll retry the fault in
6327 * recovery mode to get a zero-filled page in the
6328 * top object.
6329 * Keep the reference on the failing object so
6330 * that we can check that the mapping is still
6331 * pointing to it when we retry the fault.
6332 */
6333 // printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
6334 assert(!resilient_media_retry); /* no double retry */
6335 assert(resilient_media_object == VM_OBJECT_NULL);
6336 assert(resilient_media_offset == (vm_object_offset_t)-1);
6337 resilient_media_retry = true;
6338 resilient_media_object = object;
6339 resilient_media_offset = offset;
6340 // printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
6341 vm_fault_resilient_media_initiate++;
6342 goto RetryFault;
6343 } else {
6344 /*
6345 * we didn't succeed, lose the object reference
6346 * immediately.
6347 */
6348 vm_object_deallocate(object);
6349 object = VM_OBJECT_NULL; /* no longer valid */
6350 }
6351
6352 /*
6353 * See why we failed, and take corrective action.
6354 */
6355 switch (err) {
6356 case VM_FAULT_SUCCESS:
6357 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6358 /* These aren't possible but needed to make the switch exhaustive */
6359 break;
6360 case VM_FAULT_MEMORY_SHORTAGE:
6361 if (vm_page_wait((fault_info->fi_change_wiring) ?
6362 THREAD_UNINT :
6363 THREAD_ABORTSAFE)) {
6364 goto RetryFault;
6365 }
6366 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
6367 OS_FALLTHROUGH;
6368 case VM_FAULT_INTERRUPTED:
6369 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
6370 kr = KERN_ABORTED;
6371 goto done;
6372 case VM_FAULT_RETRY:
6373 goto RetryFault;
6374 case VM_FAULT_MEMORY_ERROR:
6375 if (error_code) {
6376 kr = error_code;
6377 } else {
6378 kr = KERN_MEMORY_ERROR;
6379 }
6380 goto done;
6381 case VM_FAULT_BUSY:
6382 kr = KERN_ALREADY_WAITING;
6383 goto done;
6384 }
6385 }
6386 m = result_page;
6387 m_object = NULL;
6388
6389 if (m != VM_PAGE_NULL) {
6390 m_object = VM_PAGE_OBJECT(m);
6391 assert((fault_info->fi_change_wiring && !wired) ?
6392 (top_page == VM_PAGE_NULL) :
6393 ((top_page == VM_PAGE_NULL) == (m_object == object)));
6394 }
6395
6396 /*
6397 * What to do with the resulting page from vm_fault_page
6398 * if it doesn't get entered into the physical map:
6399 */
6400 #define RELEASE_PAGE(m) \
6401 MACRO_BEGIN \
6402 vm_page_wakeup_done(VM_PAGE_OBJECT(m), m); \
6403 if ( !VM_PAGE_PAGEABLE(m)) { \
6404 vm_page_lockspin_queues(); \
6405 if ( !VM_PAGE_PAGEABLE(m)) \
6406 vm_page_activate(m); \
6407 vm_page_unlock_queues(); \
6408 } \
6409 MACRO_END
6410
6411
6412 object_locks_dropped = FALSE;
6413 /*
6414 * We must verify that the maps have not changed
6415 * since our last lookup. vm_map_verify() needs the
6416 * map lock (shared) but we are holding object locks.
6417 * So we do a try_lock() first and, if that fails, we
6418 * drop the object locks and go in for the map lock again.
6419 */
6420 if (m != VM_PAGE_NULL) {
6421 old_copy_object = m_object->vo_copy;
6422 old_copy_version = m_object->vo_copy_version;
6423 } else {
6424 old_copy_object = VM_OBJECT_NULL;
6425 old_copy_version = 0;
6426 }
6427 if (!vm_map_try_lock_read(original_map)) {
6428 if (m != VM_PAGE_NULL) {
6429 vm_object_unlock(m_object);
6430 } else {
6431 vm_object_unlock(object);
6432 }
6433
6434 object_locks_dropped = TRUE;
6435
6436 vm_map_lock_read(original_map);
6437 }
6438
6439 if ((map != original_map) || !vm_map_verify(map, &version)) {
6440 if (object_locks_dropped == FALSE) {
6441 if (m != VM_PAGE_NULL) {
6442 vm_object_unlock(m_object);
6443 } else {
6444 vm_object_unlock(object);
6445 }
6446
6447 object_locks_dropped = TRUE;
6448 }
6449
6450 /*
6451 * no object locks are held at this point
6452 */
6453 vm_object_t retry_object;
6454 vm_object_offset_t retry_offset;
6455 vm_prot_t retry_prot;
6456
6457 /*
6458 * To avoid trying to write_lock the map while another
6459 * thread has it read_locked (in vm_map_pageable), we
6460 * do not try for write permission. If the page is
6461 * still writable, we will get write permission. If it
6462 * is not, or has been marked needs_copy, we enter the
6463 * mapping without write permission, and will merely
6464 * take another fault.
6465 */
6466 map = original_map;
6467
6468 kr = vm_map_lookup_and_lock_object(&map, vaddr,
6469 fault_type & ~VM_PROT_WRITE,
6470 OBJECT_LOCK_EXCLUSIVE, &version,
6471 &retry_object, &retry_offset, &retry_prot,
6472 &wired,
6473 fault_info,
6474 &real_map,
6475 NULL);
6476 pmap = real_map->pmap;
6477
6478 if (kr != KERN_SUCCESS) {
6479 vm_map_unlock_read(map);
6480
6481 if (m != VM_PAGE_NULL) {
6482 assert(VM_PAGE_OBJECT(m) == m_object);
6483
6484 /*
6485 * retake the lock so that
6486 * we can drop the paging reference
6487 * in vm_fault_cleanup and do the
6488 * vm_page_wakeup_done() in RELEASE_PAGE
6489 */
6490 vm_object_lock(m_object);
6491
6492 RELEASE_PAGE(m);
6493
6494 vm_fault_cleanup(m_object, top_page);
6495 } else {
6496 /*
6497 * retake the lock so that
6498 * we can drop the paging reference
6499 * in vm_fault_cleanup
6500 */
6501 vm_object_lock(object);
6502
6503 vm_fault_cleanup(object, top_page);
6504 }
6505 vm_object_deallocate(object);
6506
6507 if (kr == KERN_INVALID_ADDRESS) {
6508 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
6509 }
6510 goto done;
6511 }
6512 vm_object_unlock(retry_object);
6513
6514 if ((retry_object != object) || (retry_offset != offset)) {
6515 vm_map_unlock_read(map);
6516 if (real_map != map) {
6517 vm_map_unlock(real_map);
6518 }
6519
6520 if (m != VM_PAGE_NULL) {
6521 assert(VM_PAGE_OBJECT(m) == m_object);
6522
6523 /*
6524 * retake the lock so that
6525 * we can drop the paging reference
6526 * in vm_fault_cleanup and do the
6527 * vm_page_wakeup_done() in RELEASE_PAGE
6528 */
6529 vm_object_lock(m_object);
6530
6531 RELEASE_PAGE(m);
6532
6533 vm_fault_cleanup(m_object, top_page);
6534 } else {
6535 /*
6536 * retake the lock so that
6537 * we can drop the paging reference
6538 * in vm_fault_cleanup
6539 */
6540 vm_object_lock(object);
6541
6542 vm_fault_cleanup(object, top_page);
6543 }
6544 vm_object_deallocate(object);
6545
6546 goto RetryFault;
6547 }
6548 /*
6549 * Check whether the protection has changed or the object
6550 * has been copied while we left the map unlocked.
6551 */
6552 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
6553 /* If the pmap layer cares, pass the full set. */
6554 prot = retry_prot;
6555 } else {
6556 prot &= retry_prot;
6557 }
6558 }
6559
6560 if (object_locks_dropped == TRUE) {
6561 if (m != VM_PAGE_NULL) {
6562 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6563 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6564 vm_object_lock(m_object);
6565 } else {
6566 vm_object_lock(object);
6567 }
6568
6569 object_locks_dropped = FALSE;
6570 }
6571
6572 if ((prot & VM_PROT_WRITE) &&
6573 m != VM_PAGE_NULL &&
6574 (m_object->vo_copy != old_copy_object ||
6575 m_object->vo_copy_version != old_copy_version)) {
6576 /*
6577 * The copy object changed while the top-level object
6578 * was unlocked, so take away write permission.
6579 */
6580 if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
6581 /*
6582 * This pmap enforces extra constraints for this set
6583 * of protections, so we can't change the protections.
6584 * This mapping should have been setup to avoid
6585 * copy-on-write since that requires removing write
6586 * access.
6587 */
6588 panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x m%p obj %p copyobj %p",
6589 __FUNCTION__, pmap, (uint64_t)vaddr, prot,
6590 fault_info->pmap_options,
6591 m, m_object, m_object->vo_copy);
6592 }
6593 prot &= ~VM_PROT_WRITE;
6594 }
6595
6596 if (!need_copy &&
6597 !fault_info->no_copy_on_read &&
6598 m != VM_PAGE_NULL &&
6599 VM_PAGE_OBJECT(m) != object &&
6600 !VM_PAGE_OBJECT(m)->pager_trusted &&
6601 vm_protect_privileged_from_untrusted &&
6602 !VM_PAGE_OBJECT(m)->code_signed &&
6603 current_proc_is_privileged()) {
6604 /*
6605 * We found the page we want in an "untrusted" VM object
6606 * down the shadow chain. Since the target is "privileged"
6607 * we want to perform a copy-on-read of that page, so that the
6608 * mapped object gets a stable copy and does not have to
6609 * rely on the "untrusted" object to provide the same
6610 * contents if the page gets reclaimed and has to be paged
6611 * in again later on.
6612 *
6613 * Special case: if the mapping is executable and the untrusted
6614 * object is code-signed and the process is "cs_enforced", we
6615 * do not copy-on-read because that would break code-signing
6616 * enforcement expectations (an executable page must belong
6617 * to a code-signed object) and we can rely on code-signing
6618 * to re-validate the page if it gets evicted and paged back in.
6619 */
6620 // printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
6621 vm_copied_on_read++;
6622 need_copy_on_read = TRUE;
6623 need_copy = TRUE;
6624 } else {
6625 need_copy_on_read = FALSE;
6626 }
6627
6628 /*
6629 * If we want to wire down this page, but no longer have
6630 * adequate permissions, we must start all over.
6631 * If we decided to copy-on-read, we must also start all over.
6632 */
6633 if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
6634 need_copy_on_read) {
6635 vm_map_unlock_read(map);
6636 if (real_map != map) {
6637 vm_map_unlock(real_map);
6638 }
6639
6640 if (m != VM_PAGE_NULL) {
6641 assert(VM_PAGE_OBJECT(m) == m_object);
6642
6643 RELEASE_PAGE(m);
6644
6645 vm_fault_cleanup(m_object, top_page);
6646 } else {
6647 vm_fault_cleanup(object, top_page);
6648 }
6649
6650 vm_object_deallocate(object);
6651
6652 goto RetryFault;
6653 }
6654 if (m != VM_PAGE_NULL) {
6655 /*
6656 * Put this page into the physical map.
6657 * We had to do the unlock above because pmap_enter
6658 * may cause other faults. The page may be on
6659 * the pageout queues. If the pageout daemon comes
6660 * across the page, it will remove it from the queues.
6661 */
6662 if (fault_page_size < PAGE_SIZE) {
6663 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
6664 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
6665 fault_phys_offset < PAGE_SIZE),
6666 "0x%llx\n", (uint64_t)fault_phys_offset);
6667 } else {
6668 assertf(fault_phys_offset == 0,
6669 "0x%llx\n", (uint64_t)fault_phys_offset);
6670 }
6671 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6672 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6673 need_retry = false;
6674 if (caller_pmap) {
6675 kr = vm_fault_enter(m,
6676 caller_pmap,
6677 caller_pmap_addr,
6678 fault_page_size,
6679 fault_phys_offset,
6680 prot,
6681 caller_prot,
6682 wired,
6683 wire_tag,
6684 fault_info,
6685 &need_retry,
6686 &type_of_fault,
6687 &object_lock_type,
6688 &page_sleep_needed);
6689 } else {
6690 kr = vm_fault_enter(m,
6691 pmap,
6692 vaddr,
6693 fault_page_size,
6694 fault_phys_offset,
6695 prot,
6696 caller_prot,
6697 wired,
6698 wire_tag,
6699 fault_info,
6700 &need_retry,
6701 &type_of_fault,
6702 &object_lock_type,
6703 &page_sleep_needed);
6704 }
6705 assert(VM_PAGE_OBJECT(m) == m_object);
6706
6707 {
6708 int event_code = 0;
6709
6710 if (m_object->internal) {
6711 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
6712 } else if (m_object->object_is_shared_cache) {
6713 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
6714 } else {
6715 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
6716 }
6717
6718 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
6719 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
6720
6721 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
6722 }
6723 if ((kr != KERN_SUCCESS) || page_sleep_needed || need_retry) {
6724 /* abort this page fault */
6725 vm_page_wakeup_done(m_object, m);
6726 vm_fault_cleanup(m_object, top_page);
6727 vm_object_deallocate(object);
6728
6729 if (need_retry) {
6730 /*
6731 * We could not expand the page table while holding an
6732 * object lock.
6733 * Expand it now and retry the fault.
6734 */
6735 assert3u(kr, ==, KERN_RESOURCE_SHORTAGE);
6736 if (caller_pmap) {
6737 (void)pmap_enter_options(
6738 caller_pmap, caller_pmap_addr, 0, 0, 0, 0, 0,
6739 PMAP_OPTIONS_NOENTER, NULL,
6740 PMAP_MAPPING_TYPE_INFER);
6741 } else {
6742 (void)pmap_enter_options(
6743 pmap, vaddr, 0, 0, 0, 0, 0,
6744 PMAP_OPTIONS_NOENTER, NULL,
6745 PMAP_MAPPING_TYPE_INFER);
6746 }
6747 need_retry = FALSE;
6748 kr = KERN_SUCCESS; /* retry fault instead of failing below */
6749 }
6750
6751 vm_map_unlock_read(map);
6752 if (real_map != map) {
6753 vm_map_unlock(real_map);
6754 }
6755
6756 if (kr != KERN_SUCCESS) {
6757 goto done;
6758 }
6759 goto RetryFault;
6760 }
6761 if (physpage_p != NULL) {
6762 /* for vm_map_wire_and_extract() */
6763 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6764 if (prot & VM_PROT_WRITE) {
6765 vm_object_lock_assert_exclusive(m_object);
6766 m->vmp_dirty = TRUE;
6767 }
6768 }
6769 } else {
6770 vm_map_entry_t entry;
6771 vm_map_offset_t laddr;
6772 vm_map_offset_t ldelta, hdelta;
6773
6774 /*
6775 * do a pmap block mapping from the physical address
6776 * in the object
6777 */
6778
6779 if (real_map != map) {
6780 vm_map_unlock(real_map);
6781 }
6782
6783 if (original_map != map) {
6784 vm_map_unlock_read(map);
6785 vm_map_lock_read(original_map);
6786 map = original_map;
6787 }
6788 real_map = map;
6789
6790 laddr = vaddr;
6791 hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6792
6793 while (vm_map_lookup_entry(map, laddr, &entry)) {
6794 if (ldelta > (laddr - entry->vme_start)) {
6795 ldelta = laddr - entry->vme_start;
6796 }
6797 if (hdelta > (entry->vme_end - laddr)) {
6798 hdelta = entry->vme_end - laddr;
6799 }
6800 if (entry->is_sub_map) {
6801 vm_map_t sub_map;
6802 bool use_pmap;
6803
6804 laddr = ((laddr - entry->vme_start)
6805 + VME_OFFSET(entry));
6806 vm_map_lock_read(VME_SUBMAP(entry));
6807 sub_map = VME_SUBMAP(entry);
6808 use_pmap = entry->use_pmap;
6809 entry = VM_MAP_ENTRY_NULL; /* not valid after unlock */
6810 if (map != real_map) {
6811 vm_map_unlock_read(map);
6812 }
6813 if (use_pmap) {
6814 vm_map_unlock_read(real_map);
6815 real_map = sub_map;
6816 }
6817 map = sub_map;
6818 } else {
6819 break;
6820 }
6821 }
6822
6823 if (vm_map_lookup_entry(map, laddr, &entry) &&
6824 (!entry->is_sub_map) &&
6825 (object != VM_OBJECT_NULL) &&
6826 (VME_OBJECT(entry) == object)) {
6827 uint16_t superpage;
6828
6829 if (!object->pager_created &&
6830 object->phys_contiguous &&
6831 VME_OFFSET(entry) == 0 &&
6832 (entry->vme_end - entry->vme_start == object->vo_size) &&
6833 VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6834 superpage = VM_MEM_SUPERPAGE;
6835 } else {
6836 superpage = 0;
6837 }
6838
6839 if (superpage && physpage_p) {
6840 /* for vm_map_wire_and_extract() */
6841 *physpage_p = (ppnum_t)
6842 ((((vm_map_offset_t)
6843 object->vo_shadow_offset)
6844 + VME_OFFSET(entry)
6845 + (laddr - entry->vme_start))
6846 >> PAGE_SHIFT);
6847 }
6848
6849 /*
6850 * Set up a block mapped area
6851 */
6852 assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6853 pmap_t block_map_pmap;
6854 addr64_t block_map_va;
6855 pmap_paddr_t block_map_pa = (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6856 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta);
6857 int block_map_wimg = VM_WIMG_MASK & (int)object->wimg_bits;
6858 if (caller_pmap) {
6859 block_map_pmap = caller_pmap;
6860 block_map_va = (addr64_t)(caller_pmap_addr - ldelta);
6861 } else {
6862 block_map_pmap = real_map->pmap;
6863 block_map_va = (addr64_t)(vaddr - ldelta);
6864 }
6865 #if HAS_MTE
6866 /*
6867 * We hit this path if we return SUCCESS from vm_fault_page but don't
6868 * return a page. This happens if we're trying to fault in a
6869 * phys_contiguous object (used by device pagers and superpages), or
6870 * if the page is non-VM managed. Both of these cases are not
6871 * expected to occur with MTE.
6872 */
6873 assert(!vm_should_override_mte_cacheattr(block_map_pmap, object, block_map_va, block_map_pa));
6874 #endif /* HAS_MTE */
6875 kr = pmap_map_block_addr(block_map_pmap,
6876 block_map_va,
6877 block_map_pa,
6878 (uint32_t)((ldelta + hdelta) >> fault_page_shift),
6879 prot,
6880 block_map_wimg | superpage,
6881 0);
6882
6883 if (kr != KERN_SUCCESS) {
6884 goto cleanup;
6885 }
6886 }
6887 }
6888
6889 /*
6890 * Success
6891 */
6892 kr = KERN_SUCCESS;
6893
6894 /*
6895 * TODO: could most of the done cases just use cleanup?
6896 */
6897 cleanup:
6898 /*
6899 * Unlock everything, and return
6900 */
6901 vm_map_unlock_read(map);
6902 if (real_map != map) {
6903 vm_map_unlock(real_map);
6904 }
6905
6906 if (m != VM_PAGE_NULL) {
6907 if (__improbable(rtfault &&
6908 !m->vmp_realtime &&
6909 vm_pageout_protect_realtime)) {
6910 vm_page_lock_queues();
6911 if (!m->vmp_realtime) {
6912 m->vmp_realtime = true;
6913 VM_COUNTER_INC(&vm_page_realtime_count);
6914 }
6915 vm_page_unlock_queues();
6916 }
6917 assert(VM_PAGE_OBJECT(m) == m_object);
6918
6919 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6920 vm_object_paging_begin(m_object);
6921
6922 assert3p(written_on_object, ==, VM_OBJECT_NULL);
6923 written_on_object = m_object;
6924 written_on_pager = m_object->pager;
6925 written_on_offset = m_object->paging_offset + m->vmp_offset;
6926 }
6927 vm_page_wakeup_done(m_object, m);
6928
6929 vm_fault_cleanup(m_object, top_page);
6930 } else {
6931 vm_fault_cleanup(object, top_page);
6932 }
6933
6934 vm_object_deallocate(object);
6935
6936 #undef RELEASE_PAGE
6937
6938 done:
6939 thread_interrupt_level(interruptible_state);
6940
6941 if (resilient_media_object != VM_OBJECT_NULL) {
6942 assert(resilient_media_retry);
6943 assert(resilient_media_offset != (vm_object_offset_t)-1);
6944 /* release extra reference on failed object */
6945 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6946 vm_object_deallocate(resilient_media_object);
6947 resilient_media_object = VM_OBJECT_NULL;
6948 resilient_media_offset = (vm_object_offset_t)-1;
6949 resilient_media_retry = false;
6950 vm_fault_resilient_media_release++;
6951 }
6952 assert(!resilient_media_retry);
6953
6954 /*
6955 * Only I/O throttle on faults which cause a pagein/swapin.
6956 */
6957 if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6958 throttle_lowpri_io(1);
6959 } else {
6960 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6961 if ((throttle_delay = vm_page_throttled(TRUE))) {
6962 if (vm_debug_events) {
6963 if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6964 VM_DEBUG_EVENT(vmf_compressordelay, DBG_VM_FAULT_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6965 } else if (type_of_fault == DBG_COW_FAULT) {
6966 VM_DEBUG_EVENT(vmf_cowdelay, DBG_VM_FAULT_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6967 } else {
6968 VM_DEBUG_EVENT(vmf_zfdelay, DBG_VM_FAULT_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6969 }
6970 }
6971 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6972 }
6973 }
6974 }
6975
6976 if (written_on_object) {
6977 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6978
6979 vm_object_lock(written_on_object);
6980 vm_object_paging_end(written_on_object);
6981 vm_object_unlock(written_on_object);
6982
6983 written_on_object = VM_OBJECT_NULL;
6984 }
6985
6986 if (rtfault) {
6987 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6988 }
6989
6990 KDBG_RELEASE(
6991 (VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_END,
6992 ((uint64_t)trace_vaddr >> 32),
6993 trace_vaddr,
6994 kr,
6995 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6996
6997 if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6998 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6999 }
7000
7001 vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
7002 return kr;
7003 }
7004
7005 /*
7006 * vm_fault_wire:
7007 *
7008 * Wire down a range of virtual addresses in a map.
7009 */
7010 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)7011 vm_fault_wire(
7012 vm_map_t map,
7013 vm_map_entry_t entry,
7014 vm_prot_t prot,
7015 vm_tag_t wire_tag,
7016 pmap_t pmap,
7017 vm_map_offset_t pmap_addr,
7018 ppnum_t *physpage_p)
7019 {
7020 vm_map_offset_t va;
7021 vm_map_offset_t end_addr = entry->vme_end;
7022 kern_return_t rc;
7023 vm_map_size_t effective_page_size;
7024
7025 assert(entry->in_transition);
7026
7027 if (!entry->is_sub_map &&
7028 VME_OBJECT(entry) != VM_OBJECT_NULL &&
7029 VME_OBJECT(entry)->phys_contiguous) {
7030 return KERN_SUCCESS;
7031 }
7032
7033 /*
7034 * Inform the physical mapping system that the
7035 * range of addresses may not fault, so that
7036 * page tables and such can be locked down as well.
7037 */
7038
7039 pmap_pageable(pmap, pmap_addr,
7040 pmap_addr + (end_addr - entry->vme_start), FALSE);
7041
7042 /*
7043 * We simulate a fault to get the page and enter it
7044 * in the physical map.
7045 */
7046
7047 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7048 for (va = entry->vme_start;
7049 va < end_addr;
7050 va += effective_page_size) {
7051 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
7052 pmap_addr + (va - entry->vme_start),
7053 physpage_p);
7054 if (rc != KERN_SUCCESS) {
7055 struct vm_object_fault_info fault_info = {
7056 .interruptible = (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE,
7057 .behavior = VM_BEHAVIOR_SEQUENTIAL,
7058 .fi_change_wiring = true,
7059 };
7060 if (os_sub_overflow(end_addr, va, &fault_info.cluster_size)) {
7061 fault_info.cluster_size = UPL_SIZE_MAX;
7062 }
7063 rc = vm_fault_internal(map, va, prot, wire_tag,
7064 pmap,
7065 (pmap_addr +
7066 (va - entry->vme_start)),
7067 physpage_p,
7068 &fault_info);
7069 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
7070 }
7071
7072 if (rc != KERN_SUCCESS) {
7073 struct vm_map_entry tmp_entry = *entry;
7074
7075 /* unwire wired pages */
7076 tmp_entry.vme_end = va;
7077 vm_fault_unwire(map, &tmp_entry, FALSE,
7078 pmap, pmap_addr, tmp_entry.vme_end);
7079
7080 return rc;
7081 }
7082 }
7083 return KERN_SUCCESS;
7084 }
7085
7086 /*
7087 * vm_fault_unwire:
7088 *
7089 * Unwire a range of virtual addresses in a map.
7090 */
7091 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr,vm_map_offset_t end_addr)7092 vm_fault_unwire(
7093 vm_map_t map,
7094 vm_map_entry_t entry,
7095 boolean_t deallocate,
7096 pmap_t pmap,
7097 vm_map_offset_t pmap_addr,
7098 vm_map_offset_t end_addr)
7099 {
7100 vm_map_offset_t va;
7101 vm_object_t object;
7102 struct vm_object_fault_info fault_info = {
7103 .interruptible = THREAD_UNINT,
7104 };
7105 unsigned int unwired_pages;
7106 vm_map_size_t effective_page_size;
7107
7108 object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
7109
7110 /*
7111 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
7112 * do anything since such memory is wired by default. So we don't have
7113 * anything to undo here.
7114 */
7115
7116 if (object != VM_OBJECT_NULL && object->phys_contiguous) {
7117 return;
7118 }
7119
7120 fault_info.interruptible = THREAD_UNINT;
7121 fault_info.behavior = entry->behavior;
7122 fault_info.user_tag = VME_ALIAS(entry);
7123 if (entry->iokit_acct ||
7124 (!entry->is_sub_map && !entry->use_pmap)) {
7125 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
7126 }
7127 fault_info.lo_offset = VME_OFFSET(entry);
7128 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
7129 fault_info.no_cache = entry->no_cache;
7130 fault_info.stealth = TRUE;
7131 if (entry->vme_xnu_user_debug) {
7132 /*
7133 * Modified code-signed executable region: wired pages must
7134 * have been copied, so they should be XNU_USER_DEBUG rather
7135 * than XNU_USER_EXEC.
7136 */
7137 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
7138 }
7139
7140 unwired_pages = 0;
7141
7142 /*
7143 * Since the pages are wired down, we must be able to
7144 * get their mappings from the physical map system.
7145 */
7146
7147 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7148 for (va = entry->vme_start;
7149 va < end_addr;
7150 va += effective_page_size) {
7151 if (object == VM_OBJECT_NULL) {
7152 if (pmap) {
7153 pmap_change_wiring(pmap,
7154 pmap_addr + (va - entry->vme_start), FALSE);
7155 }
7156 (void) vm_fault(map, va, VM_PROT_NONE,
7157 TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
7158 } else {
7159 vm_prot_t prot;
7160 vm_page_t result_page;
7161 vm_page_t top_page;
7162 vm_object_t result_object;
7163 vm_fault_return_t result;
7164
7165 /* cap cluster size at maximum UPL size */
7166 upl_size_t cluster_size;
7167 if (os_sub_overflow(end_addr, va, &cluster_size)) {
7168 cluster_size = UPL_SIZE_MAX;
7169 }
7170 fault_info.cluster_size = cluster_size;
7171
7172 do {
7173 prot = VM_PROT_NONE;
7174
7175 vm_object_lock(object);
7176 vm_object_paging_begin(object);
7177 result_page = VM_PAGE_NULL;
7178 result = vm_fault_page(
7179 object,
7180 (VME_OFFSET(entry) +
7181 (va - entry->vme_start)),
7182 VM_PROT_NONE, TRUE,
7183 FALSE, /* page not looked up */
7184 &prot, &result_page, &top_page,
7185 (int *)0,
7186 NULL, map->no_zero_fill,
7187 &fault_info);
7188 } while (result == VM_FAULT_RETRY);
7189
7190 /*
7191 * If this was a mapping to a file on a device that has been forcibly
7192 * unmounted, then we won't get a page back from vm_fault_page(). Just
7193 * move on to the next one in case the remaining pages are mapped from
7194 * different objects. During a forced unmount, the object is terminated
7195 * so the alive flag will be false if this happens. A forced unmount will
7196 * will occur when an external disk is unplugged before the user does an
7197 * eject, so we don't want to panic in that situation.
7198 */
7199
7200 if (result == VM_FAULT_MEMORY_ERROR) {
7201 if (!object->alive) {
7202 continue;
7203 }
7204 if (!object->internal && object->pager == NULL) {
7205 continue;
7206 }
7207 }
7208
7209 if (result == VM_FAULT_MEMORY_ERROR &&
7210 is_kernel_object(object)) {
7211 /*
7212 * This must have been allocated with
7213 * KMA_KOBJECT and KMA_VAONLY and there's
7214 * no physical page at this offset.
7215 * We're done (no page to free).
7216 */
7217 assert(deallocate);
7218 continue;
7219 }
7220
7221 if (result != VM_FAULT_SUCCESS) {
7222 panic("vm_fault_unwire: failure");
7223 }
7224
7225 result_object = VM_PAGE_OBJECT(result_page);
7226
7227 if (deallocate) {
7228 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
7229 vm_page_fictitious_addr);
7230 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
7231 if (VM_PAGE_WIRED(result_page)) {
7232 unwired_pages++;
7233 }
7234 VM_PAGE_FREE(result_page);
7235 } else {
7236 if (pmap && !vm_page_is_guard(result_page)) {
7237 pmap_change_wiring(pmap,
7238 pmap_addr + (va - entry->vme_start), FALSE);
7239 }
7240
7241
7242 if (VM_PAGE_WIRED(result_page)) {
7243 vm_page_lockspin_queues();
7244 vm_page_unwire(result_page, TRUE);
7245 vm_page_unlock_queues();
7246 unwired_pages++;
7247 }
7248 if (entry->zero_wired_pages &&
7249 (entry->protection & VM_PROT_WRITE) &&
7250 #if __arm64e__
7251 !entry->used_for_tpro &&
7252 #endif /* __arm64e__ */
7253 !entry->used_for_jit) {
7254 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
7255 }
7256
7257 vm_page_wakeup_done(result_object, result_page);
7258 }
7259 vm_fault_cleanup(result_object, top_page);
7260 }
7261 }
7262
7263 /*
7264 * Inform the physical mapping system that the range
7265 * of addresses may fault, so that page tables and
7266 * such may be unwired themselves.
7267 */
7268
7269 pmap_pageable(pmap, pmap_addr,
7270 pmap_addr + (end_addr - entry->vme_start), TRUE);
7271
7272 if (is_kernel_object(object)) {
7273 /*
7274 * Would like to make user_tag in vm_object_fault_info
7275 * vm_tag_t (unsigned short) but user_tag derives its value from
7276 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
7277 * to an _unsigned int_ which is used by non-fault_info paths throughout the
7278 * code at many places.
7279 *
7280 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
7281 */
7282 assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
7283 "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
7284 vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages), NULL);
7285 }
7286 }
7287
7288 /*
7289 * vm_fault_wire_fast:
7290 *
7291 * Handle common case of a wire down page fault at the given address.
7292 * If successful, the page is inserted into the associated physical map.
7293 * The map entry is passed in to avoid the overhead of a map lookup.
7294 *
7295 * NOTE: the given address should be truncated to the
7296 * proper page address.
7297 *
7298 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
7299 * a standard error specifying why the fault is fatal is returned.
7300 *
7301 * The map in question must be referenced, and remains so.
7302 * Caller has a read lock on the map.
7303 *
7304 * This is a stripped version of vm_fault() for wiring pages. Anything
7305 * other than the common case will return KERN_FAILURE, and the caller
7306 * is expected to call vm_fault().
7307 */
7308 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)7309 vm_fault_wire_fast(
7310 __unused vm_map_t map,
7311 vm_map_offset_t va,
7312 __unused vm_prot_t caller_prot,
7313 vm_tag_t wire_tag,
7314 vm_map_entry_t entry,
7315 pmap_t pmap,
7316 vm_map_offset_t pmap_addr,
7317 ppnum_t *physpage_p)
7318 {
7319 vm_object_t object;
7320 vm_object_offset_t offset;
7321 vm_page_t m;
7322 vm_prot_t prot;
7323 thread_t thread = current_thread();
7324 int type_of_fault;
7325 kern_return_t kr;
7326 vm_map_size_t fault_page_size;
7327 vm_map_offset_t fault_phys_offset;
7328 struct vm_object_fault_info fault_info = {
7329 .interruptible = THREAD_UNINT,
7330 };
7331 uint8_t object_lock_type = 0;
7332
7333 counter_inc(&vm_statistics_faults);
7334
7335 if (thread != THREAD_NULL) {
7336 counter_inc(&get_threadtask(thread)->faults);
7337 }
7338
7339 /*
7340 * Recovery actions
7341 */
7342
7343 #undef RELEASE_PAGE
7344 #define RELEASE_PAGE(m) { \
7345 vm_page_wakeup_done(VM_PAGE_OBJECT(m), m); \
7346 vm_page_lockspin_queues(); \
7347 vm_page_unwire(m, TRUE); \
7348 vm_page_unlock_queues(); \
7349 }
7350
7351
7352 #undef UNLOCK_THINGS
7353 #define UNLOCK_THINGS { \
7354 vm_object_paging_end(object); \
7355 vm_object_unlock(object); \
7356 }
7357
7358 #undef UNLOCK_AND_DEALLOCATE
7359 #define UNLOCK_AND_DEALLOCATE { \
7360 UNLOCK_THINGS; \
7361 vm_object_deallocate(object); \
7362 }
7363 /*
7364 * Give up and have caller do things the hard way.
7365 */
7366
7367 #define GIVE_UP { \
7368 UNLOCK_AND_DEALLOCATE; \
7369 return(KERN_FAILURE); \
7370 }
7371
7372
7373 /*
7374 * If this entry is not directly to a vm_object, bail out.
7375 */
7376 if (entry->is_sub_map) {
7377 assert(physpage_p == NULL);
7378 return KERN_FAILURE;
7379 }
7380
7381 /*
7382 * Find the backing store object and offset into it.
7383 */
7384
7385 object = VME_OBJECT(entry);
7386 offset = (va - entry->vme_start) + VME_OFFSET(entry);
7387 prot = entry->protection;
7388
7389 /*
7390 * Make a reference to this object to prevent its
7391 * disposal while we are messing with it.
7392 */
7393
7394 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7395 vm_object_lock(object);
7396 vm_object_reference_locked(object);
7397 vm_object_paging_begin(object);
7398
7399 /*
7400 * INVARIANTS (through entire routine):
7401 *
7402 * 1) At all times, we must either have the object
7403 * lock or a busy page in some object to prevent
7404 * some other thread from trying to bring in
7405 * the same page.
7406 *
7407 * 2) Once we have a busy page, we must remove it from
7408 * the pageout queues, so that the pageout daemon
7409 * will not grab it away.
7410 *
7411 */
7412
7413 if (entry->needs_copy) {
7414 panic("attempting to wire needs_copy memory");
7415 }
7416
7417 /*
7418 * Since we don't have the machinary to resolve CoW obligations on the fast
7419 * path, if we might have to push pages to a copy, just give up.
7420 */
7421 if (object->vo_copy != VM_OBJECT_NULL) {
7422 GIVE_UP;
7423 }
7424
7425 /*
7426 * Look for page in top-level object. If it's not there or
7427 * there's something going on, give up.
7428 */
7429 m = vm_page_lookup(object, vm_object_trunc_page(offset));
7430 if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
7431 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
7432 GIVE_UP;
7433 }
7434 if (vm_page_is_guard(m)) {
7435 /*
7436 * Guard pages are fictitious pages and are never
7437 * entered into a pmap, so let's say it's been wired...
7438 */
7439 kr = KERN_SUCCESS;
7440 goto done;
7441 }
7442
7443 /*
7444 * Wire the page down now. All bail outs beyond this
7445 * point must unwire the page.
7446 */
7447
7448 vm_page_lockspin_queues();
7449 vm_page_wire(m, wire_tag, TRUE);
7450 vm_page_unlock_queues();
7451
7452 /*
7453 * Mark page busy for other threads.
7454 */
7455 assert(!m->vmp_busy);
7456 m->vmp_busy = TRUE;
7457 assert(!m->vmp_absent);
7458
7459 fault_info.user_tag = VME_ALIAS(entry);
7460 fault_info.pmap_options = 0;
7461 if (entry->iokit_acct ||
7462 (!entry->is_sub_map && !entry->use_pmap)) {
7463 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
7464 }
7465 if (entry->vme_xnu_user_debug) {
7466 /*
7467 * Modified code-signed executable region: wiring will
7468 * copy the pages, so they should be XNU_USER_DEBUG rather
7469 * than XNU_USER_EXEC.
7470 */
7471 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
7472 }
7473
7474 if (entry->translated_allow_execute) {
7475 fault_info.pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
7476 }
7477
7478 fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7479 fault_phys_offset = offset - vm_object_trunc_page(offset);
7480
7481 /*
7482 * Put this page into the physical map.
7483 */
7484 type_of_fault = DBG_CACHE_HIT_FAULT;
7485 assert3p(VM_PAGE_OBJECT(m), ==, object);
7486 bool page_sleep_needed = false;
7487 bool need_retry = false;
7488 kr = vm_fault_enter(m,
7489 pmap,
7490 pmap_addr,
7491 fault_page_size,
7492 fault_phys_offset,
7493 prot,
7494 prot,
7495 TRUE, /* wired */
7496 wire_tag,
7497 &fault_info,
7498 &need_retry,
7499 &type_of_fault,
7500 &object_lock_type, /* Exclusive lock mode. Will remain unchanged.*/
7501 &page_sleep_needed);
7502 if ((kr != KERN_SUCCESS) || page_sleep_needed || need_retry) {
7503 RELEASE_PAGE(m);
7504 GIVE_UP;
7505 }
7506
7507
7508 done:
7509 /*
7510 * Unlock everything, and return
7511 */
7512
7513 if (physpage_p) {
7514 /* for vm_map_wire_and_extract() */
7515 if (kr == KERN_SUCCESS) {
7516 assert3p(object, ==, VM_PAGE_OBJECT(m));
7517 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7518 if (prot & VM_PROT_WRITE) {
7519 vm_object_lock_assert_exclusive(object);
7520 m->vmp_dirty = TRUE;
7521 }
7522 } else {
7523 *physpage_p = 0;
7524 }
7525 }
7526
7527 if (m->vmp_busy) {
7528 vm_page_wakeup_done(object, m);
7529 }
7530
7531 UNLOCK_AND_DEALLOCATE;
7532
7533 return kr;
7534 }
7535
7536 /*
7537 * Routine: vm_fault_copy_cleanup
7538 * Purpose:
7539 * Release a page used by vm_fault_copy.
7540 */
7541
7542 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)7543 vm_fault_copy_cleanup(
7544 vm_page_t page,
7545 vm_page_t top_page)
7546 {
7547 vm_object_t object = VM_PAGE_OBJECT(page);
7548
7549 vm_object_lock(object);
7550 vm_page_wakeup_done(object, page);
7551 if (!VM_PAGE_PAGEABLE(page)) {
7552 vm_page_lockspin_queues();
7553 if (!VM_PAGE_PAGEABLE(page)) {
7554 vm_page_activate(page);
7555 }
7556 vm_page_unlock_queues();
7557 }
7558 vm_fault_cleanup(object, top_page);
7559 }
7560
7561 static void
vm_fault_copy_dst_cleanup(vm_page_t page)7562 vm_fault_copy_dst_cleanup(
7563 vm_page_t page)
7564 {
7565 vm_object_t object;
7566
7567 if (page != VM_PAGE_NULL) {
7568 object = VM_PAGE_OBJECT(page);
7569 vm_object_lock(object);
7570 vm_page_lockspin_queues();
7571 vm_page_unwire(page, TRUE);
7572 vm_page_unlock_queues();
7573 vm_object_paging_end(object);
7574 vm_object_unlock(object);
7575 }
7576 }
7577
7578 /*
7579 * Routine: vm_fault_copy
7580 *
7581 * Purpose:
7582 * Copy pages from one virtual memory object to another --
7583 * neither the source nor destination pages need be resident.
7584 *
7585 * Before actually copying a page, the version associated with
7586 * the destination address map wil be verified.
7587 *
7588 * In/out conditions:
7589 * The caller must hold a reference, but not a lock, to
7590 * each of the source and destination objects and to the
7591 * destination map.
7592 *
7593 * Results:
7594 * Returns KERN_SUCCESS if no errors were encountered in
7595 * reading or writing the data. Returns KERN_INTERRUPTED if
7596 * the operation was interrupted (only possible if the
7597 * "interruptible" argument is asserted). Other return values
7598 * indicate a permanent error in copying the data.
7599 *
7600 * The actual amount of data copied will be returned in the
7601 * "copy_size" argument. In the event that the destination map
7602 * verification failed, this amount may be less than the amount
7603 * requested.
7604 */
7605 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)7606 vm_fault_copy(
7607 vm_object_t src_object,
7608 vm_object_offset_t src_offset,
7609 vm_map_size_t *copy_size, /* INOUT */
7610 vm_object_t dst_object,
7611 vm_object_offset_t dst_offset,
7612 vm_map_t dst_map,
7613 vm_map_version_t *dst_version,
7614 int interruptible)
7615 {
7616 vm_page_t result_page;
7617
7618 vm_page_t src_page;
7619 vm_page_t src_top_page;
7620 vm_prot_t src_prot;
7621
7622 vm_page_t dst_page;
7623 vm_page_t dst_top_page;
7624 vm_prot_t dst_prot;
7625
7626 vm_map_size_t amount_left;
7627 vm_object_t old_copy_object;
7628 uint64_t old_copy_version;
7629 vm_object_t result_page_object = NULL;
7630 kern_return_t error = 0;
7631 vm_fault_return_t result;
7632
7633 vm_map_size_t part_size;
7634 struct vm_object_fault_info fault_info_src = {};
7635 struct vm_object_fault_info fault_info_dst = {};
7636
7637 vmlp_api_start(VM_FAULT_COPY);
7638 vmlp_range_event(dst_map, dst_offset, *copy_size);
7639
7640 /*
7641 * In order not to confuse the clustered pageins, align
7642 * the different offsets on a page boundary.
7643 */
7644
7645 #define RETURN(x) \
7646 MACRO_BEGIN \
7647 *copy_size -= amount_left; \
7648 vmlp_api_end(VM_FAULT_COPY, x); \
7649 MACRO_RETURN(x); \
7650 MACRO_END
7651
7652 amount_left = *copy_size;
7653
7654 fault_info_src.interruptible = interruptible;
7655 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
7656 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
7657 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
7658 fault_info_src.stealth = TRUE;
7659
7660 fault_info_dst.interruptible = interruptible;
7661 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
7662 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
7663 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
7664 fault_info_dst.stealth = TRUE;
7665
7666 do { /* while (amount_left > 0) */
7667 /*
7668 * There may be a deadlock if both source and destination
7669 * pages are the same. To avoid this deadlock, the copy must
7670 * start by getting the destination page in order to apply
7671 * COW semantics if any.
7672 */
7673
7674 RetryDestinationFault:;
7675
7676 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
7677
7678 vm_object_lock(dst_object);
7679 vm_object_paging_begin(dst_object);
7680
7681 /* cap cluster size at maximum UPL size */
7682 upl_size_t cluster_size;
7683 if (os_convert_overflow(amount_left, &cluster_size)) {
7684 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
7685 }
7686 fault_info_dst.cluster_size = cluster_size;
7687
7688 dst_page = VM_PAGE_NULL;
7689 result = vm_fault_page(dst_object,
7690 vm_object_trunc_page(dst_offset),
7691 VM_PROT_WRITE | VM_PROT_READ,
7692 FALSE,
7693 FALSE, /* page not looked up */
7694 &dst_prot, &dst_page, &dst_top_page,
7695 (int *)0,
7696 &error,
7697 dst_map->no_zero_fill,
7698 &fault_info_dst);
7699 switch (result) {
7700 case VM_FAULT_SUCCESS:
7701 break;
7702 case VM_FAULT_RETRY:
7703 goto RetryDestinationFault;
7704 case VM_FAULT_MEMORY_SHORTAGE:
7705 if (vm_page_wait(interruptible)) {
7706 goto RetryDestinationFault;
7707 }
7708 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
7709 OS_FALLTHROUGH;
7710 case VM_FAULT_INTERRUPTED:
7711 RETURN(MACH_SEND_INTERRUPTED);
7712 case VM_FAULT_SUCCESS_NO_VM_PAGE:
7713 /* success but no VM page: fail the copy */
7714 vm_object_paging_end(dst_object);
7715 vm_object_unlock(dst_object);
7716 OS_FALLTHROUGH;
7717 case VM_FAULT_MEMORY_ERROR:
7718 if (error) {
7719 vmlp_api_end(VM_FAULT_COPY, error);
7720 return error;
7721 } else {
7722 vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7723 return KERN_MEMORY_ERROR;
7724 }
7725 default:
7726 panic("vm_fault_copy: unexpected error 0x%x from "
7727 "vm_fault_page()\n", result);
7728 }
7729 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
7730
7731 assert(dst_object == VM_PAGE_OBJECT(dst_page));
7732 old_copy_object = dst_object->vo_copy;
7733 old_copy_version = dst_object->vo_copy_version;
7734
7735 /*
7736 * There exists the possiblity that the source and
7737 * destination page are the same. But we can't
7738 * easily determine that now. If they are the
7739 * same, the call to vm_fault_page() for the
7740 * destination page will deadlock. To prevent this we
7741 * wire the page so we can drop busy without having
7742 * the page daemon steal the page. We clean up the
7743 * top page but keep the paging reference on the object
7744 * holding the dest page so it doesn't go away.
7745 */
7746
7747 vm_page_lockspin_queues();
7748 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
7749 vm_page_unlock_queues();
7750 vm_page_wakeup_done(dst_object, dst_page);
7751 vm_object_unlock(dst_object);
7752
7753 if (dst_top_page != VM_PAGE_NULL) {
7754 vm_object_lock(dst_object);
7755 VM_PAGE_FREE(dst_top_page);
7756 vm_object_paging_end(dst_object);
7757 vm_object_unlock(dst_object);
7758 }
7759
7760 RetrySourceFault:;
7761
7762 if (src_object == VM_OBJECT_NULL) {
7763 /*
7764 * No source object. We will just
7765 * zero-fill the page in dst_object.
7766 */
7767 src_page = VM_PAGE_NULL;
7768 result_page = VM_PAGE_NULL;
7769 } else {
7770 vm_object_lock(src_object);
7771 src_page = vm_page_lookup(src_object,
7772 vm_object_trunc_page(src_offset));
7773 if (src_page == dst_page) {
7774 src_prot = dst_prot;
7775 result_page = VM_PAGE_NULL;
7776 } else {
7777 src_prot = VM_PROT_READ;
7778 vm_object_paging_begin(src_object);
7779
7780 /* cap cluster size at maximum UPL size */
7781 if (os_convert_overflow(amount_left, &cluster_size)) {
7782 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
7783 }
7784 fault_info_src.cluster_size = cluster_size;
7785
7786 result_page = VM_PAGE_NULL;
7787 result = vm_fault_page(
7788 src_object,
7789 vm_object_trunc_page(src_offset),
7790 VM_PROT_READ, FALSE,
7791 FALSE, /* page not looked up */
7792 &src_prot,
7793 &result_page, &src_top_page,
7794 (int *)0, &error, FALSE,
7795 &fault_info_src);
7796
7797 switch (result) {
7798 case VM_FAULT_SUCCESS:
7799 break;
7800 case VM_FAULT_RETRY:
7801 goto RetrySourceFault;
7802 case VM_FAULT_MEMORY_SHORTAGE:
7803 if (vm_page_wait(interruptible)) {
7804 goto RetrySourceFault;
7805 }
7806 OS_FALLTHROUGH;
7807 case VM_FAULT_INTERRUPTED:
7808 vm_fault_copy_dst_cleanup(dst_page);
7809 RETURN(MACH_SEND_INTERRUPTED);
7810 case VM_FAULT_SUCCESS_NO_VM_PAGE:
7811 /* success but no VM page: fail */
7812 vm_object_paging_end(src_object);
7813 vm_object_unlock(src_object);
7814 OS_FALLTHROUGH;
7815 case VM_FAULT_MEMORY_ERROR:
7816 vm_fault_copy_dst_cleanup(dst_page);
7817 if (error) {
7818 vmlp_api_end(VM_FAULT_COPY, error);
7819 return error;
7820 } else {
7821 vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7822 return KERN_MEMORY_ERROR;
7823 }
7824 default:
7825 panic("vm_fault_copy(2): unexpected "
7826 "error 0x%x from "
7827 "vm_fault_page()\n", result);
7828 }
7829
7830 result_page_object = VM_PAGE_OBJECT(result_page);
7831 assert((src_top_page == VM_PAGE_NULL) ==
7832 (result_page_object == src_object));
7833 }
7834 assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7835 vm_object_unlock(result_page_object);
7836 }
7837
7838 vm_map_lock_read(dst_map);
7839
7840 if (!vm_map_verify(dst_map, dst_version)) {
7841 vm_map_unlock_read(dst_map);
7842 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7843 vm_fault_copy_cleanup(result_page, src_top_page);
7844 }
7845 vm_fault_copy_dst_cleanup(dst_page);
7846 break;
7847 }
7848 assert(dst_object == VM_PAGE_OBJECT(dst_page));
7849
7850 vm_object_lock(dst_object);
7851
7852 if ((dst_object->vo_copy != old_copy_object ||
7853 dst_object->vo_copy_version != old_copy_version)) {
7854 vm_object_unlock(dst_object);
7855 vm_map_unlock_read(dst_map);
7856 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7857 vm_fault_copy_cleanup(result_page, src_top_page);
7858 }
7859 vm_fault_copy_dst_cleanup(dst_page);
7860 break;
7861 }
7862
7863 /**
7864 * Avoid overwriting a page that has become busy while dst_object's lock was dropped.
7865 * Re-run the loop at the same position; if necessary, vm_fault_page() will wait
7866 * for the destination page to be unbusied.
7867 */
7868 if (__improbable(dst_page->vmp_busy)) {
7869 vm_object_unlock(dst_object);
7870 vm_map_unlock_read(dst_map);
7871 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7872 vm_fault_copy_cleanup(result_page, src_top_page);
7873 }
7874 vm_fault_copy_dst_cleanup(dst_page);
7875 continue;
7876 }
7877
7878 #if CONFIG_SPTM
7879 if (__improbable(PMAP_PAGE_IS_USER_EXECUTABLE(dst_page))) {
7880 /**
7881 * We've found a page with an executable frame type, which likely means its physical aperture
7882 * mapping is write-protected, so we won't be able to do the copy below. We'll need to remove
7883 * all extant mappings and retype the page, but first we need to make sure we can safely retype.
7884 */
7885 if (__improbable(dst_page->vmp_cleaning || dst_page->vmp_iopl_wired)) {
7886 /**
7887 * Clean up our locking state and source page/object references so that we can safely
7888 * sleep on the destination page.
7889 */
7890 vm_object_unlock(dst_object);
7891 vm_map_unlock_read(dst_map);
7892 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7893 vm_fault_copy_cleanup(result_page, src_top_page);
7894 }
7895 vm_object_lock(dst_object);
7896 assert3p(dst_object, ==, VM_PAGE_OBJECT(dst_page));
7897 if (dst_page->vmp_iopl_wired) {
7898 /**
7899 * If the page is wired for I/O, we can't safely retype and we can't reasonably
7900 * wait for the I/O to finish.
7901 */
7902 vm_object_unlock(dst_object);
7903 vm_fault_copy_dst_cleanup(dst_page);
7904 vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7905 return KERN_MEMORY_ERROR;
7906 } else if (dst_page->vmp_cleaning) {
7907 /**
7908 * We can wait for an in-place clean to finish.
7909 * NOTE: The page is still wired and we still hold a paging reference on the object
7910 * at this point, both of which will be undone by vm_fault_copy_dst_cleanup().
7911 * Is it really safe to sleep on the page in that state?
7912 */
7913 wait_result_t wres = vm_page_sleep(dst_object, dst_page, interruptible, LCK_SLEEP_UNLOCK);
7914 vm_fault_copy_dst_cleanup(dst_page);
7915 if (wres == THREAD_AWAKENED || wres == THREAD_RESTART) {
7916 continue;
7917 } else {
7918 vmlp_api_end(VM_FAULT_COPY, KERN_ABORTED);
7919 return KERN_ABORTED;
7920 }
7921 } else {
7922 /**
7923 * The cleaning or I/O state we initially observed went away while the object
7924 * lock was dropped. Since we've torn down much of our state already, we need
7925 * to rerun the copy loop at the same position.
7926 */
7927 vm_object_unlock(dst_object);
7928 vm_fault_copy_dst_cleanup(dst_page);
7929 continue;
7930 }
7931 }
7932 /**
7933 * Remove all existing mappings and retype the page. Consumers of the page will be forced to
7934 * re-fault it and, if necessary, re-validate it for codesigning.
7935 */
7936 pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(dst_page), PMAP_OPTIONS_RETYPE, NULL);
7937 }
7938 #endif /* CONFIG_SPTM */
7939
7940 /**
7941 * Copy the page, and note that it is dirty immediately.
7942 * NOTE: if we're concerned about lock contention due to holding the object lock across the copy,
7943 * we could instead consider marking dst_page busy and dropping the lock, but only if we have some
7944 * other means of preventing a CoW bypass on this path.
7945 */
7946
7947 vm_object_offset_t src_po, dst_po;
7948
7949 src_po = src_offset - vm_object_trunc_page(src_offset);
7950 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7951
7952 if (dst_po > src_po) {
7953 part_size = PAGE_SIZE - dst_po;
7954 } else {
7955 part_size = PAGE_SIZE - src_po;
7956 }
7957 if (part_size > (amount_left)) {
7958 part_size = amount_left;
7959 }
7960
7961 /**
7962 * For the case in which we're copying a full page, we don't want to use vm_page_copy() here
7963 * because that will do CS validation (unnecessarily in this case) which requires the source
7964 * object lock to be held, which in turn would complicate our locking requirements since we
7965 * already hold the destination object lock. Instead we treat the full-page case as simply
7966 * a zero-offset/PAGE_SIZE variant of the partial-page case, which keeps the code simpler
7967 * anyway.
7968 */
7969 if (result_page == VM_PAGE_NULL) {
7970 assert((vm_offset_t) dst_po == dst_po);
7971 assert((vm_size_t) part_size == part_size);
7972 vm_page_part_zero_fill(dst_page,
7973 (vm_offset_t) dst_po,
7974 (vm_size_t) part_size);
7975 } else {
7976 assert((vm_offset_t) src_po == src_po);
7977 assert((vm_offset_t) dst_po == dst_po);
7978 assert((vm_size_t) part_size == part_size);
7979 vm_page_part_copy(result_page,
7980 (vm_offset_t) src_po,
7981 dst_page,
7982 (vm_offset_t) dst_po,
7983 (vm_size_t)part_size);
7984 if (!dst_page->vmp_dirty) {
7985 SET_PAGE_DIRTY(dst_page, TRUE);
7986 }
7987 }
7988 vm_object_unlock(dst_object);
7989
7990 /*
7991 * Unlock everything, and return
7992 */
7993
7994 vm_map_unlock_read(dst_map);
7995
7996 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7997 vm_fault_copy_cleanup(result_page, src_top_page);
7998 }
7999 vm_fault_copy_dst_cleanup(dst_page);
8000
8001 amount_left -= part_size;
8002 src_offset += part_size;
8003 dst_offset += part_size;
8004 } while (amount_left > 0);
8005
8006 RETURN(KERN_SUCCESS);
8007 #undef RETURN
8008
8009 /*NOTREACHED*/
8010 }
8011
8012 #if VM_FAULT_CLASSIFY
8013 /*
8014 * Temporary statistics gathering support.
8015 */
8016
8017 /*
8018 * Statistics arrays:
8019 */
8020 #define VM_FAULT_TYPES_MAX 5
8021 #define VM_FAULT_LEVEL_MAX 8
8022
8023 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
8024
8025 #define VM_FAULT_TYPE_ZERO_FILL 0
8026 #define VM_FAULT_TYPE_MAP_IN 1
8027 #define VM_FAULT_TYPE_PAGER 2
8028 #define VM_FAULT_TYPE_COPY 3
8029 #define VM_FAULT_TYPE_OTHER 4
8030
8031
8032 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)8033 vm_fault_classify(vm_object_t object,
8034 vm_object_offset_t offset,
8035 vm_prot_t fault_type)
8036 {
8037 int type, level = 0;
8038 vm_page_t m;
8039
8040 while (TRUE) {
8041 m = vm_page_lookup(object, offset);
8042 if (m != VM_PAGE_NULL) {
8043 if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
8044 type = VM_FAULT_TYPE_OTHER;
8045 break;
8046 }
8047 if (((fault_type & VM_PROT_WRITE) == 0) ||
8048 ((level == 0) && object->vo_copy == VM_OBJECT_NULL)) {
8049 type = VM_FAULT_TYPE_MAP_IN;
8050 break;
8051 }
8052 type = VM_FAULT_TYPE_COPY;
8053 break;
8054 } else {
8055 if (object->pager_created) {
8056 type = VM_FAULT_TYPE_PAGER;
8057 break;
8058 }
8059 if (object->shadow == VM_OBJECT_NULL) {
8060 type = VM_FAULT_TYPE_ZERO_FILL;
8061 break;
8062 }
8063
8064 offset += object->vo_shadow_offset;
8065 object = object->shadow;
8066 level++;
8067 continue;
8068 }
8069 }
8070
8071 if (level > VM_FAULT_LEVEL_MAX) {
8072 level = VM_FAULT_LEVEL_MAX;
8073 }
8074
8075 vm_fault_stats[type][level] += 1;
8076
8077 return;
8078 }
8079
8080 /* cleanup routine to call from debugger */
8081
8082 void
vm_fault_classify_init(void)8083 vm_fault_classify_init(void)
8084 {
8085 int type, level;
8086
8087 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
8088 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
8089 vm_fault_stats[type][level] = 0;
8090 }
8091 }
8092
8093 return;
8094 }
8095 #endif /* VM_FAULT_CLASSIFY */
8096
8097 static inline bool
object_supports_coredump(const vm_object_t object)8098 object_supports_coredump(const vm_object_t object)
8099 {
8100 switch (object->wimg_bits & VM_WIMG_MASK) {
8101 case VM_WIMG_DEFAULT:
8102 return true;
8103 #if HAS_MTE
8104 case VM_WIMG_MTE:
8105 return true;
8106 #endif /* HAS_MTE */
8107 default:
8108 return false;
8109 }
8110 }
8111
8112 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr,bool multi_cpu)8113 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu)
8114 {
8115 vm_map_entry_t entry;
8116 vm_object_t object;
8117 vm_offset_t object_offset;
8118 vm_page_t m;
8119 int compressor_external_state, compressed_count_delta;
8120 vm_compressor_options_t compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
8121 int my_fault_type = VM_PROT_READ;
8122 kern_return_t kr;
8123 int effective_page_mask, effective_page_size;
8124 int my_cpu_no = cpu_number();
8125 ppnum_t decomp_ppnum;
8126 addr64_t decomp_paddr;
8127
8128 vmlp_api_start(KDP_LIGHTWEIGHT_FAULT);
8129
8130 if (multi_cpu) {
8131 compressor_flags |= C_KDP_MULTICPU;
8132 }
8133
8134 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
8135 effective_page_mask = VM_MAP_PAGE_MASK(map);
8136 effective_page_size = VM_MAP_PAGE_SIZE(map);
8137 } else {
8138 effective_page_mask = PAGE_MASK;
8139 effective_page_size = PAGE_SIZE;
8140 }
8141
8142 if (not_in_kdp) {
8143 panic("kdp_lightweight_fault called from outside of debugger context");
8144 }
8145
8146 assert(map != VM_MAP_NULL);
8147
8148 assert((cur_target_addr & effective_page_mask) == 0);
8149 if ((cur_target_addr & effective_page_mask) != 0) {
8150 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8151 return 0;
8152 }
8153
8154 if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
8155 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8156 return 0;
8157 }
8158
8159 if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
8160 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8161 return 0;
8162 }
8163
8164 vmlp_range_event_entry(map, entry);
8165
8166 if (entry->is_sub_map) {
8167 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8168 return 0;
8169 }
8170
8171 object = VME_OBJECT(entry);
8172 if (object == VM_OBJECT_NULL) {
8173 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8174 return 0;
8175 }
8176
8177 object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
8178
8179 while (TRUE) {
8180 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
8181 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8182 return 0;
8183 }
8184
8185 if (object->pager_created && (object->paging_in_progress ||
8186 object->activity_in_progress)) {
8187 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8188 return 0;
8189 }
8190
8191 m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
8192
8193 if (m != VM_PAGE_NULL) {
8194 if (!object_supports_coredump(object)) {
8195 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8196 return 0;
8197 }
8198
8199 if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done ||
8200 m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
8201 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
8202 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8203 return 0;
8204 }
8205
8206 assert(!vm_page_is_private(m));
8207 if (vm_page_is_private(m)) {
8208 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8209 return 0;
8210 }
8211
8212 assert(!vm_page_is_fictitious(m));
8213 if (vm_page_is_fictitious(m)) {
8214 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8215 return 0;
8216 }
8217
8218 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8219 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8220 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8221 return 0;
8222 }
8223
8224 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0);
8225 return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
8226 }
8227
8228 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
8229
8230 if (multi_cpu) {
8231 assert(vm_compressor_kdp_state.kc_decompressed_pages_ppnum != NULL);
8232 assert(vm_compressor_kdp_state.kc_decompressed_pages_paddr != NULL);
8233 decomp_ppnum = vm_compressor_kdp_state.kc_decompressed_pages_ppnum[my_cpu_no];
8234 decomp_paddr = vm_compressor_kdp_state.kc_decompressed_pages_paddr[my_cpu_no];
8235 } else {
8236 decomp_ppnum = vm_compressor_kdp_state.kc_panic_decompressed_page_ppnum;
8237 decomp_paddr = vm_compressor_kdp_state.kc_panic_decompressed_page_paddr;
8238 }
8239
8240 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
8241 if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
8242 #if HAS_MTE
8243 if (vm_object_is_mte_mappable(object)) {
8244 compressor_flags |= C_MTE | C_MTE_DROP_TAGS;
8245 }
8246 #endif /* HAS_MTE */
8247 kr = vm_compressor_pager_get(object->pager,
8248 vm_object_trunc_page(object_offset + object->paging_offset),
8249 decomp_ppnum, &my_fault_type,
8250 compressor_flags, &compressed_count_delta);
8251 if (kr == KERN_SUCCESS) {
8252 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0);
8253 return decomp_paddr;
8254 } else {
8255 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8256 return 0;
8257 }
8258 }
8259 }
8260
8261 if (object->shadow == VM_OBJECT_NULL) {
8262 vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8263 return 0;
8264 }
8265
8266 object_offset += object->vo_shadow_offset;
8267 object = object->shadow;
8268 }
8269 }
8270
8271 /*
8272 * vm_page_validate_cs_fast():
8273 * Performs a few quick checks to determine if the page's code signature
8274 * really needs to be fully validated. It could:
8275 * 1. have been modified (i.e. automatically tainted),
8276 * 2. have already been validated,
8277 * 3. have already been found to be tainted,
8278 * 4. no longer have a backing store.
8279 * Returns FALSE if the page needs to be fully validated.
8280 */
8281 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)8282 vm_page_validate_cs_fast(
8283 vm_page_t page,
8284 vm_map_size_t fault_page_size,
8285 vm_map_offset_t fault_phys_offset)
8286 {
8287 vm_object_t object;
8288
8289 object = VM_PAGE_OBJECT(page);
8290 vm_object_lock_assert_held(object);
8291
8292 if (page->vmp_wpmapped &&
8293 !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
8294 /*
8295 * This page was mapped for "write" access sometime in the
8296 * past and could still be modifiable in the future.
8297 * Consider it tainted.
8298 * [ If the page was already found to be "tainted", no
8299 * need to re-validate. ]
8300 */
8301 vm_object_lock_assert_exclusive(object);
8302 VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
8303 VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
8304 if (cs_debug) {
8305 printf("CODESIGNING: %s: "
8306 "page %p obj %p off 0x%llx "
8307 "was modified\n",
8308 __FUNCTION__,
8309 page, object, page->vmp_offset);
8310 }
8311 vm_cs_validated_dirtied++;
8312 }
8313
8314 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
8315 VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
8316 return TRUE;
8317 }
8318 vm_object_lock_assert_exclusive(object);
8319
8320 #if CHECK_CS_VALIDATION_BITMAP
8321 kern_return_t kr;
8322
8323 kr = vnode_pager_cs_check_validation_bitmap(
8324 object->pager,
8325 page->vmp_offset + object->paging_offset,
8326 CS_BITMAP_CHECK);
8327 if (kr == KERN_SUCCESS) {
8328 page->vmp_cs_validated = VMP_CS_ALL_TRUE;
8329 page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
8330 vm_cs_bitmap_validated++;
8331 return TRUE;
8332 }
8333 #endif /* CHECK_CS_VALIDATION_BITMAP */
8334
8335 if (!object->alive || object->terminating || object->pager == NULL) {
8336 /*
8337 * The object is terminating and we don't have its pager
8338 * so we can't validate the data...
8339 */
8340 return TRUE;
8341 }
8342
8343 /* we need to really validate this page */
8344 vm_object_lock_assert_exclusive(object);
8345 return FALSE;
8346 }
8347
8348 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)8349 vm_page_validate_cs_mapped_slow(
8350 vm_page_t page,
8351 const void *kaddr)
8352 {
8353 vm_object_t object;
8354 memory_object_offset_t mo_offset;
8355 memory_object_t pager;
8356 struct vnode *vnode;
8357 int validated, tainted, nx;
8358
8359 assert(page->vmp_busy);
8360 object = VM_PAGE_OBJECT(page);
8361 vm_object_lock_assert_exclusive(object);
8362
8363 vm_cs_validates++;
8364
8365 /*
8366 * Since we get here to validate a page that was brought in by
8367 * the pager, we know that this pager is all setup and ready
8368 * by now.
8369 */
8370 assert(object->code_signed);
8371 assert(!object->internal);
8372 assert(object->pager != NULL);
8373 assert(object->pager_ready);
8374
8375 pager = object->pager;
8376 assert(object->paging_in_progress);
8377 vnode = vnode_pager_lookup_vnode(pager);
8378 mo_offset = page->vmp_offset + object->paging_offset;
8379
8380 /* verify the SHA1 hash for this page */
8381 validated = 0;
8382 tainted = 0;
8383 nx = 0;
8384 cs_validate_page(vnode,
8385 pager,
8386 mo_offset,
8387 (const void *)((const char *)kaddr),
8388 &validated,
8389 &tainted,
8390 &nx);
8391
8392 page->vmp_cs_validated |= validated;
8393 page->vmp_cs_tainted |= tainted;
8394 page->vmp_cs_nx |= nx;
8395
8396 #if CHECK_CS_VALIDATION_BITMAP
8397 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
8398 page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
8399 vnode_pager_cs_check_validation_bitmap(object->pager,
8400 mo_offset,
8401 CS_BITMAP_SET);
8402 }
8403 #endif /* CHECK_CS_VALIDATION_BITMAP */
8404 }
8405
8406 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)8407 vm_page_validate_cs_mapped(
8408 vm_page_t page,
8409 vm_map_size_t fault_page_size,
8410 vm_map_offset_t fault_phys_offset,
8411 const void *kaddr)
8412 {
8413 if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
8414 vm_page_validate_cs_mapped_slow(page, kaddr);
8415 }
8416 }
8417
8418 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)8419 vm_page_map_and_validate_cs(
8420 vm_object_t object,
8421 vm_page_t page)
8422 {
8423 vm_object_offset_t offset;
8424 vm_map_offset_t koffset;
8425 vm_map_size_t ksize;
8426 vm_offset_t kaddr;
8427 kern_return_t kr;
8428 boolean_t busy_page;
8429 boolean_t need_unmap;
8430
8431 vm_object_lock_assert_exclusive(object);
8432
8433 assert(object->code_signed);
8434 offset = page->vmp_offset;
8435
8436 busy_page = page->vmp_busy;
8437 if (!busy_page) {
8438 /* keep page busy while we map (and unlock) the VM object */
8439 page->vmp_busy = TRUE;
8440 }
8441
8442 /*
8443 * Take a paging reference on the VM object
8444 * to protect it from collapse or bypass,
8445 * and keep it from disappearing too.
8446 */
8447 vm_object_paging_begin(object);
8448
8449 /* map the page in the kernel address space */
8450 ksize = PAGE_SIZE_64;
8451 koffset = 0;
8452 need_unmap = FALSE;
8453 kr = vm_paging_map_object(page,
8454 object,
8455 offset,
8456 VM_PROT_READ,
8457 FALSE, /* can't unlock object ! */
8458 &ksize,
8459 &koffset,
8460 &need_unmap);
8461 if (kr != KERN_SUCCESS) {
8462 panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
8463 }
8464 kaddr = CAST_DOWN(vm_offset_t, koffset);
8465
8466 /* validate the mapped page */
8467 vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
8468
8469 assert(page->vmp_busy);
8470 assert(object == VM_PAGE_OBJECT(page));
8471 vm_object_lock_assert_exclusive(object);
8472
8473 if (!busy_page) {
8474 vm_page_wakeup_done(object, page);
8475 }
8476 if (need_unmap) {
8477 /* unmap the map from the kernel address space */
8478 vm_paging_unmap_object(object, koffset, koffset + ksize);
8479 koffset = 0;
8480 ksize = 0;
8481 kaddr = 0;
8482 }
8483 vm_object_paging_end(object);
8484 }
8485
8486 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)8487 vm_page_validate_cs(
8488 vm_page_t page,
8489 vm_map_size_t fault_page_size,
8490 vm_map_offset_t fault_phys_offset)
8491 {
8492 vm_object_t object;
8493
8494 object = VM_PAGE_OBJECT(page);
8495 vm_object_lock_assert_held(object);
8496
8497 if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
8498 return;
8499 }
8500 vm_page_map_and_validate_cs(object, page);
8501 }
8502
8503 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)8504 vm_page_validate_cs_mapped_chunk(
8505 vm_page_t page,
8506 const void *kaddr,
8507 vm_offset_t chunk_offset,
8508 vm_size_t chunk_size,
8509 boolean_t *validated_p,
8510 unsigned *tainted_p)
8511 {
8512 vm_object_t object;
8513 vm_object_offset_t offset, offset_in_page;
8514 memory_object_t pager;
8515 struct vnode *vnode;
8516 boolean_t validated;
8517 unsigned tainted;
8518
8519 *validated_p = FALSE;
8520 *tainted_p = 0;
8521
8522 assert(page->vmp_busy);
8523 object = VM_PAGE_OBJECT(page);
8524 vm_object_lock_assert_exclusive(object);
8525
8526 assert(object->code_signed);
8527 offset = page->vmp_offset;
8528
8529 if (!object->alive || object->terminating || object->pager == NULL) {
8530 /*
8531 * The object is terminating and we don't have its pager
8532 * so we can't validate the data...
8533 */
8534 return;
8535 }
8536 /*
8537 * Since we get here to validate a page that was brought in by
8538 * the pager, we know that this pager is all setup and ready
8539 * by now.
8540 */
8541 assert(!object->internal);
8542 assert(object->pager != NULL);
8543 assert(object->pager_ready);
8544
8545 pager = object->pager;
8546 assert(object->paging_in_progress);
8547 vnode = vnode_pager_lookup_vnode(pager);
8548
8549 /* verify the signature for this chunk */
8550 offset_in_page = chunk_offset;
8551 assert(offset_in_page < PAGE_SIZE);
8552
8553 tainted = 0;
8554 validated = cs_validate_range(vnode,
8555 pager,
8556 (object->paging_offset +
8557 offset +
8558 offset_in_page),
8559 (const void *)((const char *)kaddr
8560 + offset_in_page),
8561 chunk_size,
8562 &tainted);
8563 if (validated) {
8564 *validated_p = TRUE;
8565 }
8566 if (tainted) {
8567 *tainted_p = tainted;
8568 }
8569 }
8570
8571 static void
vm_rtfrecord_lock(void)8572 vm_rtfrecord_lock(void)
8573 {
8574 lck_spin_lock(&vm_rtfr_slock);
8575 }
8576
8577 static void
vm_rtfrecord_unlock(void)8578 vm_rtfrecord_unlock(void)
8579 {
8580 lck_spin_unlock(&vm_rtfr_slock);
8581 }
8582
8583 unsigned int
vmrtfaultinfo_bufsz(void)8584 vmrtfaultinfo_bufsz(void)
8585 {
8586 return vmrtf_num_records * sizeof(vm_rtfault_record_t);
8587 }
8588
8589 #include <kern/backtrace.h>
8590
8591 __attribute__((noinline))
8592 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)8593 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
8594 {
8595 uint64_t fend = mach_continuous_time();
8596
8597 uint64_t cfpc = 0;
8598 uint64_t ctid = cthread->thread_id;
8599 uint64_t cupid = get_current_unique_pid();
8600
8601 uintptr_t bpc = 0;
8602 errno_t btr = 0;
8603
8604 /*
8605 * Capture a single-frame backtrace. This extracts just the program
8606 * counter at the point of the fault, and should not use copyin to get
8607 * Rosetta save state.
8608 */
8609 struct backtrace_control ctl = {
8610 .btc_user_thread = cthread,
8611 .btc_user_copy = backtrace_user_copy_error,
8612 };
8613 unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
8614 if ((btr == 0) && (bfrs > 0)) {
8615 cfpc = bpc;
8616 }
8617
8618 assert((fstart != 0) && fend >= fstart);
8619 vm_rtfrecord_lock();
8620 assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
8621
8622 vmrtfrs.vmrtf_total++;
8623 vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
8624
8625 cvmr->rtfabstime = fstart;
8626 cvmr->rtfduration = fend - fstart;
8627 cvmr->rtfaddr = fault_vaddr;
8628 cvmr->rtfpc = cfpc;
8629 cvmr->rtftype = type_of_fault;
8630 cvmr->rtfupid = cupid;
8631 cvmr->rtftid = ctid;
8632
8633 if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
8634 vmrtfrs.vmrtfr_curi = 0;
8635 }
8636
8637 vm_rtfrecord_unlock();
8638 }
8639
8640 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)8641 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
8642 {
8643 vm_rtfault_record_t *cvmrd = vrecords;
8644 size_t residue = vrecordsz;
8645 size_t numextracted = 0;
8646 boolean_t early_exit = FALSE;
8647
8648 vm_rtfrecord_lock();
8649
8650 for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
8651 if (residue < sizeof(vm_rtfault_record_t)) {
8652 early_exit = TRUE;
8653 break;
8654 }
8655
8656 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
8657 #if DEVELOPMENT || DEBUG
8658 if (isroot == FALSE) {
8659 continue;
8660 }
8661 #else
8662 continue;
8663 #endif /* DEVDEBUG */
8664 }
8665
8666 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
8667 cvmrd++;
8668 residue -= sizeof(vm_rtfault_record_t);
8669 numextracted++;
8670 }
8671
8672 vm_rtfrecord_unlock();
8673
8674 *vmrtfrv = numextracted;
8675 return early_exit;
8676 }
8677
8678 /*
8679 * Only allow one diagnosis to be in flight at a time, to avoid
8680 * creating too much additional memory usage.
8681 */
8682 static volatile uint_t vmtc_diagnosing;
8683 unsigned int vmtc_total = 0;
8684
8685 /*
8686 * Type used to update telemetry for the diagnosis counts.
8687 */
8688 CA_EVENT(vmtc_telemetry,
8689 CA_INT, vmtc_num_byte, /* number of corrupt bytes found */
8690 CA_BOOL, vmtc_undiagnosed, /* undiagnosed because more than 1 at a time */
8691 CA_BOOL, vmtc_not_eligible, /* the page didn't qualify */
8692 CA_BOOL, vmtc_copyin_fail, /* unable to copy in the page */
8693 CA_BOOL, vmtc_not_found, /* no corruption found even though CS failed */
8694 CA_BOOL, vmtc_one_bit_flip, /* single bit flip */
8695 CA_BOOL, vmtc_testing); /* caused on purpose by testing */
8696
8697 #if DEVELOPMENT || DEBUG
8698 /*
8699 * Buffers used to compare before/after page contents.
8700 * Stashed to aid when debugging crashes.
8701 */
8702 static size_t vmtc_last_buffer_size = 0;
8703 static uint64_t *vmtc_last_before_buffer = NULL;
8704 static uint64_t *vmtc_last_after_buffer = NULL;
8705
8706 /*
8707 * Needed to record corruptions due to testing.
8708 */
8709 static uintptr_t corruption_test_va = 0;
8710 #endif /* DEVELOPMENT || DEBUG */
8711
8712 /*
8713 * Stash a copy of data from a possibly corrupt page.
8714 */
8715 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)8716 vmtc_get_page_data(
8717 vm_map_offset_t code_addr,
8718 vm_page_t page)
8719 {
8720 uint64_t *buffer = NULL;
8721 addr64_t buffer_paddr;
8722 addr64_t page_paddr;
8723 extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
8724 uint_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8725
8726 /*
8727 * Need an aligned buffer to do a physical copy.
8728 */
8729 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
8730 size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
8731 return NULL;
8732 }
8733 buffer_paddr = kvtophys((vm_offset_t)buffer);
8734 page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
8735
8736 /* adjust the page start address if we need only 4K of a 16K page */
8737 if (size < PAGE_SIZE) {
8738 uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
8739 page_paddr += subpage_start;
8740 }
8741
8742 bcopy_phys(page_paddr, buffer_paddr, size);
8743 return buffer;
8744 }
8745
8746 /*
8747 * Set things up so we can diagnose a potential text page corruption.
8748 */
8749 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)8750 vmtc_text_page_diagnose_setup(
8751 vm_map_offset_t code_addr,
8752 vm_page_t page,
8753 CA_EVENT_TYPE(vmtc_telemetry) *event)
8754 {
8755 uint64_t *buffer = NULL;
8756
8757 /*
8758 * If another is being diagnosed, skip this one.
8759 */
8760 if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
8761 event->vmtc_undiagnosed = true;
8762 return NULL;
8763 }
8764
8765 /*
8766 * Get the contents of the corrupt page.
8767 */
8768 buffer = vmtc_get_page_data(code_addr, page);
8769 if (buffer == NULL) {
8770 event->vmtc_copyin_fail = true;
8771 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
8772 panic("Bad compare and swap in setup!");
8773 }
8774 return NULL;
8775 }
8776 return buffer;
8777 }
8778
8779 /*
8780 * Diagnose the text page by comparing its contents with
8781 * the one we've previously saved.
8782 */
8783 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)8784 vmtc_text_page_diagnose(
8785 vm_map_offset_t code_addr,
8786 uint64_t *old_code_buffer,
8787 CA_EVENT_TYPE(vmtc_telemetry) *event)
8788 {
8789 uint64_t *new_code_buffer;
8790 size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8791 uint_t count = (uint_t)size / sizeof(uint64_t);
8792 uint_t diff_count = 0;
8793 bool bit_flip = false;
8794 uint_t b;
8795 uint64_t *new;
8796 uint64_t *old;
8797
8798 new_code_buffer = kalloc_data(size, Z_WAITOK);
8799 assert(new_code_buffer != NULL);
8800 if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
8801 /* copyin error, so undo things */
8802 event->vmtc_copyin_fail = true;
8803 goto done;
8804 }
8805
8806 new = new_code_buffer;
8807 old = old_code_buffer;
8808 for (; count-- > 0; ++new, ++old) {
8809 if (*new == *old) {
8810 continue;
8811 }
8812
8813 /*
8814 * On first diff, check for a single bit flip
8815 */
8816 if (diff_count == 0) {
8817 uint64_t x = (*new ^ *old);
8818 assert(x != 0);
8819 if ((x & (x - 1)) == 0) {
8820 bit_flip = true;
8821 ++diff_count;
8822 continue;
8823 }
8824 }
8825
8826 /*
8827 * count up the number of different bytes.
8828 */
8829 for (b = 0; b < sizeof(uint64_t); ++b) {
8830 char *n = (char *)new;
8831 char *o = (char *)old;
8832 if (n[b] != o[b]) {
8833 ++diff_count;
8834 }
8835 }
8836 }
8837
8838 if (diff_count > 1) {
8839 bit_flip = false;
8840 }
8841
8842 if (diff_count == 0) {
8843 event->vmtc_not_found = true;
8844 } else {
8845 event->vmtc_num_byte = diff_count;
8846 }
8847 if (bit_flip) {
8848 event->vmtc_one_bit_flip = true;
8849 }
8850
8851 done:
8852 /*
8853 * Free up the code copy buffers, but save the last
8854 * set on development / debug kernels in case they
8855 * can provide evidence for debugging memory stomps.
8856 */
8857 #if DEVELOPMENT || DEBUG
8858 if (vmtc_last_before_buffer != NULL) {
8859 kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
8860 }
8861 if (vmtc_last_after_buffer != NULL) {
8862 kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
8863 }
8864 vmtc_last_before_buffer = old_code_buffer;
8865 vmtc_last_after_buffer = new_code_buffer;
8866 vmtc_last_buffer_size = size;
8867 #else /* DEVELOPMENT || DEBUG */
8868 kfree_data(new_code_buffer, size);
8869 kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
8870 #endif /* DEVELOPMENT || DEBUG */
8871
8872 /*
8873 * We're finished, so clear the diagnosing flag.
8874 */
8875 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
8876 panic("Bad compare and swap in diagnose!");
8877 }
8878 }
8879
8880 /*
8881 * For the given map, virt address, find the object, offset, and page.
8882 * This has to lookup the map entry, verify protections, walk any shadow chains.
8883 * If found, returns with the object locked.
8884 */
8885 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)8886 vmtc_revalidate_lookup(
8887 vm_map_t map,
8888 vm_map_offset_t vaddr,
8889 vm_object_t *ret_object,
8890 vm_object_offset_t *ret_offset,
8891 vm_page_t *ret_page,
8892 vm_prot_t *ret_prot)
8893 {
8894 vm_object_t object;
8895 vm_object_offset_t offset;
8896 vm_page_t page;
8897 kern_return_t kr = KERN_SUCCESS;
8898 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
8899 vm_map_version_t version;
8900 boolean_t wired;
8901 struct vm_object_fault_info fault_info = {
8902 .interruptible = THREAD_UNINT
8903 };
8904 vm_map_t real_map = NULL;
8905 vm_prot_t prot;
8906 vm_object_t shadow;
8907
8908 vmlp_api_start(VMTC_REVALIDATE_LOOKUP);
8909
8910 /*
8911 * Find the object/offset for the given location/map.
8912 * Note this returns with the object locked.
8913 */
8914 restart:
8915 vm_map_lock_read(map);
8916 object = VM_OBJECT_NULL; /* in case we come around the restart path */
8917 kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
8918 object_lock_type, &version, &object, &offset, &prot, &wired,
8919 &fault_info, &real_map, NULL);
8920 vm_map_unlock_read(map);
8921 if (real_map != NULL && real_map != map) {
8922 vm_map_unlock(real_map);
8923 }
8924
8925 /*
8926 * If there's no page here, fail.
8927 */
8928 if (kr != KERN_SUCCESS || object == NULL) {
8929 kr = KERN_FAILURE;
8930 goto done;
8931 }
8932
8933 /*
8934 * Chase down any shadow chains to find the actual page.
8935 */
8936 for (;;) {
8937 /*
8938 * See if the page is on the current object.
8939 */
8940 page = vm_page_lookup(object, vm_object_trunc_page(offset));
8941 if (page != NULL) {
8942 /* restart the lookup */
8943 if (page->vmp_restart) {
8944 vm_object_unlock(object);
8945 goto restart;
8946 }
8947
8948 /*
8949 * If this page is busy, we need to wait for it.
8950 */
8951 if (page->vmp_busy) {
8952 vm_page_sleep(object, page, THREAD_INTERRUPTIBLE, LCK_SLEEP_UNLOCK);
8953 goto restart;
8954 }
8955 break;
8956 }
8957
8958 /*
8959 * If the object doesn't have the page and
8960 * has no shadow, then we can quit.
8961 */
8962 shadow = object->shadow;
8963 if (shadow == NULL) {
8964 kr = KERN_FAILURE;
8965 goto done;
8966 }
8967
8968 /*
8969 * Move to the next object
8970 */
8971 offset += object->vo_shadow_offset;
8972 vm_object_lock(shadow);
8973 vm_object_unlock(object);
8974 object = shadow;
8975 shadow = VM_OBJECT_NULL;
8976 }
8977 *ret_object = object;
8978 *ret_offset = vm_object_trunc_page(offset);
8979 *ret_page = page;
8980 *ret_prot = prot;
8981
8982 done:
8983 if (kr != KERN_SUCCESS && object != NULL) {
8984 vm_object_unlock(object);
8985 }
8986 vmlp_api_end(VMTC_REVALIDATE_LOOKUP, kr);
8987 return kr;
8988 }
8989
8990 /*
8991 * Check if a page is wired, needs extra locking.
8992 */
8993 static bool
is_page_wired(vm_page_t page)8994 is_page_wired(vm_page_t page)
8995 {
8996 bool result;
8997 vm_page_lock_queues();
8998 result = VM_PAGE_WIRED(page);
8999 vm_page_unlock_queues();
9000 return result;
9001 }
9002
9003 /*
9004 * A fatal process error has occurred in the given task.
9005 * Recheck the code signing of the text page at the given
9006 * address to check for a text page corruption.
9007 *
9008 * Returns KERN_FAILURE if a page was found to be corrupt
9009 * by failing to match its code signature. KERN_SUCCESS
9010 * means the page is either valid or we don't have the
9011 * information to say it's corrupt.
9012 */
9013 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)9014 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
9015 {
9016 kern_return_t kr;
9017 vm_map_t map;
9018 vm_object_t object = NULL;
9019 vm_object_offset_t offset;
9020 vm_page_t page = NULL;
9021 struct vnode *vnode;
9022 uint64_t *diagnose_buffer = NULL;
9023 CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
9024 ca_event_t ca_event = NULL;
9025 vm_prot_t prot;
9026
9027 map = task->map;
9028 if (task->map == NULL) {
9029 return KERN_SUCCESS;
9030 }
9031
9032 kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
9033 if (kr != KERN_SUCCESS) {
9034 goto done;
9035 }
9036
9037 /*
9038 * The page must be executable.
9039 */
9040 if (!(prot & VM_PROT_EXECUTE)) {
9041 goto done;
9042 }
9043
9044 /*
9045 * The object needs to have a pager.
9046 */
9047 if (object->pager == NULL) {
9048 goto done;
9049 }
9050
9051 /*
9052 * Needs to be a vnode backed page to have a signature.
9053 */
9054 vnode = vnode_pager_lookup_vnode(object->pager);
9055 if (vnode == NULL) {
9056 goto done;
9057 }
9058
9059 /*
9060 * Object checks to see if we should proceed.
9061 */
9062 if (!object->code_signed || /* no code signature to check */
9063 object->internal || /* internal objects aren't signed */
9064 object->terminating || /* the object and its pages are already going away */
9065 !object->pager_ready) { /* this should happen, but check shouldn't hurt */
9066 goto done;
9067 }
9068
9069
9070 /*
9071 * Check the code signature of the page in question.
9072 */
9073 vm_page_map_and_validate_cs(object, page);
9074
9075 /*
9076 * At this point:
9077 * vmp_cs_validated |= validated (set if a code signature exists)
9078 * vmp_cs_tainted |= tainted (set if code signature violation)
9079 * vmp_cs_nx |= nx; ??
9080 *
9081 * if vmp_pmapped then have to pmap_disconnect..
9082 * other flags to check on object or page?
9083 */
9084 if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
9085 #if DEBUG || DEVELOPMENT
9086 /*
9087 * On development builds, a boot-arg can be used to cause
9088 * a panic, instead of a quiet repair.
9089 */
9090 if (vmtc_panic_instead) {
9091 panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
9092 }
9093 #endif /* DEBUG || DEVELOPMENT */
9094
9095 /*
9096 * We're going to invalidate this page. Grab a copy of it for comparison.
9097 */
9098 ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
9099 event = ca_event->data;
9100 diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
9101
9102 /*
9103 * Invalidate, i.e. toss, the corrupted page.
9104 */
9105 if (!page->vmp_cleaning &&
9106 !page->vmp_laundry &&
9107 !vm_page_is_fictitious(page) &&
9108 !page->vmp_precious &&
9109 !page->vmp_absent &&
9110 !VMP_ERROR_GET(page) &&
9111 !page->vmp_dirty &&
9112 !is_page_wired(page)) {
9113 if (page->vmp_pmapped) {
9114 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
9115 if (refmod & VM_MEM_MODIFIED) {
9116 SET_PAGE_DIRTY(page, FALSE);
9117 }
9118 if (refmod & VM_MEM_REFERENCED) {
9119 page->vmp_reference = TRUE;
9120 }
9121 }
9122 /* If the page seems intentionally modified, don't trash it. */
9123 if (!page->vmp_dirty) {
9124 VM_PAGE_FREE(page);
9125 } else {
9126 event->vmtc_not_eligible = true;
9127 }
9128 } else {
9129 event->vmtc_not_eligible = true;
9130 }
9131 vm_object_unlock(object);
9132 object = VM_OBJECT_NULL;
9133
9134 /*
9135 * Now try to diagnose the type of failure by faulting
9136 * in a new copy and diff'ing it with what we saved.
9137 */
9138 if (diagnose_buffer != NULL) {
9139 vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
9140 }
9141 #if DEBUG || DEVELOPMENT
9142 if (corruption_test_va != 0) {
9143 corruption_test_va = 0;
9144 event->vmtc_testing = true;
9145 }
9146 #endif /* DEBUG || DEVELOPMENT */
9147 ktriage_record(thread_tid(current_thread()),
9148 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
9149 0 /* arg */);
9150 CA_EVENT_SEND(ca_event);
9151 printf("Text page corruption detected for pid %d\n", proc_selfpid());
9152 ++vmtc_total;
9153 return KERN_FAILURE; /* failure means we definitely found a corrupt page */
9154 }
9155 done:
9156 if (object != NULL) {
9157 vm_object_unlock(object);
9158 }
9159 return KERN_SUCCESS;
9160 }
9161
9162 #if DEBUG || DEVELOPMENT
9163 /*
9164 * For implementing unit tests - ask the pmap to corrupt a text page.
9165 * We have to find the page, to get the physical address, then invoke
9166 * the pmap.
9167 */
9168 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
9169
9170 kern_return_t
vm_corrupt_text_addr(uintptr_t va)9171 vm_corrupt_text_addr(uintptr_t va)
9172 {
9173 task_t task = current_task();
9174 vm_map_t map;
9175 kern_return_t kr = KERN_SUCCESS;
9176 vm_object_t object = VM_OBJECT_NULL;
9177 vm_object_offset_t offset;
9178 vm_page_t page = NULL;
9179 pmap_paddr_t pa;
9180 vm_prot_t prot;
9181
9182 map = task->map;
9183 if (task->map == NULL) {
9184 printf("corrupt_text_addr: no map\n");
9185 return KERN_FAILURE;
9186 }
9187
9188 kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
9189 if (kr != KERN_SUCCESS) {
9190 printf("corrupt_text_addr: page lookup failed\n");
9191 return kr;
9192 }
9193 if (!(prot & VM_PROT_EXECUTE)) {
9194 printf("corrupt_text_addr: page not executable\n");
9195 return KERN_FAILURE;
9196 }
9197
9198 /* get the physical address to use */
9199 pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
9200
9201 /*
9202 * Check we have something we can work with.
9203 * Due to racing with pageout as we enter the sysctl,
9204 * it's theoretically possible to have the page disappear, just
9205 * before the lookup.
9206 *
9207 * That's highly likely to happen often. I've filed a radar 72857482
9208 * to bubble up the error here to the sysctl result and have the
9209 * test not FAIL in that case.
9210 */
9211 if (page->vmp_busy) {
9212 printf("corrupt_text_addr: vmp_busy\n");
9213 kr = KERN_FAILURE;
9214 }
9215 if (page->vmp_cleaning) {
9216 printf("corrupt_text_addr: vmp_cleaning\n");
9217 kr = KERN_FAILURE;
9218 }
9219 if (page->vmp_laundry) {
9220 printf("corrupt_text_addr: vmp_cleaning\n");
9221 kr = KERN_FAILURE;
9222 }
9223 if (vm_page_is_fictitious(page)) {
9224 printf("corrupt_text_addr: vmp_fictitious\n");
9225 kr = KERN_FAILURE;
9226 }
9227 if (page->vmp_precious) {
9228 printf("corrupt_text_addr: vmp_precious\n");
9229 kr = KERN_FAILURE;
9230 }
9231 if (page->vmp_absent) {
9232 printf("corrupt_text_addr: vmp_absent\n");
9233 kr = KERN_FAILURE;
9234 }
9235 if (VMP_ERROR_GET(page)) {
9236 printf("corrupt_text_addr: vmp_error\n");
9237 kr = KERN_FAILURE;
9238 }
9239 if (page->vmp_dirty) {
9240 printf("corrupt_text_addr: vmp_dirty\n");
9241 kr = KERN_FAILURE;
9242 }
9243 if (is_page_wired(page)) {
9244 printf("corrupt_text_addr: wired\n");
9245 kr = KERN_FAILURE;
9246 }
9247 if (!page->vmp_pmapped) {
9248 printf("corrupt_text_addr: !vmp_pmapped\n");
9249 kr = KERN_FAILURE;
9250 }
9251
9252 if (kr == KERN_SUCCESS) {
9253 printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
9254 kr = pmap_test_text_corruption(pa);
9255 if (kr != KERN_SUCCESS) {
9256 printf("corrupt_text_addr: pmap error %d\n", kr);
9257 } else {
9258 corruption_test_va = va;
9259 }
9260 } else {
9261 printf("corrupt_text_addr: object %p\n", object);
9262 printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
9263 printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
9264 printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
9265 printf("corrupt_text_addr: vm_page_t %p\n", page);
9266 printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
9267 printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
9268 }
9269
9270 if (object != VM_OBJECT_NULL) {
9271 vm_object_unlock(object);
9272 }
9273 return kr;
9274 }
9275
9276 #endif /* DEBUG || DEVELOPMENT */
9277