1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <libkern/OSAtomic.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h> /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_compressor.h>
90 #include <vm/vm_compressor_pager.h>
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_kern.h>
96 #include <vm/pmap.h>
97 #include <vm/vm_pageout.h>
98 #include <vm/vm_protos.h>
99 #include <vm/vm_external.h>
100 #include <vm/memory_object.h>
101 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
102 #include <vm/vm_shared_region.h>
103
104 #include <sys/codesign.h>
105 #include <sys/reason.h>
106 #include <sys/signalvar.h>
107
108 #include <sys/kdebug_triage.h>
109
110 #include <san/kasan.h>
111 #include <libkern/coreanalytics/coreanalytics.h>
112
113 #define VM_FAULT_CLASSIFY 0
114
115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116
117 int vm_protect_privileged_from_untrusted = 1;
118
119 unsigned int vm_object_pagein_throttle = 16;
120
121 /*
122 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
123 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
124 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
125 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
126 * keep the UI active so that the user has a chance to kill the offending task before the system
127 * completely hangs.
128 *
129 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
130 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
131 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
132 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
133 */
134
135 extern void throttle_lowpri_io(int);
136
137 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
138
139 uint64_t vm_hard_throttle_threshold;
140
141 #if DEBUG || DEVELOPMENT
142 static bool vmtc_panic_instead = false;
143 int panic_object_not_alive = 1;
144 #endif /* DEBUG || DEVELOPMENT */
145
146 OS_ALWAYS_INLINE
147 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)148 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
149 {
150 return vm_wants_task_throttled(current_task()) ||
151 ((vm_page_free_count < vm_page_throttle_limit ||
152 HARD_THROTTLE_LIMIT_REACHED()) &&
153 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
154 }
155
156 #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
157 #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
158
159 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
160 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
161
162
163 #define VM_STAT_DECOMPRESSIONS() \
164 MACRO_BEGIN \
165 counter_inc(&vm_statistics_decompressions); \
166 current_thread()->decompressions++; \
167 MACRO_END
168
169 boolean_t current_thread_aborted(void);
170
171 /* Forward declarations of internal routines. */
172 static kern_return_t vm_fault_wire_fast(
173 vm_map_t map,
174 vm_map_offset_t va,
175 vm_prot_t prot,
176 vm_tag_t wire_tag,
177 vm_map_entry_t entry,
178 pmap_t pmap,
179 vm_map_offset_t pmap_addr,
180 ppnum_t *physpage_p);
181
182 static kern_return_t vm_fault_internal(
183 vm_map_t map,
184 vm_map_offset_t vaddr,
185 vm_prot_t caller_prot,
186 boolean_t change_wiring,
187 vm_tag_t wire_tag,
188 int interruptible,
189 pmap_t pmap,
190 vm_map_offset_t pmap_addr,
191 ppnum_t *physpage_p);
192
193 static void vm_fault_copy_cleanup(
194 vm_page_t page,
195 vm_page_t top_page);
196
197 static void vm_fault_copy_dst_cleanup(
198 vm_page_t page);
199
200 #if VM_FAULT_CLASSIFY
201 extern void vm_fault_classify(vm_object_t object,
202 vm_object_offset_t offset,
203 vm_prot_t fault_type);
204
205 extern void vm_fault_classify_init(void);
206 #endif
207
208 unsigned long vm_pmap_enter_blocked = 0;
209 unsigned long vm_pmap_enter_retried = 0;
210
211 unsigned long vm_cs_validates = 0;
212 unsigned long vm_cs_revalidates = 0;
213 unsigned long vm_cs_query_modified = 0;
214 unsigned long vm_cs_validated_dirtied = 0;
215 unsigned long vm_cs_bitmap_validated = 0;
216
217 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
218
219 extern char *kdp_compressor_decompressed_page;
220 extern addr64_t kdp_compressor_decompressed_page_paddr;
221 extern ppnum_t kdp_compressor_decompressed_page_ppnum;
222
223 struct vmrtfr {
224 int vmrtfr_maxi;
225 int vmrtfr_curi;
226 int64_t vmrtf_total;
227 vm_rtfault_record_t *vm_rtf_records;
228 } vmrtfrs;
229 #define VMRTF_DEFAULT_BUFSIZE (4096)
230 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
231 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
232
233 static void vm_rtfrecord_lock(void);
234 static void vm_rtfrecord_unlock(void);
235 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
236
237 extern lck_grp_t vm_page_lck_grp_bucket;
238 extern lck_attr_t vm_page_lck_attr;
239 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
240
241 #if DEVELOPMENT || DEBUG
242 extern int madvise_free_debug;
243 #endif /* DEVELOPMENT || DEBUG */
244
245 #if CONFIG_FREEZE
246 #endif /* CONFIG_FREEZE */
247
248 /*
249 * Routine: vm_fault_init
250 * Purpose:
251 * Initialize our private data structures.
252 */
253 __startup_func
254 void
vm_fault_init(void)255 vm_fault_init(void)
256 {
257 int i, vm_compressor_temp;
258 boolean_t need_default_val = TRUE;
259 /*
260 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
261 * computed as a percentage of available memory, and the percentage used is scaled inversely with
262 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
263 * and reduce the value down to 10% for very large memory configurations. This helps give us a
264 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
265 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
266 */
267
268 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
269
270 /*
271 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
272 */
273
274 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
275 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
276 if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
277 need_default_val = FALSE;
278 vm_compressor_mode = vm_compressor_temp;
279 break;
280 }
281 }
282 if (need_default_val) {
283 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
284 }
285 }
286 #if CONFIG_FREEZE
287 if (need_default_val) {
288 if (osenvironment_is_diagnostics()) {
289 printf("osenvironment == \"diagnostics\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
290 vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
291 need_default_val = false;
292 }
293 }
294 #endif /* CONFIG_FREEZE */
295 if (need_default_val) {
296 /* If no boot arg or incorrect boot arg, try device tree. */
297 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
298 }
299 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
300 vm_config_init();
301
302 PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
303 &vm_protect_privileged_from_untrusted,
304 sizeof(vm_protect_privileged_from_untrusted));
305
306 #if DEBUG || DEVELOPMENT
307 (void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
308
309 if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
310 madvise_free_debug = 0;
311 }
312
313 PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
314 #endif /* DEBUG || DEVELOPMENT */
315 }
316
317 __startup_func
318 static void
vm_rtfault_record_init(void)319 vm_rtfault_record_init(void)
320 {
321 size_t size;
322
323 vmrtf_num_records = MAX(vmrtf_num_records, 1);
324 size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
325 vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
326 ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
327 vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
328 }
329 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
330
331 /*
332 * Routine: vm_fault_cleanup
333 * Purpose:
334 * Clean up the result of vm_fault_page.
335 * Results:
336 * The paging reference for "object" is released.
337 * "object" is unlocked.
338 * If "top_page" is not null, "top_page" is
339 * freed and the paging reference for the object
340 * containing it is released.
341 *
342 * In/out conditions:
343 * "object" must be locked.
344 */
345 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)346 vm_fault_cleanup(
347 vm_object_t object,
348 vm_page_t top_page)
349 {
350 vm_object_paging_end(object);
351 vm_object_unlock(object);
352
353 if (top_page != VM_PAGE_NULL) {
354 object = VM_PAGE_OBJECT(top_page);
355
356 vm_object_lock(object);
357 VM_PAGE_FREE(top_page);
358 vm_object_paging_end(object);
359 vm_object_unlock(object);
360 }
361 }
362
363 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
364
365
366 boolean_t vm_page_deactivate_behind = TRUE;
367 /*
368 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
369 */
370 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
371 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
372 /* we use it to size an array on the stack */
373
374 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
375
376 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
377
378 /*
379 * vm_page_is_sequential
380 *
381 * Determine if sequential access is in progress
382 * in accordance with the behavior specified.
383 * Update state to indicate current access pattern.
384 *
385 * object must have at least the shared lock held
386 */
387 static
388 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)389 vm_fault_is_sequential(
390 vm_object_t object,
391 vm_object_offset_t offset,
392 vm_behavior_t behavior)
393 {
394 vm_object_offset_t last_alloc;
395 int sequential;
396 int orig_sequential;
397
398 last_alloc = object->last_alloc;
399 sequential = object->sequential;
400 orig_sequential = sequential;
401
402 offset = vm_object_trunc_page(offset);
403 if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
404 /* re-faulting in the same page: no change in behavior */
405 return;
406 }
407
408 switch (behavior) {
409 case VM_BEHAVIOR_RANDOM:
410 /*
411 * reset indicator of sequential behavior
412 */
413 sequential = 0;
414 break;
415
416 case VM_BEHAVIOR_SEQUENTIAL:
417 if (offset && last_alloc == offset - PAGE_SIZE_64) {
418 /*
419 * advance indicator of sequential behavior
420 */
421 if (sequential < MAX_SEQUENTIAL_RUN) {
422 sequential += PAGE_SIZE;
423 }
424 } else {
425 /*
426 * reset indicator of sequential behavior
427 */
428 sequential = 0;
429 }
430 break;
431
432 case VM_BEHAVIOR_RSEQNTL:
433 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
434 /*
435 * advance indicator of sequential behavior
436 */
437 if (sequential > -MAX_SEQUENTIAL_RUN) {
438 sequential -= PAGE_SIZE;
439 }
440 } else {
441 /*
442 * reset indicator of sequential behavior
443 */
444 sequential = 0;
445 }
446 break;
447
448 case VM_BEHAVIOR_DEFAULT:
449 default:
450 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
451 /*
452 * advance indicator of sequential behavior
453 */
454 if (sequential < 0) {
455 sequential = 0;
456 }
457 if (sequential < MAX_SEQUENTIAL_RUN) {
458 sequential += PAGE_SIZE;
459 }
460 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
461 /*
462 * advance indicator of sequential behavior
463 */
464 if (sequential > 0) {
465 sequential = 0;
466 }
467 if (sequential > -MAX_SEQUENTIAL_RUN) {
468 sequential -= PAGE_SIZE;
469 }
470 } else {
471 /*
472 * reset indicator of sequential behavior
473 */
474 sequential = 0;
475 }
476 break;
477 }
478 if (sequential != orig_sequential) {
479 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
480 /*
481 * if someone else has already updated object->sequential
482 * don't bother trying to update it or object->last_alloc
483 */
484 return;
485 }
486 }
487 /*
488 * I'd like to do this with a OSCompareAndSwap64, but that
489 * doesn't exist for PPC... however, it shouldn't matter
490 * that much... last_alloc is maintained so that we can determine
491 * if a sequential access pattern is taking place... if only
492 * one thread is banging on this object, no problem with the unprotected
493 * update... if 2 or more threads are banging away, we run the risk of
494 * someone seeing a mangled update... however, in the face of multiple
495 * accesses, no sequential access pattern can develop anyway, so we
496 * haven't lost any real info.
497 */
498 object->last_alloc = offset;
499 }
500
501 #if DEVELOPMENT || DEBUG
502 uint64_t vm_page_deactivate_behind_count = 0;
503 #endif /* DEVELOPMENT || DEBUG */
504
505 /*
506 * vm_page_deactivate_behind
507 *
508 * Determine if sequential access is in progress
509 * in accordance with the behavior specified. If
510 * so, compute a potential page to deactivate and
511 * deactivate it.
512 *
513 * object must be locked.
514 *
515 * return TRUE if we actually deactivate a page
516 */
517 static
518 boolean_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)519 vm_fault_deactivate_behind(
520 vm_object_t object,
521 vm_object_offset_t offset,
522 vm_behavior_t behavior)
523 {
524 int n;
525 int pages_in_run = 0;
526 int max_pages_in_run = 0;
527 int sequential_run;
528 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
529 vm_object_offset_t run_offset = 0;
530 vm_object_offset_t pg_offset = 0;
531 vm_page_t m;
532 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
533
534 pages_in_run = 0;
535 #if TRACEFAULTPAGE
536 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
537 #endif
538 if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
539 /*
540 * Do not deactivate pages from the kernel object: they
541 * are not intended to become pageable.
542 * or we've disabled the deactivate behind mechanism
543 * or we are dealing with an offset that is not aligned to
544 * the system's PAGE_SIZE because in that case we will
545 * handle the deactivation on the aligned offset and, thus,
546 * the full PAGE_SIZE page once. This helps us avoid the redundant
547 * deactivates and the extra faults.
548 */
549 return FALSE;
550 }
551 if ((sequential_run = object->sequential)) {
552 if (sequential_run < 0) {
553 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
554 sequential_run = 0 - sequential_run;
555 } else {
556 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
557 }
558 }
559 switch (behavior) {
560 case VM_BEHAVIOR_RANDOM:
561 break;
562 case VM_BEHAVIOR_SEQUENTIAL:
563 if (sequential_run >= (int)PAGE_SIZE) {
564 run_offset = 0 - PAGE_SIZE_64;
565 max_pages_in_run = 1;
566 }
567 break;
568 case VM_BEHAVIOR_RSEQNTL:
569 if (sequential_run >= (int)PAGE_SIZE) {
570 run_offset = PAGE_SIZE_64;
571 max_pages_in_run = 1;
572 }
573 break;
574 case VM_BEHAVIOR_DEFAULT:
575 default:
576 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
577
578 /*
579 * determine if the run of sequential accesss has been
580 * long enough on an object with default access behavior
581 * to consider it for deactivation
582 */
583 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
584 /*
585 * the comparisons between offset and behind are done
586 * in this kind of odd fashion in order to prevent wrap around
587 * at the end points
588 */
589 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
590 if (offset >= behind) {
591 run_offset = 0 - behind;
592 pg_offset = PAGE_SIZE_64;
593 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
594 }
595 } else {
596 if (offset < -behind) {
597 run_offset = behind;
598 pg_offset = 0 - PAGE_SIZE_64;
599 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
600 }
601 }
602 }
603 break;}
604 }
605 for (n = 0; n < max_pages_in_run; n++) {
606 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
607
608 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
609 page_run[pages_in_run++] = m;
610
611 /*
612 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
613 *
614 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
615 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
616 * new reference happens. If no futher references happen on the page after that remote TLB flushes
617 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
618 * by pageout_scan, which is just fine since the last reference would have happened quite far
619 * in the past (TLB caches don't hang around for very long), and of course could just as easily
620 * have happened before we did the deactivate_behind.
621 */
622 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
623 }
624 }
625 if (pages_in_run) {
626 vm_page_lockspin_queues();
627
628 for (n = 0; n < pages_in_run; n++) {
629 m = page_run[n];
630
631 vm_page_deactivate_internal(m, FALSE);
632
633 #if DEVELOPMENT || DEBUG
634 vm_page_deactivate_behind_count++;
635 #endif /* DEVELOPMENT || DEBUG */
636
637 #if TRACEFAULTPAGE
638 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
639 #endif
640 }
641 vm_page_unlock_queues();
642
643 return TRUE;
644 }
645 return FALSE;
646 }
647
648
649 #if (DEVELOPMENT || DEBUG)
650 uint32_t vm_page_creation_throttled_hard = 0;
651 uint32_t vm_page_creation_throttled_soft = 0;
652 uint64_t vm_page_creation_throttle_avoided = 0;
653 #endif /* DEVELOPMENT || DEBUG */
654
655 static int
vm_page_throttled(boolean_t page_kept)656 vm_page_throttled(boolean_t page_kept)
657 {
658 clock_sec_t elapsed_sec;
659 clock_sec_t tv_sec;
660 clock_usec_t tv_usec;
661 task_t curtask = current_task_early();
662
663 thread_t thread = current_thread();
664
665 if (thread->options & TH_OPT_VMPRIV) {
666 return 0;
667 }
668
669 if (curtask && !curtask->active) {
670 return 0;
671 }
672
673 if (thread->t_page_creation_throttled) {
674 thread->t_page_creation_throttled = 0;
675
676 if (page_kept == FALSE) {
677 goto no_throttle;
678 }
679 }
680 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
681 #if (DEVELOPMENT || DEBUG)
682 thread->t_page_creation_throttled_hard++;
683 OSAddAtomic(1, &vm_page_creation_throttled_hard);
684 #endif /* DEVELOPMENT || DEBUG */
685 return HARD_THROTTLE_DELAY;
686 }
687
688 if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
689 thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
690 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
691 #if (DEVELOPMENT || DEBUG)
692 OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
693 #endif
694 goto no_throttle;
695 }
696 clock_get_system_microtime(&tv_sec, &tv_usec);
697
698 elapsed_sec = tv_sec - thread->t_page_creation_time;
699
700 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
701 (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
702 if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
703 /*
704 * we'll reset our stats to give a well behaved app
705 * that was unlucky enough to accumulate a bunch of pages
706 * over a long period of time a chance to get out of
707 * the throttled state... we reset the counter and timestamp
708 * so that if it stays under the rate limit for the next second
709 * it will be back in our good graces... if it exceeds it, it
710 * will remain in the throttled state
711 */
712 thread->t_page_creation_time = tv_sec;
713 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
714 }
715 VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
716
717 thread->t_page_creation_throttled = 1;
718
719 if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
720 #if (DEVELOPMENT || DEBUG)
721 thread->t_page_creation_throttled_hard++;
722 OSAddAtomic(1, &vm_page_creation_throttled_hard);
723 #endif /* DEVELOPMENT || DEBUG */
724 return HARD_THROTTLE_DELAY;
725 } else {
726 #if (DEVELOPMENT || DEBUG)
727 thread->t_page_creation_throttled_soft++;
728 OSAddAtomic(1, &vm_page_creation_throttled_soft);
729 #endif /* DEVELOPMENT || DEBUG */
730 return SOFT_THROTTLE_DELAY;
731 }
732 }
733 thread->t_page_creation_time = tv_sec;
734 thread->t_page_creation_count = 0;
735 }
736 no_throttle:
737 thread->t_page_creation_count++;
738
739 return 0;
740 }
741
742 extern boolean_t vm_pageout_running;
743 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)744 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
745 int throttle_delay)
746 {
747 /* make sure vm_pageout_scan() gets to work while we're throttled */
748 if (!vm_pageout_running) {
749 thread_wakeup((event_t)&vm_page_free_wanted);
750 }
751 delay(throttle_delay);
752 }
753
754
755 /*
756 * check for various conditions that would
757 * prevent us from creating a ZF page...
758 * cleanup is based on being called from vm_fault_page
759 *
760 * object must be locked
761 * object == m->vmp_object
762 */
763 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)764 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
765 {
766 int throttle_delay;
767
768 if (object->shadow_severed ||
769 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
770 /*
771 * Either:
772 * 1. the shadow chain was severed,
773 * 2. the purgeable object is volatile or empty and is marked
774 * to fault on access while volatile.
775 * Just have to return an error at this point
776 */
777 if (m != VM_PAGE_NULL) {
778 VM_PAGE_FREE(m);
779 }
780 vm_fault_cleanup(object, first_m);
781
782 thread_interrupt_level(interruptible_state);
783
784 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
785 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
786 }
787
788 if (object->shadow_severed) {
789 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
790 }
791 return VM_FAULT_MEMORY_ERROR;
792 }
793 if (page_throttle == TRUE) {
794 if ((throttle_delay = vm_page_throttled(FALSE))) {
795 /*
796 * we're throttling zero-fills...
797 * treat this as if we couldn't grab a page
798 */
799 if (m != VM_PAGE_NULL) {
800 VM_PAGE_FREE(m);
801 }
802 vm_fault_cleanup(object, first_m);
803
804 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
805
806 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
807
808 if (current_thread_aborted()) {
809 thread_interrupt_level(interruptible_state);
810 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
811 return VM_FAULT_INTERRUPTED;
812 }
813 thread_interrupt_level(interruptible_state);
814
815 return VM_FAULT_MEMORY_SHORTAGE;
816 }
817 }
818 return VM_FAULT_SUCCESS;
819 }
820
821 /*
822 * Clear the code signing bits on the given page_t
823 */
824 static void
vm_fault_cs_clear(vm_page_t m)825 vm_fault_cs_clear(vm_page_t m)
826 {
827 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
828 m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
829 m->vmp_cs_nx = VMP_CS_ALL_FALSE;
830 }
831
832 /*
833 * Enqueues the given page on the throttled queue.
834 * The caller must hold the vm_page_queue_lock and it will be held on return.
835 */
836 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)837 vm_fault_enqueue_throttled_locked(vm_page_t m)
838 {
839 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
840 assert(!VM_PAGE_WIRED(m));
841
842 /*
843 * can't be on the pageout queue since we don't
844 * have a pager to try and clean to
845 */
846 vm_page_queues_remove(m, TRUE);
847 vm_page_check_pageable_safe(m);
848 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
849 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
850 vm_page_throttled_count++;
851 }
852
853 /*
854 * do the work to zero fill a page and
855 * inject it into the correct paging queue
856 *
857 * m->vmp_object must be locked
858 * page queue lock must NOT be held
859 */
860 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)861 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
862 {
863 int my_fault = DBG_ZERO_FILL_FAULT;
864 vm_object_t object;
865
866 object = VM_PAGE_OBJECT(m);
867
868 /*
869 * This is is a zero-fill page fault...
870 *
871 * Checking the page lock is a waste of
872 * time; this page was absent, so
873 * it can't be page locked by a pager.
874 *
875 * we also consider it undefined
876 * with respect to instruction
877 * execution. i.e. it is the responsibility
878 * of higher layers to call for an instruction
879 * sync after changing the contents and before
880 * sending a program into this area. We
881 * choose this approach for performance
882 */
883 vm_fault_cs_clear(m);
884 m->vmp_pmapped = TRUE;
885
886 if (no_zero_fill == TRUE) {
887 my_fault = DBG_NZF_PAGE_FAULT;
888
889 if (m->vmp_absent && m->vmp_busy) {
890 return my_fault;
891 }
892 } else {
893 vm_page_zero_fill(m);
894
895 counter_inc(&vm_statistics_zero_fill_count);
896 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
897 }
898 assert(!m->vmp_laundry);
899 assert(object != kernel_object);
900 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
901 if (!VM_DYNAMIC_PAGING_ENABLED() &&
902 (object->purgable == VM_PURGABLE_DENY ||
903 object->purgable == VM_PURGABLE_NONVOLATILE ||
904 object->purgable == VM_PURGABLE_VOLATILE)) {
905 vm_page_lockspin_queues();
906 if (!VM_DYNAMIC_PAGING_ENABLED()) {
907 vm_fault_enqueue_throttled_locked(m);
908 }
909 vm_page_unlock_queues();
910 }
911 return my_fault;
912 }
913
914
915 /*
916 * Routine: vm_fault_page
917 * Purpose:
918 * Find the resident page for the virtual memory
919 * specified by the given virtual memory object
920 * and offset.
921 * Additional arguments:
922 * The required permissions for the page is given
923 * in "fault_type". Desired permissions are included
924 * in "protection".
925 * fault_info is passed along to determine pagein cluster
926 * limits... it contains the expected reference pattern,
927 * cluster size if available, etc...
928 *
929 * If the desired page is known to be resident (for
930 * example, because it was previously wired down), asserting
931 * the "unwiring" parameter will speed the search.
932 *
933 * If the operation can be interrupted (by thread_abort
934 * or thread_terminate), then the "interruptible"
935 * parameter should be asserted.
936 *
937 * Results:
938 * The page containing the proper data is returned
939 * in "result_page".
940 *
941 * In/out conditions:
942 * The source object must be locked and referenced,
943 * and must donate one paging reference. The reference
944 * is not affected. The paging reference and lock are
945 * consumed.
946 *
947 * If the call succeeds, the object in which "result_page"
948 * resides is left locked and holding a paging reference.
949 * If this is not the original object, a busy page in the
950 * original object is returned in "top_page", to prevent other
951 * callers from pursuing this same data, along with a paging
952 * reference for the original object. The "top_page" should
953 * be destroyed when this guarantee is no longer required.
954 * The "result_page" is also left busy. It is not removed
955 * from the pageout queues.
956 * Special Case:
957 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
958 * fault succeeded but there's no VM page (i.e. the VM object
959 * does not actually hold VM pages, but device memory or
960 * large pages). The object is still locked and we still hold a
961 * paging_in_progress reference.
962 */
963 unsigned int vm_fault_page_blocked_access = 0;
964 unsigned int vm_fault_page_forced_retry = 0;
965
966 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)967 vm_fault_page(
968 /* Arguments: */
969 vm_object_t first_object, /* Object to begin search */
970 vm_object_offset_t first_offset, /* Offset into object */
971 vm_prot_t fault_type, /* What access is requested */
972 boolean_t must_be_resident,/* Must page be resident? */
973 boolean_t caller_lookup, /* caller looked up page */
974 /* Modifies in place: */
975 vm_prot_t *protection, /* Protection for mapping */
976 vm_page_t *result_page, /* Page found, if successful */
977 /* Returns: */
978 vm_page_t *top_page, /* Page in top object, if
979 * not result_page. */
980 int *type_of_fault, /* if non-null, fill in with type of fault
981 * COW, zero-fill, etc... returned in trace point */
982 /* More arguments: */
983 kern_return_t *error_code, /* code if page is in error */
984 boolean_t no_zero_fill, /* don't zero fill absent pages */
985 vm_object_fault_info_t fault_info)
986 {
987 vm_page_t m;
988 vm_object_t object;
989 vm_object_offset_t offset;
990 vm_page_t first_m;
991 vm_object_t next_object;
992 vm_object_t copy_object;
993 boolean_t look_for_page;
994 boolean_t force_fault_retry = FALSE;
995 vm_prot_t access_required = fault_type;
996 vm_prot_t wants_copy_flag;
997 kern_return_t wait_result;
998 wait_interrupt_t interruptible_state;
999 boolean_t data_already_requested = FALSE;
1000 vm_behavior_t orig_behavior;
1001 vm_size_t orig_cluster_size;
1002 vm_fault_return_t error;
1003 int my_fault;
1004 uint32_t try_failed_count;
1005 int interruptible; /* how may fault be interrupted? */
1006 int external_state = VM_EXTERNAL_STATE_UNKNOWN;
1007 memory_object_t pager;
1008 vm_fault_return_t retval;
1009 int grab_options;
1010 bool clear_absent_on_error = false;
1011
1012 /*
1013 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1014 * marked as paged out in the compressor pager or the pager doesn't exist.
1015 * Note also that if the pager for an internal object
1016 * has not been created, the pager is not invoked regardless of the value
1017 * of MUST_ASK_PAGER().
1018 *
1019 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1020 * is marked as paged out in the compressor pager.
1021 * PAGED_OUT() is used to determine if a page has already been pushed
1022 * into a copy object in order to avoid a redundant page out operation.
1023 */
1024 #define MUST_ASK_PAGER(o, f, s) \
1025 ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1026
1027 #define PAGED_OUT(o, f) \
1028 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1029
1030 /*
1031 * Recovery actions
1032 */
1033 #define RELEASE_PAGE(m) \
1034 MACRO_BEGIN \
1035 PAGE_WAKEUP_DONE(m); \
1036 if ( !VM_PAGE_PAGEABLE(m)) { \
1037 vm_page_lockspin_queues(); \
1038 if (clear_absent_on_error && m->vmp_absent) {\
1039 vm_page_zero_fill(m); \
1040 counter_inc(&vm_statistics_zero_fill_count);\
1041 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);\
1042 m->vmp_absent = false; \
1043 } \
1044 if ( !VM_PAGE_PAGEABLE(m)) { \
1045 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
1046 vm_page_deactivate(m); \
1047 else \
1048 vm_page_activate(m); \
1049 } \
1050 vm_page_unlock_queues(); \
1051 } \
1052 clear_absent_on_error = false; \
1053 MACRO_END
1054
1055 #if TRACEFAULTPAGE
1056 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1057 #endif
1058
1059 interruptible = fault_info->interruptible;
1060 interruptible_state = thread_interrupt_level(interruptible);
1061
1062 /*
1063 * INVARIANTS (through entire routine):
1064 *
1065 * 1) At all times, we must either have the object
1066 * lock or a busy page in some object to prevent
1067 * some other thread from trying to bring in
1068 * the same page.
1069 *
1070 * Note that we cannot hold any locks during the
1071 * pager access or when waiting for memory, so
1072 * we use a busy page then.
1073 *
1074 * 2) To prevent another thread from racing us down the
1075 * shadow chain and entering a new page in the top
1076 * object before we do, we must keep a busy page in
1077 * the top object while following the shadow chain.
1078 *
1079 * 3) We must increment paging_in_progress on any object
1080 * for which we have a busy page before dropping
1081 * the object lock
1082 *
1083 * 4) We leave busy pages on the pageout queues.
1084 * If the pageout daemon comes across a busy page,
1085 * it will remove the page from the pageout queues.
1086 */
1087
1088 object = first_object;
1089 offset = first_offset;
1090 first_m = VM_PAGE_NULL;
1091 access_required = fault_type;
1092
1093 /*
1094 * default type of fault
1095 */
1096 my_fault = DBG_CACHE_HIT_FAULT;
1097 thread_pri_floor_t token;
1098 bool drop_floor = false;
1099
1100 while (TRUE) {
1101 #if TRACEFAULTPAGE
1102 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1103 #endif
1104
1105 grab_options = 0;
1106 #if CONFIG_SECLUDED_MEMORY
1107 if (object->can_grab_secluded) {
1108 grab_options |= VM_PAGE_GRAB_SECLUDED;
1109 }
1110 #endif /* CONFIG_SECLUDED_MEMORY */
1111
1112 if (!object->alive) {
1113 /*
1114 * object is no longer valid
1115 * clean up and return error
1116 */
1117 #if DEVELOPMENT || DEBUG
1118 printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1119 if (panic_object_not_alive) {
1120 panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1121 }
1122 #endif /* DEVELOPMENT || DEBUG */
1123 vm_fault_cleanup(object, first_m);
1124 thread_interrupt_level(interruptible_state);
1125
1126 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1127 return VM_FAULT_MEMORY_ERROR;
1128 }
1129
1130 if (!object->pager_created && object->phys_contiguous) {
1131 /*
1132 * A physically-contiguous object without a pager:
1133 * must be a "large page" object. We do not deal
1134 * with VM pages for this object.
1135 */
1136 caller_lookup = FALSE;
1137 m = VM_PAGE_NULL;
1138 goto phys_contig_object;
1139 }
1140
1141 if (object->blocked_access) {
1142 /*
1143 * Access to this VM object has been blocked.
1144 * Replace our "paging_in_progress" reference with
1145 * a "activity_in_progress" reference and wait for
1146 * access to be unblocked.
1147 */
1148 caller_lookup = FALSE; /* no longer valid after sleep */
1149 vm_object_activity_begin(object);
1150 vm_object_paging_end(object);
1151 while (object->blocked_access) {
1152 vm_object_sleep(object,
1153 VM_OBJECT_EVENT_UNBLOCKED,
1154 THREAD_UNINT);
1155 }
1156 vm_fault_page_blocked_access++;
1157 vm_object_paging_begin(object);
1158 vm_object_activity_end(object);
1159 }
1160
1161 /*
1162 * See whether the page at 'offset' is resident
1163 */
1164 if (caller_lookup == TRUE) {
1165 /*
1166 * The caller has already looked up the page
1167 * and gave us the result in "result_page".
1168 * We can use this for the first lookup but
1169 * it loses its validity as soon as we unlock
1170 * the object.
1171 */
1172 m = *result_page;
1173 caller_lookup = FALSE; /* no longer valid after that */
1174 } else {
1175 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1176 }
1177 #if TRACEFAULTPAGE
1178 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1179 #endif
1180 if (m != VM_PAGE_NULL) {
1181 if (m->vmp_busy) {
1182 /*
1183 * The page is being brought in,
1184 * wait for it and then retry.
1185 */
1186 #if TRACEFAULTPAGE
1187 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1188 #endif
1189 wait_result = PAGE_SLEEP(object, m, interruptible);
1190
1191 if (wait_result != THREAD_AWAKENED) {
1192 vm_fault_cleanup(object, first_m);
1193 thread_interrupt_level(interruptible_state);
1194
1195 if (wait_result == THREAD_RESTART) {
1196 return VM_FAULT_RETRY;
1197 } else {
1198 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1199 return VM_FAULT_INTERRUPTED;
1200 }
1201 }
1202 continue;
1203 }
1204 if (m->vmp_laundry) {
1205 m->vmp_free_when_done = FALSE;
1206
1207 if (!m->vmp_cleaning) {
1208 vm_pageout_steal_laundry(m, FALSE);
1209 }
1210 }
1211 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1212 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1213 /*
1214 * Guard page: off limits !
1215 */
1216 if (fault_type == VM_PROT_NONE) {
1217 /*
1218 * The fault is not requesting any
1219 * access to the guard page, so it must
1220 * be just to wire or unwire it.
1221 * Let's pretend it succeeded...
1222 */
1223 m->vmp_busy = TRUE;
1224 *result_page = m;
1225 assert(first_m == VM_PAGE_NULL);
1226 *top_page = first_m;
1227 if (type_of_fault) {
1228 *type_of_fault = DBG_GUARD_FAULT;
1229 }
1230 thread_interrupt_level(interruptible_state);
1231 return VM_FAULT_SUCCESS;
1232 } else {
1233 /*
1234 * The fault requests access to the
1235 * guard page: let's deny that !
1236 */
1237 vm_fault_cleanup(object, first_m);
1238 thread_interrupt_level(interruptible_state);
1239 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1240 return VM_FAULT_MEMORY_ERROR;
1241 }
1242 }
1243
1244
1245 if (VMP_ERROR_GET(m)) {
1246 /*
1247 * The page is in error, give up now.
1248 */
1249 #if TRACEFAULTPAGE
1250 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1251 #endif
1252 if (error_code) {
1253 *error_code = KERN_MEMORY_ERROR;
1254 }
1255 VM_PAGE_FREE(m);
1256
1257 vm_fault_cleanup(object, first_m);
1258 thread_interrupt_level(interruptible_state);
1259
1260 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1261 return VM_FAULT_MEMORY_ERROR;
1262 }
1263 if (m->vmp_restart) {
1264 /*
1265 * The pager wants us to restart
1266 * at the top of the chain,
1267 * typically because it has moved the
1268 * page to another pager, then do so.
1269 */
1270 #if TRACEFAULTPAGE
1271 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1272 #endif
1273 VM_PAGE_FREE(m);
1274
1275 vm_fault_cleanup(object, first_m);
1276 thread_interrupt_level(interruptible_state);
1277
1278 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1279 return VM_FAULT_RETRY;
1280 }
1281 if (m->vmp_absent) {
1282 /*
1283 * The page isn't busy, but is absent,
1284 * therefore it's deemed "unavailable".
1285 *
1286 * Remove the non-existent page (unless it's
1287 * in the top object) and move on down to the
1288 * next object (if there is one).
1289 */
1290 #if TRACEFAULTPAGE
1291 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1292 #endif
1293 next_object = object->shadow;
1294
1295 if (next_object == VM_OBJECT_NULL) {
1296 /*
1297 * Absent page at bottom of shadow
1298 * chain; zero fill the page we left
1299 * busy in the first object, and free
1300 * the absent page.
1301 */
1302 assert(!must_be_resident);
1303
1304 /*
1305 * check for any conditions that prevent
1306 * us from creating a new zero-fill page
1307 * vm_fault_check will do all of the
1308 * fault cleanup in the case of an error condition
1309 * including resetting the thread_interrupt_level
1310 */
1311 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1312
1313 if (error != VM_FAULT_SUCCESS) {
1314 return error;
1315 }
1316
1317 if (object != first_object) {
1318 /*
1319 * free the absent page we just found
1320 */
1321 VM_PAGE_FREE(m);
1322
1323 /*
1324 * drop reference and lock on current object
1325 */
1326 vm_object_paging_end(object);
1327 vm_object_unlock(object);
1328
1329 /*
1330 * grab the original page we
1331 * 'soldered' in place and
1332 * retake lock on 'first_object'
1333 */
1334 m = first_m;
1335 first_m = VM_PAGE_NULL;
1336
1337 object = first_object;
1338 offset = first_offset;
1339
1340 vm_object_lock(object);
1341 } else {
1342 /*
1343 * we're going to use the absent page we just found
1344 * so convert it to a 'busy' page
1345 */
1346 m->vmp_absent = FALSE;
1347 m->vmp_busy = TRUE;
1348 }
1349 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1350 m->vmp_absent = TRUE;
1351 clear_absent_on_error = true;
1352 }
1353 /*
1354 * zero-fill the page and put it on
1355 * the correct paging queue
1356 */
1357 my_fault = vm_fault_zero_page(m, no_zero_fill);
1358
1359 break;
1360 } else {
1361 if (must_be_resident) {
1362 vm_object_paging_end(object);
1363 } else if (object != first_object) {
1364 vm_object_paging_end(object);
1365 VM_PAGE_FREE(m);
1366 } else {
1367 first_m = m;
1368 m->vmp_absent = FALSE;
1369 m->vmp_busy = TRUE;
1370
1371 vm_page_lockspin_queues();
1372 vm_page_queues_remove(m, FALSE);
1373 vm_page_unlock_queues();
1374 }
1375
1376 offset += object->vo_shadow_offset;
1377 fault_info->lo_offset += object->vo_shadow_offset;
1378 fault_info->hi_offset += object->vo_shadow_offset;
1379 access_required = VM_PROT_READ;
1380
1381 vm_object_lock(next_object);
1382 vm_object_unlock(object);
1383 object = next_object;
1384 vm_object_paging_begin(object);
1385
1386 /*
1387 * reset to default type of fault
1388 */
1389 my_fault = DBG_CACHE_HIT_FAULT;
1390
1391 continue;
1392 }
1393 }
1394 if ((m->vmp_cleaning)
1395 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1396 && (fault_type & VM_PROT_WRITE)) {
1397 /*
1398 * This is a copy-on-write fault that will
1399 * cause us to revoke access to this page, but
1400 * this page is in the process of being cleaned
1401 * in a clustered pageout. We must wait until
1402 * the cleaning operation completes before
1403 * revoking access to the original page,
1404 * otherwise we might attempt to remove a
1405 * wired mapping.
1406 */
1407 #if TRACEFAULTPAGE
1408 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1409 #endif
1410 /*
1411 * take an extra ref so that object won't die
1412 */
1413 vm_object_reference_locked(object);
1414
1415 vm_fault_cleanup(object, first_m);
1416
1417 vm_object_lock(object);
1418 assert(object->ref_count > 0);
1419
1420 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1421
1422 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1423 PAGE_ASSERT_WAIT(m, interruptible);
1424
1425 vm_object_unlock(object);
1426 wait_result = thread_block(THREAD_CONTINUE_NULL);
1427 vm_object_deallocate(object);
1428
1429 goto backoff;
1430 } else {
1431 vm_object_unlock(object);
1432
1433 vm_object_deallocate(object);
1434 thread_interrupt_level(interruptible_state);
1435
1436 return VM_FAULT_RETRY;
1437 }
1438 }
1439 if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1440 !(fault_info != NULL && fault_info->stealth)) {
1441 /*
1442 * If we were passed a non-NULL pointer for
1443 * "type_of_fault", than we came from
1444 * vm_fault... we'll let it deal with
1445 * this condition, since it
1446 * needs to see m->vmp_speculative to correctly
1447 * account the pageins, otherwise...
1448 * take it off the speculative queue, we'll
1449 * let the caller of vm_fault_page deal
1450 * with getting it onto the correct queue
1451 *
1452 * If the caller specified in fault_info that
1453 * it wants a "stealth" fault, we also leave
1454 * the page in the speculative queue.
1455 */
1456 vm_page_lockspin_queues();
1457 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1458 vm_page_queues_remove(m, FALSE);
1459 }
1460 vm_page_unlock_queues();
1461 }
1462 assert(object == VM_PAGE_OBJECT(m));
1463
1464 if (object->code_signed) {
1465 /*
1466 * CODE SIGNING:
1467 * We just paged in a page from a signed
1468 * memory object but we don't need to
1469 * validate it now. We'll validate it if
1470 * when it gets mapped into a user address
1471 * space for the first time or when the page
1472 * gets copied to another object as a result
1473 * of a copy-on-write.
1474 */
1475 }
1476
1477 /*
1478 * We mark the page busy and leave it on
1479 * the pageout queues. If the pageout
1480 * deamon comes across it, then it will
1481 * remove the page from the queue, but not the object
1482 */
1483 #if TRACEFAULTPAGE
1484 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1485 #endif
1486 assert(!m->vmp_busy);
1487 assert(!m->vmp_absent);
1488
1489 m->vmp_busy = TRUE;
1490 break;
1491 }
1492
1493 /*
1494 * we get here when there is no page present in the object at
1495 * the offset we're interested in... we'll allocate a page
1496 * at this point if the pager associated with
1497 * this object can provide the data or we're the top object...
1498 * object is locked; m == NULL
1499 */
1500
1501 if (must_be_resident) {
1502 if (fault_type == VM_PROT_NONE &&
1503 object == kernel_object) {
1504 /*
1505 * We've been called from vm_fault_unwire()
1506 * while removing a map entry that was allocated
1507 * with KMA_KOBJECT and KMA_VAONLY. This page
1508 * is not present and there's nothing more to
1509 * do here (nothing to unwire).
1510 */
1511 vm_fault_cleanup(object, first_m);
1512 thread_interrupt_level(interruptible_state);
1513
1514 return VM_FAULT_MEMORY_ERROR;
1515 }
1516
1517 goto dont_look_for_page;
1518 }
1519
1520 /* Don't expect to fault pages into the kernel object. */
1521 assert(object != kernel_object);
1522
1523 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1524
1525 #if TRACEFAULTPAGE
1526 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1527 #endif
1528 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1529 /*
1530 * Allocate a new page for this object/offset pair as a placeholder
1531 */
1532 m = vm_page_grab_options(grab_options);
1533 #if TRACEFAULTPAGE
1534 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1535 #endif
1536 if (m == VM_PAGE_NULL) {
1537 vm_fault_cleanup(object, first_m);
1538 thread_interrupt_level(interruptible_state);
1539
1540 return VM_FAULT_MEMORY_SHORTAGE;
1541 }
1542
1543 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1544 vm_page_insert_internal(m, object,
1545 vm_object_trunc_page(offset),
1546 VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1547 } else {
1548 vm_page_insert(m, object, vm_object_trunc_page(offset));
1549 }
1550 }
1551 if (look_for_page) {
1552 kern_return_t rc;
1553 int my_fault_type;
1554
1555 /*
1556 * If the memory manager is not ready, we
1557 * cannot make requests.
1558 */
1559 if (!object->pager_ready) {
1560 #if TRACEFAULTPAGE
1561 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1562 #endif
1563 if (m != VM_PAGE_NULL) {
1564 VM_PAGE_FREE(m);
1565 }
1566
1567 /*
1568 * take an extra ref so object won't die
1569 */
1570 vm_object_reference_locked(object);
1571 vm_fault_cleanup(object, first_m);
1572
1573 vm_object_lock(object);
1574 assert(object->ref_count > 0);
1575
1576 if (!object->pager_ready) {
1577 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1578
1579 vm_object_unlock(object);
1580 if (wait_result == THREAD_WAITING) {
1581 wait_result = thread_block(THREAD_CONTINUE_NULL);
1582 }
1583 vm_object_deallocate(object);
1584
1585 goto backoff;
1586 } else {
1587 vm_object_unlock(object);
1588 vm_object_deallocate(object);
1589 thread_interrupt_level(interruptible_state);
1590
1591 return VM_FAULT_RETRY;
1592 }
1593 }
1594 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1595 /*
1596 * If there are too many outstanding page
1597 * requests pending on this external object, we
1598 * wait for them to be resolved now.
1599 */
1600 #if TRACEFAULTPAGE
1601 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1602 #endif
1603 if (m != VM_PAGE_NULL) {
1604 VM_PAGE_FREE(m);
1605 }
1606 /*
1607 * take an extra ref so object won't die
1608 */
1609 vm_object_reference_locked(object);
1610
1611 vm_fault_cleanup(object, first_m);
1612
1613 vm_object_lock(object);
1614 assert(object->ref_count > 0);
1615
1616 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1617 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1618
1619 vm_object_unlock(object);
1620 wait_result = thread_block(THREAD_CONTINUE_NULL);
1621 vm_object_deallocate(object);
1622
1623 goto backoff;
1624 } else {
1625 vm_object_unlock(object);
1626 vm_object_deallocate(object);
1627 thread_interrupt_level(interruptible_state);
1628
1629 return VM_FAULT_RETRY;
1630 }
1631 }
1632 if (object->internal) {
1633 int compressed_count_delta;
1634
1635 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1636
1637 if (m == VM_PAGE_NULL) {
1638 /*
1639 * Allocate a new page for this object/offset pair as a placeholder
1640 */
1641 m = vm_page_grab_options(grab_options);
1642 #if TRACEFAULTPAGE
1643 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1644 #endif
1645 if (m == VM_PAGE_NULL) {
1646 vm_fault_cleanup(object, first_m);
1647 thread_interrupt_level(interruptible_state);
1648
1649 return VM_FAULT_MEMORY_SHORTAGE;
1650 }
1651
1652 m->vmp_absent = TRUE;
1653 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1654 vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1655 } else {
1656 vm_page_insert(m, object, vm_object_trunc_page(offset));
1657 }
1658 }
1659 assert(m->vmp_busy);
1660
1661 m->vmp_absent = TRUE;
1662 pager = object->pager;
1663
1664 assert(object->paging_in_progress > 0);
1665 vm_object_unlock(object);
1666
1667 rc = vm_compressor_pager_get(
1668 pager,
1669 offset + object->paging_offset,
1670 VM_PAGE_GET_PHYS_PAGE(m),
1671 &my_fault_type,
1672 0,
1673 &compressed_count_delta);
1674
1675 if (type_of_fault == NULL) {
1676 int throttle_delay;
1677
1678 /*
1679 * we weren't called from vm_fault, so we
1680 * need to apply page creation throttling
1681 * do it before we re-acquire any locks
1682 */
1683 if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1684 if ((throttle_delay = vm_page_throttled(TRUE))) {
1685 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1686 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1687 }
1688 }
1689 }
1690 vm_object_lock(object);
1691 assert(object->paging_in_progress > 0);
1692
1693 vm_compressor_pager_count(
1694 pager,
1695 compressed_count_delta,
1696 FALSE, /* shared_lock */
1697 object);
1698
1699 switch (rc) {
1700 case KERN_SUCCESS:
1701 m->vmp_absent = FALSE;
1702 m->vmp_dirty = TRUE;
1703 if ((object->wimg_bits &
1704 VM_WIMG_MASK) !=
1705 VM_WIMG_USE_DEFAULT) {
1706 /*
1707 * If the page is not cacheable,
1708 * we can't let its contents
1709 * linger in the data cache
1710 * after the decompression.
1711 */
1712 pmap_sync_page_attributes_phys(
1713 VM_PAGE_GET_PHYS_PAGE(m));
1714 } else {
1715 m->vmp_written_by_kernel = TRUE;
1716 }
1717
1718 /*
1719 * If the object is purgeable, its
1720 * owner's purgeable ledgers have been
1721 * updated in vm_page_insert() but the
1722 * page was also accounted for in a
1723 * "compressed purgeable" ledger, so
1724 * update that now.
1725 */
1726 if (((object->purgable !=
1727 VM_PURGABLE_DENY) ||
1728 object->vo_ledger_tag) &&
1729 (object->vo_owner !=
1730 NULL)) {
1731 /*
1732 * One less compressed
1733 * purgeable/tagged page.
1734 */
1735 vm_object_owner_compressed_update(
1736 object,
1737 -1);
1738 }
1739
1740 break;
1741 case KERN_MEMORY_FAILURE:
1742 m->vmp_unusual = TRUE;
1743 m->vmp_error = TRUE;
1744 m->vmp_absent = FALSE;
1745 break;
1746 case KERN_MEMORY_ERROR:
1747 assert(m->vmp_absent);
1748 break;
1749 default:
1750 panic("vm_fault_page(): unexpected "
1751 "error %d from "
1752 "vm_compressor_pager_get()\n",
1753 rc);
1754 }
1755 PAGE_WAKEUP_DONE(m);
1756
1757 rc = KERN_SUCCESS;
1758 goto data_requested;
1759 }
1760 my_fault_type = DBG_PAGEIN_FAULT;
1761
1762 if (m != VM_PAGE_NULL) {
1763 VM_PAGE_FREE(m);
1764 m = VM_PAGE_NULL;
1765 }
1766
1767 #if TRACEFAULTPAGE
1768 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1769 #endif
1770
1771 /*
1772 * It's possible someone called vm_object_destroy while we weren't
1773 * holding the object lock. If that has happened, then bail out
1774 * here.
1775 */
1776
1777 pager = object->pager;
1778
1779 if (pager == MEMORY_OBJECT_NULL) {
1780 vm_fault_cleanup(object, first_m);
1781 thread_interrupt_level(interruptible_state);
1782 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NO_PAGER), 0 /* arg */);
1783 return VM_FAULT_MEMORY_ERROR;
1784 }
1785
1786 /*
1787 * We have an absent page in place for the faulting offset,
1788 * so we can release the object lock.
1789 */
1790
1791 if (object->object_is_shared_cache) {
1792 token = thread_priority_floor_start();
1793 /*
1794 * A non-native shared cache object might
1795 * be getting set up in parallel with this
1796 * fault and so we can't assume that this
1797 * check will be valid after we drop the
1798 * object lock below.
1799 */
1800 drop_floor = true;
1801 }
1802
1803 vm_object_unlock(object);
1804
1805 /*
1806 * If this object uses a copy_call strategy,
1807 * and we are interested in a copy of this object
1808 * (having gotten here only by following a
1809 * shadow chain), then tell the memory manager
1810 * via a flag added to the desired_access
1811 * parameter, so that it can detect a race
1812 * between our walking down the shadow chain
1813 * and its pushing pages up into a copy of
1814 * the object that it manages.
1815 */
1816 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1817 wants_copy_flag = VM_PROT_WANTS_COPY;
1818 } else {
1819 wants_copy_flag = VM_PROT_NONE;
1820 }
1821
1822 if (object->copy == first_object) {
1823 /*
1824 * if we issue the memory_object_data_request in
1825 * this state, we are subject to a deadlock with
1826 * the underlying filesystem if it is trying to
1827 * shrink the file resulting in a push of pages
1828 * into the copy object... that push will stall
1829 * on the placeholder page, and if the pushing thread
1830 * is holding a lock that is required on the pagein
1831 * path (such as a truncate lock), we'll deadlock...
1832 * to avoid this potential deadlock, we throw away
1833 * our placeholder page before calling memory_object_data_request
1834 * and force this thread to retry the vm_fault_page after
1835 * we have issued the I/O. the second time through this path
1836 * we will find the page already in the cache (presumably still
1837 * busy waiting for the I/O to complete) and then complete
1838 * the fault w/o having to go through memory_object_data_request again
1839 */
1840 assert(first_m != VM_PAGE_NULL);
1841 assert(VM_PAGE_OBJECT(first_m) == first_object);
1842
1843 vm_object_lock(first_object);
1844 VM_PAGE_FREE(first_m);
1845 vm_object_paging_end(first_object);
1846 vm_object_unlock(first_object);
1847
1848 first_m = VM_PAGE_NULL;
1849 force_fault_retry = TRUE;
1850
1851 vm_fault_page_forced_retry++;
1852 }
1853
1854 if (data_already_requested == TRUE) {
1855 orig_behavior = fault_info->behavior;
1856 orig_cluster_size = fault_info->cluster_size;
1857
1858 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1859 fault_info->cluster_size = PAGE_SIZE;
1860 }
1861 /*
1862 * Call the memory manager to retrieve the data.
1863 */
1864 rc = memory_object_data_request(
1865 pager,
1866 vm_object_trunc_page(offset) + object->paging_offset,
1867 PAGE_SIZE,
1868 access_required | wants_copy_flag,
1869 (memory_object_fault_info_t)fault_info);
1870
1871 if (data_already_requested == TRUE) {
1872 fault_info->behavior = orig_behavior;
1873 fault_info->cluster_size = orig_cluster_size;
1874 } else {
1875 data_already_requested = TRUE;
1876 }
1877
1878 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1879 #if TRACEFAULTPAGE
1880 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1881 #endif
1882 vm_object_lock(object);
1883
1884 if (drop_floor && object->object_is_shared_cache) {
1885 thread_priority_floor_end(&token);
1886 drop_floor = false;
1887 }
1888
1889 data_requested:
1890 if (rc != KERN_SUCCESS) {
1891 vm_fault_cleanup(object, first_m);
1892 thread_interrupt_level(interruptible_state);
1893
1894 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
1895
1896 return (rc == MACH_SEND_INTERRUPTED) ?
1897 VM_FAULT_INTERRUPTED :
1898 VM_FAULT_MEMORY_ERROR;
1899 } else {
1900 clock_sec_t tv_sec;
1901 clock_usec_t tv_usec;
1902
1903 if (my_fault_type == DBG_PAGEIN_FAULT) {
1904 clock_get_system_microtime(&tv_sec, &tv_usec);
1905 current_thread()->t_page_creation_time = tv_sec;
1906 current_thread()->t_page_creation_count = 0;
1907 }
1908 }
1909 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1910 vm_fault_cleanup(object, first_m);
1911 thread_interrupt_level(interruptible_state);
1912
1913 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
1914 return VM_FAULT_INTERRUPTED;
1915 }
1916 if (force_fault_retry == TRUE) {
1917 vm_fault_cleanup(object, first_m);
1918 thread_interrupt_level(interruptible_state);
1919
1920 return VM_FAULT_RETRY;
1921 }
1922 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1923 /*
1924 * No page here means that the object we
1925 * initially looked up was "physically
1926 * contiguous" (i.e. device memory). However,
1927 * with Virtual VRAM, the object might not
1928 * be backed by that device memory anymore,
1929 * so we're done here only if the object is
1930 * still "phys_contiguous".
1931 * Otherwise, if the object is no longer
1932 * "phys_contiguous", we need to retry the
1933 * page fault against the object's new backing
1934 * store (different memory object).
1935 */
1936 phys_contig_object:
1937 goto done;
1938 }
1939 /*
1940 * potentially a pagein fault
1941 * if we make it through the state checks
1942 * above, than we'll count it as such
1943 */
1944 my_fault = my_fault_type;
1945
1946 /*
1947 * Retry with same object/offset, since new data may
1948 * be in a different page (i.e., m is meaningless at
1949 * this point).
1950 */
1951 continue;
1952 }
1953 dont_look_for_page:
1954 /*
1955 * We get here if the object has no pager, or an existence map
1956 * exists and indicates the page isn't present on the pager
1957 * or we're unwiring a page. If a pager exists, but there
1958 * is no existence map, then the m->vmp_absent case above handles
1959 * the ZF case when the pager can't provide the page
1960 */
1961 #if TRACEFAULTPAGE
1962 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1963 #endif
1964 if (object == first_object) {
1965 first_m = m;
1966 } else {
1967 assert(m == VM_PAGE_NULL);
1968 }
1969
1970 next_object = object->shadow;
1971
1972 if (next_object == VM_OBJECT_NULL) {
1973 /*
1974 * we've hit the bottom of the shadown chain,
1975 * fill the page in the top object with zeros.
1976 */
1977 assert(!must_be_resident);
1978
1979 if (object != first_object) {
1980 vm_object_paging_end(object);
1981 vm_object_unlock(object);
1982
1983 object = first_object;
1984 offset = first_offset;
1985 vm_object_lock(object);
1986 }
1987 m = first_m;
1988 assert(VM_PAGE_OBJECT(m) == object);
1989 first_m = VM_PAGE_NULL;
1990
1991 /*
1992 * check for any conditions that prevent
1993 * us from creating a new zero-fill page
1994 * vm_fault_check will do all of the
1995 * fault cleanup in the case of an error condition
1996 * including resetting the thread_interrupt_level
1997 */
1998 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1999
2000 if (error != VM_FAULT_SUCCESS) {
2001 return error;
2002 }
2003
2004 if (m == VM_PAGE_NULL) {
2005 m = vm_page_grab_options(grab_options);
2006
2007 if (m == VM_PAGE_NULL) {
2008 vm_fault_cleanup(object, VM_PAGE_NULL);
2009 thread_interrupt_level(interruptible_state);
2010
2011 return VM_FAULT_MEMORY_SHORTAGE;
2012 }
2013 vm_page_insert(m, object, vm_object_trunc_page(offset));
2014 }
2015 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2016 m->vmp_absent = TRUE;
2017 clear_absent_on_error = true;
2018 }
2019
2020 my_fault = vm_fault_zero_page(m, no_zero_fill);
2021
2022 break;
2023 } else {
2024 /*
2025 * Move on to the next object. Lock the next
2026 * object before unlocking the current one.
2027 */
2028 if ((object != first_object) || must_be_resident) {
2029 vm_object_paging_end(object);
2030 }
2031
2032 offset += object->vo_shadow_offset;
2033 fault_info->lo_offset += object->vo_shadow_offset;
2034 fault_info->hi_offset += object->vo_shadow_offset;
2035 access_required = VM_PROT_READ;
2036
2037 vm_object_lock(next_object);
2038 vm_object_unlock(object);
2039
2040 object = next_object;
2041 vm_object_paging_begin(object);
2042 }
2043 }
2044
2045 /*
2046 * PAGE HAS BEEN FOUND.
2047 *
2048 * This page (m) is:
2049 * busy, so that we can play with it;
2050 * not absent, so that nobody else will fill it;
2051 * possibly eligible for pageout;
2052 *
2053 * The top-level page (first_m) is:
2054 * VM_PAGE_NULL if the page was found in the
2055 * top-level object;
2056 * busy, not absent, and ineligible for pageout.
2057 *
2058 * The current object (object) is locked. A paging
2059 * reference is held for the current and top-level
2060 * objects.
2061 */
2062
2063 #if TRACEFAULTPAGE
2064 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
2065 #endif
2066 #if EXTRA_ASSERTIONS
2067 assert(m->vmp_busy && !m->vmp_absent);
2068 assert((first_m == VM_PAGE_NULL) ||
2069 (first_m->vmp_busy && !first_m->vmp_absent &&
2070 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2071 #endif /* EXTRA_ASSERTIONS */
2072
2073 /*
2074 * If the page is being written, but isn't
2075 * already owned by the top-level object,
2076 * we have to copy it into a new page owned
2077 * by the top-level object.
2078 */
2079 if (object != first_object) {
2080 #if TRACEFAULTPAGE
2081 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2082 #endif
2083 if (fault_type & VM_PROT_WRITE) {
2084 vm_page_t copy_m;
2085
2086 /*
2087 * We only really need to copy if we
2088 * want to write it.
2089 */
2090 assert(!must_be_resident);
2091
2092 /*
2093 * If we try to collapse first_object at this
2094 * point, we may deadlock when we try to get
2095 * the lock on an intermediate object (since we
2096 * have the bottom object locked). We can't
2097 * unlock the bottom object, because the page
2098 * we found may move (by collapse) if we do.
2099 *
2100 * Instead, we first copy the page. Then, when
2101 * we have no more use for the bottom object,
2102 * we unlock it and try to collapse.
2103 *
2104 * Note that we copy the page even if we didn't
2105 * need to... that's the breaks.
2106 */
2107
2108 /*
2109 * Allocate a page for the copy
2110 */
2111 copy_m = vm_page_grab_options(grab_options);
2112
2113 if (copy_m == VM_PAGE_NULL) {
2114 RELEASE_PAGE(m);
2115
2116 vm_fault_cleanup(object, first_m);
2117 thread_interrupt_level(interruptible_state);
2118
2119 return VM_FAULT_MEMORY_SHORTAGE;
2120 }
2121
2122 vm_page_copy(m, copy_m);
2123
2124 /*
2125 * If another map is truly sharing this
2126 * page with us, we have to flush all
2127 * uses of the original page, since we
2128 * can't distinguish those which want the
2129 * original from those which need the
2130 * new copy.
2131 *
2132 * XXXO If we know that only one map has
2133 * access to this page, then we could
2134 * avoid the pmap_disconnect() call.
2135 */
2136 if (m->vmp_pmapped) {
2137 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2138 }
2139
2140 if (m->vmp_clustered) {
2141 VM_PAGE_COUNT_AS_PAGEIN(m);
2142 VM_PAGE_CONSUME_CLUSTERED(m);
2143 }
2144 assert(!m->vmp_cleaning);
2145
2146 /*
2147 * We no longer need the old page or object.
2148 */
2149 RELEASE_PAGE(m);
2150
2151 /*
2152 * This check helps with marking the object as having a sequential pattern
2153 * Normally we'll miss doing this below because this fault is about COW to
2154 * the first_object i.e. bring page in from disk, push to object above but
2155 * don't update the file object's sequential pattern.
2156 */
2157 if (object->internal == FALSE) {
2158 vm_fault_is_sequential(object, offset, fault_info->behavior);
2159 }
2160
2161 vm_object_paging_end(object);
2162 vm_object_unlock(object);
2163
2164 my_fault = DBG_COW_FAULT;
2165 counter_inc(&vm_statistics_cow_faults);
2166 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2167 counter_inc(¤t_task()->cow_faults);
2168
2169 object = first_object;
2170 offset = first_offset;
2171
2172 vm_object_lock(object);
2173 /*
2174 * get rid of the place holder
2175 * page that we soldered in earlier
2176 */
2177 VM_PAGE_FREE(first_m);
2178 first_m = VM_PAGE_NULL;
2179
2180 /*
2181 * and replace it with the
2182 * page we just copied into
2183 */
2184 assert(copy_m->vmp_busy);
2185 vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2186 SET_PAGE_DIRTY(copy_m, TRUE);
2187
2188 m = copy_m;
2189 /*
2190 * Now that we've gotten the copy out of the
2191 * way, let's try to collapse the top object.
2192 * But we have to play ugly games with
2193 * paging_in_progress to do that...
2194 */
2195 vm_object_paging_end(object);
2196 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2197 vm_object_paging_begin(object);
2198 } else {
2199 *protection &= (~VM_PROT_WRITE);
2200 }
2201 }
2202 /*
2203 * Now check whether the page needs to be pushed into the
2204 * copy object. The use of asymmetric copy on write for
2205 * shared temporary objects means that we may do two copies to
2206 * satisfy the fault; one above to get the page from a
2207 * shadowed object, and one here to push it into the copy.
2208 */
2209 try_failed_count = 0;
2210
2211 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2212 vm_object_offset_t copy_offset;
2213 vm_page_t copy_m;
2214
2215 #if TRACEFAULTPAGE
2216 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2217 #endif
2218 /*
2219 * If the page is being written, but hasn't been
2220 * copied to the copy-object, we have to copy it there.
2221 */
2222 if ((fault_type & VM_PROT_WRITE) == 0) {
2223 *protection &= ~VM_PROT_WRITE;
2224 break;
2225 }
2226
2227 /*
2228 * If the page was guaranteed to be resident,
2229 * we must have already performed the copy.
2230 */
2231 if (must_be_resident) {
2232 break;
2233 }
2234
2235 /*
2236 * Try to get the lock on the copy_object.
2237 */
2238 if (!vm_object_lock_try(copy_object)) {
2239 vm_object_unlock(object);
2240 try_failed_count++;
2241
2242 mutex_pause(try_failed_count); /* wait a bit */
2243 vm_object_lock(object);
2244
2245 continue;
2246 }
2247 try_failed_count = 0;
2248
2249 /*
2250 * Make another reference to the copy-object,
2251 * to keep it from disappearing during the
2252 * copy.
2253 */
2254 vm_object_reference_locked(copy_object);
2255
2256 /*
2257 * Does the page exist in the copy?
2258 */
2259 copy_offset = first_offset - copy_object->vo_shadow_offset;
2260 copy_offset = vm_object_trunc_page(copy_offset);
2261
2262 if (copy_object->vo_size <= copy_offset) {
2263 /*
2264 * Copy object doesn't cover this page -- do nothing.
2265 */
2266 ;
2267 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2268 /*
2269 * Page currently exists in the copy object
2270 */
2271 if (copy_m->vmp_busy) {
2272 /*
2273 * If the page is being brought
2274 * in, wait for it and then retry.
2275 */
2276 RELEASE_PAGE(m);
2277
2278 /*
2279 * take an extra ref so object won't die
2280 */
2281 vm_object_reference_locked(copy_object);
2282 vm_object_unlock(copy_object);
2283 vm_fault_cleanup(object, first_m);
2284
2285 vm_object_lock(copy_object);
2286 assert(copy_object->ref_count > 0);
2287 vm_object_lock_assert_exclusive(copy_object);
2288 copy_object->ref_count--;
2289 assert(copy_object->ref_count > 0);
2290 copy_m = vm_page_lookup(copy_object, copy_offset);
2291
2292 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2293 PAGE_ASSERT_WAIT(copy_m, interruptible);
2294
2295 vm_object_unlock(copy_object);
2296 wait_result = thread_block(THREAD_CONTINUE_NULL);
2297 vm_object_deallocate(copy_object);
2298
2299 goto backoff;
2300 } else {
2301 vm_object_unlock(copy_object);
2302 vm_object_deallocate(copy_object);
2303 thread_interrupt_level(interruptible_state);
2304
2305 return VM_FAULT_RETRY;
2306 }
2307 }
2308 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2309 /*
2310 * If PAGED_OUT is TRUE, then the page used to exist
2311 * in the copy-object, and has already been paged out.
2312 * We don't need to repeat this. If PAGED_OUT is
2313 * FALSE, then either we don't know (!pager_created,
2314 * for example) or it hasn't been paged out.
2315 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2316 * We must copy the page to the copy object.
2317 *
2318 * Allocate a page for the copy
2319 */
2320 copy_m = vm_page_alloc(copy_object, copy_offset);
2321
2322 if (copy_m == VM_PAGE_NULL) {
2323 RELEASE_PAGE(m);
2324
2325 vm_object_lock_assert_exclusive(copy_object);
2326 copy_object->ref_count--;
2327 assert(copy_object->ref_count > 0);
2328
2329 vm_object_unlock(copy_object);
2330 vm_fault_cleanup(object, first_m);
2331 thread_interrupt_level(interruptible_state);
2332
2333 return VM_FAULT_MEMORY_SHORTAGE;
2334 }
2335 /*
2336 * Must copy page into copy-object.
2337 */
2338 vm_page_copy(m, copy_m);
2339
2340 /*
2341 * If the old page was in use by any users
2342 * of the copy-object, it must be removed
2343 * from all pmaps. (We can't know which
2344 * pmaps use it.)
2345 */
2346 if (m->vmp_pmapped) {
2347 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2348 }
2349
2350 if (m->vmp_clustered) {
2351 VM_PAGE_COUNT_AS_PAGEIN(m);
2352 VM_PAGE_CONSUME_CLUSTERED(m);
2353 }
2354 /*
2355 * If there's a pager, then immediately
2356 * page out this page, using the "initialize"
2357 * option. Else, we use the copy.
2358 */
2359 if ((!copy_object->pager_ready)
2360 || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2361 ) {
2362 vm_page_lockspin_queues();
2363 assert(!m->vmp_cleaning);
2364 vm_page_activate(copy_m);
2365 vm_page_unlock_queues();
2366
2367 SET_PAGE_DIRTY(copy_m, TRUE);
2368 PAGE_WAKEUP_DONE(copy_m);
2369 } else {
2370 assert(copy_m->vmp_busy == TRUE);
2371 assert(!m->vmp_cleaning);
2372
2373 /*
2374 * dirty is protected by the object lock
2375 */
2376 SET_PAGE_DIRTY(copy_m, TRUE);
2377
2378 /*
2379 * The page is already ready for pageout:
2380 * not on pageout queues and busy.
2381 * Unlock everything except the
2382 * copy_object itself.
2383 */
2384 vm_object_unlock(object);
2385
2386 /*
2387 * Write the page to the copy-object,
2388 * flushing it from the kernel.
2389 */
2390 vm_pageout_initialize_page(copy_m);
2391
2392 /*
2393 * Since the pageout may have
2394 * temporarily dropped the
2395 * copy_object's lock, we
2396 * check whether we'll have
2397 * to deallocate the hard way.
2398 */
2399 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2400 vm_object_unlock(copy_object);
2401 vm_object_deallocate(copy_object);
2402 vm_object_lock(object);
2403
2404 continue;
2405 }
2406 /*
2407 * Pick back up the old object's
2408 * lock. [It is safe to do so,
2409 * since it must be deeper in the
2410 * object tree.]
2411 */
2412 vm_object_lock(object);
2413 }
2414
2415 /*
2416 * Because we're pushing a page upward
2417 * in the object tree, we must restart
2418 * any faults that are waiting here.
2419 * [Note that this is an expansion of
2420 * PAGE_WAKEUP that uses the THREAD_RESTART
2421 * wait result]. Can't turn off the page's
2422 * busy bit because we're not done with it.
2423 */
2424 if (m->vmp_wanted) {
2425 m->vmp_wanted = FALSE;
2426 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2427 }
2428 }
2429 /*
2430 * The reference count on copy_object must be
2431 * at least 2: one for our extra reference,
2432 * and at least one from the outside world
2433 * (we checked that when we last locked
2434 * copy_object).
2435 */
2436 vm_object_lock_assert_exclusive(copy_object);
2437 copy_object->ref_count--;
2438 assert(copy_object->ref_count > 0);
2439
2440 vm_object_unlock(copy_object);
2441
2442 break;
2443 }
2444
2445 done:
2446 *result_page = m;
2447 *top_page = first_m;
2448
2449 if (m != VM_PAGE_NULL) {
2450 assert(VM_PAGE_OBJECT(m) == object);
2451
2452 retval = VM_FAULT_SUCCESS;
2453
2454 if (my_fault == DBG_PAGEIN_FAULT) {
2455 VM_PAGE_COUNT_AS_PAGEIN(m);
2456
2457 if (object->internal) {
2458 my_fault = DBG_PAGEIND_FAULT;
2459 } else {
2460 my_fault = DBG_PAGEINV_FAULT;
2461 }
2462
2463 /*
2464 * evaluate access pattern and update state
2465 * vm_fault_deactivate_behind depends on the
2466 * state being up to date
2467 */
2468 vm_fault_is_sequential(object, offset, fault_info->behavior);
2469 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2470 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2471 /*
2472 * we weren't called from vm_fault, so handle the
2473 * accounting here for hits in the cache
2474 */
2475 if (m->vmp_clustered) {
2476 VM_PAGE_COUNT_AS_PAGEIN(m);
2477 VM_PAGE_CONSUME_CLUSTERED(m);
2478 }
2479 vm_fault_is_sequential(object, offset, fault_info->behavior);
2480 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2481 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2482 VM_STAT_DECOMPRESSIONS();
2483 }
2484 if (type_of_fault) {
2485 *type_of_fault = my_fault;
2486 }
2487 } else {
2488 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2489 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2490 assert(first_m == VM_PAGE_NULL);
2491 assert(object == first_object);
2492 }
2493
2494 thread_interrupt_level(interruptible_state);
2495
2496 #if TRACEFAULTPAGE
2497 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2498 #endif
2499 return retval;
2500
2501 backoff:
2502 thread_interrupt_level(interruptible_state);
2503
2504 if (wait_result == THREAD_INTERRUPTED) {
2505 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2506 return VM_FAULT_INTERRUPTED;
2507 }
2508 return VM_FAULT_RETRY;
2509
2510 #undef RELEASE_PAGE
2511 }
2512
2513 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2514 #define PANIC_ON_CS_KILLED_DEFAULT true
2515 #else
2516 #define PANIC_ON_CS_KILLED_DEFAULT false
2517 #endif
2518 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2519 PANIC_ON_CS_KILLED_DEFAULT);
2520
2521 extern int proc_selfpid(void);
2522 extern char *proc_name_address(void *p);
2523 unsigned long cs_enter_tainted_rejected = 0;
2524 unsigned long cs_enter_tainted_accepted = 0;
2525
2526 /*
2527 * CODE SIGNING:
2528 * When soft faulting a page, we have to validate the page if:
2529 * 1. the page is being mapped in user space
2530 * 2. the page hasn't already been found to be "tainted"
2531 * 3. the page belongs to a code-signed object
2532 * 4. the page has not been validated yet or has been mapped for write.
2533 */
2534 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2535 vm_fault_cs_need_validation(
2536 pmap_t pmap,
2537 vm_page_t page,
2538 vm_object_t page_obj,
2539 vm_map_size_t fault_page_size,
2540 vm_map_offset_t fault_phys_offset)
2541 {
2542 if (pmap == kernel_pmap) {
2543 /* 1 - not user space */
2544 return false;
2545 }
2546 if (!page_obj->code_signed) {
2547 /* 3 - page does not belong to a code-signed object */
2548 return false;
2549 }
2550 if (fault_page_size == PAGE_SIZE) {
2551 /* looking at the whole page */
2552 assertf(fault_phys_offset == 0,
2553 "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2554 (uint64_t)fault_page_size,
2555 (uint64_t)fault_phys_offset);
2556 if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2557 /* 2 - page is all tainted */
2558 return false;
2559 }
2560 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2561 !page->vmp_wpmapped) {
2562 /* 4 - already fully validated and never mapped writable */
2563 return false;
2564 }
2565 } else {
2566 /* looking at a specific sub-page */
2567 if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2568 /* 2 - sub-page was already marked as tainted */
2569 return false;
2570 }
2571 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2572 !page->vmp_wpmapped) {
2573 /* 4 - already validated and never mapped writable */
2574 return false;
2575 }
2576 }
2577 /* page needs to be validated */
2578 return true;
2579 }
2580
2581
2582 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2583 vm_fault_cs_page_immutable(
2584 vm_page_t m,
2585 vm_map_size_t fault_page_size,
2586 vm_map_offset_t fault_phys_offset,
2587 vm_prot_t prot __unused)
2588 {
2589 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2590 /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2591 return true;
2592 }
2593 return false;
2594 }
2595
2596 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2597 vm_fault_cs_page_nx(
2598 vm_page_t m,
2599 vm_map_size_t fault_page_size,
2600 vm_map_offset_t fault_phys_offset)
2601 {
2602 return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2603 }
2604
2605 /*
2606 * Check if the page being entered into the pmap violates code signing.
2607 */
2608 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2609 vm_fault_cs_check_violation(
2610 bool cs_bypass,
2611 vm_object_t object,
2612 vm_page_t m,
2613 pmap_t pmap,
2614 vm_prot_t prot,
2615 vm_prot_t caller_prot,
2616 vm_map_size_t fault_page_size,
2617 vm_map_offset_t fault_phys_offset,
2618 vm_object_fault_info_t fault_info,
2619 bool map_is_switched,
2620 bool map_is_switch_protected,
2621 bool *cs_violation)
2622 {
2623 #if !PMAP_CS
2624 #pragma unused(caller_prot)
2625 #pragma unused(fault_info)
2626 #endif /* !PMAP_CS */
2627 int cs_enforcement_enabled;
2628 if (!cs_bypass &&
2629 vm_fault_cs_need_validation(pmap, m, object,
2630 fault_page_size, fault_phys_offset)) {
2631 vm_object_lock_assert_exclusive(object);
2632
2633 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2634 vm_cs_revalidates++;
2635 }
2636
2637 /* VM map is locked, so 1 ref will remain on VM object -
2638 * so no harm if vm_page_validate_cs drops the object lock */
2639
2640 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2641 }
2642
2643 /* If the map is switched, and is switch-protected, we must protect
2644 * some pages from being write-faulted: immutable pages because by
2645 * definition they may not be written, and executable pages because that
2646 * would provide a way to inject unsigned code.
2647 * If the page is immutable, we can simply return. However, we can't
2648 * immediately determine whether a page is executable anywhere. But,
2649 * we can disconnect it everywhere and remove the executable protection
2650 * from the current map. We do that below right before we do the
2651 * PMAP_ENTER.
2652 */
2653 if (pmap == kernel_pmap) {
2654 /* kernel fault: cs_enforcement does not apply */
2655 cs_enforcement_enabled = 0;
2656 } else {
2657 cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2658 }
2659
2660 if (cs_enforcement_enabled && map_is_switched &&
2661 map_is_switch_protected &&
2662 vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2663 (prot & VM_PROT_WRITE)) {
2664 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2665 return KERN_CODESIGN_ERROR;
2666 }
2667
2668 if (cs_enforcement_enabled &&
2669 vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2670 (prot & VM_PROT_EXECUTE)) {
2671 if (cs_debug) {
2672 printf("page marked to be NX, not letting it be mapped EXEC\n");
2673 }
2674 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2675 return KERN_CODESIGN_ERROR;
2676 }
2677
2678 /* A page could be tainted, or pose a risk of being tainted later.
2679 * Check whether the receiving process wants it, and make it feel
2680 * the consequences (that hapens in cs_invalid_page()).
2681 * For CS Enforcement, two other conditions will
2682 * cause that page to be tainted as well:
2683 * - pmapping an unsigned page executable - this means unsigned code;
2684 * - writeable mapping of a validated page - the content of that page
2685 * can be changed without the kernel noticing, therefore unsigned
2686 * code can be created
2687 */
2688 if (cs_bypass) {
2689 /* code-signing is bypassed */
2690 *cs_violation = FALSE;
2691 } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2692 /* tainted page */
2693 *cs_violation = TRUE;
2694 } else if (!cs_enforcement_enabled) {
2695 /* no further code-signing enforcement */
2696 *cs_violation = FALSE;
2697 } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2698 ((prot & VM_PROT_WRITE) ||
2699 m->vmp_wpmapped)) {
2700 /*
2701 * The page should be immutable, but is in danger of being
2702 * modified.
2703 * This is the case where we want policy from the code
2704 * directory - is the page immutable or not? For now we have
2705 * to assume that code pages will be immutable, data pages not.
2706 * We'll assume a page is a code page if it has a code directory
2707 * and we fault for execution.
2708 * That is good enough since if we faulted the code page for
2709 * writing in another map before, it is wpmapped; if we fault
2710 * it for writing in this map later it will also be faulted for
2711 * executing at the same time; and if we fault for writing in
2712 * another map later, we will disconnect it from this pmap so
2713 * we'll notice the change.
2714 */
2715 *cs_violation = TRUE;
2716 } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2717 (prot & VM_PROT_EXECUTE)
2718 ) {
2719 *cs_violation = TRUE;
2720 } else {
2721 *cs_violation = FALSE;
2722 }
2723 return KERN_SUCCESS;
2724 }
2725
2726 /*
2727 * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2728 * @param must_disconnect This value will be set to true if the caller must disconnect
2729 * this page.
2730 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2731 */
2732 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2733 vm_fault_cs_handle_violation(
2734 vm_object_t object,
2735 vm_page_t m,
2736 pmap_t pmap,
2737 vm_prot_t prot,
2738 vm_map_offset_t vaddr,
2739 vm_map_size_t fault_page_size,
2740 vm_map_offset_t fault_phys_offset,
2741 bool map_is_switched,
2742 bool map_is_switch_protected,
2743 bool *must_disconnect)
2744 {
2745 #if !MACH_ASSERT
2746 #pragma unused(pmap)
2747 #pragma unused(map_is_switch_protected)
2748 #endif /* !MACH_ASSERT */
2749 /*
2750 * We will have a tainted page. Have to handle the special case
2751 * of a switched map now. If the map is not switched, standard
2752 * procedure applies - call cs_invalid_page().
2753 * If the map is switched, the real owner is invalid already.
2754 * There is no point in invalidating the switching process since
2755 * it will not be executing from the map. So we don't call
2756 * cs_invalid_page() in that case.
2757 */
2758 boolean_t reject_page, cs_killed;
2759 kern_return_t kr;
2760 if (map_is_switched) {
2761 assert(pmap == vm_map_pmap(current_thread()->map));
2762 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2763 reject_page = FALSE;
2764 } else {
2765 if (cs_debug > 5) {
2766 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2767 object->code_signed ? "yes" : "no",
2768 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2769 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2770 m->vmp_wpmapped ? "yes" : "no",
2771 (int)prot);
2772 }
2773 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2774 }
2775
2776 if (reject_page) {
2777 /* reject the invalid page: abort the page fault */
2778 int pid;
2779 const char *procname;
2780 task_t task;
2781 vm_object_t file_object, shadow;
2782 vm_object_offset_t file_offset;
2783 char *pathname, *filename;
2784 vm_size_t pathname_len, filename_len;
2785 boolean_t truncated_path;
2786 #define __PATH_MAX 1024
2787 struct timespec mtime, cs_mtime;
2788 int shadow_depth;
2789 os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2790
2791 kr = KERN_CODESIGN_ERROR;
2792 cs_enter_tainted_rejected++;
2793
2794 /* get process name and pid */
2795 procname = "?";
2796 task = current_task();
2797 pid = proc_selfpid();
2798 if (get_bsdtask_info(task) != NULL) {
2799 procname = proc_name_address(get_bsdtask_info(task));
2800 }
2801
2802 /* get file's VM object */
2803 file_object = object;
2804 file_offset = m->vmp_offset;
2805 for (shadow = file_object->shadow,
2806 shadow_depth = 0;
2807 shadow != VM_OBJECT_NULL;
2808 shadow = file_object->shadow,
2809 shadow_depth++) {
2810 vm_object_lock_shared(shadow);
2811 if (file_object != object) {
2812 vm_object_unlock(file_object);
2813 }
2814 file_offset += file_object->vo_shadow_offset;
2815 file_object = shadow;
2816 }
2817
2818 mtime.tv_sec = 0;
2819 mtime.tv_nsec = 0;
2820 cs_mtime.tv_sec = 0;
2821 cs_mtime.tv_nsec = 0;
2822
2823 /* get file's pathname and/or filename */
2824 pathname = NULL;
2825 filename = NULL;
2826 pathname_len = 0;
2827 filename_len = 0;
2828 truncated_path = FALSE;
2829 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2830 if (file_object->pager != NULL) {
2831 pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
2832 if (pathname) {
2833 pathname[0] = '\0';
2834 pathname_len = __PATH_MAX;
2835 filename = pathname + pathname_len;
2836 filename_len = __PATH_MAX;
2837
2838 if (vnode_pager_get_object_name(file_object->pager,
2839 pathname,
2840 pathname_len,
2841 filename,
2842 filename_len,
2843 &truncated_path) == KERN_SUCCESS) {
2844 /* safety first... */
2845 pathname[__PATH_MAX - 1] = '\0';
2846 filename[__PATH_MAX - 1] = '\0';
2847
2848 vnode_pager_get_object_mtime(file_object->pager,
2849 &mtime,
2850 &cs_mtime);
2851 } else {
2852 kfree_data(pathname, __PATH_MAX * 2);
2853 pathname = NULL;
2854 filename = NULL;
2855 pathname_len = 0;
2856 filename_len = 0;
2857 truncated_path = FALSE;
2858 }
2859 }
2860 }
2861 printf("CODE SIGNING: process %d[%s]: "
2862 "rejecting invalid page at address 0x%llx "
2863 "from offset 0x%llx in file \"%s%s%s\" "
2864 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2865 "(signed:%d validated:%d tainted:%d nx:%d "
2866 "wpmapped:%d dirty:%d depth:%d)\n",
2867 pid, procname, (addr64_t) vaddr,
2868 file_offset,
2869 (pathname ? pathname : "<nil>"),
2870 (truncated_path ? "/.../" : ""),
2871 (truncated_path ? filename : ""),
2872 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2873 ((cs_mtime.tv_sec == mtime.tv_sec &&
2874 cs_mtime.tv_nsec == mtime.tv_nsec)
2875 ? "=="
2876 : "!="),
2877 mtime.tv_sec, mtime.tv_nsec,
2878 object->code_signed,
2879 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2880 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2881 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2882 m->vmp_wpmapped,
2883 m->vmp_dirty,
2884 shadow_depth);
2885
2886 /*
2887 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2888 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2889 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2890 * will deal with the segmentation fault.
2891 */
2892 if (cs_killed) {
2893 KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2894 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2895
2896 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2897 if (codesigning_exit_reason == NULL) {
2898 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2899 } else {
2900 mach_vm_address_t data_addr = 0;
2901 struct codesigning_exit_reason_info *ceri = NULL;
2902 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2903
2904 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2905 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2906 } else {
2907 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2908 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2909 ceri = (struct codesigning_exit_reason_info *)data_addr;
2910 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2911
2912 ceri->ceri_virt_addr = vaddr;
2913 ceri->ceri_file_offset = file_offset;
2914 if (pathname) {
2915 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2916 } else {
2917 ceri->ceri_pathname[0] = '\0';
2918 }
2919 if (filename) {
2920 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2921 } else {
2922 ceri->ceri_filename[0] = '\0';
2923 }
2924 ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2925 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2926 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2927 ceri->ceri_page_modtime_secs = mtime.tv_sec;
2928 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2929 ceri->ceri_object_codesigned = (object->code_signed);
2930 ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2931 ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2932 ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2933 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2934 ceri->ceri_page_slid = 0;
2935 ceri->ceri_page_dirty = (m->vmp_dirty);
2936 ceri->ceri_page_shadow_depth = shadow_depth;
2937 } else {
2938 #if DEBUG || DEVELOPMENT
2939 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2940 #else
2941 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2942 #endif /* DEBUG || DEVELOPMENT */
2943 /* Free the buffer */
2944 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2945 }
2946 }
2947 }
2948
2949 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2950 }
2951 if (panic_on_cs_killed &&
2952 object->object_is_shared_cache) {
2953 char *tainted_contents;
2954 vm_map_offset_t src_vaddr;
2955 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2956 tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
2957 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2958 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2959 panic("CODE SIGNING: process %d[%s]: "
2960 "rejecting invalid page (phys#0x%x) at address 0x%llx "
2961 "from offset 0x%llx in file \"%s%s%s\" "
2962 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2963 "(signed:%d validated:%d tainted:%d nx:%d"
2964 "wpmapped:%d dirty:%d depth:%d)\n",
2965 pid, procname,
2966 VM_PAGE_GET_PHYS_PAGE(m),
2967 (addr64_t) vaddr,
2968 file_offset,
2969 (pathname ? pathname : "<nil>"),
2970 (truncated_path ? "/.../" : ""),
2971 (truncated_path ? filename : ""),
2972 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2973 ((cs_mtime.tv_sec == mtime.tv_sec &&
2974 cs_mtime.tv_nsec == mtime.tv_nsec)
2975 ? "=="
2976 : "!="),
2977 mtime.tv_sec, mtime.tv_nsec,
2978 object->code_signed,
2979 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2980 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2981 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2982 m->vmp_wpmapped,
2983 m->vmp_dirty,
2984 shadow_depth);
2985 }
2986
2987 if (file_object != object) {
2988 vm_object_unlock(file_object);
2989 }
2990 if (pathname_len != 0) {
2991 kfree_data(pathname, __PATH_MAX * 2);
2992 pathname = NULL;
2993 filename = NULL;
2994 }
2995 } else {
2996 /* proceed with the invalid page */
2997 kr = KERN_SUCCESS;
2998 if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2999 !object->code_signed) {
3000 /*
3001 * This page has not been (fully) validated but
3002 * does not belong to a code-signed object
3003 * so it should not be forcefully considered
3004 * as tainted.
3005 * We're just concerned about it here because
3006 * we've been asked to "execute" it but that
3007 * does not mean that it should cause other
3008 * accesses to fail.
3009 * This happens when a debugger sets a
3010 * breakpoint and we then execute code in
3011 * that page. Marking the page as "tainted"
3012 * would cause any inspection tool ("leaks",
3013 * "vmmap", "CrashReporter", ...) to get killed
3014 * due to code-signing violation on that page,
3015 * even though they're just reading it and not
3016 * executing from it.
3017 */
3018 } else {
3019 /*
3020 * Page might have been tainted before or not;
3021 * now it definitively is. If the page wasn't
3022 * tainted, we must disconnect it from all
3023 * pmaps later, to force existing mappings
3024 * through that code path for re-consideration
3025 * of the validity of that page.
3026 */
3027 if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3028 *must_disconnect = TRUE;
3029 VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3030 }
3031 }
3032 cs_enter_tainted_accepted++;
3033 }
3034 if (kr != KERN_SUCCESS) {
3035 if (cs_debug) {
3036 printf("CODESIGNING: vm_fault_enter(0x%llx): "
3037 "*** INVALID PAGE ***\n",
3038 (long long)vaddr);
3039 }
3040 #if !SECURE_KERNEL
3041 if (cs_enforcement_panic) {
3042 panic("CODESIGNING: panicking on invalid page");
3043 }
3044 #endif
3045 }
3046 return kr;
3047 }
3048
3049 /*
3050 * Check that the code signature is valid for the given page being inserted into
3051 * the pmap.
3052 *
3053 * @param must_disconnect This value will be set to true if the caller must disconnect
3054 * this page.
3055 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3056 */
3057 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3058 vm_fault_validate_cs(
3059 bool cs_bypass,
3060 vm_object_t object,
3061 vm_page_t m,
3062 pmap_t pmap,
3063 vm_map_offset_t vaddr,
3064 vm_prot_t prot,
3065 vm_prot_t caller_prot,
3066 vm_map_size_t fault_page_size,
3067 vm_map_offset_t fault_phys_offset,
3068 vm_object_fault_info_t fault_info,
3069 bool *must_disconnect)
3070 {
3071 bool map_is_switched, map_is_switch_protected, cs_violation;
3072 kern_return_t kr;
3073 /* Validate code signature if necessary. */
3074 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3075 (pmap == vm_map_pmap(current_thread()->map)));
3076 map_is_switch_protected = current_thread()->map->switch_protect;
3077 kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3078 prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3079 map_is_switched, map_is_switch_protected, &cs_violation);
3080 if (kr != KERN_SUCCESS) {
3081 return kr;
3082 }
3083 if (cs_violation) {
3084 kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3085 fault_page_size, fault_phys_offset,
3086 map_is_switched, map_is_switch_protected, must_disconnect);
3087 }
3088 return kr;
3089 }
3090
3091 /*
3092 * Enqueue the page on the appropriate paging queue.
3093 */
3094 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3095 vm_fault_enqueue_page(
3096 vm_object_t object,
3097 vm_page_t m,
3098 bool wired,
3099 bool change_wiring,
3100 vm_tag_t wire_tag,
3101 bool no_cache,
3102 int *type_of_fault,
3103 kern_return_t kr)
3104 {
3105 assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3106 boolean_t page_queues_locked = FALSE;
3107 boolean_t previously_pmapped = m->vmp_pmapped;
3108 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3109 MACRO_BEGIN \
3110 if (! page_queues_locked) { \
3111 page_queues_locked = TRUE; \
3112 vm_page_lockspin_queues(); \
3113 } \
3114 MACRO_END
3115 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3116 MACRO_BEGIN \
3117 if (page_queues_locked) { \
3118 page_queues_locked = FALSE; \
3119 vm_page_unlock_queues(); \
3120 } \
3121 MACRO_END
3122
3123 vm_page_update_special_state(m);
3124 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3125 /*
3126 * Compressor pages are neither wired
3127 * nor pageable and should never change.
3128 */
3129 assert(object == compressor_object);
3130 } else if (change_wiring) {
3131 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3132
3133 if (wired) {
3134 if (kr == KERN_SUCCESS) {
3135 vm_page_wire(m, wire_tag, TRUE);
3136 }
3137 } else {
3138 vm_page_unwire(m, TRUE);
3139 }
3140 /* we keep the page queues lock, if we need it later */
3141 } else {
3142 if (object->internal == TRUE) {
3143 /*
3144 * don't allow anonymous pages on
3145 * the speculative queues
3146 */
3147 no_cache = FALSE;
3148 }
3149 if (kr != KERN_SUCCESS) {
3150 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3151 vm_page_deactivate(m);
3152 /* we keep the page queues lock, if we need it later */
3153 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3154 (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3155 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3156 ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3157 !VM_PAGE_WIRED(m)) {
3158 if (vm_page_local_q &&
3159 (*type_of_fault == DBG_COW_FAULT ||
3160 *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3161 struct vpl *lq;
3162 uint32_t lid;
3163
3164 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3165
3166 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3167 vm_object_lock_assert_exclusive(object);
3168
3169 /*
3170 * we got a local queue to stuff this
3171 * new page on...
3172 * its safe to manipulate local and
3173 * local_id at this point since we're
3174 * behind an exclusive object lock and
3175 * the page is not on any global queue.
3176 *
3177 * we'll use the current cpu number to
3178 * select the queue note that we don't
3179 * need to disable preemption... we're
3180 * going to be behind the local queue's
3181 * lock to do the real work
3182 */
3183 lid = cpu_number();
3184
3185 lq = zpercpu_get_cpu(vm_page_local_q, lid);
3186
3187 VPL_LOCK(&lq->vpl_lock);
3188
3189 vm_page_check_pageable_safe(m);
3190 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3191 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3192 m->vmp_local_id = lid;
3193 lq->vpl_count++;
3194
3195 if (object->internal) {
3196 lq->vpl_internal_count++;
3197 } else {
3198 lq->vpl_external_count++;
3199 }
3200
3201 VPL_UNLOCK(&lq->vpl_lock);
3202
3203 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3204 /*
3205 * we're beyond the soft limit
3206 * for the local queue
3207 * vm_page_reactivate_local will
3208 * 'try' to take the global page
3209 * queue lock... if it can't
3210 * that's ok... we'll let the
3211 * queue continue to grow up
3212 * to the hard limit... at that
3213 * point we'll wait for the
3214 * lock... once we've got the
3215 * lock, we'll transfer all of
3216 * the pages from the local
3217 * queue to the global active
3218 * queue
3219 */
3220 vm_page_reactivate_local(lid, FALSE, FALSE);
3221 }
3222 } else {
3223 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3224
3225 /*
3226 * test again now that we hold the
3227 * page queue lock
3228 */
3229 if (!VM_PAGE_WIRED(m)) {
3230 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3231 vm_page_queues_remove(m, FALSE);
3232
3233 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3234 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3235 }
3236
3237 if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3238 no_cache) {
3239 /*
3240 * If this is a no_cache mapping
3241 * and the page has never been
3242 * mapped before or was
3243 * previously a no_cache page,
3244 * then we want to leave pages
3245 * in the speculative state so
3246 * that they can be readily
3247 * recycled if free memory runs
3248 * low. Otherwise the page is
3249 * activated as normal.
3250 */
3251
3252 if (no_cache &&
3253 (!previously_pmapped ||
3254 m->vmp_no_cache)) {
3255 m->vmp_no_cache = TRUE;
3256
3257 if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3258 vm_page_speculate(m, FALSE);
3259 }
3260 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3261 vm_page_activate(m);
3262 }
3263 }
3264 }
3265 /* we keep the page queues lock, if we need it later */
3266 }
3267 }
3268 }
3269 /* we're done with the page queues lock, if we ever took it */
3270 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3271 }
3272
3273 /*
3274 * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3275 * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3276 * before being inserted into the pmap.
3277 */
3278 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3279 vm_fault_enter_set_mapped(
3280 vm_object_t object,
3281 vm_page_t m,
3282 vm_prot_t prot,
3283 vm_prot_t fault_type)
3284 {
3285 bool page_needs_sync = false;
3286 /*
3287 * NOTE: we may only hold the vm_object lock SHARED
3288 * at this point, so we need the phys_page lock to
3289 * properly serialize updating the pmapped and
3290 * xpmapped bits
3291 */
3292 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3293 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3294
3295 pmap_lock_phys_page(phys_page);
3296 m->vmp_pmapped = TRUE;
3297
3298 if (!m->vmp_xpmapped) {
3299 m->vmp_xpmapped = TRUE;
3300
3301 pmap_unlock_phys_page(phys_page);
3302
3303 if (!object->internal) {
3304 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3305 }
3306
3307 #if defined(__arm64__)
3308 page_needs_sync = true;
3309 #else
3310 if (object->internal &&
3311 object->pager != NULL) {
3312 /*
3313 * This page could have been
3314 * uncompressed by the
3315 * compressor pager and its
3316 * contents might be only in
3317 * the data cache.
3318 * Since it's being mapped for
3319 * "execute" for the fist time,
3320 * make sure the icache is in
3321 * sync.
3322 */
3323 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3324 page_needs_sync = true;
3325 }
3326 #endif
3327 } else {
3328 pmap_unlock_phys_page(phys_page);
3329 }
3330 } else {
3331 if (m->vmp_pmapped == FALSE) {
3332 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3333
3334 pmap_lock_phys_page(phys_page);
3335 m->vmp_pmapped = TRUE;
3336 pmap_unlock_phys_page(phys_page);
3337 }
3338 }
3339
3340 if (fault_type & VM_PROT_WRITE) {
3341 if (m->vmp_wpmapped == FALSE) {
3342 vm_object_lock_assert_exclusive(object);
3343 if (!object->internal && object->pager) {
3344 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3345 }
3346 m->vmp_wpmapped = TRUE;
3347 }
3348 }
3349 return page_needs_sync;
3350 }
3351
3352 /*
3353 * Try to enter the given page into the pmap.
3354 * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3355 * a codesigning failure on a non-execute fault.
3356 */
3357 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3358 vm_fault_attempt_pmap_enter(
3359 pmap_t pmap,
3360 vm_map_offset_t vaddr,
3361 vm_map_size_t fault_page_size,
3362 vm_map_offset_t fault_phys_offset,
3363 vm_page_t m,
3364 vm_prot_t *prot,
3365 vm_prot_t caller_prot,
3366 vm_prot_t fault_type,
3367 bool wired,
3368 int pmap_options)
3369 {
3370 #if !PMAP_CS
3371 #pragma unused(caller_prot)
3372 #endif /* !PMAP_CS */
3373 kern_return_t kr;
3374 if (fault_page_size != PAGE_SIZE) {
3375 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3376 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3377 fault_phys_offset < PAGE_SIZE),
3378 "0x%llx\n", (uint64_t)fault_phys_offset);
3379 } else {
3380 assertf(fault_phys_offset == 0,
3381 "0x%llx\n", (uint64_t)fault_phys_offset);
3382 }
3383
3384 PMAP_ENTER_OPTIONS(pmap, vaddr,
3385 fault_phys_offset,
3386 m, *prot, fault_type, 0,
3387 wired,
3388 pmap_options,
3389 kr);
3390 return kr;
3391 }
3392
3393 /*
3394 * Enter the given page into the pmap.
3395 * The map must be locked shared.
3396 * The vm object must NOT be locked.
3397 *
3398 * @param need_retry if not null, avoid making a (potentially) blocking call into
3399 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3400 */
3401 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3402 vm_fault_pmap_enter(
3403 pmap_t pmap,
3404 vm_map_offset_t vaddr,
3405 vm_map_size_t fault_page_size,
3406 vm_map_offset_t fault_phys_offset,
3407 vm_page_t m,
3408 vm_prot_t *prot,
3409 vm_prot_t caller_prot,
3410 vm_prot_t fault_type,
3411 bool wired,
3412 int pmap_options,
3413 boolean_t *need_retry)
3414 {
3415 kern_return_t kr;
3416 if (need_retry != NULL) {
3417 /*
3418 * Although we don't hold a lock on this object, we hold a lock
3419 * on the top object in the chain. To prevent a deadlock, we
3420 * can't allow the pmap layer to block.
3421 */
3422 pmap_options |= PMAP_OPTIONS_NOWAIT;
3423 }
3424 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3425 fault_page_size, fault_phys_offset,
3426 m, prot, caller_prot, fault_type, wired, pmap_options);
3427 if (kr == KERN_RESOURCE_SHORTAGE) {
3428 if (need_retry) {
3429 /*
3430 * There's nothing we can do here since we hold the
3431 * lock on the top object in the chain. The caller
3432 * will need to deal with this by dropping that lock and retrying.
3433 */
3434 *need_retry = TRUE;
3435 vm_pmap_enter_retried++;
3436 }
3437 }
3438 return kr;
3439 }
3440
3441 /*
3442 * Enter the given page into the pmap.
3443 * The vm map must be locked shared.
3444 * The vm object must be locked exclusive, unless this is a soft fault.
3445 * For a soft fault, the object must be locked shared or exclusive.
3446 *
3447 * @param need_retry if not null, avoid making a (potentially) blocking call into
3448 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3449 */
3450 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3451 vm_fault_pmap_enter_with_object_lock(
3452 vm_object_t object,
3453 pmap_t pmap,
3454 vm_map_offset_t vaddr,
3455 vm_map_size_t fault_page_size,
3456 vm_map_offset_t fault_phys_offset,
3457 vm_page_t m,
3458 vm_prot_t *prot,
3459 vm_prot_t caller_prot,
3460 vm_prot_t fault_type,
3461 bool wired,
3462 int pmap_options,
3463 boolean_t *need_retry)
3464 {
3465 kern_return_t kr;
3466 /*
3467 * Prevent a deadlock by not
3468 * holding the object lock if we need to wait for a page in
3469 * pmap_enter() - <rdar://problem/7138958>
3470 */
3471 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3472 fault_page_size, fault_phys_offset,
3473 m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3474 #if __x86_64__
3475 if (kr == KERN_INVALID_ARGUMENT &&
3476 pmap == PMAP_NULL &&
3477 wired) {
3478 /*
3479 * Wiring a page in a pmap-less VM map:
3480 * VMware's "vmmon" kernel extension does this
3481 * to grab pages.
3482 * Let it proceed even though the PMAP_ENTER() failed.
3483 */
3484 kr = KERN_SUCCESS;
3485 }
3486 #endif /* __x86_64__ */
3487
3488 if (kr == KERN_RESOURCE_SHORTAGE) {
3489 if (need_retry) {
3490 /*
3491 * this will be non-null in the case where we hold the lock
3492 * on the top-object in this chain... we can't just drop
3493 * the lock on the object we're inserting the page into
3494 * and recall the PMAP_ENTER since we can still cause
3495 * a deadlock if one of the critical paths tries to
3496 * acquire the lock on the top-object and we're blocked
3497 * in PMAP_ENTER waiting for memory... our only recourse
3498 * is to deal with it at a higher level where we can
3499 * drop both locks.
3500 */
3501 *need_retry = TRUE;
3502 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PMAP_ENTER_RESOURCE_SHORTAGE), 0 /* arg */);
3503 vm_pmap_enter_retried++;
3504 goto done;
3505 }
3506 /*
3507 * The nonblocking version of pmap_enter did not succeed.
3508 * and we don't need to drop other locks and retry
3509 * at the level above us, so
3510 * use the blocking version instead. Requires marking
3511 * the page busy and unlocking the object
3512 */
3513 boolean_t was_busy = m->vmp_busy;
3514
3515 vm_object_lock_assert_exclusive(object);
3516
3517 m->vmp_busy = TRUE;
3518 vm_object_unlock(object);
3519
3520 PMAP_ENTER_OPTIONS(pmap, vaddr,
3521 fault_phys_offset,
3522 m, *prot, fault_type,
3523 0, wired,
3524 pmap_options, kr);
3525
3526 assert(VM_PAGE_OBJECT(m) == object);
3527
3528 /* Take the object lock again. */
3529 vm_object_lock(object);
3530
3531 /* If the page was busy, someone else will wake it up.
3532 * Otherwise, we have to do it now. */
3533 assert(m->vmp_busy);
3534 if (!was_busy) {
3535 PAGE_WAKEUP_DONE(m);
3536 }
3537 vm_pmap_enter_blocked++;
3538 }
3539
3540 done:
3541 return kr;
3542 }
3543
3544 /*
3545 * Prepare to enter a page into the pmap by checking CS, protection bits,
3546 * and setting mapped bits on the page_t.
3547 * Does not modify the page's paging queue.
3548 *
3549 * page queue lock must NOT be held
3550 * m->vmp_object must be locked
3551 *
3552 * NOTE: m->vmp_object could be locked "shared" only if we are called
3553 * from vm_fault() as part of a soft fault.
3554 */
3555 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,boolean_t change_wiring,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync)3556 vm_fault_enter_prepare(
3557 vm_page_t m,
3558 pmap_t pmap,
3559 vm_map_offset_t vaddr,
3560 vm_prot_t *prot,
3561 vm_prot_t caller_prot,
3562 vm_map_size_t fault_page_size,
3563 vm_map_offset_t fault_phys_offset,
3564 boolean_t change_wiring,
3565 vm_prot_t fault_type,
3566 vm_object_fault_info_t fault_info,
3567 int *type_of_fault,
3568 bool *page_needs_data_sync)
3569 {
3570 kern_return_t kr;
3571 bool is_tainted = false;
3572 vm_object_t object;
3573 boolean_t cs_bypass = fault_info->cs_bypass;
3574
3575 object = VM_PAGE_OBJECT(m);
3576
3577 vm_object_lock_assert_held(object);
3578
3579 #if KASAN
3580 if (pmap == kernel_pmap) {
3581 kasan_notify_address(vaddr, PAGE_SIZE);
3582 }
3583 #endif
3584
3585 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3586
3587 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3588 vm_object_lock_assert_exclusive(object);
3589 } else if ((fault_type & VM_PROT_WRITE) == 0 &&
3590 !change_wiring &&
3591 (!m->vmp_wpmapped
3592 #if VM_OBJECT_ACCESS_TRACKING
3593 || object->access_tracking
3594 #endif /* VM_OBJECT_ACCESS_TRACKING */
3595 )) {
3596 /*
3597 * This is not a "write" fault, so we
3598 * might not have taken the object lock
3599 * exclusively and we might not be able
3600 * to update the "wpmapped" bit in
3601 * vm_fault_enter().
3602 * Let's just grant read access to
3603 * the page for now and we'll
3604 * soft-fault again if we need write
3605 * access later...
3606 */
3607
3608 /* This had better not be a JIT page. */
3609 if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3610 *prot &= ~VM_PROT_WRITE;
3611 } else {
3612 assert(cs_bypass);
3613 }
3614 }
3615 if (m->vmp_pmapped == FALSE) {
3616 if (m->vmp_clustered) {
3617 if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3618 /*
3619 * found it in the cache, but this
3620 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3621 * so it must have come in as part of
3622 * a cluster... account 1 pagein against it
3623 */
3624 if (object->internal) {
3625 *type_of_fault = DBG_PAGEIND_FAULT;
3626 } else {
3627 *type_of_fault = DBG_PAGEINV_FAULT;
3628 }
3629
3630 VM_PAGE_COUNT_AS_PAGEIN(m);
3631 }
3632 VM_PAGE_CONSUME_CLUSTERED(m);
3633 }
3634 }
3635
3636 if (*type_of_fault != DBG_COW_FAULT) {
3637 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3638
3639 if (pmap == kernel_pmap) {
3640 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3641 }
3642 }
3643
3644 kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3645 *prot, caller_prot, fault_page_size, fault_phys_offset,
3646 fault_info, &is_tainted);
3647 if (kr == KERN_SUCCESS) {
3648 /*
3649 * We either have a good page, or a tainted page that has been accepted by the process.
3650 * In both cases the page will be entered into the pmap.
3651 */
3652 *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3653 if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3654 /*
3655 * This page is tainted but we're inserting it anyways.
3656 * Since it's writeable, we need to disconnect it from other pmaps
3657 * now so those processes can take note.
3658 */
3659
3660 /*
3661 * We can only get here
3662 * because of the CSE logic
3663 */
3664 assert(pmap_get_vm_map_cs_enforced(pmap));
3665 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3666 /*
3667 * If we are faulting for a write, we can clear
3668 * the execute bit - that will ensure the page is
3669 * checked again before being executable, which
3670 * protects against a map switch.
3671 * This only happens the first time the page
3672 * gets tainted, so we won't get stuck here
3673 * to make an already writeable page executable.
3674 */
3675 if (!cs_bypass) {
3676 assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3677 *prot &= ~VM_PROT_EXECUTE;
3678 }
3679 }
3680 assert(VM_PAGE_OBJECT(m) == object);
3681
3682 #if VM_OBJECT_ACCESS_TRACKING
3683 if (object->access_tracking) {
3684 DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3685 if (fault_type & VM_PROT_WRITE) {
3686 object->access_tracking_writes++;
3687 vm_object_access_tracking_writes++;
3688 } else {
3689 object->access_tracking_reads++;
3690 vm_object_access_tracking_reads++;
3691 }
3692 }
3693 #endif /* VM_OBJECT_ACCESS_TRACKING */
3694 }
3695
3696 return kr;
3697 }
3698
3699 /*
3700 * page queue lock must NOT be held
3701 * m->vmp_object must be locked
3702 *
3703 * NOTE: m->vmp_object could be locked "shared" only if we are called
3704 * from vm_fault() as part of a soft fault. If so, we must be
3705 * careful not to modify the VM object in any way that is not
3706 * legal under a shared lock...
3707 */
3708 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,boolean_t change_wiring,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,boolean_t * need_retry,int * type_of_fault)3709 vm_fault_enter(
3710 vm_page_t m,
3711 pmap_t pmap,
3712 vm_map_offset_t vaddr,
3713 vm_map_size_t fault_page_size,
3714 vm_map_offset_t fault_phys_offset,
3715 vm_prot_t prot,
3716 vm_prot_t caller_prot,
3717 boolean_t wired,
3718 boolean_t change_wiring,
3719 vm_tag_t wire_tag,
3720 vm_object_fault_info_t fault_info,
3721 boolean_t *need_retry,
3722 int *type_of_fault)
3723 {
3724 kern_return_t kr;
3725 vm_object_t object;
3726 bool page_needs_data_sync;
3727 vm_prot_t fault_type;
3728 int pmap_options = fault_info->pmap_options;
3729
3730 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3731 assert(m->vmp_fictitious);
3732 return KERN_SUCCESS;
3733 }
3734
3735 fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3736
3737 assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
3738 kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3739 fault_page_size, fault_phys_offset, change_wiring, fault_type,
3740 fault_info, type_of_fault, &page_needs_data_sync);
3741 object = VM_PAGE_OBJECT(m);
3742
3743 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3744
3745 if (kr == KERN_SUCCESS) {
3746 if (page_needs_data_sync) {
3747 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3748 }
3749
3750 kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3751 fault_page_size, fault_phys_offset, m,
3752 &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3753 }
3754
3755 return kr;
3756 }
3757
3758 void
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)3759 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3760 {
3761 if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3762 vm_fault(current_map(), /* map */
3763 vaddr, /* vaddr */
3764 prot, /* fault_type */
3765 FALSE, /* change_wiring */
3766 VM_KERN_MEMORY_NONE, /* tag - not wiring */
3767 THREAD_UNINT, /* interruptible */
3768 NULL, /* caller_pmap */
3769 0 /* caller_pmap_addr */);
3770 }
3771 }
3772
3773
3774 /*
3775 * Routine: vm_fault
3776 * Purpose:
3777 * Handle page faults, including pseudo-faults
3778 * used to change the wiring status of pages.
3779 * Returns:
3780 * Explicit continuations have been removed.
3781 * Implementation:
3782 * vm_fault and vm_fault_page save mucho state
3783 * in the moral equivalent of a closure. The state
3784 * structure is allocated when first entering vm_fault
3785 * and deallocated when leaving vm_fault.
3786 */
3787
3788 extern uint64_t get_current_unique_pid(void);
3789
3790 unsigned long vm_fault_collapse_total = 0;
3791 unsigned long vm_fault_collapse_skipped = 0;
3792
3793
3794 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3795 vm_fault_external(
3796 vm_map_t map,
3797 vm_map_offset_t vaddr,
3798 vm_prot_t fault_type,
3799 boolean_t change_wiring,
3800 int interruptible,
3801 pmap_t caller_pmap,
3802 vm_map_offset_t caller_pmap_addr)
3803 {
3804 return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3805 change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3806 interruptible, caller_pmap, caller_pmap_addr,
3807 NULL);
3808 }
3809
3810 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3811 vm_fault(
3812 vm_map_t map,
3813 vm_map_offset_t vaddr,
3814 vm_prot_t fault_type,
3815 boolean_t change_wiring,
3816 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3817 int interruptible,
3818 pmap_t caller_pmap,
3819 vm_map_offset_t caller_pmap_addr)
3820 {
3821 return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3822 interruptible, caller_pmap, caller_pmap_addr,
3823 NULL);
3824 }
3825
3826 static boolean_t
current_proc_is_privileged(void)3827 current_proc_is_privileged(void)
3828 {
3829 return csproc_get_platform_binary(current_proc());
3830 }
3831
3832 uint64_t vm_copied_on_read = 0;
3833
3834 /*
3835 * Cleanup after a vm_fault_enter.
3836 * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3837 * or the page should be in the pmap and on the correct paging queue.
3838 *
3839 * Precondition:
3840 * map must be locked shared.
3841 * m_object must be locked.
3842 * If top_object != VM_OBJECT_NULL, it must be locked.
3843 * real_map must be locked.
3844 *
3845 * Postcondition:
3846 * map will be unlocked
3847 * m_object will be unlocked
3848 * top_object will be unlocked
3849 * If real_map != map, it will be unlocked
3850 */
3851 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,boolean_t need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)3852 vm_fault_complete(
3853 vm_map_t map,
3854 vm_map_t real_map,
3855 vm_object_t object,
3856 vm_object_t m_object,
3857 vm_page_t m,
3858 vm_map_offset_t offset,
3859 vm_map_offset_t trace_real_vaddr,
3860 vm_object_fault_info_t fault_info,
3861 vm_prot_t caller_prot,
3862 #if CONFIG_DTRACE
3863 vm_map_offset_t real_vaddr,
3864 #else
3865 __unused vm_map_offset_t real_vaddr,
3866 #endif /* CONFIG_DTRACE */
3867 int type_of_fault,
3868 boolean_t need_retry,
3869 kern_return_t kr,
3870 ppnum_t *physpage_p,
3871 vm_prot_t prot,
3872 vm_object_t top_object,
3873 boolean_t need_collapse,
3874 vm_map_offset_t cur_offset,
3875 vm_prot_t fault_type,
3876 vm_object_t *written_on_object,
3877 memory_object_t *written_on_pager,
3878 vm_object_offset_t *written_on_offset)
3879 {
3880 int event_code = 0;
3881 vm_map_lock_assert_shared(map);
3882 vm_object_lock_assert_held(m_object);
3883 if (top_object != VM_OBJECT_NULL) {
3884 vm_object_lock_assert_held(top_object);
3885 }
3886 vm_map_lock_assert_held(real_map);
3887
3888 if (m_object->internal) {
3889 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3890 } else if (m_object->object_is_shared_cache) {
3891 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3892 } else {
3893 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3894 }
3895 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
3896 if (need_retry == FALSE) {
3897 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
3898 }
3899 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3900 if (kr == KERN_SUCCESS &&
3901 physpage_p != NULL) {
3902 /* for vm_map_wire_and_extract() */
3903 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3904 if (prot & VM_PROT_WRITE) {
3905 vm_object_lock_assert_exclusive(m_object);
3906 m->vmp_dirty = TRUE;
3907 }
3908 }
3909
3910 if (top_object != VM_OBJECT_NULL) {
3911 /*
3912 * It's safe to drop the top object
3913 * now that we've done our
3914 * vm_fault_enter(). Any other fault
3915 * in progress for that virtual
3916 * address will either find our page
3917 * and translation or put in a new page
3918 * and translation.
3919 */
3920 vm_object_unlock(top_object);
3921 top_object = VM_OBJECT_NULL;
3922 }
3923
3924 if (need_collapse == TRUE) {
3925 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3926 }
3927
3928 if (need_retry == FALSE &&
3929 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3930 /*
3931 * evaluate access pattern and update state
3932 * vm_fault_deactivate_behind depends on the
3933 * state being up to date
3934 */
3935 vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3936
3937 vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3938 }
3939 /*
3940 * That's it, clean up and return.
3941 */
3942 if (m->vmp_busy) {
3943 vm_object_lock_assert_exclusive(m_object);
3944 PAGE_WAKEUP_DONE(m);
3945 }
3946
3947 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3948 vm_object_paging_begin(m_object);
3949
3950 assert(*written_on_object == VM_OBJECT_NULL);
3951 *written_on_object = m_object;
3952 *written_on_pager = m_object->pager;
3953 *written_on_offset = m_object->paging_offset + m->vmp_offset;
3954 }
3955 vm_object_unlock(object);
3956
3957 vm_map_unlock_read(map);
3958 if (real_map != map) {
3959 vm_map_unlock(real_map);
3960 }
3961 }
3962
3963 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)3964 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3965 {
3966 if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3967 return DBG_COR_FAULT;
3968 }
3969 return type_of_fault;
3970 }
3971
3972 uint64_t vm_fault_resilient_media_initiate = 0;
3973 uint64_t vm_fault_resilient_media_retry = 0;
3974 uint64_t vm_fault_resilient_media_proceed = 0;
3975 uint64_t vm_fault_resilient_media_release = 0;
3976 uint64_t vm_fault_resilient_media_abort1 = 0;
3977 uint64_t vm_fault_resilient_media_abort2 = 0;
3978
3979 #if MACH_ASSERT
3980 int vm_fault_resilient_media_inject_error1_rate = 0;
3981 int vm_fault_resilient_media_inject_error1 = 0;
3982 int vm_fault_resilient_media_inject_error2_rate = 0;
3983 int vm_fault_resilient_media_inject_error2 = 0;
3984 int vm_fault_resilient_media_inject_error3_rate = 0;
3985 int vm_fault_resilient_media_inject_error3 = 0;
3986 #endif /* MACH_ASSERT */
3987
3988 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p)3989 vm_fault_internal(
3990 vm_map_t map,
3991 vm_map_offset_t vaddr,
3992 vm_prot_t caller_prot,
3993 boolean_t change_wiring,
3994 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3995 int interruptible,
3996 pmap_t caller_pmap,
3997 vm_map_offset_t caller_pmap_addr,
3998 ppnum_t *physpage_p)
3999 {
4000 vm_map_version_t version; /* Map version for verificiation */
4001 boolean_t wired; /* Should mapping be wired down? */
4002 vm_object_t object; /* Top-level object */
4003 vm_object_offset_t offset; /* Top-level offset */
4004 vm_prot_t prot; /* Protection for mapping */
4005 vm_object_t old_copy_object; /* Saved copy object */
4006 vm_page_t result_page; /* Result of vm_fault_page */
4007 vm_page_t top_page; /* Placeholder page */
4008 kern_return_t kr;
4009
4010 vm_page_t m; /* Fast access to result_page */
4011 kern_return_t error_code;
4012 vm_object_t cur_object;
4013 vm_object_t m_object = NULL;
4014 vm_object_offset_t cur_offset;
4015 vm_page_t cur_m;
4016 vm_object_t new_object;
4017 int type_of_fault;
4018 pmap_t pmap;
4019 wait_interrupt_t interruptible_state;
4020 vm_map_t real_map = map;
4021 vm_map_t original_map = map;
4022 bool object_locks_dropped = FALSE;
4023 vm_prot_t fault_type;
4024 vm_prot_t original_fault_type;
4025 struct vm_object_fault_info fault_info = {};
4026 bool need_collapse = FALSE;
4027 boolean_t need_retry = FALSE;
4028 boolean_t *need_retry_ptr = NULL;
4029 uint8_t object_lock_type = 0;
4030 uint8_t cur_object_lock_type;
4031 vm_object_t top_object = VM_OBJECT_NULL;
4032 vm_object_t written_on_object = VM_OBJECT_NULL;
4033 memory_object_t written_on_pager = NULL;
4034 vm_object_offset_t written_on_offset = 0;
4035 int throttle_delay;
4036 int compressed_count_delta;
4037 uint8_t grab_options;
4038 bool need_copy;
4039 bool need_copy_on_read;
4040 vm_map_offset_t trace_vaddr;
4041 vm_map_offset_t trace_real_vaddr;
4042 vm_map_size_t fault_page_size;
4043 vm_map_size_t fault_page_mask;
4044 int fault_page_shift;
4045 vm_map_offset_t fault_phys_offset;
4046 vm_map_offset_t real_vaddr;
4047 bool resilient_media_retry = false;
4048 bool resilient_media_ref_transfer = false;
4049 vm_object_t resilient_media_object = VM_OBJECT_NULL;
4050 vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
4051 bool page_needs_data_sync = false;
4052 /*
4053 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4054 * If so, the zero fill path will drop the lock
4055 * NB: Ideally we would always drop the lock rather than rely on
4056 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4057 */
4058 bool object_is_contended = false;
4059
4060 real_vaddr = vaddr;
4061 trace_real_vaddr = vaddr;
4062
4063 /*
4064 * Some (kernel) submaps are marked with "should never fault".
4065 *
4066 * We do this for two reasons:
4067 * - PGZ which is inside the zone map range can't go down the normal
4068 * lookup path (vm_map_lookup_entry() would panic).
4069 *
4070 * - we want for guard pages to not have to use fictitious pages at all
4071 * to prevent from ZFOD pages to be made.
4072 *
4073 * We also want capture the fault address easily so that the zone
4074 * allocator might present an enhanced panic log.
4075 */
4076 if (map->never_faults || (pgz_owned(vaddr) && map->pmap == kernel_pmap)) {
4077 assert(map->pmap == kernel_pmap);
4078 panic_fault_address = vaddr;
4079 return KERN_INVALID_ADDRESS;
4080 }
4081
4082 if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4083 fault_phys_offset = (vm_map_offset_t)-1;
4084 fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4085 fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4086 fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4087 if (fault_page_size < PAGE_SIZE) {
4088 DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4089 vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4090 }
4091 } else {
4092 fault_phys_offset = 0;
4093 fault_page_size = PAGE_SIZE;
4094 fault_page_mask = PAGE_MASK;
4095 fault_page_shift = PAGE_SHIFT;
4096 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4097 }
4098
4099 if (map == kernel_map) {
4100 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4101 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4102 } else {
4103 trace_vaddr = vaddr;
4104 }
4105
4106 KDBG_RELEASE(
4107 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
4108 ((uint64_t)trace_vaddr >> 32),
4109 trace_vaddr,
4110 (map == kernel_map));
4111
4112 if (get_preemption_level() != 0) {
4113 KDBG_RELEASE(
4114 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4115 ((uint64_t)trace_vaddr >> 32),
4116 trace_vaddr,
4117 KERN_FAILURE);
4118
4119 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4120 return KERN_FAILURE;
4121 }
4122
4123 thread_t cthread = current_thread();
4124 bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4125 uint64_t fstart = 0;
4126
4127 if (rtfault) {
4128 fstart = mach_continuous_time();
4129 }
4130
4131 interruptible_state = thread_interrupt_level(interruptible);
4132
4133 fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4134
4135 counter_inc(&vm_statistics_faults);
4136 counter_inc(¤t_task()->faults);
4137 original_fault_type = fault_type;
4138
4139 need_copy = FALSE;
4140 if (fault_type & VM_PROT_WRITE) {
4141 need_copy = TRUE;
4142 }
4143
4144 if (need_copy || change_wiring) {
4145 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4146 } else {
4147 object_lock_type = OBJECT_LOCK_SHARED;
4148 }
4149
4150 cur_object_lock_type = OBJECT_LOCK_SHARED;
4151
4152 if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4153 if (compressor_map) {
4154 if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4155 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4156 }
4157 }
4158 }
4159 RetryFault:
4160 assert(written_on_object == VM_OBJECT_NULL);
4161
4162 /*
4163 * assume we will hit a page in the cache
4164 * otherwise, explicitly override with
4165 * the real fault type once we determine it
4166 */
4167 type_of_fault = DBG_CACHE_HIT_FAULT;
4168
4169 /*
4170 * Find the backing store object and offset into
4171 * it to begin the search.
4172 */
4173 fault_type = original_fault_type;
4174 map = original_map;
4175 vm_map_lock_read(map);
4176
4177 if (resilient_media_retry) {
4178 /*
4179 * If we have to insert a fake zero-filled page to hide
4180 * a media failure to provide the real page, we need to
4181 * resolve any pending copy-on-write on this mapping.
4182 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4183 * with that even if this is not a "write" fault.
4184 */
4185 need_copy = TRUE;
4186 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4187 vm_fault_resilient_media_retry++;
4188 }
4189
4190 kr = vm_map_lookup_and_lock_object(&map, vaddr,
4191 (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4192 object_lock_type, &version,
4193 &object, &offset, &prot, &wired,
4194 &fault_info,
4195 &real_map,
4196 &object_is_contended);
4197
4198 if (kr != KERN_SUCCESS) {
4199 vm_map_unlock_read(map);
4200 /*
4201 * This can be seen in a crash report if indeed the
4202 * thread is crashing due to an invalid access in a non-existent
4203 * range.
4204 * Turning this OFF for now because it is noisy and not always fatal
4205 * eg prefaulting.
4206 *
4207 * if (kr == KERN_INVALID_ADDRESS) {
4208 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4209 * }
4210 */
4211 goto done;
4212 }
4213
4214
4215 pmap = real_map->pmap;
4216 fault_info.interruptible = interruptible;
4217 fault_info.stealth = FALSE;
4218 fault_info.io_sync = FALSE;
4219 fault_info.mark_zf_absent = FALSE;
4220 fault_info.batch_pmap_op = FALSE;
4221
4222 if (resilient_media_retry) {
4223 /*
4224 * We're retrying this fault after having detected a media
4225 * failure from a "resilient_media" mapping.
4226 * Check that the mapping is still pointing at the object
4227 * that just failed to provide a page.
4228 */
4229 assert(resilient_media_object != VM_OBJECT_NULL);
4230 assert(resilient_media_offset != (vm_object_offset_t)-1);
4231 if ((object != VM_OBJECT_NULL &&
4232 object == resilient_media_object &&
4233 offset == resilient_media_offset &&
4234 fault_info.resilient_media)
4235 #if MACH_ASSERT
4236 && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4237 (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4238 #endif /* MACH_ASSERT */
4239 ) {
4240 /*
4241 * This mapping still points at the same object
4242 * and is still "resilient_media": proceed in
4243 * "recovery-from-media-failure" mode, where we'll
4244 * insert a zero-filled page in the top object.
4245 */
4246 // printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4247 vm_fault_resilient_media_proceed++;
4248 } else {
4249 /* not recovering: reset state and retry fault */
4250 // printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4251 vm_object_unlock(object);
4252 if (real_map != map) {
4253 vm_map_unlock(real_map);
4254 }
4255 vm_map_unlock_read(map);
4256 /* release our extra reference on failed object */
4257 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4258 vm_object_lock_assert_notheld(resilient_media_object);
4259 vm_object_deallocate(resilient_media_object);
4260 resilient_media_object = VM_OBJECT_NULL;
4261 resilient_media_offset = (vm_object_offset_t)-1;
4262 resilient_media_retry = false;
4263 vm_fault_resilient_media_abort1++;
4264 goto RetryFault;
4265 }
4266 } else {
4267 assert(resilient_media_object == VM_OBJECT_NULL);
4268 resilient_media_offset = (vm_object_offset_t)-1;
4269 }
4270
4271 /*
4272 * If the page is wired, we must fault for the current protection
4273 * value, to avoid further faults.
4274 */
4275 if (wired) {
4276 fault_type = prot | VM_PROT_WRITE;
4277 }
4278 if (wired || need_copy) {
4279 /*
4280 * since we're treating this fault as a 'write'
4281 * we must hold the top object lock exclusively
4282 */
4283 if (object_lock_type == OBJECT_LOCK_SHARED) {
4284 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4285
4286 if (vm_object_lock_upgrade(object) == FALSE) {
4287 /*
4288 * couldn't upgrade, so explictly
4289 * take the lock exclusively
4290 */
4291 vm_object_lock(object);
4292 }
4293 }
4294 }
4295
4296 #if VM_FAULT_CLASSIFY
4297 /*
4298 * Temporary data gathering code
4299 */
4300 vm_fault_classify(object, offset, fault_type);
4301 #endif
4302 /*
4303 * Fast fault code. The basic idea is to do as much as
4304 * possible while holding the map lock and object locks.
4305 * Busy pages are not used until the object lock has to
4306 * be dropped to do something (copy, zero fill, pmap enter).
4307 * Similarly, paging references aren't acquired until that
4308 * point, and object references aren't used.
4309 *
4310 * If we can figure out what to do
4311 * (zero fill, copy on write, pmap enter) while holding
4312 * the locks, then it gets done. Otherwise, we give up,
4313 * and use the original fault path (which doesn't hold
4314 * the map lock, and relies on busy pages).
4315 * The give up cases include:
4316 * - Have to talk to pager.
4317 * - Page is busy, absent or in error.
4318 * - Pager has locked out desired access.
4319 * - Fault needs to be restarted.
4320 * - Have to push page into copy object.
4321 *
4322 * The code is an infinite loop that moves one level down
4323 * the shadow chain each time. cur_object and cur_offset
4324 * refer to the current object being examined. object and offset
4325 * are the original object from the map. The loop is at the
4326 * top level if and only if object and cur_object are the same.
4327 *
4328 * Invariants: Map lock is held throughout. Lock is held on
4329 * original object and cur_object (if different) when
4330 * continuing or exiting loop.
4331 *
4332 */
4333
4334 #if defined(__arm64__)
4335 /*
4336 * Fail if reading an execute-only page in a
4337 * pmap that enforces execute-only protection.
4338 */
4339 if (fault_type == VM_PROT_READ &&
4340 (prot & VM_PROT_EXECUTE) &&
4341 !(prot & VM_PROT_READ) &&
4342 pmap_enforces_execute_only(pmap)) {
4343 vm_object_unlock(object);
4344 vm_map_unlock_read(map);
4345 if (real_map != map) {
4346 vm_map_unlock(real_map);
4347 }
4348 kr = KERN_PROTECTION_FAILURE;
4349 goto done;
4350 }
4351 #endif
4352
4353 fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4354
4355 /*
4356 * If this page is to be inserted in a copy delay object
4357 * for writing, and if the object has a copy, then the
4358 * copy delay strategy is implemented in the slow fault page.
4359 */
4360 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4361 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4362 goto handle_copy_delay;
4363 }
4364
4365 cur_object = object;
4366 cur_offset = offset;
4367
4368 grab_options = 0;
4369 #if CONFIG_SECLUDED_MEMORY
4370 if (object->can_grab_secluded) {
4371 grab_options |= VM_PAGE_GRAB_SECLUDED;
4372 }
4373 #endif /* CONFIG_SECLUDED_MEMORY */
4374
4375 while (TRUE) {
4376 if (!cur_object->pager_created &&
4377 cur_object->phys_contiguous) { /* superpage */
4378 break;
4379 }
4380
4381 if (cur_object->blocked_access) {
4382 /*
4383 * Access to this VM object has been blocked.
4384 * Let the slow path handle it.
4385 */
4386 break;
4387 }
4388
4389 m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4390 m_object = NULL;
4391
4392 if (m != VM_PAGE_NULL) {
4393 m_object = cur_object;
4394
4395 if (m->vmp_busy) {
4396 wait_result_t result;
4397
4398 /*
4399 * in order to do the PAGE_ASSERT_WAIT, we must
4400 * have object that 'm' belongs to locked exclusively
4401 */
4402 if (object != cur_object) {
4403 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4404 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4405
4406 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4407 /*
4408 * couldn't upgrade so go do a full retry
4409 * immediately since we can no longer be
4410 * certain about cur_object (since we
4411 * don't hold a reference on it)...
4412 * first drop the top object lock
4413 */
4414 vm_object_unlock(object);
4415
4416 vm_map_unlock_read(map);
4417 if (real_map != map) {
4418 vm_map_unlock(real_map);
4419 }
4420
4421 goto RetryFault;
4422 }
4423 }
4424 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4425 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4426
4427 if (vm_object_lock_upgrade(object) == FALSE) {
4428 /*
4429 * couldn't upgrade, so explictly take the lock
4430 * exclusively and go relookup the page since we
4431 * will have dropped the object lock and
4432 * a different thread could have inserted
4433 * a page at this offset
4434 * no need for a full retry since we're
4435 * at the top level of the object chain
4436 */
4437 vm_object_lock(object);
4438
4439 continue;
4440 }
4441 }
4442 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4443 /*
4444 * m->vmp_busy == TRUE and the object is locked exclusively
4445 * if m->pageout_queue == TRUE after we acquire the
4446 * queues lock, we are guaranteed that it is stable on
4447 * the pageout queue and therefore reclaimable
4448 *
4449 * NOTE: this is only true for the internal pageout queue
4450 * in the compressor world
4451 */
4452 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4453
4454 vm_page_lock_queues();
4455
4456 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4457 vm_pageout_throttle_up(m);
4458 vm_page_unlock_queues();
4459
4460 PAGE_WAKEUP_DONE(m);
4461 goto reclaimed_from_pageout;
4462 }
4463 vm_page_unlock_queues();
4464 }
4465 if (object != cur_object) {
4466 vm_object_unlock(object);
4467 }
4468
4469 vm_map_unlock_read(map);
4470 if (real_map != map) {
4471 vm_map_unlock(real_map);
4472 }
4473
4474 result = PAGE_ASSERT_WAIT(m, interruptible);
4475
4476 vm_object_unlock(cur_object);
4477
4478 if (result == THREAD_WAITING) {
4479 result = thread_block(THREAD_CONTINUE_NULL);
4480 }
4481 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4482 goto RetryFault;
4483 }
4484
4485 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4486 kr = KERN_ABORTED;
4487 goto done;
4488 }
4489 reclaimed_from_pageout:
4490 if (m->vmp_laundry) {
4491 if (object != cur_object) {
4492 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4493 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4494
4495 vm_object_unlock(object);
4496 vm_object_unlock(cur_object);
4497
4498 vm_map_unlock_read(map);
4499 if (real_map != map) {
4500 vm_map_unlock(real_map);
4501 }
4502
4503 goto RetryFault;
4504 }
4505 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4506 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4507
4508 if (vm_object_lock_upgrade(object) == FALSE) {
4509 /*
4510 * couldn't upgrade, so explictly take the lock
4511 * exclusively and go relookup the page since we
4512 * will have dropped the object lock and
4513 * a different thread could have inserted
4514 * a page at this offset
4515 * no need for a full retry since we're
4516 * at the top level of the object chain
4517 */
4518 vm_object_lock(object);
4519
4520 continue;
4521 }
4522 }
4523 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
4524 vm_pageout_steal_laundry(m, FALSE);
4525 }
4526
4527
4528 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4529 /*
4530 * Guard page: let the slow path deal with it
4531 */
4532 break;
4533 }
4534 if (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4535 /*
4536 * Unusual case... let the slow path deal with it
4537 */
4538 break;
4539 }
4540 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4541 if (object != cur_object) {
4542 vm_object_unlock(object);
4543 }
4544 vm_map_unlock_read(map);
4545 if (real_map != map) {
4546 vm_map_unlock(real_map);
4547 }
4548 vm_object_unlock(cur_object);
4549 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
4550 kr = KERN_MEMORY_ERROR;
4551 goto done;
4552 }
4553 assert(m_object == VM_PAGE_OBJECT(m));
4554
4555 if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4556 PAGE_SIZE, 0) ||
4557 (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4558 upgrade_lock_and_retry:
4559 /*
4560 * We might need to validate this page
4561 * against its code signature, so we
4562 * want to hold the VM object exclusively.
4563 */
4564 if (object != cur_object) {
4565 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4566 vm_object_unlock(object);
4567 vm_object_unlock(cur_object);
4568
4569 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4570
4571 vm_map_unlock_read(map);
4572 if (real_map != map) {
4573 vm_map_unlock(real_map);
4574 }
4575
4576 goto RetryFault;
4577 }
4578 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4579 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4580
4581 if (vm_object_lock_upgrade(object) == FALSE) {
4582 /*
4583 * couldn't upgrade, so explictly take the lock
4584 * exclusively and go relookup the page since we
4585 * will have dropped the object lock and
4586 * a different thread could have inserted
4587 * a page at this offset
4588 * no need for a full retry since we're
4589 * at the top level of the object chain
4590 */
4591 vm_object_lock(object);
4592
4593 continue;
4594 }
4595 }
4596 }
4597 /*
4598 * Two cases of map in faults:
4599 * - At top level w/o copy object.
4600 * - Read fault anywhere.
4601 * --> must disallow write.
4602 */
4603
4604 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4605 goto FastPmapEnter;
4606 }
4607
4608 if (!need_copy &&
4609 !fault_info.no_copy_on_read &&
4610 cur_object != object &&
4611 !cur_object->internal &&
4612 !cur_object->pager_trusted &&
4613 vm_protect_privileged_from_untrusted &&
4614 !cur_object->code_signed &&
4615 current_proc_is_privileged()) {
4616 /*
4617 * We're faulting on a page in "object" and
4618 * went down the shadow chain to "cur_object"
4619 * to find out that "cur_object"'s pager
4620 * is not "trusted", i.e. we can not trust it
4621 * to always return the same contents.
4622 * Since the target is a "privileged" process,
4623 * let's treat this as a copy-on-read fault, as
4624 * if it was a copy-on-write fault.
4625 * Once "object" gets a copy of this page, it
4626 * won't have to rely on "cur_object" to
4627 * provide the contents again.
4628 *
4629 * This is done by setting "need_copy" and
4630 * retrying the fault from the top with the
4631 * appropriate locking.
4632 *
4633 * Special case: if the mapping is executable
4634 * and the untrusted object is code-signed and
4635 * the process is "cs_enforced", we do not
4636 * copy-on-read because that would break
4637 * code-signing enforcement expectations (an
4638 * executable page must belong to a code-signed
4639 * object) and we can rely on code-signing
4640 * to re-validate the page if it gets evicted
4641 * and paged back in.
4642 */
4643 // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4644 vm_copied_on_read++;
4645 need_copy = TRUE;
4646
4647 vm_object_unlock(object);
4648 vm_object_unlock(cur_object);
4649 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4650 vm_map_unlock_read(map);
4651 if (real_map != map) {
4652 vm_map_unlock(real_map);
4653 }
4654 goto RetryFault;
4655 }
4656
4657 if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4658 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4659 prot &= ~VM_PROT_WRITE;
4660 } else {
4661 /*
4662 * For a protection that the pmap cares
4663 * about, we must hand over the full
4664 * set of protections (so that the pmap
4665 * layer can apply any desired policy).
4666 * This means that cs_bypass must be
4667 * set, as this can force us to pass
4668 * RWX.
4669 */
4670 assert(fault_info.cs_bypass);
4671 }
4672
4673 if (object != cur_object) {
4674 /*
4675 * We still need to hold the top object
4676 * lock here to prevent a race between
4677 * a read fault (taking only "shared"
4678 * locks) and a write fault (taking
4679 * an "exclusive" lock on the top
4680 * object.
4681 * Otherwise, as soon as we release the
4682 * top lock, the write fault could
4683 * proceed and actually complete before
4684 * the read fault, and the copied page's
4685 * translation could then be overwritten
4686 * by the read fault's translation for
4687 * the original page.
4688 *
4689 * Let's just record what the top object
4690 * is and we'll release it later.
4691 */
4692 top_object = object;
4693
4694 /*
4695 * switch to the object that has the new page
4696 */
4697 object = cur_object;
4698 object_lock_type = cur_object_lock_type;
4699 }
4700 FastPmapEnter:
4701 assert(m_object == VM_PAGE_OBJECT(m));
4702
4703 /*
4704 * prepare for the pmap_enter...
4705 * object and map are both locked
4706 * m contains valid data
4707 * object == m->vmp_object
4708 * cur_object == NULL or it's been unlocked
4709 * no paging references on either object or cur_object
4710 */
4711 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4712 need_retry_ptr = &need_retry;
4713 } else {
4714 need_retry_ptr = NULL;
4715 }
4716
4717 if (fault_page_size < PAGE_SIZE) {
4718 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4719 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4720 fault_phys_offset < PAGE_SIZE),
4721 "0x%llx\n", (uint64_t)fault_phys_offset);
4722 } else {
4723 assertf(fault_phys_offset == 0,
4724 "0x%llx\n", (uint64_t)fault_phys_offset);
4725 }
4726
4727 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
4728 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
4729 if (caller_pmap) {
4730 kr = vm_fault_enter(m,
4731 caller_pmap,
4732 caller_pmap_addr,
4733 fault_page_size,
4734 fault_phys_offset,
4735 prot,
4736 caller_prot,
4737 wired,
4738 change_wiring,
4739 wire_tag,
4740 &fault_info,
4741 need_retry_ptr,
4742 &type_of_fault);
4743 } else {
4744 kr = vm_fault_enter(m,
4745 pmap,
4746 vaddr,
4747 fault_page_size,
4748 fault_phys_offset,
4749 prot,
4750 caller_prot,
4751 wired,
4752 change_wiring,
4753 wire_tag,
4754 &fault_info,
4755 need_retry_ptr,
4756 &type_of_fault);
4757 }
4758
4759 vm_fault_complete(
4760 map,
4761 real_map,
4762 object,
4763 m_object,
4764 m,
4765 offset,
4766 trace_real_vaddr,
4767 &fault_info,
4768 caller_prot,
4769 real_vaddr,
4770 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4771 need_retry,
4772 kr,
4773 physpage_p,
4774 prot,
4775 top_object,
4776 need_collapse,
4777 cur_offset,
4778 fault_type,
4779 &written_on_object,
4780 &written_on_pager,
4781 &written_on_offset);
4782 top_object = VM_OBJECT_NULL;
4783 if (need_retry == TRUE) {
4784 /*
4785 * vm_fault_enter couldn't complete the PMAP_ENTER...
4786 * at this point we don't hold any locks so it's safe
4787 * to ask the pmap layer to expand the page table to
4788 * accommodate this mapping... once expanded, we'll
4789 * re-drive the fault which should result in vm_fault_enter
4790 * being able to successfully enter the mapping this time around
4791 */
4792 (void)pmap_enter_options(
4793 pmap, vaddr, 0, 0, 0, 0, 0,
4794 PMAP_OPTIONS_NOENTER, NULL);
4795
4796 need_retry = FALSE;
4797 goto RetryFault;
4798 }
4799 goto done;
4800 }
4801 /*
4802 * COPY ON WRITE FAULT
4803 */
4804 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4805
4806 /*
4807 * If objects match, then
4808 * object->copy must not be NULL (else control
4809 * would be in previous code block), and we
4810 * have a potential push into the copy object
4811 * with which we can't cope with here.
4812 */
4813 if (cur_object == object) {
4814 /*
4815 * must take the slow path to
4816 * deal with the copy push
4817 */
4818 break;
4819 }
4820
4821 /*
4822 * This is now a shadow based copy on write
4823 * fault -- it requires a copy up the shadow
4824 * chain.
4825 */
4826 assert(m_object == VM_PAGE_OBJECT(m));
4827
4828 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4829 vm_fault_cs_need_validation(NULL, m, m_object,
4830 PAGE_SIZE, 0)) {
4831 goto upgrade_lock_and_retry;
4832 }
4833
4834 #if MACH_ASSERT
4835 if (resilient_media_retry &&
4836 vm_fault_resilient_media_inject_error2_rate != 0 &&
4837 (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
4838 /* inject an error */
4839 cur_m = m;
4840 m = VM_PAGE_NULL;
4841 m_object = VM_OBJECT_NULL;
4842 break;
4843 }
4844 #endif /* MACH_ASSERT */
4845 /*
4846 * Allocate a page in the original top level
4847 * object. Give up if allocate fails. Also
4848 * need to remember current page, as it's the
4849 * source of the copy.
4850 *
4851 * at this point we hold locks on both
4852 * object and cur_object... no need to take
4853 * paging refs or mark pages BUSY since
4854 * we don't drop either object lock until
4855 * the page has been copied and inserted
4856 */
4857 cur_m = m;
4858 m = vm_page_grab_options(grab_options);
4859 m_object = NULL;
4860
4861 if (m == VM_PAGE_NULL) {
4862 /*
4863 * no free page currently available...
4864 * must take the slow path
4865 */
4866 break;
4867 }
4868
4869 /*
4870 * Now do the copy. Mark the source page busy...
4871 *
4872 * NOTE: This code holds the map lock across
4873 * the page copy.
4874 */
4875 vm_page_copy(cur_m, m);
4876 vm_page_insert(m, object, vm_object_trunc_page(offset));
4877 if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4878 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4879 }
4880 m_object = object;
4881 SET_PAGE_DIRTY(m, FALSE);
4882
4883 /*
4884 * Now cope with the source page and object
4885 */
4886 if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4887 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4888 } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4889 /*
4890 * We've copied the full 16K page but we're
4891 * about to call vm_fault_enter() only for
4892 * the 4K chunk we're faulting on. The other
4893 * three 4K chunks in that page could still
4894 * be pmapped in this pmap.
4895 * Since the VM object layer thinks that the
4896 * entire page has been dealt with and the
4897 * original page might no longer be needed,
4898 * it might collapse/bypass the original VM
4899 * object and free its pages, which would be
4900 * bad (and would trigger pmap_verify_free()
4901 * assertions) if the other 4K chunks are still
4902 * pmapped.
4903 */
4904 /*
4905 * XXX FBDP TODO4K: to be revisisted
4906 * Technically, we need to pmap_disconnect()
4907 * only the target pmap's mappings for the 4K
4908 * chunks of this 16K VM page. If other pmaps
4909 * have PTEs on these chunks, that means that
4910 * the associated VM map must have a reference
4911 * on the VM object, so no need to worry about
4912 * those.
4913 * pmap_protect() for each 4K chunk would be
4914 * better but we'd have to check which chunks
4915 * are actually mapped before and after this
4916 * one.
4917 * A full-blown pmap_disconnect() is easier
4918 * for now but not efficient.
4919 */
4920 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4921 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4922 }
4923
4924 if (cur_m->vmp_clustered) {
4925 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4926 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4927 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4928 }
4929 need_collapse = TRUE;
4930
4931 if (!cur_object->internal &&
4932 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4933 /*
4934 * The object from which we've just
4935 * copied a page is most probably backed
4936 * by a vnode. We don't want to waste too
4937 * much time trying to collapse the VM objects
4938 * and create a bottleneck when several tasks
4939 * map the same file.
4940 */
4941 if (cur_object->copy == object) {
4942 /*
4943 * Shared mapping or no COW yet.
4944 * We can never collapse a copy
4945 * object into its backing object.
4946 */
4947 need_collapse = FALSE;
4948 } else if (cur_object->copy == object->shadow &&
4949 object->shadow->resident_page_count == 0) {
4950 /*
4951 * Shared mapping after a COW occurred.
4952 */
4953 need_collapse = FALSE;
4954 }
4955 }
4956 vm_object_unlock(cur_object);
4957
4958 if (need_collapse == FALSE) {
4959 vm_fault_collapse_skipped++;
4960 }
4961 vm_fault_collapse_total++;
4962
4963 type_of_fault = DBG_COW_FAULT;
4964 counter_inc(&vm_statistics_cow_faults);
4965 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4966 counter_inc(¤t_task()->cow_faults);
4967
4968 goto FastPmapEnter;
4969 } else {
4970 /*
4971 * No page at cur_object, cur_offset... m == NULL
4972 */
4973 if (cur_object->pager_created) {
4974 vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4975
4976 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4977 int my_fault_type;
4978 uint8_t c_flags = C_DONT_BLOCK;
4979 bool insert_cur_object = FALSE;
4980
4981 /*
4982 * May have to talk to a pager...
4983 * if so, take the slow path by
4984 * doing a 'break' from the while (TRUE) loop
4985 *
4986 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4987 * if the compressor is active and the page exists there
4988 */
4989 if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4990 break;
4991 }
4992
4993 if (map == kernel_map || real_map == kernel_map) {
4994 /*
4995 * can't call into the compressor with the kernel_map
4996 * lock held, since the compressor may try to operate
4997 * on the kernel map in order to return an empty c_segment
4998 */
4999 break;
5000 }
5001 if (object != cur_object) {
5002 if (fault_type & VM_PROT_WRITE) {
5003 c_flags |= C_KEEP;
5004 } else {
5005 insert_cur_object = TRUE;
5006 }
5007 }
5008 if (insert_cur_object == TRUE) {
5009 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5010 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5011
5012 if (vm_object_lock_upgrade(cur_object) == FALSE) {
5013 /*
5014 * couldn't upgrade so go do a full retry
5015 * immediately since we can no longer be
5016 * certain about cur_object (since we
5017 * don't hold a reference on it)...
5018 * first drop the top object lock
5019 */
5020 vm_object_unlock(object);
5021
5022 vm_map_unlock_read(map);
5023 if (real_map != map) {
5024 vm_map_unlock(real_map);
5025 }
5026
5027 goto RetryFault;
5028 }
5029 }
5030 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
5031 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5032
5033 if (object != cur_object) {
5034 /*
5035 * we can't go for the upgrade on the top
5036 * lock since the upgrade may block waiting
5037 * for readers to drain... since we hold
5038 * cur_object locked at this point, waiting
5039 * for the readers to drain would represent
5040 * a lock order inversion since the lock order
5041 * for objects is the reference order in the
5042 * shadown chain
5043 */
5044 vm_object_unlock(object);
5045 vm_object_unlock(cur_object);
5046
5047 vm_map_unlock_read(map);
5048 if (real_map != map) {
5049 vm_map_unlock(real_map);
5050 }
5051
5052 goto RetryFault;
5053 }
5054 if (vm_object_lock_upgrade(object) == FALSE) {
5055 /*
5056 * couldn't upgrade, so explictly take the lock
5057 * exclusively and go relookup the page since we
5058 * will have dropped the object lock and
5059 * a different thread could have inserted
5060 * a page at this offset
5061 * no need for a full retry since we're
5062 * at the top level of the object chain
5063 */
5064 vm_object_lock(object);
5065
5066 continue;
5067 }
5068 }
5069 m = vm_page_grab_options(grab_options);
5070 m_object = NULL;
5071
5072 if (m == VM_PAGE_NULL) {
5073 /*
5074 * no free page currently available...
5075 * must take the slow path
5076 */
5077 break;
5078 }
5079
5080 /*
5081 * The object is and remains locked
5082 * so no need to take a
5083 * "paging_in_progress" reference.
5084 */
5085 bool shared_lock;
5086 if ((object == cur_object &&
5087 object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5088 (object != cur_object &&
5089 cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5090 shared_lock = FALSE;
5091 } else {
5092 shared_lock = TRUE;
5093 }
5094
5095 kr = vm_compressor_pager_get(
5096 cur_object->pager,
5097 (vm_object_trunc_page(cur_offset)
5098 + cur_object->paging_offset),
5099 VM_PAGE_GET_PHYS_PAGE(m),
5100 &my_fault_type,
5101 c_flags,
5102 &compressed_count_delta);
5103
5104 vm_compressor_pager_count(
5105 cur_object->pager,
5106 compressed_count_delta,
5107 shared_lock,
5108 cur_object);
5109
5110 if (kr != KERN_SUCCESS) {
5111 vm_page_release(m, FALSE);
5112 m = VM_PAGE_NULL;
5113 }
5114 /*
5115 * If vm_compressor_pager_get() returns
5116 * KERN_MEMORY_FAILURE, then the
5117 * compressed data is permanently lost,
5118 * so return this error immediately.
5119 */
5120 if (kr == KERN_MEMORY_FAILURE) {
5121 if (object != cur_object) {
5122 vm_object_unlock(cur_object);
5123 }
5124 vm_object_unlock(object);
5125 vm_map_unlock_read(map);
5126 if (real_map != map) {
5127 vm_map_unlock(real_map);
5128 }
5129
5130 goto done;
5131 } else if (kr != KERN_SUCCESS) {
5132 break;
5133 }
5134 m->vmp_dirty = TRUE;
5135
5136 /*
5137 * If the object is purgeable, its
5138 * owner's purgeable ledgers will be
5139 * updated in vm_page_insert() but the
5140 * page was also accounted for in a
5141 * "compressed purgeable" ledger, so
5142 * update that now.
5143 */
5144 if (object != cur_object &&
5145 !insert_cur_object) {
5146 /*
5147 * We're not going to insert
5148 * the decompressed page into
5149 * the object it came from.
5150 *
5151 * We're dealing with a
5152 * copy-on-write fault on
5153 * "object".
5154 * We're going to decompress
5155 * the page directly into the
5156 * target "object" while
5157 * keepin the compressed
5158 * page for "cur_object", so
5159 * no ledger update in that
5160 * case.
5161 */
5162 } else if (((cur_object->purgable ==
5163 VM_PURGABLE_DENY) &&
5164 (!cur_object->vo_ledger_tag)) ||
5165 (cur_object->vo_owner ==
5166 NULL)) {
5167 /*
5168 * "cur_object" is not purgeable
5169 * and is not ledger-taged, or
5170 * there's no owner for it,
5171 * so no owner's ledgers to
5172 * update.
5173 */
5174 } else {
5175 /*
5176 * One less compressed
5177 * purgeable/tagged page for
5178 * cur_object's owner.
5179 */
5180 vm_object_owner_compressed_update(
5181 cur_object,
5182 -1);
5183 }
5184
5185 if (insert_cur_object) {
5186 vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5187 m_object = cur_object;
5188 } else {
5189 vm_page_insert(m, object, vm_object_trunc_page(offset));
5190 m_object = object;
5191 }
5192
5193 if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5194 /*
5195 * If the page is not cacheable,
5196 * we can't let its contents
5197 * linger in the data cache
5198 * after the decompression.
5199 */
5200 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5201 }
5202
5203 type_of_fault = my_fault_type;
5204
5205 VM_STAT_DECOMPRESSIONS();
5206
5207 if (cur_object != object) {
5208 if (insert_cur_object) {
5209 top_object = object;
5210 /*
5211 * switch to the object that has the new page
5212 */
5213 object = cur_object;
5214 object_lock_type = cur_object_lock_type;
5215 } else {
5216 vm_object_unlock(cur_object);
5217 cur_object = object;
5218 }
5219 }
5220 goto FastPmapEnter;
5221 }
5222 /*
5223 * existence map present and indicates
5224 * that the pager doesn't have this page
5225 */
5226 }
5227 if (cur_object->shadow == VM_OBJECT_NULL ||
5228 resilient_media_retry) {
5229 /*
5230 * Zero fill fault. Page gets
5231 * inserted into the original object.
5232 */
5233 if (cur_object->shadow_severed ||
5234 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5235 cur_object == compressor_object ||
5236 cur_object == kernel_object) {
5237 if (object != cur_object) {
5238 vm_object_unlock(cur_object);
5239 }
5240 vm_object_unlock(object);
5241
5242 vm_map_unlock_read(map);
5243 if (real_map != map) {
5244 vm_map_unlock(real_map);
5245 }
5246 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5247 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5248 }
5249
5250 if (cur_object->shadow_severed) {
5251 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5252 }
5253
5254 kr = KERN_MEMORY_ERROR;
5255 goto done;
5256 }
5257 if (cur_object != object) {
5258 vm_object_unlock(cur_object);
5259
5260 cur_object = object;
5261 }
5262 if (object_lock_type == OBJECT_LOCK_SHARED) {
5263 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5264
5265 if (vm_object_lock_upgrade(object) == FALSE) {
5266 /*
5267 * couldn't upgrade so do a full retry on the fault
5268 * since we dropped the object lock which
5269 * could allow another thread to insert
5270 * a page at this offset
5271 */
5272 vm_map_unlock_read(map);
5273 if (real_map != map) {
5274 vm_map_unlock(real_map);
5275 }
5276
5277 goto RetryFault;
5278 }
5279 }
5280 if (!object->internal) {
5281 panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5282 }
5283 #if MACH_ASSERT
5284 if (resilient_media_retry &&
5285 vm_fault_resilient_media_inject_error3_rate != 0 &&
5286 (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5287 /* inject an error */
5288 m_object = NULL;
5289 break;
5290 }
5291 #endif /* MACH_ASSERT */
5292 m = vm_page_alloc(object, vm_object_trunc_page(offset));
5293 m_object = NULL;
5294
5295 if (m == VM_PAGE_NULL) {
5296 /*
5297 * no free page currently available...
5298 * must take the slow path
5299 */
5300 break;
5301 }
5302 m_object = object;
5303
5304 if ((prot & VM_PROT_WRITE) &&
5305 !(fault_type & VM_PROT_WRITE) &&
5306 object->copy != VM_OBJECT_NULL) {
5307 /*
5308 * This is not a write fault and
5309 * we might have a copy-on-write
5310 * obligation to honor (copy object or
5311 * "needs_copy" map entry), so do not
5312 * give write access yet.
5313 * We'll need to catch the first write
5314 * to resolve the copy-on-write by
5315 * pushing this page to a copy object
5316 * or making a shadow object.
5317 */
5318 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5319 prot &= ~VM_PROT_WRITE;
5320 } else {
5321 assert(fault_info.cs_bypass);
5322 }
5323 }
5324
5325 /*
5326 * Zeroing the page and entering into it into the pmap
5327 * represents a significant amount of the zero fill fault handler's work.
5328 *
5329 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5330 * now that we've inserted the page into the vm object.
5331 * Before dropping the lock, we need to check protection bits and set the
5332 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5333 * zero it, and do the pmap enter. We'll need to reacquire the lock
5334 * to clear the busy bit and wake up any waiters.
5335 */
5336 vm_fault_cs_clear(m);
5337 m->vmp_pmapped = TRUE;
5338 if (map->no_zero_fill) {
5339 type_of_fault = DBG_NZF_PAGE_FAULT;
5340 } else {
5341 type_of_fault = DBG_ZERO_FILL_FAULT;
5342 }
5343 {
5344 pmap_t destination_pmap;
5345 vm_map_offset_t destination_pmap_vaddr;
5346 vm_prot_t enter_fault_type;
5347 if (caller_pmap) {
5348 destination_pmap = caller_pmap;
5349 destination_pmap_vaddr = caller_pmap_addr;
5350 } else {
5351 destination_pmap = pmap;
5352 destination_pmap_vaddr = vaddr;
5353 }
5354 if (change_wiring) {
5355 enter_fault_type = VM_PROT_NONE;
5356 } else {
5357 enter_fault_type = caller_prot;
5358 }
5359 assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
5360 kr = vm_fault_enter_prepare(m,
5361 destination_pmap,
5362 destination_pmap_vaddr,
5363 &prot,
5364 caller_prot,
5365 fault_page_size,
5366 fault_phys_offset,
5367 change_wiring,
5368 enter_fault_type,
5369 &fault_info,
5370 &type_of_fault,
5371 &page_needs_data_sync);
5372 if (kr != KERN_SUCCESS) {
5373 goto zero_fill_cleanup;
5374 }
5375
5376 if (object_is_contended) {
5377 /*
5378 * At this point the page is in the vm object, but not on a paging queue.
5379 * Since it's accessible to another thread but its contents are invalid
5380 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5381 */
5382 m->vmp_busy = TRUE;
5383 vm_object_unlock(object);
5384 }
5385 if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5386 /*
5387 * Now zero fill page...
5388 * the page is probably going to
5389 * be written soon, so don't bother
5390 * to clear the modified bit
5391 *
5392 * NOTE: This code holds the map
5393 * lock across the zero fill.
5394 */
5395 vm_page_zero_fill(m);
5396 counter_inc(&vm_statistics_zero_fill_count);
5397 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5398 }
5399 if (page_needs_data_sync) {
5400 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5401 }
5402
5403 if (top_object != VM_OBJECT_NULL) {
5404 need_retry_ptr = &need_retry;
5405 } else {
5406 need_retry_ptr = NULL;
5407 }
5408 if (object_is_contended) {
5409 kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5410 fault_page_size, fault_phys_offset,
5411 m, &prot, caller_prot, enter_fault_type, wired,
5412 fault_info.pmap_options, need_retry_ptr);
5413 vm_object_lock(object);
5414 } else {
5415 kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5416 fault_page_size, fault_phys_offset,
5417 m, &prot, caller_prot, enter_fault_type, wired,
5418 fault_info.pmap_options, need_retry_ptr);
5419 }
5420 }
5421 zero_fill_cleanup:
5422 if (!VM_DYNAMIC_PAGING_ENABLED() &&
5423 (object->purgable == VM_PURGABLE_DENY ||
5424 object->purgable == VM_PURGABLE_NONVOLATILE ||
5425 object->purgable == VM_PURGABLE_VOLATILE)) {
5426 vm_page_lockspin_queues();
5427 if (!VM_DYNAMIC_PAGING_ENABLED()) {
5428 vm_fault_enqueue_throttled_locked(m);
5429 }
5430 vm_page_unlock_queues();
5431 }
5432 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5433
5434 vm_fault_complete(
5435 map,
5436 real_map,
5437 object,
5438 m_object,
5439 m,
5440 offset,
5441 trace_real_vaddr,
5442 &fault_info,
5443 caller_prot,
5444 real_vaddr,
5445 type_of_fault,
5446 need_retry,
5447 kr,
5448 physpage_p,
5449 prot,
5450 top_object,
5451 need_collapse,
5452 cur_offset,
5453 fault_type,
5454 &written_on_object,
5455 &written_on_pager,
5456 &written_on_offset);
5457 top_object = VM_OBJECT_NULL;
5458 if (need_retry == TRUE) {
5459 /*
5460 * vm_fault_enter couldn't complete the PMAP_ENTER...
5461 * at this point we don't hold any locks so it's safe
5462 * to ask the pmap layer to expand the page table to
5463 * accommodate this mapping... once expanded, we'll
5464 * re-drive the fault which should result in vm_fault_enter
5465 * being able to successfully enter the mapping this time around
5466 */
5467 (void)pmap_enter_options(
5468 pmap, vaddr, 0, 0, 0, 0, 0,
5469 PMAP_OPTIONS_NOENTER, NULL);
5470
5471 need_retry = FALSE;
5472 goto RetryFault;
5473 }
5474 goto done;
5475 }
5476 /*
5477 * On to the next level in the shadow chain
5478 */
5479 cur_offset += cur_object->vo_shadow_offset;
5480 new_object = cur_object->shadow;
5481 fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5482
5483 /*
5484 * take the new_object's lock with the indicated state
5485 */
5486 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5487 vm_object_lock_shared(new_object);
5488 } else {
5489 vm_object_lock(new_object);
5490 }
5491
5492 if (cur_object != object) {
5493 vm_object_unlock(cur_object);
5494 }
5495
5496 cur_object = new_object;
5497
5498 continue;
5499 }
5500 }
5501 /*
5502 * Cleanup from fast fault failure. Drop any object
5503 * lock other than original and drop map lock.
5504 */
5505 if (object != cur_object) {
5506 vm_object_unlock(cur_object);
5507 }
5508
5509 /*
5510 * must own the object lock exclusively at this point
5511 */
5512 if (object_lock_type == OBJECT_LOCK_SHARED) {
5513 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5514
5515 if (vm_object_lock_upgrade(object) == FALSE) {
5516 /*
5517 * couldn't upgrade, so explictly
5518 * take the lock exclusively
5519 * no need to retry the fault at this
5520 * point since "vm_fault_page" will
5521 * completely re-evaluate the state
5522 */
5523 vm_object_lock(object);
5524 }
5525 }
5526
5527 handle_copy_delay:
5528 vm_map_unlock_read(map);
5529 if (real_map != map) {
5530 vm_map_unlock(real_map);
5531 }
5532
5533 if (__improbable(object == compressor_object ||
5534 object == kernel_object)) {
5535 /*
5536 * These objects are explicitly managed and populated by the
5537 * kernel. The virtual ranges backed by these objects should
5538 * either have wired pages or "holes" that are not supposed to
5539 * be accessed at all until they get explicitly populated.
5540 * We should never have to resolve a fault on a mapping backed
5541 * by one of these VM objects and providing a zero-filled page
5542 * would be wrong here, so let's fail the fault and let the
5543 * caller crash or recover.
5544 */
5545 vm_object_unlock(object);
5546 kr = KERN_MEMORY_ERROR;
5547 goto done;
5548 }
5549
5550 resilient_media_ref_transfer = false;
5551 if (resilient_media_retry) {
5552 /*
5553 * We could get here if we failed to get a free page
5554 * to zero-fill and had to take the slow path again.
5555 * Reset our "recovery-from-failed-media" state.
5556 */
5557 assert(resilient_media_object != VM_OBJECT_NULL);
5558 assert(resilient_media_offset != (vm_object_offset_t)-1);
5559 /* release our extra reference on failed object */
5560 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5561 if (object == resilient_media_object) {
5562 /*
5563 * We're holding "object"'s lock, so we can't release
5564 * our extra reference at this point.
5565 * We need an extra reference on "object" anyway
5566 * (see below), so let's just transfer this reference.
5567 */
5568 resilient_media_ref_transfer = true;
5569 } else {
5570 vm_object_lock_assert_notheld(resilient_media_object);
5571 vm_object_deallocate(resilient_media_object);
5572 }
5573 resilient_media_object = VM_OBJECT_NULL;
5574 resilient_media_offset = (vm_object_offset_t)-1;
5575 resilient_media_retry = false;
5576 vm_fault_resilient_media_abort2++;
5577 }
5578
5579 /*
5580 * Make a reference to this object to
5581 * prevent its disposal while we are messing with
5582 * it. Once we have the reference, the map is free
5583 * to be diddled. Since objects reference their
5584 * shadows (and copies), they will stay around as well.
5585 */
5586 if (resilient_media_ref_transfer) {
5587 /* we already have an extra reference on this object */
5588 resilient_media_ref_transfer = false;
5589 } else {
5590 vm_object_reference_locked(object);
5591 }
5592 vm_object_paging_begin(object);
5593
5594 set_thread_pagein_error(cthread, 0);
5595 error_code = 0;
5596
5597 result_page = VM_PAGE_NULL;
5598 kr = vm_fault_page(object, offset, fault_type,
5599 (change_wiring && !wired),
5600 FALSE, /* page not looked up */
5601 &prot, &result_page, &top_page,
5602 &type_of_fault,
5603 &error_code, map->no_zero_fill,
5604 &fault_info);
5605
5606 /*
5607 * if kr != VM_FAULT_SUCCESS, then the paging reference
5608 * has been dropped and the object unlocked... the ref_count
5609 * is still held
5610 *
5611 * if kr == VM_FAULT_SUCCESS, then the paging reference
5612 * is still held along with the ref_count on the original object
5613 *
5614 * the object is returned locked with a paging reference
5615 *
5616 * if top_page != NULL, then it's BUSY and the
5617 * object it belongs to has a paging reference
5618 * but is returned unlocked
5619 */
5620 if (kr != VM_FAULT_SUCCESS &&
5621 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5622 if (kr == VM_FAULT_MEMORY_ERROR &&
5623 fault_info.resilient_media) {
5624 assertf(object->internal, "object %p", object);
5625 /*
5626 * This fault failed but the mapping was
5627 * "media resilient", so we'll retry the fault in
5628 * recovery mode to get a zero-filled page in the
5629 * top object.
5630 * Keep the reference on the failing object so
5631 * that we can check that the mapping is still
5632 * pointing to it when we retry the fault.
5633 */
5634 // printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5635 assert(!resilient_media_retry); /* no double retry */
5636 assert(resilient_media_object == VM_OBJECT_NULL);
5637 assert(resilient_media_offset == (vm_object_offset_t)-1);
5638 resilient_media_retry = true;
5639 resilient_media_object = object;
5640 resilient_media_offset = offset;
5641 // printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5642 vm_fault_resilient_media_initiate++;
5643 goto RetryFault;
5644 } else {
5645 /*
5646 * we didn't succeed, lose the object reference
5647 * immediately.
5648 */
5649 vm_object_deallocate(object);
5650 object = VM_OBJECT_NULL; /* no longer valid */
5651 }
5652
5653 /*
5654 * See why we failed, and take corrective action.
5655 */
5656 switch (kr) {
5657 case VM_FAULT_MEMORY_SHORTAGE:
5658 if (vm_page_wait((change_wiring) ?
5659 THREAD_UNINT :
5660 THREAD_ABORTSAFE)) {
5661 goto RetryFault;
5662 }
5663 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
5664 OS_FALLTHROUGH;
5665 case VM_FAULT_INTERRUPTED:
5666 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
5667 kr = KERN_ABORTED;
5668 goto done;
5669 case VM_FAULT_RETRY:
5670 goto RetryFault;
5671 case VM_FAULT_MEMORY_ERROR:
5672 if (error_code) {
5673 kr = error_code;
5674 } else {
5675 kr = KERN_MEMORY_ERROR;
5676 }
5677 goto done;
5678 default:
5679 panic("vm_fault: unexpected error 0x%x from "
5680 "vm_fault_page()\n", kr);
5681 }
5682 }
5683 m = result_page;
5684 m_object = NULL;
5685
5686 if (m != VM_PAGE_NULL) {
5687 m_object = VM_PAGE_OBJECT(m);
5688 assert((change_wiring && !wired) ?
5689 (top_page == VM_PAGE_NULL) :
5690 ((top_page == VM_PAGE_NULL) == (m_object == object)));
5691 }
5692
5693 /*
5694 * What to do with the resulting page from vm_fault_page
5695 * if it doesn't get entered into the physical map:
5696 */
5697 #define RELEASE_PAGE(m) \
5698 MACRO_BEGIN \
5699 PAGE_WAKEUP_DONE(m); \
5700 if ( !VM_PAGE_PAGEABLE(m)) { \
5701 vm_page_lockspin_queues(); \
5702 if ( !VM_PAGE_PAGEABLE(m)) \
5703 vm_page_activate(m); \
5704 vm_page_unlock_queues(); \
5705 } \
5706 MACRO_END
5707
5708
5709 object_locks_dropped = FALSE;
5710 /*
5711 * We must verify that the maps have not changed
5712 * since our last lookup. vm_map_verify() needs the
5713 * map lock (shared) but we are holding object locks.
5714 * So we do a try_lock() first and, if that fails, we
5715 * drop the object locks and go in for the map lock again.
5716 */
5717 if (!vm_map_try_lock_read(original_map)) {
5718 if (m != VM_PAGE_NULL) {
5719 old_copy_object = m_object->copy;
5720 vm_object_unlock(m_object);
5721 } else {
5722 old_copy_object = VM_OBJECT_NULL;
5723 vm_object_unlock(object);
5724 }
5725
5726 object_locks_dropped = TRUE;
5727
5728 vm_map_lock_read(original_map);
5729 }
5730
5731 if ((map != original_map) || !vm_map_verify(map, &version)) {
5732 if (object_locks_dropped == FALSE) {
5733 if (m != VM_PAGE_NULL) {
5734 old_copy_object = m_object->copy;
5735 vm_object_unlock(m_object);
5736 } else {
5737 old_copy_object = VM_OBJECT_NULL;
5738 vm_object_unlock(object);
5739 }
5740
5741 object_locks_dropped = TRUE;
5742 }
5743
5744 /*
5745 * no object locks are held at this point
5746 */
5747 vm_object_t retry_object;
5748 vm_object_offset_t retry_offset;
5749 vm_prot_t retry_prot;
5750
5751 /*
5752 * To avoid trying to write_lock the map while another
5753 * thread has it read_locked (in vm_map_pageable), we
5754 * do not try for write permission. If the page is
5755 * still writable, we will get write permission. If it
5756 * is not, or has been marked needs_copy, we enter the
5757 * mapping without write permission, and will merely
5758 * take another fault.
5759 */
5760 map = original_map;
5761
5762 kr = vm_map_lookup_and_lock_object(&map, vaddr,
5763 fault_type & ~VM_PROT_WRITE,
5764 OBJECT_LOCK_EXCLUSIVE, &version,
5765 &retry_object, &retry_offset, &retry_prot,
5766 &wired,
5767 &fault_info,
5768 &real_map,
5769 NULL);
5770 pmap = real_map->pmap;
5771
5772 if (kr != KERN_SUCCESS) {
5773 vm_map_unlock_read(map);
5774
5775 if (m != VM_PAGE_NULL) {
5776 assert(VM_PAGE_OBJECT(m) == m_object);
5777
5778 /*
5779 * retake the lock so that
5780 * we can drop the paging reference
5781 * in vm_fault_cleanup and do the
5782 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5783 */
5784 vm_object_lock(m_object);
5785
5786 RELEASE_PAGE(m);
5787
5788 vm_fault_cleanup(m_object, top_page);
5789 } else {
5790 /*
5791 * retake the lock so that
5792 * we can drop the paging reference
5793 * in vm_fault_cleanup
5794 */
5795 vm_object_lock(object);
5796
5797 vm_fault_cleanup(object, top_page);
5798 }
5799 vm_object_deallocate(object);
5800
5801 if (kr == KERN_INVALID_ADDRESS) {
5802 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
5803 }
5804 goto done;
5805 }
5806 vm_object_unlock(retry_object);
5807
5808 if ((retry_object != object) || (retry_offset != offset)) {
5809 vm_map_unlock_read(map);
5810 if (real_map != map) {
5811 vm_map_unlock(real_map);
5812 }
5813
5814 if (m != VM_PAGE_NULL) {
5815 assert(VM_PAGE_OBJECT(m) == m_object);
5816
5817 /*
5818 * retake the lock so that
5819 * we can drop the paging reference
5820 * in vm_fault_cleanup and do the
5821 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5822 */
5823 vm_object_lock(m_object);
5824
5825 RELEASE_PAGE(m);
5826
5827 vm_fault_cleanup(m_object, top_page);
5828 } else {
5829 /*
5830 * retake the lock so that
5831 * we can drop the paging reference
5832 * in vm_fault_cleanup
5833 */
5834 vm_object_lock(object);
5835
5836 vm_fault_cleanup(object, top_page);
5837 }
5838 vm_object_deallocate(object);
5839
5840 goto RetryFault;
5841 }
5842 /*
5843 * Check whether the protection has changed or the object
5844 * has been copied while we left the map unlocked.
5845 */
5846 if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5847 /* If the pmap layer cares, pass the full set. */
5848 prot = retry_prot;
5849 } else {
5850 prot &= retry_prot;
5851 }
5852 }
5853
5854 if (object_locks_dropped == TRUE) {
5855 if (m != VM_PAGE_NULL) {
5856 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5857 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5858 vm_object_lock(m_object);
5859
5860 if (m_object->copy != old_copy_object) {
5861 /*
5862 * The copy object changed while the top-level object
5863 * was unlocked, so take away write permission.
5864 */
5865 assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5866 prot &= ~VM_PROT_WRITE;
5867 }
5868 } else {
5869 vm_object_lock(object);
5870 }
5871
5872 object_locks_dropped = FALSE;
5873 }
5874
5875 if (!need_copy &&
5876 !fault_info.no_copy_on_read &&
5877 m != VM_PAGE_NULL &&
5878 VM_PAGE_OBJECT(m) != object &&
5879 !VM_PAGE_OBJECT(m)->pager_trusted &&
5880 vm_protect_privileged_from_untrusted &&
5881 !VM_PAGE_OBJECT(m)->code_signed &&
5882 current_proc_is_privileged()) {
5883 /*
5884 * We found the page we want in an "untrusted" VM object
5885 * down the shadow chain. Since the target is "privileged"
5886 * we want to perform a copy-on-read of that page, so that the
5887 * mapped object gets a stable copy and does not have to
5888 * rely on the "untrusted" object to provide the same
5889 * contents if the page gets reclaimed and has to be paged
5890 * in again later on.
5891 *
5892 * Special case: if the mapping is executable and the untrusted
5893 * object is code-signed and the process is "cs_enforced", we
5894 * do not copy-on-read because that would break code-signing
5895 * enforcement expectations (an executable page must belong
5896 * to a code-signed object) and we can rely on code-signing
5897 * to re-validate the page if it gets evicted and paged back in.
5898 */
5899 // printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5900 vm_copied_on_read++;
5901 need_copy_on_read = TRUE;
5902 need_copy = TRUE;
5903 } else {
5904 need_copy_on_read = FALSE;
5905 }
5906
5907 /*
5908 * If we want to wire down this page, but no longer have
5909 * adequate permissions, we must start all over.
5910 * If we decided to copy-on-read, we must also start all over.
5911 */
5912 if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5913 need_copy_on_read) {
5914 vm_map_unlock_read(map);
5915 if (real_map != map) {
5916 vm_map_unlock(real_map);
5917 }
5918
5919 if (m != VM_PAGE_NULL) {
5920 assert(VM_PAGE_OBJECT(m) == m_object);
5921
5922 RELEASE_PAGE(m);
5923
5924 vm_fault_cleanup(m_object, top_page);
5925 } else {
5926 vm_fault_cleanup(object, top_page);
5927 }
5928
5929 vm_object_deallocate(object);
5930
5931 goto RetryFault;
5932 }
5933 if (m != VM_PAGE_NULL) {
5934 /*
5935 * Put this page into the physical map.
5936 * We had to do the unlock above because pmap_enter
5937 * may cause other faults. The page may be on
5938 * the pageout queues. If the pageout daemon comes
5939 * across the page, it will remove it from the queues.
5940 */
5941 if (fault_page_size < PAGE_SIZE) {
5942 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5943 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5944 fault_phys_offset < PAGE_SIZE),
5945 "0x%llx\n", (uint64_t)fault_phys_offset);
5946 } else {
5947 assertf(fault_phys_offset == 0,
5948 "0x%llx\n", (uint64_t)fault_phys_offset);
5949 }
5950 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5951 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5952 if (caller_pmap) {
5953 kr = vm_fault_enter(m,
5954 caller_pmap,
5955 caller_pmap_addr,
5956 fault_page_size,
5957 fault_phys_offset,
5958 prot,
5959 caller_prot,
5960 wired,
5961 change_wiring,
5962 wire_tag,
5963 &fault_info,
5964 NULL,
5965 &type_of_fault);
5966 } else {
5967 kr = vm_fault_enter(m,
5968 pmap,
5969 vaddr,
5970 fault_page_size,
5971 fault_phys_offset,
5972 prot,
5973 caller_prot,
5974 wired,
5975 change_wiring,
5976 wire_tag,
5977 &fault_info,
5978 NULL,
5979 &type_of_fault);
5980 }
5981 assert(VM_PAGE_OBJECT(m) == m_object);
5982
5983 {
5984 int event_code = 0;
5985
5986 if (m_object->internal) {
5987 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5988 } else if (m_object->object_is_shared_cache) {
5989 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5990 } else {
5991 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5992 }
5993
5994 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
5995 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
5996
5997 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5998 }
5999 if (kr != KERN_SUCCESS) {
6000 /* abort this page fault */
6001 vm_map_unlock_read(map);
6002 if (real_map != map) {
6003 vm_map_unlock(real_map);
6004 }
6005 PAGE_WAKEUP_DONE(m);
6006 vm_fault_cleanup(m_object, top_page);
6007 vm_object_deallocate(object);
6008 goto done;
6009 }
6010 if (physpage_p != NULL) {
6011 /* for vm_map_wire_and_extract() */
6012 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6013 if (prot & VM_PROT_WRITE) {
6014 vm_object_lock_assert_exclusive(m_object);
6015 m->vmp_dirty = TRUE;
6016 }
6017 }
6018 } else {
6019 vm_map_entry_t entry;
6020 vm_map_offset_t laddr;
6021 vm_map_offset_t ldelta, hdelta;
6022
6023 /*
6024 * do a pmap block mapping from the physical address
6025 * in the object
6026 */
6027
6028 if (real_map != map) {
6029 vm_map_unlock(real_map);
6030 }
6031
6032 if (original_map != map) {
6033 vm_map_unlock_read(map);
6034 vm_map_lock_read(original_map);
6035 map = original_map;
6036 }
6037 real_map = map;
6038
6039 laddr = vaddr;
6040 hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6041
6042 while (vm_map_lookup_entry(map, laddr, &entry)) {
6043 if (ldelta > (laddr - entry->vme_start)) {
6044 ldelta = laddr - entry->vme_start;
6045 }
6046 if (hdelta > (entry->vme_end - laddr)) {
6047 hdelta = entry->vme_end - laddr;
6048 }
6049 if (entry->is_sub_map) {
6050 laddr = ((laddr - entry->vme_start)
6051 + VME_OFFSET(entry));
6052 vm_map_lock_read(VME_SUBMAP(entry));
6053
6054 if (map != real_map) {
6055 vm_map_unlock_read(map);
6056 }
6057 if (entry->use_pmap) {
6058 vm_map_unlock_read(real_map);
6059 real_map = VME_SUBMAP(entry);
6060 }
6061 map = VME_SUBMAP(entry);
6062 } else {
6063 break;
6064 }
6065 }
6066
6067 if (vm_map_lookup_entry(map, laddr, &entry) &&
6068 (!entry->is_sub_map) &&
6069 (object != VM_OBJECT_NULL) &&
6070 (VME_OBJECT(entry) == object)) {
6071 uint16_t superpage;
6072
6073 if (!object->pager_created &&
6074 object->phys_contiguous &&
6075 VME_OFFSET(entry) == 0 &&
6076 (entry->vme_end - entry->vme_start == object->vo_size) &&
6077 VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6078 superpage = VM_MEM_SUPERPAGE;
6079 } else {
6080 superpage = 0;
6081 }
6082
6083 if (superpage && physpage_p) {
6084 /* for vm_map_wire_and_extract() */
6085 *physpage_p = (ppnum_t)
6086 ((((vm_map_offset_t)
6087 object->vo_shadow_offset)
6088 + VME_OFFSET(entry)
6089 + (laddr - entry->vme_start))
6090 >> PAGE_SHIFT);
6091 }
6092
6093 if (caller_pmap) {
6094 /*
6095 * Set up a block mapped area
6096 */
6097 assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6098 kr = pmap_map_block_addr(caller_pmap,
6099 (addr64_t)(caller_pmap_addr - ldelta),
6100 (pmap_paddr_t)(((vm_map_offset_t) (object->vo_shadow_offset)) +
6101 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6102 (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6103 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6104
6105 if (kr != KERN_SUCCESS) {
6106 goto cleanup;
6107 }
6108 } else {
6109 /*
6110 * Set up a block mapped area
6111 */
6112 assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6113 kr = pmap_map_block_addr(real_map->pmap,
6114 (addr64_t)(vaddr - ldelta),
6115 (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6116 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6117 (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6118 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6119
6120 if (kr != KERN_SUCCESS) {
6121 goto cleanup;
6122 }
6123 }
6124 }
6125 }
6126
6127 /*
6128 * Success
6129 */
6130 kr = KERN_SUCCESS;
6131
6132 /*
6133 * TODO: could most of the done cases just use cleanup?
6134 */
6135 cleanup:
6136 /*
6137 * Unlock everything, and return
6138 */
6139 vm_map_unlock_read(map);
6140 if (real_map != map) {
6141 vm_map_unlock(real_map);
6142 }
6143
6144 if (m != VM_PAGE_NULL) {
6145 assert(VM_PAGE_OBJECT(m) == m_object);
6146
6147 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6148 vm_object_paging_begin(m_object);
6149
6150 assert(written_on_object == VM_OBJECT_NULL);
6151 written_on_object = m_object;
6152 written_on_pager = m_object->pager;
6153 written_on_offset = m_object->paging_offset + m->vmp_offset;
6154 }
6155 PAGE_WAKEUP_DONE(m);
6156
6157 vm_fault_cleanup(m_object, top_page);
6158 } else {
6159 vm_fault_cleanup(object, top_page);
6160 }
6161
6162 vm_object_deallocate(object);
6163
6164 #undef RELEASE_PAGE
6165
6166 done:
6167 thread_interrupt_level(interruptible_state);
6168
6169 if (resilient_media_object != VM_OBJECT_NULL) {
6170 assert(resilient_media_retry);
6171 assert(resilient_media_offset != (vm_object_offset_t)-1);
6172 /* release extra reference on failed object */
6173 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6174 vm_object_lock_assert_notheld(resilient_media_object);
6175 vm_object_deallocate(resilient_media_object);
6176 resilient_media_object = VM_OBJECT_NULL;
6177 resilient_media_offset = (vm_object_offset_t)-1;
6178 resilient_media_retry = false;
6179 vm_fault_resilient_media_release++;
6180 }
6181 assert(!resilient_media_retry);
6182
6183 /*
6184 * Only I/O throttle on faults which cause a pagein/swapin.
6185 */
6186 if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6187 throttle_lowpri_io(1);
6188 } else {
6189 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6190 if ((throttle_delay = vm_page_throttled(TRUE))) {
6191 if (vm_debug_events) {
6192 if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6193 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6194 } else if (type_of_fault == DBG_COW_FAULT) {
6195 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6196 } else {
6197 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6198 }
6199 }
6200 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6201 }
6202 }
6203 }
6204
6205 if (written_on_object) {
6206 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6207
6208 vm_object_lock(written_on_object);
6209 vm_object_paging_end(written_on_object);
6210 vm_object_unlock(written_on_object);
6211
6212 written_on_object = VM_OBJECT_NULL;
6213 }
6214
6215 if (rtfault) {
6216 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6217 }
6218
6219 KDBG_RELEASE(
6220 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
6221 ((uint64_t)trace_vaddr >> 32),
6222 trace_vaddr,
6223 kr,
6224 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6225
6226 if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6227 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6228 }
6229
6230 return kr;
6231 }
6232
6233 /*
6234 * vm_fault_wire:
6235 *
6236 * Wire down a range of virtual addresses in a map.
6237 */
6238 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6239 vm_fault_wire(
6240 vm_map_t map,
6241 vm_map_entry_t entry,
6242 vm_prot_t prot,
6243 vm_tag_t wire_tag,
6244 pmap_t pmap,
6245 vm_map_offset_t pmap_addr,
6246 ppnum_t *physpage_p)
6247 {
6248 vm_map_offset_t va;
6249 vm_map_offset_t end_addr = entry->vme_end;
6250 kern_return_t rc;
6251 vm_map_size_t effective_page_size;
6252
6253 assert(entry->in_transition);
6254
6255 if (!entry->is_sub_map &&
6256 VME_OBJECT(entry) != VM_OBJECT_NULL &&
6257 VME_OBJECT(entry)->phys_contiguous) {
6258 return KERN_SUCCESS;
6259 }
6260
6261 /*
6262 * Inform the physical mapping system that the
6263 * range of addresses may not fault, so that
6264 * page tables and such can be locked down as well.
6265 */
6266
6267 pmap_pageable(pmap, pmap_addr,
6268 pmap_addr + (end_addr - entry->vme_start), FALSE);
6269
6270 /*
6271 * We simulate a fault to get the page and enter it
6272 * in the physical map.
6273 */
6274
6275 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6276 for (va = entry->vme_start;
6277 va < end_addr;
6278 va += effective_page_size) {
6279 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6280 pmap_addr + (va - entry->vme_start),
6281 physpage_p);
6282 if (rc != KERN_SUCCESS) {
6283 rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6284 ((pmap == kernel_pmap)
6285 ? THREAD_UNINT
6286 : THREAD_ABORTSAFE),
6287 pmap,
6288 (pmap_addr +
6289 (va - entry->vme_start)),
6290 physpage_p);
6291 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6292 }
6293
6294 if (rc != KERN_SUCCESS) {
6295 struct vm_map_entry tmp_entry = *entry;
6296
6297 /* unwire wired pages */
6298 tmp_entry.vme_end = va;
6299 vm_fault_unwire(map,
6300 &tmp_entry, FALSE, pmap, pmap_addr);
6301
6302 return rc;
6303 }
6304 }
6305 return KERN_SUCCESS;
6306 }
6307
6308 /*
6309 * vm_fault_unwire:
6310 *
6311 * Unwire a range of virtual addresses in a map.
6312 */
6313 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr)6314 vm_fault_unwire(
6315 vm_map_t map,
6316 vm_map_entry_t entry,
6317 boolean_t deallocate,
6318 pmap_t pmap,
6319 vm_map_offset_t pmap_addr)
6320 {
6321 vm_map_offset_t va;
6322 vm_map_offset_t end_addr = entry->vme_end;
6323 vm_object_t object;
6324 struct vm_object_fault_info fault_info = {};
6325 unsigned int unwired_pages;
6326 vm_map_size_t effective_page_size;
6327
6328 object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6329
6330 /*
6331 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6332 * do anything since such memory is wired by default. So we don't have
6333 * anything to undo here.
6334 */
6335
6336 if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6337 return;
6338 }
6339
6340 fault_info.interruptible = THREAD_UNINT;
6341 fault_info.behavior = entry->behavior;
6342 fault_info.user_tag = VME_ALIAS(entry);
6343 if (entry->iokit_acct ||
6344 (!entry->is_sub_map && !entry->use_pmap)) {
6345 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6346 }
6347 fault_info.lo_offset = VME_OFFSET(entry);
6348 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6349 fault_info.no_cache = entry->no_cache;
6350 fault_info.stealth = TRUE;
6351
6352 unwired_pages = 0;
6353
6354 /*
6355 * Since the pages are wired down, we must be able to
6356 * get their mappings from the physical map system.
6357 */
6358
6359 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6360 for (va = entry->vme_start;
6361 va < end_addr;
6362 va += effective_page_size) {
6363 if (object == VM_OBJECT_NULL) {
6364 if (pmap) {
6365 pmap_change_wiring(pmap,
6366 pmap_addr + (va - entry->vme_start), FALSE);
6367 }
6368 (void) vm_fault(map, va, VM_PROT_NONE,
6369 TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6370 } else {
6371 vm_prot_t prot;
6372 vm_page_t result_page;
6373 vm_page_t top_page;
6374 vm_object_t result_object;
6375 vm_fault_return_t result;
6376
6377 /* cap cluster size at maximum UPL size */
6378 upl_size_t cluster_size;
6379 if (os_sub_overflow(end_addr, va, &cluster_size)) {
6380 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6381 }
6382 fault_info.cluster_size = cluster_size;
6383
6384 do {
6385 prot = VM_PROT_NONE;
6386
6387 vm_object_lock(object);
6388 vm_object_paging_begin(object);
6389 result_page = VM_PAGE_NULL;
6390 result = vm_fault_page(
6391 object,
6392 (VME_OFFSET(entry) +
6393 (va - entry->vme_start)),
6394 VM_PROT_NONE, TRUE,
6395 FALSE, /* page not looked up */
6396 &prot, &result_page, &top_page,
6397 (int *)0,
6398 NULL, map->no_zero_fill,
6399 &fault_info);
6400 } while (result == VM_FAULT_RETRY);
6401
6402 /*
6403 * If this was a mapping to a file on a device that has been forcibly
6404 * unmounted, then we won't get a page back from vm_fault_page(). Just
6405 * move on to the next one in case the remaining pages are mapped from
6406 * different objects. During a forced unmount, the object is terminated
6407 * so the alive flag will be false if this happens. A forced unmount will
6408 * will occur when an external disk is unplugged before the user does an
6409 * eject, so we don't want to panic in that situation.
6410 */
6411
6412 if (result == VM_FAULT_MEMORY_ERROR) {
6413 if (!object->alive) {
6414 continue;
6415 }
6416 if (!object->internal && object->pager == NULL) {
6417 continue;
6418 }
6419 }
6420
6421 if (result == VM_FAULT_MEMORY_ERROR &&
6422 object == kernel_object) {
6423 /*
6424 * This must have been allocated with
6425 * KMA_KOBJECT and KMA_VAONLY and there's
6426 * no physical page at this offset.
6427 * We're done (no page to free).
6428 */
6429 assert(deallocate);
6430 continue;
6431 }
6432
6433 if (result != VM_FAULT_SUCCESS) {
6434 panic("vm_fault_unwire: failure");
6435 }
6436
6437 result_object = VM_PAGE_OBJECT(result_page);
6438
6439 if (deallocate) {
6440 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6441 vm_page_fictitious_addr);
6442 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6443 if (VM_PAGE_WIRED(result_page)) {
6444 unwired_pages++;
6445 }
6446 VM_PAGE_FREE(result_page);
6447 } else {
6448 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6449 pmap_change_wiring(pmap,
6450 pmap_addr + (va - entry->vme_start), FALSE);
6451 }
6452
6453
6454 if (VM_PAGE_WIRED(result_page)) {
6455 vm_page_lockspin_queues();
6456 vm_page_unwire(result_page, TRUE);
6457 vm_page_unlock_queues();
6458 unwired_pages++;
6459 }
6460 if (entry->zero_wired_pages) {
6461 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6462 entry->zero_wired_pages = FALSE;
6463 }
6464
6465 PAGE_WAKEUP_DONE(result_page);
6466 }
6467 vm_fault_cleanup(result_object, top_page);
6468 }
6469 }
6470
6471 /*
6472 * Inform the physical mapping system that the range
6473 * of addresses may fault, so that page tables and
6474 * such may be unwired themselves.
6475 */
6476
6477 pmap_pageable(pmap, pmap_addr,
6478 pmap_addr + (end_addr - entry->vme_start), TRUE);
6479
6480 if (kernel_object == object) {
6481 /*
6482 * Would like to make user_tag in vm_object_fault_info
6483 * vm_tag_t (unsigned short) but user_tag derives its value from
6484 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6485 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6486 * code at many places.
6487 *
6488 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6489 */
6490 assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6491 "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6492 vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6493 }
6494 }
6495
6496 /*
6497 * vm_fault_wire_fast:
6498 *
6499 * Handle common case of a wire down page fault at the given address.
6500 * If successful, the page is inserted into the associated physical map.
6501 * The map entry is passed in to avoid the overhead of a map lookup.
6502 *
6503 * NOTE: the given address should be truncated to the
6504 * proper page address.
6505 *
6506 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
6507 * a standard error specifying why the fault is fatal is returned.
6508 *
6509 * The map in question must be referenced, and remains so.
6510 * Caller has a read lock on the map.
6511 *
6512 * This is a stripped version of vm_fault() for wiring pages. Anything
6513 * other than the common case will return KERN_FAILURE, and the caller
6514 * is expected to call vm_fault().
6515 */
6516 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6517 vm_fault_wire_fast(
6518 __unused vm_map_t map,
6519 vm_map_offset_t va,
6520 __unused vm_prot_t caller_prot,
6521 vm_tag_t wire_tag,
6522 vm_map_entry_t entry,
6523 pmap_t pmap,
6524 vm_map_offset_t pmap_addr,
6525 ppnum_t *physpage_p)
6526 {
6527 vm_object_t object;
6528 vm_object_offset_t offset;
6529 vm_page_t m;
6530 vm_prot_t prot;
6531 thread_t thread = current_thread();
6532 int type_of_fault;
6533 kern_return_t kr;
6534 vm_map_size_t fault_page_size;
6535 vm_map_offset_t fault_phys_offset;
6536 struct vm_object_fault_info fault_info = {};
6537
6538 counter_inc(&vm_statistics_faults);
6539
6540 if (thread != THREAD_NULL) {
6541 counter_inc(&get_threadtask(thread)->faults);
6542 }
6543
6544 /*
6545 * Recovery actions
6546 */
6547
6548 #undef RELEASE_PAGE
6549 #define RELEASE_PAGE(m) { \
6550 PAGE_WAKEUP_DONE(m); \
6551 vm_page_lockspin_queues(); \
6552 vm_page_unwire(m, TRUE); \
6553 vm_page_unlock_queues(); \
6554 }
6555
6556
6557 #undef UNLOCK_THINGS
6558 #define UNLOCK_THINGS { \
6559 vm_object_paging_end(object); \
6560 vm_object_unlock(object); \
6561 }
6562
6563 #undef UNLOCK_AND_DEALLOCATE
6564 #define UNLOCK_AND_DEALLOCATE { \
6565 UNLOCK_THINGS; \
6566 vm_object_deallocate(object); \
6567 }
6568 /*
6569 * Give up and have caller do things the hard way.
6570 */
6571
6572 #define GIVE_UP { \
6573 UNLOCK_AND_DEALLOCATE; \
6574 return(KERN_FAILURE); \
6575 }
6576
6577
6578 /*
6579 * If this entry is not directly to a vm_object, bail out.
6580 */
6581 if (entry->is_sub_map) {
6582 assert(physpage_p == NULL);
6583 return KERN_FAILURE;
6584 }
6585
6586 /*
6587 * Find the backing store object and offset into it.
6588 */
6589
6590 object = VME_OBJECT(entry);
6591 offset = (va - entry->vme_start) + VME_OFFSET(entry);
6592 prot = entry->protection;
6593
6594 /*
6595 * Make a reference to this object to prevent its
6596 * disposal while we are messing with it.
6597 */
6598
6599 vm_object_lock(object);
6600 vm_object_reference_locked(object);
6601 vm_object_paging_begin(object);
6602
6603 /*
6604 * INVARIANTS (through entire routine):
6605 *
6606 * 1) At all times, we must either have the object
6607 * lock or a busy page in some object to prevent
6608 * some other thread from trying to bring in
6609 * the same page.
6610 *
6611 * 2) Once we have a busy page, we must remove it from
6612 * the pageout queues, so that the pageout daemon
6613 * will not grab it away.
6614 *
6615 */
6616
6617 /*
6618 * Look for page in top-level object. If it's not there or
6619 * there's something going on, give up.
6620 */
6621 m = vm_page_lookup(object, vm_object_trunc_page(offset));
6622 if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6623 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
6624 GIVE_UP;
6625 }
6626 if (m->vmp_fictitious &&
6627 VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6628 /*
6629 * Guard pages are fictitious pages and are never
6630 * entered into a pmap, so let's say it's been wired...
6631 */
6632 kr = KERN_SUCCESS;
6633 goto done;
6634 }
6635
6636 /*
6637 * Wire the page down now. All bail outs beyond this
6638 * point must unwire the page.
6639 */
6640
6641 vm_page_lockspin_queues();
6642 vm_page_wire(m, wire_tag, TRUE);
6643 vm_page_unlock_queues();
6644
6645 /*
6646 * Mark page busy for other threads.
6647 */
6648 assert(!m->vmp_busy);
6649 m->vmp_busy = TRUE;
6650 assert(!m->vmp_absent);
6651
6652 /*
6653 * Give up if the page is being written and there's a copy object
6654 */
6655 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6656 RELEASE_PAGE(m);
6657 GIVE_UP;
6658 }
6659
6660 fault_info.user_tag = VME_ALIAS(entry);
6661 fault_info.pmap_options = 0;
6662 if (entry->iokit_acct ||
6663 (!entry->is_sub_map && !entry->use_pmap)) {
6664 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6665 }
6666
6667 fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6668 fault_phys_offset = offset - vm_object_trunc_page(offset);
6669
6670 /*
6671 * Put this page into the physical map.
6672 */
6673 type_of_fault = DBG_CACHE_HIT_FAULT;
6674 assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6675 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6676 kr = vm_fault_enter(m,
6677 pmap,
6678 pmap_addr,
6679 fault_page_size,
6680 fault_phys_offset,
6681 prot,
6682 prot,
6683 TRUE, /* wired */
6684 FALSE, /* change_wiring */
6685 wire_tag,
6686 &fault_info,
6687 NULL,
6688 &type_of_fault);
6689 if (kr != KERN_SUCCESS) {
6690 RELEASE_PAGE(m);
6691 GIVE_UP;
6692 }
6693
6694 done:
6695 /*
6696 * Unlock everything, and return
6697 */
6698
6699 if (physpage_p) {
6700 /* for vm_map_wire_and_extract() */
6701 if (kr == KERN_SUCCESS) {
6702 assert(object == VM_PAGE_OBJECT(m));
6703 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6704 if (prot & VM_PROT_WRITE) {
6705 vm_object_lock_assert_exclusive(object);
6706 m->vmp_dirty = TRUE;
6707 }
6708 } else {
6709 *physpage_p = 0;
6710 }
6711 }
6712
6713 PAGE_WAKEUP_DONE(m);
6714 UNLOCK_AND_DEALLOCATE;
6715
6716 return kr;
6717 }
6718
6719 /*
6720 * Routine: vm_fault_copy_cleanup
6721 * Purpose:
6722 * Release a page used by vm_fault_copy.
6723 */
6724
6725 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)6726 vm_fault_copy_cleanup(
6727 vm_page_t page,
6728 vm_page_t top_page)
6729 {
6730 vm_object_t object = VM_PAGE_OBJECT(page);
6731
6732 vm_object_lock(object);
6733 PAGE_WAKEUP_DONE(page);
6734 if (!VM_PAGE_PAGEABLE(page)) {
6735 vm_page_lockspin_queues();
6736 if (!VM_PAGE_PAGEABLE(page)) {
6737 vm_page_activate(page);
6738 }
6739 vm_page_unlock_queues();
6740 }
6741 vm_fault_cleanup(object, top_page);
6742 }
6743
6744 static void
vm_fault_copy_dst_cleanup(vm_page_t page)6745 vm_fault_copy_dst_cleanup(
6746 vm_page_t page)
6747 {
6748 vm_object_t object;
6749
6750 if (page != VM_PAGE_NULL) {
6751 object = VM_PAGE_OBJECT(page);
6752 vm_object_lock(object);
6753 vm_page_lockspin_queues();
6754 vm_page_unwire(page, TRUE);
6755 vm_page_unlock_queues();
6756 vm_object_paging_end(object);
6757 vm_object_unlock(object);
6758 }
6759 }
6760
6761 /*
6762 * Routine: vm_fault_copy
6763 *
6764 * Purpose:
6765 * Copy pages from one virtual memory object to another --
6766 * neither the source nor destination pages need be resident.
6767 *
6768 * Before actually copying a page, the version associated with
6769 * the destination address map wil be verified.
6770 *
6771 * In/out conditions:
6772 * The caller must hold a reference, but not a lock, to
6773 * each of the source and destination objects and to the
6774 * destination map.
6775 *
6776 * Results:
6777 * Returns KERN_SUCCESS if no errors were encountered in
6778 * reading or writing the data. Returns KERN_INTERRUPTED if
6779 * the operation was interrupted (only possible if the
6780 * "interruptible" argument is asserted). Other return values
6781 * indicate a permanent error in copying the data.
6782 *
6783 * The actual amount of data copied will be returned in the
6784 * "copy_size" argument. In the event that the destination map
6785 * verification failed, this amount may be less than the amount
6786 * requested.
6787 */
6788 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)6789 vm_fault_copy(
6790 vm_object_t src_object,
6791 vm_object_offset_t src_offset,
6792 vm_map_size_t *copy_size, /* INOUT */
6793 vm_object_t dst_object,
6794 vm_object_offset_t dst_offset,
6795 vm_map_t dst_map,
6796 vm_map_version_t *dst_version,
6797 int interruptible)
6798 {
6799 vm_page_t result_page;
6800
6801 vm_page_t src_page;
6802 vm_page_t src_top_page;
6803 vm_prot_t src_prot;
6804
6805 vm_page_t dst_page;
6806 vm_page_t dst_top_page;
6807 vm_prot_t dst_prot;
6808
6809 vm_map_size_t amount_left;
6810 vm_object_t old_copy_object;
6811 vm_object_t result_page_object = NULL;
6812 kern_return_t error = 0;
6813 vm_fault_return_t result;
6814
6815 vm_map_size_t part_size;
6816 struct vm_object_fault_info fault_info_src = {};
6817 struct vm_object_fault_info fault_info_dst = {};
6818
6819 /*
6820 * In order not to confuse the clustered pageins, align
6821 * the different offsets on a page boundary.
6822 */
6823
6824 #define RETURN(x) \
6825 MACRO_BEGIN \
6826 *copy_size -= amount_left; \
6827 MACRO_RETURN(x); \
6828 MACRO_END
6829
6830 amount_left = *copy_size;
6831
6832 fault_info_src.interruptible = interruptible;
6833 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6834 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6835 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6836 fault_info_src.stealth = TRUE;
6837
6838 fault_info_dst.interruptible = interruptible;
6839 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6840 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6841 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6842 fault_info_dst.stealth = TRUE;
6843
6844 do { /* while (amount_left > 0) */
6845 /*
6846 * There may be a deadlock if both source and destination
6847 * pages are the same. To avoid this deadlock, the copy must
6848 * start by getting the destination page in order to apply
6849 * COW semantics if any.
6850 */
6851
6852 RetryDestinationFault:;
6853
6854 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6855
6856 vm_object_lock(dst_object);
6857 vm_object_paging_begin(dst_object);
6858
6859 /* cap cluster size at maximum UPL size */
6860 upl_size_t cluster_size;
6861 if (os_convert_overflow(amount_left, &cluster_size)) {
6862 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6863 }
6864 fault_info_dst.cluster_size = cluster_size;
6865
6866 dst_page = VM_PAGE_NULL;
6867 result = vm_fault_page(dst_object,
6868 vm_object_trunc_page(dst_offset),
6869 VM_PROT_WRITE | VM_PROT_READ,
6870 FALSE,
6871 FALSE, /* page not looked up */
6872 &dst_prot, &dst_page, &dst_top_page,
6873 (int *)0,
6874 &error,
6875 dst_map->no_zero_fill,
6876 &fault_info_dst);
6877 switch (result) {
6878 case VM_FAULT_SUCCESS:
6879 break;
6880 case VM_FAULT_RETRY:
6881 goto RetryDestinationFault;
6882 case VM_FAULT_MEMORY_SHORTAGE:
6883 if (vm_page_wait(interruptible)) {
6884 goto RetryDestinationFault;
6885 }
6886 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
6887 OS_FALLTHROUGH;
6888 case VM_FAULT_INTERRUPTED:
6889 RETURN(MACH_SEND_INTERRUPTED);
6890 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6891 /* success but no VM page: fail the copy */
6892 vm_object_paging_end(dst_object);
6893 vm_object_unlock(dst_object);
6894 OS_FALLTHROUGH;
6895 case VM_FAULT_MEMORY_ERROR:
6896 if (error) {
6897 return error;
6898 } else {
6899 return KERN_MEMORY_ERROR;
6900 }
6901 default:
6902 panic("vm_fault_copy: unexpected error 0x%x from "
6903 "vm_fault_page()\n", result);
6904 }
6905 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6906
6907 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6908 old_copy_object = dst_object->copy;
6909
6910 /*
6911 * There exists the possiblity that the source and
6912 * destination page are the same. But we can't
6913 * easily determine that now. If they are the
6914 * same, the call to vm_fault_page() for the
6915 * destination page will deadlock. To prevent this we
6916 * wire the page so we can drop busy without having
6917 * the page daemon steal the page. We clean up the
6918 * top page but keep the paging reference on the object
6919 * holding the dest page so it doesn't go away.
6920 */
6921
6922 vm_page_lockspin_queues();
6923 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6924 vm_page_unlock_queues();
6925 PAGE_WAKEUP_DONE(dst_page);
6926 vm_object_unlock(dst_object);
6927
6928 if (dst_top_page != VM_PAGE_NULL) {
6929 vm_object_lock(dst_object);
6930 VM_PAGE_FREE(dst_top_page);
6931 vm_object_paging_end(dst_object);
6932 vm_object_unlock(dst_object);
6933 }
6934
6935 RetrySourceFault:;
6936
6937 if (src_object == VM_OBJECT_NULL) {
6938 /*
6939 * No source object. We will just
6940 * zero-fill the page in dst_object.
6941 */
6942 src_page = VM_PAGE_NULL;
6943 result_page = VM_PAGE_NULL;
6944 } else {
6945 vm_object_lock(src_object);
6946 src_page = vm_page_lookup(src_object,
6947 vm_object_trunc_page(src_offset));
6948 if (src_page == dst_page) {
6949 src_prot = dst_prot;
6950 result_page = VM_PAGE_NULL;
6951 } else {
6952 src_prot = VM_PROT_READ;
6953 vm_object_paging_begin(src_object);
6954
6955 /* cap cluster size at maximum UPL size */
6956 if (os_convert_overflow(amount_left, &cluster_size)) {
6957 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6958 }
6959 fault_info_src.cluster_size = cluster_size;
6960
6961 result_page = VM_PAGE_NULL;
6962 result = vm_fault_page(
6963 src_object,
6964 vm_object_trunc_page(src_offset),
6965 VM_PROT_READ, FALSE,
6966 FALSE, /* page not looked up */
6967 &src_prot,
6968 &result_page, &src_top_page,
6969 (int *)0, &error, FALSE,
6970 &fault_info_src);
6971
6972 switch (result) {
6973 case VM_FAULT_SUCCESS:
6974 break;
6975 case VM_FAULT_RETRY:
6976 goto RetrySourceFault;
6977 case VM_FAULT_MEMORY_SHORTAGE:
6978 if (vm_page_wait(interruptible)) {
6979 goto RetrySourceFault;
6980 }
6981 OS_FALLTHROUGH;
6982 case VM_FAULT_INTERRUPTED:
6983 vm_fault_copy_dst_cleanup(dst_page);
6984 RETURN(MACH_SEND_INTERRUPTED);
6985 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6986 /* success but no VM page: fail */
6987 vm_object_paging_end(src_object);
6988 vm_object_unlock(src_object);
6989 OS_FALLTHROUGH;
6990 case VM_FAULT_MEMORY_ERROR:
6991 vm_fault_copy_dst_cleanup(dst_page);
6992 if (error) {
6993 return error;
6994 } else {
6995 return KERN_MEMORY_ERROR;
6996 }
6997 default:
6998 panic("vm_fault_copy(2): unexpected "
6999 "error 0x%x from "
7000 "vm_fault_page()\n", result);
7001 }
7002
7003 result_page_object = VM_PAGE_OBJECT(result_page);
7004 assert((src_top_page == VM_PAGE_NULL) ==
7005 (result_page_object == src_object));
7006 }
7007 assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7008 vm_object_unlock(result_page_object);
7009 }
7010
7011 vm_map_lock_read(dst_map);
7012
7013 if (!vm_map_verify(dst_map, dst_version)) {
7014 vm_map_unlock_read(dst_map);
7015 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7016 vm_fault_copy_cleanup(result_page, src_top_page);
7017 }
7018 vm_fault_copy_dst_cleanup(dst_page);
7019 break;
7020 }
7021 assert(dst_object == VM_PAGE_OBJECT(dst_page));
7022
7023 vm_object_lock(dst_object);
7024
7025 if (dst_object->copy != old_copy_object) {
7026 vm_object_unlock(dst_object);
7027 vm_map_unlock_read(dst_map);
7028 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7029 vm_fault_copy_cleanup(result_page, src_top_page);
7030 }
7031 vm_fault_copy_dst_cleanup(dst_page);
7032 break;
7033 }
7034 vm_object_unlock(dst_object);
7035
7036 /*
7037 * Copy the page, and note that it is dirty
7038 * immediately.
7039 */
7040
7041 if (!page_aligned(src_offset) ||
7042 !page_aligned(dst_offset) ||
7043 !page_aligned(amount_left)) {
7044 vm_object_offset_t src_po,
7045 dst_po;
7046
7047 src_po = src_offset - vm_object_trunc_page(src_offset);
7048 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7049
7050 if (dst_po > src_po) {
7051 part_size = PAGE_SIZE - dst_po;
7052 } else {
7053 part_size = PAGE_SIZE - src_po;
7054 }
7055 if (part_size > (amount_left)) {
7056 part_size = amount_left;
7057 }
7058
7059 if (result_page == VM_PAGE_NULL) {
7060 assert((vm_offset_t) dst_po == dst_po);
7061 assert((vm_size_t) part_size == part_size);
7062 vm_page_part_zero_fill(dst_page,
7063 (vm_offset_t) dst_po,
7064 (vm_size_t) part_size);
7065 } else {
7066 assert((vm_offset_t) src_po == src_po);
7067 assert((vm_offset_t) dst_po == dst_po);
7068 assert((vm_size_t) part_size == part_size);
7069 vm_page_part_copy(result_page,
7070 (vm_offset_t) src_po,
7071 dst_page,
7072 (vm_offset_t) dst_po,
7073 (vm_size_t)part_size);
7074 if (!dst_page->vmp_dirty) {
7075 vm_object_lock(dst_object);
7076 SET_PAGE_DIRTY(dst_page, TRUE);
7077 vm_object_unlock(dst_object);
7078 }
7079 }
7080 } else {
7081 part_size = PAGE_SIZE;
7082
7083 if (result_page == VM_PAGE_NULL) {
7084 vm_page_zero_fill(dst_page);
7085 } else {
7086 vm_object_lock(result_page_object);
7087 vm_page_copy(result_page, dst_page);
7088 vm_object_unlock(result_page_object);
7089
7090 if (!dst_page->vmp_dirty) {
7091 vm_object_lock(dst_object);
7092 SET_PAGE_DIRTY(dst_page, TRUE);
7093 vm_object_unlock(dst_object);
7094 }
7095 }
7096 }
7097
7098 /*
7099 * Unlock everything, and return
7100 */
7101
7102 vm_map_unlock_read(dst_map);
7103
7104 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7105 vm_fault_copy_cleanup(result_page, src_top_page);
7106 }
7107 vm_fault_copy_dst_cleanup(dst_page);
7108
7109 amount_left -= part_size;
7110 src_offset += part_size;
7111 dst_offset += part_size;
7112 } while (amount_left > 0);
7113
7114 RETURN(KERN_SUCCESS);
7115 #undef RETURN
7116
7117 /*NOTREACHED*/
7118 }
7119
7120 #if VM_FAULT_CLASSIFY
7121 /*
7122 * Temporary statistics gathering support.
7123 */
7124
7125 /*
7126 * Statistics arrays:
7127 */
7128 #define VM_FAULT_TYPES_MAX 5
7129 #define VM_FAULT_LEVEL_MAX 8
7130
7131 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
7132
7133 #define VM_FAULT_TYPE_ZERO_FILL 0
7134 #define VM_FAULT_TYPE_MAP_IN 1
7135 #define VM_FAULT_TYPE_PAGER 2
7136 #define VM_FAULT_TYPE_COPY 3
7137 #define VM_FAULT_TYPE_OTHER 4
7138
7139
7140 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)7141 vm_fault_classify(vm_object_t object,
7142 vm_object_offset_t offset,
7143 vm_prot_t fault_type)
7144 {
7145 int type, level = 0;
7146 vm_page_t m;
7147
7148 while (TRUE) {
7149 m = vm_page_lookup(object, offset);
7150 if (m != VM_PAGE_NULL) {
7151 if (m->vmp_busy || VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent) {
7152 type = VM_FAULT_TYPE_OTHER;
7153 break;
7154 }
7155 if (((fault_type & VM_PROT_WRITE) == 0) ||
7156 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
7157 type = VM_FAULT_TYPE_MAP_IN;
7158 break;
7159 }
7160 type = VM_FAULT_TYPE_COPY;
7161 break;
7162 } else {
7163 if (object->pager_created) {
7164 type = VM_FAULT_TYPE_PAGER;
7165 break;
7166 }
7167 if (object->shadow == VM_OBJECT_NULL) {
7168 type = VM_FAULT_TYPE_ZERO_FILL;
7169 break;
7170 }
7171
7172 offset += object->vo_shadow_offset;
7173 object = object->shadow;
7174 level++;
7175 continue;
7176 }
7177 }
7178
7179 if (level > VM_FAULT_LEVEL_MAX) {
7180 level = VM_FAULT_LEVEL_MAX;
7181 }
7182
7183 vm_fault_stats[type][level] += 1;
7184
7185 return;
7186 }
7187
7188 /* cleanup routine to call from debugger */
7189
7190 void
vm_fault_classify_init(void)7191 vm_fault_classify_init(void)
7192 {
7193 int type, level;
7194
7195 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
7196 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
7197 vm_fault_stats[type][level] = 0;
7198 }
7199 }
7200
7201 return;
7202 }
7203 #endif /* VM_FAULT_CLASSIFY */
7204
7205 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr)7206 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7207 {
7208 vm_map_entry_t entry;
7209 vm_object_t object;
7210 vm_offset_t object_offset;
7211 vm_page_t m;
7212 int compressor_external_state, compressed_count_delta;
7213 int compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
7214 int my_fault_type = VM_PROT_READ;
7215 kern_return_t kr;
7216 int effective_page_mask, effective_page_size;
7217
7218 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7219 effective_page_mask = VM_MAP_PAGE_MASK(map);
7220 effective_page_size = VM_MAP_PAGE_SIZE(map);
7221 } else {
7222 effective_page_mask = PAGE_MASK;
7223 effective_page_size = PAGE_SIZE;
7224 }
7225
7226 if (not_in_kdp) {
7227 panic("kdp_lightweight_fault called from outside of debugger context");
7228 }
7229
7230 assert(map != VM_MAP_NULL);
7231
7232 assert((cur_target_addr & effective_page_mask) == 0);
7233 if ((cur_target_addr & effective_page_mask) != 0) {
7234 return 0;
7235 }
7236
7237 if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7238 return 0;
7239 }
7240
7241 if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7242 return 0;
7243 }
7244
7245 if (entry->is_sub_map) {
7246 return 0;
7247 }
7248
7249 object = VME_OBJECT(entry);
7250 if (object == VM_OBJECT_NULL) {
7251 return 0;
7252 }
7253
7254 object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7255
7256 while (TRUE) {
7257 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7258 return 0;
7259 }
7260
7261 if (object->pager_created && (object->paging_in_progress ||
7262 object->activity_in_progress)) {
7263 return 0;
7264 }
7265
7266 m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7267
7268 if (m != VM_PAGE_NULL) {
7269 if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7270 return 0;
7271 }
7272
7273 if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
7274 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7275 return 0;
7276 }
7277
7278 assert(!m->vmp_private);
7279 if (m->vmp_private) {
7280 return 0;
7281 }
7282
7283 assert(!m->vmp_fictitious);
7284 if (m->vmp_fictitious) {
7285 return 0;
7286 }
7287
7288 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7289 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7290 return 0;
7291 }
7292
7293 return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7294 }
7295
7296 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7297
7298 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7299 if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7300 kr = vm_compressor_pager_get(object->pager,
7301 vm_object_trunc_page(object_offset + object->paging_offset),
7302 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7303 compressor_flags, &compressed_count_delta);
7304 if (kr == KERN_SUCCESS) {
7305 return kdp_compressor_decompressed_page_paddr;
7306 } else {
7307 return 0;
7308 }
7309 }
7310 }
7311
7312 if (object->shadow == VM_OBJECT_NULL) {
7313 return 0;
7314 }
7315
7316 object_offset += object->vo_shadow_offset;
7317 object = object->shadow;
7318 }
7319 }
7320
7321 /*
7322 * vm_page_validate_cs_fast():
7323 * Performs a few quick checks to determine if the page's code signature
7324 * really needs to be fully validated. It could:
7325 * 1. have been modified (i.e. automatically tainted),
7326 * 2. have already been validated,
7327 * 3. have already been found to be tainted,
7328 * 4. no longer have a backing store.
7329 * Returns FALSE if the page needs to be fully validated.
7330 */
7331 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7332 vm_page_validate_cs_fast(
7333 vm_page_t page,
7334 vm_map_size_t fault_page_size,
7335 vm_map_offset_t fault_phys_offset)
7336 {
7337 vm_object_t object;
7338
7339 object = VM_PAGE_OBJECT(page);
7340 vm_object_lock_assert_held(object);
7341
7342 if (page->vmp_wpmapped &&
7343 !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7344 /*
7345 * This page was mapped for "write" access sometime in the
7346 * past and could still be modifiable in the future.
7347 * Consider it tainted.
7348 * [ If the page was already found to be "tainted", no
7349 * need to re-validate. ]
7350 */
7351 vm_object_lock_assert_exclusive(object);
7352 VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7353 VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7354 if (cs_debug) {
7355 printf("CODESIGNING: %s: "
7356 "page %p obj %p off 0x%llx "
7357 "was modified\n",
7358 __FUNCTION__,
7359 page, object, page->vmp_offset);
7360 }
7361 vm_cs_validated_dirtied++;
7362 }
7363
7364 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7365 VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7366 return TRUE;
7367 }
7368 vm_object_lock_assert_exclusive(object);
7369
7370 #if CHECK_CS_VALIDATION_BITMAP
7371 kern_return_t kr;
7372
7373 kr = vnode_pager_cs_check_validation_bitmap(
7374 object->pager,
7375 page->vmp_offset + object->paging_offset,
7376 CS_BITMAP_CHECK);
7377 if (kr == KERN_SUCCESS) {
7378 page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7379 page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7380 vm_cs_bitmap_validated++;
7381 return TRUE;
7382 }
7383 #endif /* CHECK_CS_VALIDATION_BITMAP */
7384
7385 if (!object->alive || object->terminating || object->pager == NULL) {
7386 /*
7387 * The object is terminating and we don't have its pager
7388 * so we can't validate the data...
7389 */
7390 return TRUE;
7391 }
7392
7393 /* we need to really validate this page */
7394 vm_object_lock_assert_exclusive(object);
7395 return FALSE;
7396 }
7397
7398 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)7399 vm_page_validate_cs_mapped_slow(
7400 vm_page_t page,
7401 const void *kaddr)
7402 {
7403 vm_object_t object;
7404 memory_object_offset_t mo_offset;
7405 memory_object_t pager;
7406 struct vnode *vnode;
7407 int validated, tainted, nx;
7408
7409 assert(page->vmp_busy);
7410 object = VM_PAGE_OBJECT(page);
7411 vm_object_lock_assert_exclusive(object);
7412
7413 vm_cs_validates++;
7414
7415 /*
7416 * Since we get here to validate a page that was brought in by
7417 * the pager, we know that this pager is all setup and ready
7418 * by now.
7419 */
7420 assert(object->code_signed);
7421 assert(!object->internal);
7422 assert(object->pager != NULL);
7423 assert(object->pager_ready);
7424
7425 pager = object->pager;
7426 assert(object->paging_in_progress);
7427 vnode = vnode_pager_lookup_vnode(pager);
7428 mo_offset = page->vmp_offset + object->paging_offset;
7429
7430 /* verify the SHA1 hash for this page */
7431 validated = 0;
7432 tainted = 0;
7433 nx = 0;
7434 cs_validate_page(vnode,
7435 pager,
7436 mo_offset,
7437 (const void *)((const char *)kaddr),
7438 &validated,
7439 &tainted,
7440 &nx);
7441
7442 page->vmp_cs_validated |= validated;
7443 page->vmp_cs_tainted |= tainted;
7444 page->vmp_cs_nx |= nx;
7445
7446 #if CHECK_CS_VALIDATION_BITMAP
7447 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7448 page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7449 vnode_pager_cs_check_validation_bitmap(object->pager,
7450 mo_offset,
7451 CS_BITMAP_SET);
7452 }
7453 #endif /* CHECK_CS_VALIDATION_BITMAP */
7454 }
7455
7456 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)7457 vm_page_validate_cs_mapped(
7458 vm_page_t page,
7459 vm_map_size_t fault_page_size,
7460 vm_map_offset_t fault_phys_offset,
7461 const void *kaddr)
7462 {
7463 if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7464 vm_page_validate_cs_mapped_slow(page, kaddr);
7465 }
7466 }
7467
7468 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)7469 vm_page_map_and_validate_cs(
7470 vm_object_t object,
7471 vm_page_t page)
7472 {
7473 vm_object_offset_t offset;
7474 vm_map_offset_t koffset;
7475 vm_map_size_t ksize;
7476 vm_offset_t kaddr;
7477 kern_return_t kr;
7478 boolean_t busy_page;
7479 boolean_t need_unmap;
7480
7481 vm_object_lock_assert_exclusive(object);
7482
7483 assert(object->code_signed);
7484 offset = page->vmp_offset;
7485
7486 busy_page = page->vmp_busy;
7487 if (!busy_page) {
7488 /* keep page busy while we map (and unlock) the VM object */
7489 page->vmp_busy = TRUE;
7490 }
7491
7492 /*
7493 * Take a paging reference on the VM object
7494 * to protect it from collapse or bypass,
7495 * and keep it from disappearing too.
7496 */
7497 vm_object_paging_begin(object);
7498
7499 /* map the page in the kernel address space */
7500 ksize = PAGE_SIZE_64;
7501 koffset = 0;
7502 need_unmap = FALSE;
7503 kr = vm_paging_map_object(page,
7504 object,
7505 offset,
7506 VM_PROT_READ,
7507 FALSE, /* can't unlock object ! */
7508 &ksize,
7509 &koffset,
7510 &need_unmap);
7511 if (kr != KERN_SUCCESS) {
7512 panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
7513 }
7514 kaddr = CAST_DOWN(vm_offset_t, koffset);
7515
7516 /* validate the mapped page */
7517 vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7518
7519 assert(page->vmp_busy);
7520 assert(object == VM_PAGE_OBJECT(page));
7521 vm_object_lock_assert_exclusive(object);
7522
7523 if (!busy_page) {
7524 PAGE_WAKEUP_DONE(page);
7525 }
7526 if (need_unmap) {
7527 /* unmap the map from the kernel address space */
7528 vm_paging_unmap_object(object, koffset, koffset + ksize);
7529 koffset = 0;
7530 ksize = 0;
7531 kaddr = 0;
7532 }
7533 vm_object_paging_end(object);
7534 }
7535
7536 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7537 vm_page_validate_cs(
7538 vm_page_t page,
7539 vm_map_size_t fault_page_size,
7540 vm_map_offset_t fault_phys_offset)
7541 {
7542 vm_object_t object;
7543
7544 object = VM_PAGE_OBJECT(page);
7545 vm_object_lock_assert_held(object);
7546
7547 if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7548 return;
7549 }
7550 vm_page_map_and_validate_cs(object, page);
7551 }
7552
7553 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)7554 vm_page_validate_cs_mapped_chunk(
7555 vm_page_t page,
7556 const void *kaddr,
7557 vm_offset_t chunk_offset,
7558 vm_size_t chunk_size,
7559 boolean_t *validated_p,
7560 unsigned *tainted_p)
7561 {
7562 vm_object_t object;
7563 vm_object_offset_t offset, offset_in_page;
7564 memory_object_t pager;
7565 struct vnode *vnode;
7566 boolean_t validated;
7567 unsigned tainted;
7568
7569 *validated_p = FALSE;
7570 *tainted_p = 0;
7571
7572 assert(page->vmp_busy);
7573 object = VM_PAGE_OBJECT(page);
7574 vm_object_lock_assert_exclusive(object);
7575
7576 assert(object->code_signed);
7577 offset = page->vmp_offset;
7578
7579 if (!object->alive || object->terminating || object->pager == NULL) {
7580 /*
7581 * The object is terminating and we don't have its pager
7582 * so we can't validate the data...
7583 */
7584 return;
7585 }
7586 /*
7587 * Since we get here to validate a page that was brought in by
7588 * the pager, we know that this pager is all setup and ready
7589 * by now.
7590 */
7591 assert(!object->internal);
7592 assert(object->pager != NULL);
7593 assert(object->pager_ready);
7594
7595 pager = object->pager;
7596 assert(object->paging_in_progress);
7597 vnode = vnode_pager_lookup_vnode(pager);
7598
7599 /* verify the signature for this chunk */
7600 offset_in_page = chunk_offset;
7601 assert(offset_in_page < PAGE_SIZE);
7602
7603 tainted = 0;
7604 validated = cs_validate_range(vnode,
7605 pager,
7606 (object->paging_offset +
7607 offset +
7608 offset_in_page),
7609 (const void *)((const char *)kaddr
7610 + offset_in_page),
7611 chunk_size,
7612 &tainted);
7613 if (validated) {
7614 *validated_p = TRUE;
7615 }
7616 if (tainted) {
7617 *tainted_p = tainted;
7618 }
7619 }
7620
7621 static void
vm_rtfrecord_lock(void)7622 vm_rtfrecord_lock(void)
7623 {
7624 lck_spin_lock(&vm_rtfr_slock);
7625 }
7626
7627 static void
vm_rtfrecord_unlock(void)7628 vm_rtfrecord_unlock(void)
7629 {
7630 lck_spin_unlock(&vm_rtfr_slock);
7631 }
7632
7633 unsigned int
vmrtfaultinfo_bufsz(void)7634 vmrtfaultinfo_bufsz(void)
7635 {
7636 return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7637 }
7638
7639 #include <kern/backtrace.h>
7640
7641 __attribute__((noinline))
7642 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)7643 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7644 {
7645 uint64_t fend = mach_continuous_time();
7646
7647 uint64_t cfpc = 0;
7648 uint64_t ctid = cthread->thread_id;
7649 uint64_t cupid = get_current_unique_pid();
7650
7651 uintptr_t bpc = 0;
7652 errno_t btr = 0;
7653
7654 /*
7655 * Capture a single-frame backtrace. This extracts just the program
7656 * counter at the point of the fault, and should not use copyin to get
7657 * Rosetta save state.
7658 */
7659 struct backtrace_control ctl = {
7660 .btc_user_thread = cthread,
7661 .btc_user_copy = backtrace_user_copy_error,
7662 };
7663 unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
7664 if ((btr == 0) && (bfrs > 0)) {
7665 cfpc = bpc;
7666 }
7667
7668 assert((fstart != 0) && fend >= fstart);
7669 vm_rtfrecord_lock();
7670 assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7671
7672 vmrtfrs.vmrtf_total++;
7673 vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7674
7675 cvmr->rtfabstime = fstart;
7676 cvmr->rtfduration = fend - fstart;
7677 cvmr->rtfaddr = fault_vaddr;
7678 cvmr->rtfpc = cfpc;
7679 cvmr->rtftype = type_of_fault;
7680 cvmr->rtfupid = cupid;
7681 cvmr->rtftid = ctid;
7682
7683 if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7684 vmrtfrs.vmrtfr_curi = 0;
7685 }
7686
7687 vm_rtfrecord_unlock();
7688 }
7689
7690 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)7691 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7692 {
7693 vm_rtfault_record_t *cvmrd = vrecords;
7694 size_t residue = vrecordsz;
7695 size_t numextracted = 0;
7696 boolean_t early_exit = FALSE;
7697
7698 vm_rtfrecord_lock();
7699
7700 for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7701 if (residue < sizeof(vm_rtfault_record_t)) {
7702 early_exit = TRUE;
7703 break;
7704 }
7705
7706 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7707 #if DEVELOPMENT || DEBUG
7708 if (isroot == FALSE) {
7709 continue;
7710 }
7711 #else
7712 continue;
7713 #endif /* DEVDEBUG */
7714 }
7715
7716 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7717 cvmrd++;
7718 residue -= sizeof(vm_rtfault_record_t);
7719 numextracted++;
7720 }
7721
7722 vm_rtfrecord_unlock();
7723
7724 *vmrtfrv = numextracted;
7725 return early_exit;
7726 }
7727
7728 /*
7729 * Only allow one diagnosis to be in flight at a time, to avoid
7730 * creating too much additional memory usage.
7731 */
7732 static volatile uint_t vmtc_diagnosing;
7733 unsigned int vmtc_total = 0;
7734
7735 /*
7736 * Type used to update telemetry for the diagnosis counts.
7737 */
7738 CA_EVENT(vmtc_telemetry,
7739 CA_INT, vmtc_num_byte, /* number of corrupt bytes found */
7740 CA_BOOL, vmtc_undiagnosed, /* undiagnosed because more than 1 at a time */
7741 CA_BOOL, vmtc_not_eligible, /* the page didn't qualify */
7742 CA_BOOL, vmtc_copyin_fail, /* unable to copy in the page */
7743 CA_BOOL, vmtc_not_found, /* no corruption found even though CS failed */
7744 CA_BOOL, vmtc_one_bit_flip, /* single bit flip */
7745 CA_BOOL, vmtc_testing); /* caused on purpose by testing */
7746
7747 #if DEVELOPMENT || DEBUG
7748 /*
7749 * Buffers used to compare before/after page contents.
7750 * Stashed to aid when debugging crashes.
7751 */
7752 static size_t vmtc_last_buffer_size = 0;
7753 static uint64_t *vmtc_last_before_buffer = NULL;
7754 static uint64_t *vmtc_last_after_buffer = NULL;
7755
7756 /*
7757 * Needed to record corruptions due to testing.
7758 */
7759 static uintptr_t corruption_test_va = 0;
7760 #endif /* DEVELOPMENT || DEBUG */
7761
7762 /*
7763 * Stash a copy of data from a possibly corrupt page.
7764 */
7765 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)7766 vmtc_get_page_data(
7767 vm_map_offset_t code_addr,
7768 vm_page_t page)
7769 {
7770 uint64_t *buffer = NULL;
7771 addr64_t buffer_paddr;
7772 addr64_t page_paddr;
7773 extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
7774 uint_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7775
7776 /*
7777 * Need an aligned buffer to do a physical copy.
7778 */
7779 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
7780 size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
7781 return NULL;
7782 }
7783 buffer_paddr = kvtophys((vm_offset_t)buffer);
7784 page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
7785
7786 /* adjust the page start address if we need only 4K of a 16K page */
7787 if (size < PAGE_SIZE) {
7788 uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
7789 page_paddr += subpage_start;
7790 }
7791
7792 bcopy_phys(page_paddr, buffer_paddr, size);
7793 return buffer;
7794 }
7795
7796 /*
7797 * Set things up so we can diagnose a potential text page corruption.
7798 */
7799 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)7800 vmtc_text_page_diagnose_setup(
7801 vm_map_offset_t code_addr,
7802 vm_page_t page,
7803 CA_EVENT_TYPE(vmtc_telemetry) *event)
7804 {
7805 uint64_t *buffer = NULL;
7806
7807 /*
7808 * If another is being diagnosed, skip this one.
7809 */
7810 if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
7811 event->vmtc_undiagnosed = true;
7812 return NULL;
7813 }
7814
7815 /*
7816 * Get the contents of the corrupt page.
7817 */
7818 buffer = vmtc_get_page_data(code_addr, page);
7819 if (buffer == NULL) {
7820 event->vmtc_copyin_fail = true;
7821 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7822 panic("Bad compare and swap in setup!");
7823 }
7824 return NULL;
7825 }
7826 return buffer;
7827 }
7828
7829 /*
7830 * Diagnose the text page by comparing its contents with
7831 * the one we've previously saved.
7832 */
7833 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)7834 vmtc_text_page_diagnose(
7835 vm_map_offset_t code_addr,
7836 uint64_t *old_code_buffer,
7837 CA_EVENT_TYPE(vmtc_telemetry) *event)
7838 {
7839 uint64_t *new_code_buffer;
7840 size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7841 uint_t count = (uint_t)size / sizeof(uint64_t);
7842 uint_t diff_count = 0;
7843 bool bit_flip = false;
7844 uint_t b;
7845 uint64_t *new;
7846 uint64_t *old;
7847
7848 new_code_buffer = kalloc_data(size, Z_WAITOK);
7849 assert(new_code_buffer != NULL);
7850 if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
7851 /* copyin error, so undo things */
7852 event->vmtc_copyin_fail = true;
7853 goto done;
7854 }
7855
7856 new = new_code_buffer;
7857 old = old_code_buffer;
7858 for (; count-- > 0; ++new, ++old) {
7859 if (*new == *old) {
7860 continue;
7861 }
7862
7863 /*
7864 * On first diff, check for a single bit flip
7865 */
7866 if (diff_count == 0) {
7867 uint64_t x = (*new ^ *old);
7868 assert(x != 0);
7869 if ((x & (x - 1)) == 0) {
7870 bit_flip = true;
7871 ++diff_count;
7872 continue;
7873 }
7874 }
7875
7876 /*
7877 * count up the number of different bytes.
7878 */
7879 for (b = 0; b < sizeof(uint64_t); ++b) {
7880 char *n = (char *)new;
7881 char *o = (char *)old;
7882 if (n[b] != o[b]) {
7883 ++diff_count;
7884 }
7885 }
7886 }
7887
7888 if (diff_count > 1) {
7889 bit_flip = false;
7890 }
7891
7892 if (diff_count == 0) {
7893 event->vmtc_not_found = true;
7894 } else {
7895 event->vmtc_num_byte = diff_count;
7896 }
7897 if (bit_flip) {
7898 event->vmtc_one_bit_flip = true;
7899 }
7900
7901 done:
7902 /*
7903 * Free up the code copy buffers, but save the last
7904 * set on development / debug kernels in case they
7905 * can provide evidence for debugging memory stomps.
7906 */
7907 #if DEVELOPMENT || DEBUG
7908 if (vmtc_last_before_buffer != NULL) {
7909 kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
7910 }
7911 if (vmtc_last_after_buffer != NULL) {
7912 kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
7913 }
7914 vmtc_last_before_buffer = old_code_buffer;
7915 vmtc_last_after_buffer = new_code_buffer;
7916 vmtc_last_buffer_size = size;
7917 #else /* DEVELOPMENT || DEBUG */
7918 kfree_data(new_code_buffer, size);
7919 kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
7920 #endif /* DEVELOPMENT || DEBUG */
7921
7922 /*
7923 * We're finished, so clear the diagnosing flag.
7924 */
7925 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7926 panic("Bad compare and swap in diagnose!");
7927 }
7928 }
7929
7930 /*
7931 * For the given map, virt address, find the object, offset, and page.
7932 * This has to lookup the map entry, verify protections, walk any shadow chains.
7933 * If found, returns with the object locked.
7934 */
7935 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)7936 vmtc_revalidate_lookup(
7937 vm_map_t map,
7938 vm_map_offset_t vaddr,
7939 vm_object_t *ret_object,
7940 vm_object_offset_t *ret_offset,
7941 vm_page_t *ret_page,
7942 vm_prot_t *ret_prot)
7943 {
7944 vm_object_t object;
7945 vm_object_offset_t offset;
7946 vm_page_t page;
7947 kern_return_t kr = KERN_SUCCESS;
7948 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7949 vm_map_version_t version;
7950 boolean_t wired;
7951 struct vm_object_fault_info fault_info = {};
7952 vm_map_t real_map = NULL;
7953 vm_prot_t prot;
7954 vm_object_t shadow;
7955
7956 /*
7957 * Find the object/offset for the given location/map.
7958 * Note this returns with the object locked.
7959 */
7960 restart:
7961 vm_map_lock_read(map);
7962 object = VM_OBJECT_NULL; /* in case we come around the restart path */
7963 kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
7964 object_lock_type, &version, &object, &offset, &prot, &wired,
7965 &fault_info, &real_map, NULL);
7966 vm_map_unlock_read(map);
7967 if (real_map != NULL && real_map != map) {
7968 vm_map_unlock(real_map);
7969 }
7970
7971 /*
7972 * If there's no page here, fail.
7973 */
7974 if (kr != KERN_SUCCESS || object == NULL) {
7975 kr = KERN_FAILURE;
7976 goto done;
7977 }
7978
7979 /*
7980 * Chase down any shadow chains to find the actual page.
7981 */
7982 for (;;) {
7983 /*
7984 * See if the page is on the current object.
7985 */
7986 page = vm_page_lookup(object, vm_object_trunc_page(offset));
7987 if (page != NULL) {
7988 /* restart the lookup */
7989 if (page->vmp_restart) {
7990 vm_object_unlock(object);
7991 goto restart;
7992 }
7993
7994 /*
7995 * If this page is busy, we need to wait for it.
7996 */
7997 if (page->vmp_busy) {
7998 PAGE_SLEEP(object, page, TRUE);
7999 vm_object_unlock(object);
8000 goto restart;
8001 }
8002 break;
8003 }
8004
8005 /*
8006 * If the object doesn't have the page and
8007 * has no shadow, then we can quit.
8008 */
8009 shadow = object->shadow;
8010 if (shadow == NULL) {
8011 kr = KERN_FAILURE;
8012 goto done;
8013 }
8014
8015 /*
8016 * Move to the next object
8017 */
8018 offset += object->vo_shadow_offset;
8019 vm_object_lock(shadow);
8020 vm_object_unlock(object);
8021 object = shadow;
8022 shadow = VM_OBJECT_NULL;
8023 }
8024 *ret_object = object;
8025 *ret_offset = vm_object_trunc_page(offset);
8026 *ret_page = page;
8027 *ret_prot = prot;
8028
8029 done:
8030 if (kr != KERN_SUCCESS && object != NULL) {
8031 vm_object_unlock(object);
8032 }
8033 return kr;
8034 }
8035
8036 /*
8037 * Check if a page is wired, needs extra locking.
8038 */
8039 static bool
is_page_wired(vm_page_t page)8040 is_page_wired(vm_page_t page)
8041 {
8042 bool result;
8043 vm_page_lock_queues();
8044 result = VM_PAGE_WIRED(page);
8045 vm_page_unlock_queues();
8046 return result;
8047 }
8048
8049 /*
8050 * A fatal process error has occurred in the given task.
8051 * Recheck the code signing of the text page at the given
8052 * address to check for a text page corruption.
8053 *
8054 * Returns KERN_FAILURE if a page was found to be corrupt
8055 * by failing to match its code signature. KERN_SUCCESS
8056 * means the page is either valid or we don't have the
8057 * information to say it's corrupt.
8058 */
8059 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)8060 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
8061 {
8062 kern_return_t kr;
8063 vm_map_t map;
8064 vm_object_t object = NULL;
8065 vm_object_offset_t offset;
8066 vm_page_t page = NULL;
8067 struct vnode *vnode;
8068 uint64_t *diagnose_buffer = NULL;
8069 CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
8070 ca_event_t ca_event = NULL;
8071 vm_prot_t prot;
8072
8073 map = task->map;
8074 if (task->map == NULL) {
8075 return KERN_SUCCESS;
8076 }
8077
8078 kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
8079 if (kr != KERN_SUCCESS) {
8080 goto done;
8081 }
8082
8083 /*
8084 * The page must be executable.
8085 */
8086 if (!(prot & VM_PROT_EXECUTE)) {
8087 goto done;
8088 }
8089
8090 /*
8091 * The object needs to have a pager.
8092 */
8093 if (object->pager == NULL) {
8094 goto done;
8095 }
8096
8097 /*
8098 * Needs to be a vnode backed page to have a signature.
8099 */
8100 vnode = vnode_pager_lookup_vnode(object->pager);
8101 if (vnode == NULL) {
8102 goto done;
8103 }
8104
8105 /*
8106 * Object checks to see if we should proceed.
8107 */
8108 if (!object->code_signed || /* no code signature to check */
8109 object->internal || /* internal objects aren't signed */
8110 object->terminating || /* the object and its pages are already going away */
8111 !object->pager_ready) { /* this should happen, but check shouldn't hurt */
8112 goto done;
8113 }
8114
8115
8116 /*
8117 * Check the code signature of the page in question.
8118 */
8119 vm_page_map_and_validate_cs(object, page);
8120
8121 /*
8122 * At this point:
8123 * vmp_cs_validated |= validated (set if a code signature exists)
8124 * vmp_cs_tainted |= tainted (set if code signature violation)
8125 * vmp_cs_nx |= nx; ??
8126 *
8127 * if vmp_pmapped then have to pmap_disconnect..
8128 * other flags to check on object or page?
8129 */
8130 if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
8131 #if DEBUG || DEVELOPMENT
8132 /*
8133 * On development builds, a boot-arg can be used to cause
8134 * a panic, instead of a quiet repair.
8135 */
8136 if (vmtc_panic_instead) {
8137 panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
8138 }
8139 #endif /* DEBUG || DEVELOPMENT */
8140
8141 /*
8142 * We're going to invalidate this page. Grab a copy of it for comparison.
8143 */
8144 ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
8145 event = ca_event->data;
8146 diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
8147
8148 /*
8149 * Invalidate, i.e. toss, the corrupted page.
8150 */
8151 if (!page->vmp_cleaning &&
8152 !page->vmp_laundry &&
8153 !page->vmp_fictitious &&
8154 !page->vmp_precious &&
8155 !page->vmp_absent &&
8156 !VMP_ERROR_GET(page) &&
8157 !page->vmp_dirty &&
8158 !is_page_wired(page)) {
8159 if (page->vmp_pmapped) {
8160 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
8161 if (refmod & VM_MEM_MODIFIED) {
8162 SET_PAGE_DIRTY(page, FALSE);
8163 }
8164 if (refmod & VM_MEM_REFERENCED) {
8165 page->vmp_reference = TRUE;
8166 }
8167 }
8168 /* If the page seems intentionally modified, don't trash it. */
8169 if (!page->vmp_dirty) {
8170 VM_PAGE_FREE(page);
8171 } else {
8172 event->vmtc_not_eligible = true;
8173 }
8174 } else {
8175 event->vmtc_not_eligible = true;
8176 }
8177 vm_object_unlock(object);
8178 object = VM_OBJECT_NULL;
8179
8180 /*
8181 * Now try to diagnose the type of failure by faulting
8182 * in a new copy and diff'ing it with what we saved.
8183 */
8184 if (diagnose_buffer != NULL) {
8185 vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
8186 }
8187 #if DEBUG || DEVELOPMENT
8188 if (corruption_test_va != 0) {
8189 corruption_test_va = 0;
8190 event->vmtc_testing = true;
8191 }
8192 #endif /* DEBUG || DEVELOPMENT */
8193 ktriage_record(thread_tid(current_thread()),
8194 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
8195 0 /* arg */);
8196 CA_EVENT_SEND(ca_event);
8197 printf("Text page corruption detected for pid %d\n", proc_selfpid());
8198 ++vmtc_total;
8199 return KERN_FAILURE; /* failure means we definitely found a corrupt page */
8200 }
8201 done:
8202 if (object != NULL) {
8203 vm_object_unlock(object);
8204 }
8205 return KERN_SUCCESS;
8206 }
8207
8208 #if DEBUG || DEVELOPMENT
8209 /*
8210 * For implementing unit tests - ask the pmap to corrupt a text page.
8211 * We have to find the page, to get the physical address, then invoke
8212 * the pmap.
8213 */
8214 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
8215
8216 kern_return_t
vm_corrupt_text_addr(uintptr_t va)8217 vm_corrupt_text_addr(uintptr_t va)
8218 {
8219 task_t task = current_task();
8220 vm_map_t map;
8221 kern_return_t kr = KERN_SUCCESS;
8222 vm_object_t object = VM_OBJECT_NULL;
8223 vm_object_offset_t offset;
8224 vm_page_t page = NULL;
8225 pmap_paddr_t pa;
8226 vm_prot_t prot;
8227
8228 map = task->map;
8229 if (task->map == NULL) {
8230 printf("corrupt_text_addr: no map\n");
8231 return KERN_FAILURE;
8232 }
8233
8234 kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
8235 if (kr != KERN_SUCCESS) {
8236 printf("corrupt_text_addr: page lookup failed\n");
8237 return kr;
8238 }
8239 if (!(prot & VM_PROT_EXECUTE)) {
8240 printf("corrupt_text_addr: page not executable\n");
8241 return KERN_FAILURE;
8242 }
8243
8244 /* get the physical address to use */
8245 pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
8246
8247 /*
8248 * Check we have something we can work with.
8249 * Due to racing with pageout as we enter the sysctl,
8250 * it's theoretically possible to have the page disappear, just
8251 * before the lookup.
8252 *
8253 * That's highly likely to happen often. I've filed a radar 72857482
8254 * to bubble up the error here to the sysctl result and have the
8255 * test not FAIL in that case.
8256 */
8257 if (page->vmp_busy) {
8258 printf("corrupt_text_addr: vmp_busy\n");
8259 kr = KERN_FAILURE;
8260 }
8261 if (page->vmp_cleaning) {
8262 printf("corrupt_text_addr: vmp_cleaning\n");
8263 kr = KERN_FAILURE;
8264 }
8265 if (page->vmp_laundry) {
8266 printf("corrupt_text_addr: vmp_cleaning\n");
8267 kr = KERN_FAILURE;
8268 }
8269 if (page->vmp_fictitious) {
8270 printf("corrupt_text_addr: vmp_fictitious\n");
8271 kr = KERN_FAILURE;
8272 }
8273 if (page->vmp_precious) {
8274 printf("corrupt_text_addr: vmp_precious\n");
8275 kr = KERN_FAILURE;
8276 }
8277 if (page->vmp_absent) {
8278 printf("corrupt_text_addr: vmp_absent\n");
8279 kr = KERN_FAILURE;
8280 }
8281 if (VMP_ERROR_GET(page)) {
8282 printf("corrupt_text_addr: vmp_error\n");
8283 kr = KERN_FAILURE;
8284 }
8285 if (page->vmp_dirty) {
8286 printf("corrupt_text_addr: vmp_dirty\n");
8287 kr = KERN_FAILURE;
8288 }
8289 if (is_page_wired(page)) {
8290 printf("corrupt_text_addr: wired\n");
8291 kr = KERN_FAILURE;
8292 }
8293 if (!page->vmp_pmapped) {
8294 printf("corrupt_text_addr: !vmp_pmapped\n");
8295 kr = KERN_FAILURE;
8296 }
8297
8298 if (kr == KERN_SUCCESS) {
8299 printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8300 kr = pmap_test_text_corruption(pa);
8301 if (kr != KERN_SUCCESS) {
8302 printf("corrupt_text_addr: pmap error %d\n", kr);
8303 } else {
8304 corruption_test_va = va;
8305 }
8306 } else {
8307 printf("corrupt_text_addr: object %p\n", object);
8308 printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8309 printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8310 printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8311 printf("corrupt_text_addr: vm_page_t %p\n", page);
8312 printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8313 printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8314 }
8315
8316 if (object != VM_OBJECT_NULL) {
8317 vm_object_unlock(object);
8318 }
8319 return kr;
8320 }
8321
8322 #endif /* DEBUG || DEVELOPMENT */
8323