1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <libkern/OSAtomic.h>
66
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h> /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_compressor.h>
90 #include <vm/vm_compressor_pager.h>
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_kern.h>
96 #include <vm/pmap.h>
97 #include <vm/vm_pageout.h>
98 #include <vm/vm_protos.h>
99 #include <vm/vm_external.h>
100 #include <vm/memory_object.h>
101 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
102 #include <vm/vm_shared_region.h>
103
104 #include <sys/codesign.h>
105 #include <sys/reason.h>
106 #include <sys/signalvar.h>
107
108 #include <sys/kdebug_triage.h>
109
110 #include <san/kasan.h>
111 #include <libkern/coreanalytics/coreanalytics.h>
112
113 #define VM_FAULT_CLASSIFY 0
114
115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116
117 int vm_protect_privileged_from_untrusted = 1;
118
119 unsigned int vm_object_pagein_throttle = 16;
120
121 /*
122 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
123 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
124 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
125 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
126 * keep the UI active so that the user has a chance to kill the offending task before the system
127 * completely hangs.
128 *
129 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
130 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
131 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
132 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
133 */
134
135 extern void throttle_lowpri_io(int);
136
137 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
138
139 uint64_t vm_hard_throttle_threshold;
140
141 #if DEBUG || DEVELOPMENT
142 static bool vmtc_panic_instead = false;
143 int panic_object_not_alive = 1;
144 #endif /* DEBUG || DEVELOPMENT */
145
146 OS_ALWAYS_INLINE
147 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)148 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
149 {
150 return vm_wants_task_throttled(current_task()) ||
151 ((vm_page_free_count < vm_page_throttle_limit ||
152 HARD_THROTTLE_LIMIT_REACHED()) &&
153 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
154 }
155
156 #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
157 #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
158
159 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
160 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
161
162
163 #define VM_STAT_DECOMPRESSIONS() \
164 MACRO_BEGIN \
165 counter_inc(&vm_statistics_decompressions); \
166 current_thread()->decompressions++; \
167 MACRO_END
168
169 boolean_t current_thread_aborted(void);
170
171 /* Forward declarations of internal routines. */
172 static kern_return_t vm_fault_wire_fast(
173 vm_map_t map,
174 vm_map_offset_t va,
175 vm_prot_t prot,
176 vm_tag_t wire_tag,
177 vm_map_entry_t entry,
178 pmap_t pmap,
179 vm_map_offset_t pmap_addr,
180 ppnum_t *physpage_p);
181
182 static kern_return_t vm_fault_internal(
183 vm_map_t map,
184 vm_map_offset_t vaddr,
185 vm_prot_t caller_prot,
186 boolean_t change_wiring,
187 vm_tag_t wire_tag,
188 int interruptible,
189 pmap_t pmap,
190 vm_map_offset_t pmap_addr,
191 ppnum_t *physpage_p);
192
193 static void vm_fault_copy_cleanup(
194 vm_page_t page,
195 vm_page_t top_page);
196
197 static void vm_fault_copy_dst_cleanup(
198 vm_page_t page);
199
200 #if VM_FAULT_CLASSIFY
201 extern void vm_fault_classify(vm_object_t object,
202 vm_object_offset_t offset,
203 vm_prot_t fault_type);
204
205 extern void vm_fault_classify_init(void);
206 #endif
207
208 unsigned long vm_pmap_enter_blocked = 0;
209 unsigned long vm_pmap_enter_retried = 0;
210
211 unsigned long vm_cs_validates = 0;
212 unsigned long vm_cs_revalidates = 0;
213 unsigned long vm_cs_query_modified = 0;
214 unsigned long vm_cs_validated_dirtied = 0;
215 unsigned long vm_cs_bitmap_validated = 0;
216
217 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
218
219 extern char *kdp_compressor_decompressed_page;
220 extern addr64_t kdp_compressor_decompressed_page_paddr;
221 extern ppnum_t kdp_compressor_decompressed_page_ppnum;
222
223 struct vmrtfr {
224 int vmrtfr_maxi;
225 int vmrtfr_curi;
226 int64_t vmrtf_total;
227 vm_rtfault_record_t *vm_rtf_records;
228 } vmrtfrs;
229 #define VMRTF_DEFAULT_BUFSIZE (4096)
230 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
231 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
232
233 static void vm_rtfrecord_lock(void);
234 static void vm_rtfrecord_unlock(void);
235 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
236
237 extern lck_grp_t vm_page_lck_grp_bucket;
238 extern lck_attr_t vm_page_lck_attr;
239 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
240
241 #if DEVELOPMENT || DEBUG
242 extern int madvise_free_debug;
243 #endif /* DEVELOPMENT || DEBUG */
244
245 extern int vm_pageout_protect_realtime;
246
247 #if CONFIG_FREEZE
248 #endif /* CONFIG_FREEZE */
249
250 /*
251 * Routine: vm_fault_init
252 * Purpose:
253 * Initialize our private data structures.
254 */
255 __startup_func
256 void
vm_fault_init(void)257 vm_fault_init(void)
258 {
259 int i, vm_compressor_temp;
260 boolean_t need_default_val = TRUE;
261 /*
262 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
263 * computed as a percentage of available memory, and the percentage used is scaled inversely with
264 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
265 * and reduce the value down to 10% for very large memory configurations. This helps give us a
266 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
267 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
268 */
269
270 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
271
272 /*
273 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
274 */
275
276 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
277 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
278 if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
279 need_default_val = FALSE;
280 vm_compressor_mode = vm_compressor_temp;
281 break;
282 }
283 }
284 if (need_default_val) {
285 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
286 }
287 }
288 #if CONFIG_FREEZE
289 if (need_default_val) {
290 if (osenvironment_is_diagnostics()) {
291 printf("osenvironment == \"diagnostics\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
292 vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
293 need_default_val = false;
294 }
295 }
296 #endif /* CONFIG_FREEZE */
297 if (need_default_val) {
298 /* If no boot arg or incorrect boot arg, try device tree. */
299 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
300 }
301 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
302 vm_config_init();
303
304 PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
305 &vm_protect_privileged_from_untrusted,
306 sizeof(vm_protect_privileged_from_untrusted));
307
308 #if DEBUG || DEVELOPMENT
309 (void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
310
311 if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
312 madvise_free_debug = 0;
313 }
314
315 PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
316 #endif /* DEBUG || DEVELOPMENT */
317 }
318
319 __startup_func
320 static void
vm_rtfault_record_init(void)321 vm_rtfault_record_init(void)
322 {
323 size_t size;
324
325 vmrtf_num_records = MAX(vmrtf_num_records, 1);
326 size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
327 vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
328 ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
329 vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
330 }
331 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
332
333 /*
334 * Routine: vm_fault_cleanup
335 * Purpose:
336 * Clean up the result of vm_fault_page.
337 * Results:
338 * The paging reference for "object" is released.
339 * "object" is unlocked.
340 * If "top_page" is not null, "top_page" is
341 * freed and the paging reference for the object
342 * containing it is released.
343 *
344 * In/out conditions:
345 * "object" must be locked.
346 */
347 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)348 vm_fault_cleanup(
349 vm_object_t object,
350 vm_page_t top_page)
351 {
352 vm_object_paging_end(object);
353 vm_object_unlock(object);
354
355 if (top_page != VM_PAGE_NULL) {
356 object = VM_PAGE_OBJECT(top_page);
357
358 vm_object_lock(object);
359 VM_PAGE_FREE(top_page);
360 vm_object_paging_end(object);
361 vm_object_unlock(object);
362 }
363 }
364
365 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
366
367
368 boolean_t vm_page_deactivate_behind = TRUE;
369 /*
370 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
371 */
372 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
373 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
374 /* we use it to size an array on the stack */
375
376 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
377
378 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
379
380 /*
381 * vm_page_is_sequential
382 *
383 * Determine if sequential access is in progress
384 * in accordance with the behavior specified.
385 * Update state to indicate current access pattern.
386 *
387 * object must have at least the shared lock held
388 */
389 static
390 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)391 vm_fault_is_sequential(
392 vm_object_t object,
393 vm_object_offset_t offset,
394 vm_behavior_t behavior)
395 {
396 vm_object_offset_t last_alloc;
397 int sequential;
398 int orig_sequential;
399
400 last_alloc = object->last_alloc;
401 sequential = object->sequential;
402 orig_sequential = sequential;
403
404 offset = vm_object_trunc_page(offset);
405 if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
406 /* re-faulting in the same page: no change in behavior */
407 return;
408 }
409
410 switch (behavior) {
411 case VM_BEHAVIOR_RANDOM:
412 /*
413 * reset indicator of sequential behavior
414 */
415 sequential = 0;
416 break;
417
418 case VM_BEHAVIOR_SEQUENTIAL:
419 if (offset && last_alloc == offset - PAGE_SIZE_64) {
420 /*
421 * advance indicator of sequential behavior
422 */
423 if (sequential < MAX_SEQUENTIAL_RUN) {
424 sequential += PAGE_SIZE;
425 }
426 } else {
427 /*
428 * reset indicator of sequential behavior
429 */
430 sequential = 0;
431 }
432 break;
433
434 case VM_BEHAVIOR_RSEQNTL:
435 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
436 /*
437 * advance indicator of sequential behavior
438 */
439 if (sequential > -MAX_SEQUENTIAL_RUN) {
440 sequential -= PAGE_SIZE;
441 }
442 } else {
443 /*
444 * reset indicator of sequential behavior
445 */
446 sequential = 0;
447 }
448 break;
449
450 case VM_BEHAVIOR_DEFAULT:
451 default:
452 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
453 /*
454 * advance indicator of sequential behavior
455 */
456 if (sequential < 0) {
457 sequential = 0;
458 }
459 if (sequential < MAX_SEQUENTIAL_RUN) {
460 sequential += PAGE_SIZE;
461 }
462 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
463 /*
464 * advance indicator of sequential behavior
465 */
466 if (sequential > 0) {
467 sequential = 0;
468 }
469 if (sequential > -MAX_SEQUENTIAL_RUN) {
470 sequential -= PAGE_SIZE;
471 }
472 } else {
473 /*
474 * reset indicator of sequential behavior
475 */
476 sequential = 0;
477 }
478 break;
479 }
480 if (sequential != orig_sequential) {
481 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
482 /*
483 * if someone else has already updated object->sequential
484 * don't bother trying to update it or object->last_alloc
485 */
486 return;
487 }
488 }
489 /*
490 * I'd like to do this with a OSCompareAndSwap64, but that
491 * doesn't exist for PPC... however, it shouldn't matter
492 * that much... last_alloc is maintained so that we can determine
493 * if a sequential access pattern is taking place... if only
494 * one thread is banging on this object, no problem with the unprotected
495 * update... if 2 or more threads are banging away, we run the risk of
496 * someone seeing a mangled update... however, in the face of multiple
497 * accesses, no sequential access pattern can develop anyway, so we
498 * haven't lost any real info.
499 */
500 object->last_alloc = offset;
501 }
502
503 #if DEVELOPMENT || DEBUG
504 uint64_t vm_page_deactivate_behind_count = 0;
505 #endif /* DEVELOPMENT || DEBUG */
506
507 /*
508 * vm_page_deactivate_behind
509 *
510 * Determine if sequential access is in progress
511 * in accordance with the behavior specified. If
512 * so, compute a potential page to deactivate and
513 * deactivate it.
514 *
515 * object must be locked.
516 *
517 * return TRUE if we actually deactivate a page
518 */
519 static
520 boolean_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)521 vm_fault_deactivate_behind(
522 vm_object_t object,
523 vm_object_offset_t offset,
524 vm_behavior_t behavior)
525 {
526 int n;
527 int pages_in_run = 0;
528 int max_pages_in_run = 0;
529 int sequential_run;
530 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
531 vm_object_offset_t run_offset = 0;
532 vm_object_offset_t pg_offset = 0;
533 vm_page_t m;
534 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
535
536 pages_in_run = 0;
537 #if TRACEFAULTPAGE
538 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
539 #endif
540 if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
541 /*
542 * Do not deactivate pages from the kernel object: they
543 * are not intended to become pageable.
544 * or we've disabled the deactivate behind mechanism
545 * or we are dealing with an offset that is not aligned to
546 * the system's PAGE_SIZE because in that case we will
547 * handle the deactivation on the aligned offset and, thus,
548 * the full PAGE_SIZE page once. This helps us avoid the redundant
549 * deactivates and the extra faults.
550 */
551 return FALSE;
552 }
553 if ((sequential_run = object->sequential)) {
554 if (sequential_run < 0) {
555 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
556 sequential_run = 0 - sequential_run;
557 } else {
558 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
559 }
560 }
561 switch (behavior) {
562 case VM_BEHAVIOR_RANDOM:
563 break;
564 case VM_BEHAVIOR_SEQUENTIAL:
565 if (sequential_run >= (int)PAGE_SIZE) {
566 run_offset = 0 - PAGE_SIZE_64;
567 max_pages_in_run = 1;
568 }
569 break;
570 case VM_BEHAVIOR_RSEQNTL:
571 if (sequential_run >= (int)PAGE_SIZE) {
572 run_offset = PAGE_SIZE_64;
573 max_pages_in_run = 1;
574 }
575 break;
576 case VM_BEHAVIOR_DEFAULT:
577 default:
578 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
579
580 /*
581 * determine if the run of sequential accesss has been
582 * long enough on an object with default access behavior
583 * to consider it for deactivation
584 */
585 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
586 /*
587 * the comparisons between offset and behind are done
588 * in this kind of odd fashion in order to prevent wrap around
589 * at the end points
590 */
591 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
592 if (offset >= behind) {
593 run_offset = 0 - behind;
594 pg_offset = PAGE_SIZE_64;
595 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
596 }
597 } else {
598 if (offset < -behind) {
599 run_offset = behind;
600 pg_offset = 0 - PAGE_SIZE_64;
601 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
602 }
603 }
604 }
605 break;}
606 }
607 for (n = 0; n < max_pages_in_run; n++) {
608 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
609
610 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
611 page_run[pages_in_run++] = m;
612
613 /*
614 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
615 *
616 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
617 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
618 * new reference happens. If no futher references happen on the page after that remote TLB flushes
619 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
620 * by pageout_scan, which is just fine since the last reference would have happened quite far
621 * in the past (TLB caches don't hang around for very long), and of course could just as easily
622 * have happened before we did the deactivate_behind.
623 */
624 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
625 }
626 }
627 if (pages_in_run) {
628 vm_page_lockspin_queues();
629
630 for (n = 0; n < pages_in_run; n++) {
631 m = page_run[n];
632
633 vm_page_deactivate_internal(m, FALSE);
634
635 #if DEVELOPMENT || DEBUG
636 vm_page_deactivate_behind_count++;
637 #endif /* DEVELOPMENT || DEBUG */
638
639 #if TRACEFAULTPAGE
640 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
641 #endif
642 }
643 vm_page_unlock_queues();
644
645 return TRUE;
646 }
647 return FALSE;
648 }
649
650
651 #if (DEVELOPMENT || DEBUG)
652 uint32_t vm_page_creation_throttled_hard = 0;
653 uint32_t vm_page_creation_throttled_soft = 0;
654 uint64_t vm_page_creation_throttle_avoided = 0;
655 #endif /* DEVELOPMENT || DEBUG */
656
657 static int
vm_page_throttled(boolean_t page_kept)658 vm_page_throttled(boolean_t page_kept)
659 {
660 clock_sec_t elapsed_sec;
661 clock_sec_t tv_sec;
662 clock_usec_t tv_usec;
663 task_t curtask = current_task_early();
664
665 thread_t thread = current_thread();
666
667 if (thread->options & TH_OPT_VMPRIV) {
668 return 0;
669 }
670
671 if (curtask && !curtask->active) {
672 return 0;
673 }
674
675 if (thread->t_page_creation_throttled) {
676 thread->t_page_creation_throttled = 0;
677
678 if (page_kept == FALSE) {
679 goto no_throttle;
680 }
681 }
682 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
683 #if (DEVELOPMENT || DEBUG)
684 thread->t_page_creation_throttled_hard++;
685 OSAddAtomic(1, &vm_page_creation_throttled_hard);
686 #endif /* DEVELOPMENT || DEBUG */
687 return HARD_THROTTLE_DELAY;
688 }
689
690 if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
691 thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
692 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
693 #if (DEVELOPMENT || DEBUG)
694 OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
695 #endif
696 goto no_throttle;
697 }
698 clock_get_system_microtime(&tv_sec, &tv_usec);
699
700 elapsed_sec = tv_sec - thread->t_page_creation_time;
701
702 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
703 (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
704 if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
705 /*
706 * we'll reset our stats to give a well behaved app
707 * that was unlucky enough to accumulate a bunch of pages
708 * over a long period of time a chance to get out of
709 * the throttled state... we reset the counter and timestamp
710 * so that if it stays under the rate limit for the next second
711 * it will be back in our good graces... if it exceeds it, it
712 * will remain in the throttled state
713 */
714 thread->t_page_creation_time = tv_sec;
715 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
716 }
717 VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
718
719 thread->t_page_creation_throttled = 1;
720
721 if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
722 #if (DEVELOPMENT || DEBUG)
723 thread->t_page_creation_throttled_hard++;
724 OSAddAtomic(1, &vm_page_creation_throttled_hard);
725 #endif /* DEVELOPMENT || DEBUG */
726 return HARD_THROTTLE_DELAY;
727 } else {
728 #if (DEVELOPMENT || DEBUG)
729 thread->t_page_creation_throttled_soft++;
730 OSAddAtomic(1, &vm_page_creation_throttled_soft);
731 #endif /* DEVELOPMENT || DEBUG */
732 return SOFT_THROTTLE_DELAY;
733 }
734 }
735 thread->t_page_creation_time = tv_sec;
736 thread->t_page_creation_count = 0;
737 }
738 no_throttle:
739 thread->t_page_creation_count++;
740
741 return 0;
742 }
743
744 extern boolean_t vm_pageout_running;
745 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)746 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
747 int throttle_delay)
748 {
749 /* make sure vm_pageout_scan() gets to work while we're throttled */
750 if (!vm_pageout_running) {
751 thread_wakeup((event_t)&vm_page_free_wanted);
752 }
753 delay(throttle_delay);
754 }
755
756
757 /*
758 * check for various conditions that would
759 * prevent us from creating a ZF page...
760 * cleanup is based on being called from vm_fault_page
761 *
762 * object must be locked
763 * object == m->vmp_object
764 */
765 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)766 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
767 {
768 int throttle_delay;
769
770 if (object->shadow_severed ||
771 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
772 /*
773 * Either:
774 * 1. the shadow chain was severed,
775 * 2. the purgeable object is volatile or empty and is marked
776 * to fault on access while volatile.
777 * Just have to return an error at this point
778 */
779 if (m != VM_PAGE_NULL) {
780 VM_PAGE_FREE(m);
781 }
782 vm_fault_cleanup(object, first_m);
783
784 thread_interrupt_level(interruptible_state);
785
786 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
787 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
788 }
789
790 if (object->shadow_severed) {
791 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
792 }
793 return VM_FAULT_MEMORY_ERROR;
794 }
795 if (page_throttle == TRUE) {
796 if ((throttle_delay = vm_page_throttled(FALSE))) {
797 /*
798 * we're throttling zero-fills...
799 * treat this as if we couldn't grab a page
800 */
801 if (m != VM_PAGE_NULL) {
802 VM_PAGE_FREE(m);
803 }
804 vm_fault_cleanup(object, first_m);
805
806 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
807
808 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
809
810 if (current_thread_aborted()) {
811 thread_interrupt_level(interruptible_state);
812 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
813 return VM_FAULT_INTERRUPTED;
814 }
815 thread_interrupt_level(interruptible_state);
816
817 return VM_FAULT_MEMORY_SHORTAGE;
818 }
819 }
820 return VM_FAULT_SUCCESS;
821 }
822
823 /*
824 * Clear the code signing bits on the given page_t
825 */
826 static void
vm_fault_cs_clear(vm_page_t m)827 vm_fault_cs_clear(vm_page_t m)
828 {
829 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
830 m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
831 m->vmp_cs_nx = VMP_CS_ALL_FALSE;
832 }
833
834 /*
835 * Enqueues the given page on the throttled queue.
836 * The caller must hold the vm_page_queue_lock and it will be held on return.
837 */
838 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)839 vm_fault_enqueue_throttled_locked(vm_page_t m)
840 {
841 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
842 assert(!VM_PAGE_WIRED(m));
843
844 /*
845 * can't be on the pageout queue since we don't
846 * have a pager to try and clean to
847 */
848 vm_page_queues_remove(m, TRUE);
849 vm_page_check_pageable_safe(m);
850 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
851 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
852 vm_page_throttled_count++;
853 }
854
855 /*
856 * do the work to zero fill a page and
857 * inject it into the correct paging queue
858 *
859 * m->vmp_object must be locked
860 * page queue lock must NOT be held
861 */
862 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)863 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
864 {
865 int my_fault = DBG_ZERO_FILL_FAULT;
866 vm_object_t object;
867
868 object = VM_PAGE_OBJECT(m);
869
870 /*
871 * This is is a zero-fill page fault...
872 *
873 * Checking the page lock is a waste of
874 * time; this page was absent, so
875 * it can't be page locked by a pager.
876 *
877 * we also consider it undefined
878 * with respect to instruction
879 * execution. i.e. it is the responsibility
880 * of higher layers to call for an instruction
881 * sync after changing the contents and before
882 * sending a program into this area. We
883 * choose this approach for performance
884 */
885 vm_fault_cs_clear(m);
886 m->vmp_pmapped = TRUE;
887
888 if (no_zero_fill == TRUE) {
889 my_fault = DBG_NZF_PAGE_FAULT;
890
891 if (m->vmp_absent && m->vmp_busy) {
892 return my_fault;
893 }
894 } else {
895 vm_page_zero_fill(m);
896
897 counter_inc(&vm_statistics_zero_fill_count);
898 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
899 }
900 assert(!m->vmp_laundry);
901 assert(object != kernel_object);
902 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
903 if (!VM_DYNAMIC_PAGING_ENABLED() &&
904 (object->purgable == VM_PURGABLE_DENY ||
905 object->purgable == VM_PURGABLE_NONVOLATILE ||
906 object->purgable == VM_PURGABLE_VOLATILE)) {
907 vm_page_lockspin_queues();
908 if (!VM_DYNAMIC_PAGING_ENABLED()) {
909 vm_fault_enqueue_throttled_locked(m);
910 }
911 vm_page_unlock_queues();
912 }
913 return my_fault;
914 }
915
916
917 /*
918 * Routine: vm_fault_page
919 * Purpose:
920 * Find the resident page for the virtual memory
921 * specified by the given virtual memory object
922 * and offset.
923 * Additional arguments:
924 * The required permissions for the page is given
925 * in "fault_type". Desired permissions are included
926 * in "protection".
927 * fault_info is passed along to determine pagein cluster
928 * limits... it contains the expected reference pattern,
929 * cluster size if available, etc...
930 *
931 * If the desired page is known to be resident (for
932 * example, because it was previously wired down), asserting
933 * the "unwiring" parameter will speed the search.
934 *
935 * If the operation can be interrupted (by thread_abort
936 * or thread_terminate), then the "interruptible"
937 * parameter should be asserted.
938 *
939 * Results:
940 * The page containing the proper data is returned
941 * in "result_page".
942 *
943 * In/out conditions:
944 * The source object must be locked and referenced,
945 * and must donate one paging reference. The reference
946 * is not affected. The paging reference and lock are
947 * consumed.
948 *
949 * If the call succeeds, the object in which "result_page"
950 * resides is left locked and holding a paging reference.
951 * If this is not the original object, a busy page in the
952 * original object is returned in "top_page", to prevent other
953 * callers from pursuing this same data, along with a paging
954 * reference for the original object. The "top_page" should
955 * be destroyed when this guarantee is no longer required.
956 * The "result_page" is also left busy. It is not removed
957 * from the pageout queues.
958 * Special Case:
959 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
960 * fault succeeded but there's no VM page (i.e. the VM object
961 * does not actually hold VM pages, but device memory or
962 * large pages). The object is still locked and we still hold a
963 * paging_in_progress reference.
964 */
965 unsigned int vm_fault_page_blocked_access = 0;
966 unsigned int vm_fault_page_forced_retry = 0;
967
968 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)969 vm_fault_page(
970 /* Arguments: */
971 vm_object_t first_object, /* Object to begin search */
972 vm_object_offset_t first_offset, /* Offset into object */
973 vm_prot_t fault_type, /* What access is requested */
974 boolean_t must_be_resident,/* Must page be resident? */
975 boolean_t caller_lookup, /* caller looked up page */
976 /* Modifies in place: */
977 vm_prot_t *protection, /* Protection for mapping */
978 vm_page_t *result_page, /* Page found, if successful */
979 /* Returns: */
980 vm_page_t *top_page, /* Page in top object, if
981 * not result_page. */
982 int *type_of_fault, /* if non-null, fill in with type of fault
983 * COW, zero-fill, etc... returned in trace point */
984 /* More arguments: */
985 kern_return_t *error_code, /* code if page is in error */
986 boolean_t no_zero_fill, /* don't zero fill absent pages */
987 vm_object_fault_info_t fault_info)
988 {
989 vm_page_t m;
990 vm_object_t object;
991 vm_object_offset_t offset;
992 vm_page_t first_m;
993 vm_object_t next_object;
994 vm_object_t copy_object;
995 boolean_t look_for_page;
996 boolean_t force_fault_retry = FALSE;
997 vm_prot_t access_required = fault_type;
998 vm_prot_t wants_copy_flag;
999 kern_return_t wait_result;
1000 wait_interrupt_t interruptible_state;
1001 boolean_t data_already_requested = FALSE;
1002 vm_behavior_t orig_behavior;
1003 vm_size_t orig_cluster_size;
1004 vm_fault_return_t error;
1005 int my_fault;
1006 uint32_t try_failed_count;
1007 int interruptible; /* how may fault be interrupted? */
1008 int external_state = VM_EXTERNAL_STATE_UNKNOWN;
1009 memory_object_t pager;
1010 vm_fault_return_t retval;
1011 int grab_options;
1012 bool clear_absent_on_error = false;
1013
1014 /*
1015 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1016 * marked as paged out in the compressor pager or the pager doesn't exist.
1017 * Note also that if the pager for an internal object
1018 * has not been created, the pager is not invoked regardless of the value
1019 * of MUST_ASK_PAGER().
1020 *
1021 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1022 * is marked as paged out in the compressor pager.
1023 * PAGED_OUT() is used to determine if a page has already been pushed
1024 * into a copy object in order to avoid a redundant page out operation.
1025 */
1026 #define MUST_ASK_PAGER(o, f, s) \
1027 ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1028
1029 #define PAGED_OUT(o, f) \
1030 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1031
1032 /*
1033 * Recovery actions
1034 */
1035 #define RELEASE_PAGE(m) \
1036 MACRO_BEGIN \
1037 PAGE_WAKEUP_DONE(m); \
1038 if ( !VM_PAGE_PAGEABLE(m)) { \
1039 vm_page_lockspin_queues(); \
1040 if (clear_absent_on_error && m->vmp_absent) {\
1041 vm_page_zero_fill(m); \
1042 counter_inc(&vm_statistics_zero_fill_count);\
1043 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);\
1044 m->vmp_absent = false; \
1045 } \
1046 if ( !VM_PAGE_PAGEABLE(m)) { \
1047 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
1048 vm_page_deactivate(m); \
1049 else \
1050 vm_page_activate(m); \
1051 } \
1052 vm_page_unlock_queues(); \
1053 } \
1054 clear_absent_on_error = false; \
1055 MACRO_END
1056
1057 #if TRACEFAULTPAGE
1058 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1059 #endif
1060
1061 interruptible = fault_info->interruptible;
1062 interruptible_state = thread_interrupt_level(interruptible);
1063
1064 /*
1065 * INVARIANTS (through entire routine):
1066 *
1067 * 1) At all times, we must either have the object
1068 * lock or a busy page in some object to prevent
1069 * some other thread from trying to bring in
1070 * the same page.
1071 *
1072 * Note that we cannot hold any locks during the
1073 * pager access or when waiting for memory, so
1074 * we use a busy page then.
1075 *
1076 * 2) To prevent another thread from racing us down the
1077 * shadow chain and entering a new page in the top
1078 * object before we do, we must keep a busy page in
1079 * the top object while following the shadow chain.
1080 *
1081 * 3) We must increment paging_in_progress on any object
1082 * for which we have a busy page before dropping
1083 * the object lock
1084 *
1085 * 4) We leave busy pages on the pageout queues.
1086 * If the pageout daemon comes across a busy page,
1087 * it will remove the page from the pageout queues.
1088 */
1089
1090 object = first_object;
1091 offset = first_offset;
1092 first_m = VM_PAGE_NULL;
1093 access_required = fault_type;
1094
1095 /*
1096 * default type of fault
1097 */
1098 my_fault = DBG_CACHE_HIT_FAULT;
1099 thread_pri_floor_t token;
1100 bool drop_floor = false;
1101
1102 while (TRUE) {
1103 #if TRACEFAULTPAGE
1104 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1105 #endif
1106
1107 grab_options = 0;
1108 #if CONFIG_SECLUDED_MEMORY
1109 if (object->can_grab_secluded) {
1110 grab_options |= VM_PAGE_GRAB_SECLUDED;
1111 }
1112 #endif /* CONFIG_SECLUDED_MEMORY */
1113
1114 if (!object->alive) {
1115 /*
1116 * object is no longer valid
1117 * clean up and return error
1118 */
1119 #if DEVELOPMENT || DEBUG
1120 printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1121 if (panic_object_not_alive) {
1122 panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1123 }
1124 #endif /* DEVELOPMENT || DEBUG */
1125 vm_fault_cleanup(object, first_m);
1126 thread_interrupt_level(interruptible_state);
1127
1128 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1129 return VM_FAULT_MEMORY_ERROR;
1130 }
1131
1132 if (!object->pager_created && object->phys_contiguous) {
1133 /*
1134 * A physically-contiguous object without a pager:
1135 * must be a "large page" object. We do not deal
1136 * with VM pages for this object.
1137 */
1138 caller_lookup = FALSE;
1139 m = VM_PAGE_NULL;
1140 goto phys_contig_object;
1141 }
1142
1143 if (object->blocked_access) {
1144 /*
1145 * Access to this VM object has been blocked.
1146 * Replace our "paging_in_progress" reference with
1147 * a "activity_in_progress" reference and wait for
1148 * access to be unblocked.
1149 */
1150 caller_lookup = FALSE; /* no longer valid after sleep */
1151 vm_object_activity_begin(object);
1152 vm_object_paging_end(object);
1153 while (object->blocked_access) {
1154 vm_object_sleep(object,
1155 VM_OBJECT_EVENT_UNBLOCKED,
1156 THREAD_UNINT);
1157 }
1158 vm_fault_page_blocked_access++;
1159 vm_object_paging_begin(object);
1160 vm_object_activity_end(object);
1161 }
1162
1163 /*
1164 * See whether the page at 'offset' is resident
1165 */
1166 if (caller_lookup == TRUE) {
1167 /*
1168 * The caller has already looked up the page
1169 * and gave us the result in "result_page".
1170 * We can use this for the first lookup but
1171 * it loses its validity as soon as we unlock
1172 * the object.
1173 */
1174 m = *result_page;
1175 caller_lookup = FALSE; /* no longer valid after that */
1176 } else {
1177 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1178 }
1179 #if TRACEFAULTPAGE
1180 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1181 #endif
1182 if (m != VM_PAGE_NULL) {
1183 if (m->vmp_busy) {
1184 /*
1185 * The page is being brought in,
1186 * wait for it and then retry.
1187 */
1188 #if TRACEFAULTPAGE
1189 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1190 #endif
1191 wait_result = PAGE_SLEEP(object, m, interruptible);
1192
1193 if (wait_result != THREAD_AWAKENED) {
1194 vm_fault_cleanup(object, first_m);
1195 thread_interrupt_level(interruptible_state);
1196
1197 if (wait_result == THREAD_RESTART) {
1198 return VM_FAULT_RETRY;
1199 } else {
1200 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1201 return VM_FAULT_INTERRUPTED;
1202 }
1203 }
1204 continue;
1205 }
1206 if (m->vmp_laundry) {
1207 m->vmp_free_when_done = FALSE;
1208
1209 if (!m->vmp_cleaning) {
1210 vm_pageout_steal_laundry(m, FALSE);
1211 }
1212 }
1213 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1214 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1215 /*
1216 * Guard page: off limits !
1217 */
1218 if (fault_type == VM_PROT_NONE) {
1219 /*
1220 * The fault is not requesting any
1221 * access to the guard page, so it must
1222 * be just to wire or unwire it.
1223 * Let's pretend it succeeded...
1224 */
1225 m->vmp_busy = TRUE;
1226 *result_page = m;
1227 assert(first_m == VM_PAGE_NULL);
1228 *top_page = first_m;
1229 if (type_of_fault) {
1230 *type_of_fault = DBG_GUARD_FAULT;
1231 }
1232 thread_interrupt_level(interruptible_state);
1233 return VM_FAULT_SUCCESS;
1234 } else {
1235 /*
1236 * The fault requests access to the
1237 * guard page: let's deny that !
1238 */
1239 vm_fault_cleanup(object, first_m);
1240 thread_interrupt_level(interruptible_state);
1241 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1242 return VM_FAULT_MEMORY_ERROR;
1243 }
1244 }
1245
1246
1247 if (VMP_ERROR_GET(m)) {
1248 /*
1249 * The page is in error, give up now.
1250 */
1251 #if TRACEFAULTPAGE
1252 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1253 #endif
1254 if (error_code) {
1255 *error_code = KERN_MEMORY_ERROR;
1256 }
1257 VM_PAGE_FREE(m);
1258
1259 vm_fault_cleanup(object, first_m);
1260 thread_interrupt_level(interruptible_state);
1261
1262 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1263 return VM_FAULT_MEMORY_ERROR;
1264 }
1265 if (m->vmp_restart) {
1266 /*
1267 * The pager wants us to restart
1268 * at the top of the chain,
1269 * typically because it has moved the
1270 * page to another pager, then do so.
1271 */
1272 #if TRACEFAULTPAGE
1273 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1274 #endif
1275 VM_PAGE_FREE(m);
1276
1277 vm_fault_cleanup(object, first_m);
1278 thread_interrupt_level(interruptible_state);
1279
1280 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1281 return VM_FAULT_RETRY;
1282 }
1283 if (m->vmp_absent) {
1284 /*
1285 * The page isn't busy, but is absent,
1286 * therefore it's deemed "unavailable".
1287 *
1288 * Remove the non-existent page (unless it's
1289 * in the top object) and move on down to the
1290 * next object (if there is one).
1291 */
1292 #if TRACEFAULTPAGE
1293 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1294 #endif
1295 next_object = object->shadow;
1296
1297 if (next_object == VM_OBJECT_NULL) {
1298 /*
1299 * Absent page at bottom of shadow
1300 * chain; zero fill the page we left
1301 * busy in the first object, and free
1302 * the absent page.
1303 */
1304 assert(!must_be_resident);
1305
1306 /*
1307 * check for any conditions that prevent
1308 * us from creating a new zero-fill page
1309 * vm_fault_check will do all of the
1310 * fault cleanup in the case of an error condition
1311 * including resetting the thread_interrupt_level
1312 */
1313 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1314
1315 if (error != VM_FAULT_SUCCESS) {
1316 return error;
1317 }
1318
1319 if (object != first_object) {
1320 /*
1321 * free the absent page we just found
1322 */
1323 VM_PAGE_FREE(m);
1324
1325 /*
1326 * drop reference and lock on current object
1327 */
1328 vm_object_paging_end(object);
1329 vm_object_unlock(object);
1330
1331 /*
1332 * grab the original page we
1333 * 'soldered' in place and
1334 * retake lock on 'first_object'
1335 */
1336 m = first_m;
1337 first_m = VM_PAGE_NULL;
1338
1339 object = first_object;
1340 offset = first_offset;
1341
1342 vm_object_lock(object);
1343 } else {
1344 /*
1345 * we're going to use the absent page we just found
1346 * so convert it to a 'busy' page
1347 */
1348 m->vmp_absent = FALSE;
1349 m->vmp_busy = TRUE;
1350 }
1351 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1352 m->vmp_absent = TRUE;
1353 clear_absent_on_error = true;
1354 }
1355 /*
1356 * zero-fill the page and put it on
1357 * the correct paging queue
1358 */
1359 my_fault = vm_fault_zero_page(m, no_zero_fill);
1360
1361 break;
1362 } else {
1363 if (must_be_resident) {
1364 vm_object_paging_end(object);
1365 } else if (object != first_object) {
1366 vm_object_paging_end(object);
1367 VM_PAGE_FREE(m);
1368 } else {
1369 first_m = m;
1370 m->vmp_absent = FALSE;
1371 m->vmp_busy = TRUE;
1372
1373 vm_page_lockspin_queues();
1374 vm_page_queues_remove(m, FALSE);
1375 vm_page_unlock_queues();
1376 }
1377
1378 offset += object->vo_shadow_offset;
1379 fault_info->lo_offset += object->vo_shadow_offset;
1380 fault_info->hi_offset += object->vo_shadow_offset;
1381 access_required = VM_PROT_READ;
1382
1383 vm_object_lock(next_object);
1384 vm_object_unlock(object);
1385 object = next_object;
1386 vm_object_paging_begin(object);
1387
1388 /*
1389 * reset to default type of fault
1390 */
1391 my_fault = DBG_CACHE_HIT_FAULT;
1392
1393 continue;
1394 }
1395 }
1396 if ((m->vmp_cleaning)
1397 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1398 && (fault_type & VM_PROT_WRITE)) {
1399 /*
1400 * This is a copy-on-write fault that will
1401 * cause us to revoke access to this page, but
1402 * this page is in the process of being cleaned
1403 * in a clustered pageout. We must wait until
1404 * the cleaning operation completes before
1405 * revoking access to the original page,
1406 * otherwise we might attempt to remove a
1407 * wired mapping.
1408 */
1409 #if TRACEFAULTPAGE
1410 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1411 #endif
1412 /*
1413 * take an extra ref so that object won't die
1414 */
1415 vm_object_reference_locked(object);
1416
1417 vm_fault_cleanup(object, first_m);
1418
1419 vm_object_lock(object);
1420 assert(object->ref_count > 0);
1421
1422 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1423
1424 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1425 PAGE_ASSERT_WAIT(m, interruptible);
1426
1427 vm_object_unlock(object);
1428 wait_result = thread_block(THREAD_CONTINUE_NULL);
1429 vm_object_deallocate(object);
1430
1431 goto backoff;
1432 } else {
1433 vm_object_unlock(object);
1434
1435 vm_object_deallocate(object);
1436 thread_interrupt_level(interruptible_state);
1437
1438 return VM_FAULT_RETRY;
1439 }
1440 }
1441 if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1442 !(fault_info != NULL && fault_info->stealth)) {
1443 /*
1444 * If we were passed a non-NULL pointer for
1445 * "type_of_fault", than we came from
1446 * vm_fault... we'll let it deal with
1447 * this condition, since it
1448 * needs to see m->vmp_speculative to correctly
1449 * account the pageins, otherwise...
1450 * take it off the speculative queue, we'll
1451 * let the caller of vm_fault_page deal
1452 * with getting it onto the correct queue
1453 *
1454 * If the caller specified in fault_info that
1455 * it wants a "stealth" fault, we also leave
1456 * the page in the speculative queue.
1457 */
1458 vm_page_lockspin_queues();
1459 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1460 vm_page_queues_remove(m, FALSE);
1461 }
1462 vm_page_unlock_queues();
1463 }
1464 assert(object == VM_PAGE_OBJECT(m));
1465
1466 if (object->code_signed) {
1467 /*
1468 * CODE SIGNING:
1469 * We just paged in a page from a signed
1470 * memory object but we don't need to
1471 * validate it now. We'll validate it if
1472 * when it gets mapped into a user address
1473 * space for the first time or when the page
1474 * gets copied to another object as a result
1475 * of a copy-on-write.
1476 */
1477 }
1478
1479 /*
1480 * We mark the page busy and leave it on
1481 * the pageout queues. If the pageout
1482 * deamon comes across it, then it will
1483 * remove the page from the queue, but not the object
1484 */
1485 #if TRACEFAULTPAGE
1486 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1487 #endif
1488 assert(!m->vmp_busy);
1489 assert(!m->vmp_absent);
1490
1491 m->vmp_busy = TRUE;
1492 break;
1493 }
1494
1495 /*
1496 * we get here when there is no page present in the object at
1497 * the offset we're interested in... we'll allocate a page
1498 * at this point if the pager associated with
1499 * this object can provide the data or we're the top object...
1500 * object is locked; m == NULL
1501 */
1502
1503 if (must_be_resident) {
1504 if (fault_type == VM_PROT_NONE &&
1505 object == kernel_object) {
1506 /*
1507 * We've been called from vm_fault_unwire()
1508 * while removing a map entry that was allocated
1509 * with KMA_KOBJECT and KMA_VAONLY. This page
1510 * is not present and there's nothing more to
1511 * do here (nothing to unwire).
1512 */
1513 vm_fault_cleanup(object, first_m);
1514 thread_interrupt_level(interruptible_state);
1515
1516 return VM_FAULT_MEMORY_ERROR;
1517 }
1518
1519 goto dont_look_for_page;
1520 }
1521
1522 /* Don't expect to fault pages into the kernel object. */
1523 assert(object != kernel_object);
1524
1525 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1526
1527 #if TRACEFAULTPAGE
1528 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1529 #endif
1530 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1531 /*
1532 * Allocate a new page for this object/offset pair as a placeholder
1533 */
1534 m = vm_page_grab_options(grab_options);
1535 #if TRACEFAULTPAGE
1536 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1537 #endif
1538 if (m == VM_PAGE_NULL) {
1539 vm_fault_cleanup(object, first_m);
1540 thread_interrupt_level(interruptible_state);
1541
1542 return VM_FAULT_MEMORY_SHORTAGE;
1543 }
1544
1545 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1546 vm_page_insert_internal(m, object,
1547 vm_object_trunc_page(offset),
1548 VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1549 } else {
1550 vm_page_insert(m, object, vm_object_trunc_page(offset));
1551 }
1552 }
1553 if (look_for_page) {
1554 kern_return_t rc;
1555 int my_fault_type;
1556
1557 /*
1558 * If the memory manager is not ready, we
1559 * cannot make requests.
1560 */
1561 if (!object->pager_ready) {
1562 #if TRACEFAULTPAGE
1563 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1564 #endif
1565 if (m != VM_PAGE_NULL) {
1566 VM_PAGE_FREE(m);
1567 }
1568
1569 /*
1570 * take an extra ref so object won't die
1571 */
1572 vm_object_reference_locked(object);
1573 vm_fault_cleanup(object, first_m);
1574
1575 vm_object_lock(object);
1576 assert(object->ref_count > 0);
1577
1578 if (!object->pager_ready) {
1579 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1580
1581 vm_object_unlock(object);
1582 if (wait_result == THREAD_WAITING) {
1583 wait_result = thread_block(THREAD_CONTINUE_NULL);
1584 }
1585 vm_object_deallocate(object);
1586
1587 goto backoff;
1588 } else {
1589 vm_object_unlock(object);
1590 vm_object_deallocate(object);
1591 thread_interrupt_level(interruptible_state);
1592
1593 return VM_FAULT_RETRY;
1594 }
1595 }
1596 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1597 /*
1598 * If there are too many outstanding page
1599 * requests pending on this external object, we
1600 * wait for them to be resolved now.
1601 */
1602 #if TRACEFAULTPAGE
1603 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1604 #endif
1605 if (m != VM_PAGE_NULL) {
1606 VM_PAGE_FREE(m);
1607 }
1608 /*
1609 * take an extra ref so object won't die
1610 */
1611 vm_object_reference_locked(object);
1612
1613 vm_fault_cleanup(object, first_m);
1614
1615 vm_object_lock(object);
1616 assert(object->ref_count > 0);
1617
1618 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1619 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1620
1621 vm_object_unlock(object);
1622 wait_result = thread_block(THREAD_CONTINUE_NULL);
1623 vm_object_deallocate(object);
1624
1625 goto backoff;
1626 } else {
1627 vm_object_unlock(object);
1628 vm_object_deallocate(object);
1629 thread_interrupt_level(interruptible_state);
1630
1631 return VM_FAULT_RETRY;
1632 }
1633 }
1634 if (object->internal) {
1635 int compressed_count_delta;
1636
1637 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1638
1639 if (m == VM_PAGE_NULL) {
1640 /*
1641 * Allocate a new page for this object/offset pair as a placeholder
1642 */
1643 m = vm_page_grab_options(grab_options);
1644 #if TRACEFAULTPAGE
1645 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1646 #endif
1647 if (m == VM_PAGE_NULL) {
1648 vm_fault_cleanup(object, first_m);
1649 thread_interrupt_level(interruptible_state);
1650
1651 return VM_FAULT_MEMORY_SHORTAGE;
1652 }
1653
1654 m->vmp_absent = TRUE;
1655 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1656 vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1657 } else {
1658 vm_page_insert(m, object, vm_object_trunc_page(offset));
1659 }
1660 }
1661 assert(m->vmp_busy);
1662
1663 m->vmp_absent = TRUE;
1664 pager = object->pager;
1665
1666 assert(object->paging_in_progress > 0);
1667 vm_object_unlock(object);
1668
1669 rc = vm_compressor_pager_get(
1670 pager,
1671 offset + object->paging_offset,
1672 VM_PAGE_GET_PHYS_PAGE(m),
1673 &my_fault_type,
1674 0,
1675 &compressed_count_delta);
1676
1677 if (type_of_fault == NULL) {
1678 int throttle_delay;
1679
1680 /*
1681 * we weren't called from vm_fault, so we
1682 * need to apply page creation throttling
1683 * do it before we re-acquire any locks
1684 */
1685 if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1686 if ((throttle_delay = vm_page_throttled(TRUE))) {
1687 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1688 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1689 }
1690 }
1691 }
1692 vm_object_lock(object);
1693 assert(object->paging_in_progress > 0);
1694
1695 vm_compressor_pager_count(
1696 pager,
1697 compressed_count_delta,
1698 FALSE, /* shared_lock */
1699 object);
1700
1701 switch (rc) {
1702 case KERN_SUCCESS:
1703 m->vmp_absent = FALSE;
1704 m->vmp_dirty = TRUE;
1705 if ((object->wimg_bits &
1706 VM_WIMG_MASK) !=
1707 VM_WIMG_USE_DEFAULT) {
1708 /*
1709 * If the page is not cacheable,
1710 * we can't let its contents
1711 * linger in the data cache
1712 * after the decompression.
1713 */
1714 pmap_sync_page_attributes_phys(
1715 VM_PAGE_GET_PHYS_PAGE(m));
1716 } else {
1717 m->vmp_written_by_kernel = TRUE;
1718 }
1719
1720 /*
1721 * If the object is purgeable, its
1722 * owner's purgeable ledgers have been
1723 * updated in vm_page_insert() but the
1724 * page was also accounted for in a
1725 * "compressed purgeable" ledger, so
1726 * update that now.
1727 */
1728 if (((object->purgable !=
1729 VM_PURGABLE_DENY) ||
1730 object->vo_ledger_tag) &&
1731 (object->vo_owner !=
1732 NULL)) {
1733 /*
1734 * One less compressed
1735 * purgeable/tagged page.
1736 */
1737 vm_object_owner_compressed_update(
1738 object,
1739 -1);
1740 }
1741
1742 break;
1743 case KERN_MEMORY_FAILURE:
1744 m->vmp_unusual = TRUE;
1745 m->vmp_error = TRUE;
1746 m->vmp_absent = FALSE;
1747 break;
1748 case KERN_MEMORY_ERROR:
1749 assert(m->vmp_absent);
1750 break;
1751 default:
1752 panic("vm_fault_page(): unexpected "
1753 "error %d from "
1754 "vm_compressor_pager_get()\n",
1755 rc);
1756 }
1757 PAGE_WAKEUP_DONE(m);
1758
1759 rc = KERN_SUCCESS;
1760 goto data_requested;
1761 }
1762 my_fault_type = DBG_PAGEIN_FAULT;
1763
1764 if (m != VM_PAGE_NULL) {
1765 VM_PAGE_FREE(m);
1766 m = VM_PAGE_NULL;
1767 }
1768
1769 #if TRACEFAULTPAGE
1770 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1771 #endif
1772
1773 /*
1774 * It's possible someone called vm_object_destroy while we weren't
1775 * holding the object lock. If that has happened, then bail out
1776 * here.
1777 */
1778
1779 pager = object->pager;
1780
1781 if (pager == MEMORY_OBJECT_NULL) {
1782 vm_fault_cleanup(object, first_m);
1783 thread_interrupt_level(interruptible_state);
1784 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NO_PAGER), 0 /* arg */);
1785 return VM_FAULT_MEMORY_ERROR;
1786 }
1787
1788 /*
1789 * We have an absent page in place for the faulting offset,
1790 * so we can release the object lock.
1791 */
1792
1793 if (object->object_is_shared_cache) {
1794 token = thread_priority_floor_start();
1795 /*
1796 * A non-native shared cache object might
1797 * be getting set up in parallel with this
1798 * fault and so we can't assume that this
1799 * check will be valid after we drop the
1800 * object lock below.
1801 */
1802 drop_floor = true;
1803 }
1804
1805 vm_object_unlock(object);
1806
1807 /*
1808 * If this object uses a copy_call strategy,
1809 * and we are interested in a copy of this object
1810 * (having gotten here only by following a
1811 * shadow chain), then tell the memory manager
1812 * via a flag added to the desired_access
1813 * parameter, so that it can detect a race
1814 * between our walking down the shadow chain
1815 * and its pushing pages up into a copy of
1816 * the object that it manages.
1817 */
1818 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1819 wants_copy_flag = VM_PROT_WANTS_COPY;
1820 } else {
1821 wants_copy_flag = VM_PROT_NONE;
1822 }
1823
1824 if (object->copy == first_object) {
1825 /*
1826 * if we issue the memory_object_data_request in
1827 * this state, we are subject to a deadlock with
1828 * the underlying filesystem if it is trying to
1829 * shrink the file resulting in a push of pages
1830 * into the copy object... that push will stall
1831 * on the placeholder page, and if the pushing thread
1832 * is holding a lock that is required on the pagein
1833 * path (such as a truncate lock), we'll deadlock...
1834 * to avoid this potential deadlock, we throw away
1835 * our placeholder page before calling memory_object_data_request
1836 * and force this thread to retry the vm_fault_page after
1837 * we have issued the I/O. the second time through this path
1838 * we will find the page already in the cache (presumably still
1839 * busy waiting for the I/O to complete) and then complete
1840 * the fault w/o having to go through memory_object_data_request again
1841 */
1842 assert(first_m != VM_PAGE_NULL);
1843 assert(VM_PAGE_OBJECT(first_m) == first_object);
1844
1845 vm_object_lock(first_object);
1846 VM_PAGE_FREE(first_m);
1847 vm_object_paging_end(first_object);
1848 vm_object_unlock(first_object);
1849
1850 first_m = VM_PAGE_NULL;
1851 force_fault_retry = TRUE;
1852
1853 vm_fault_page_forced_retry++;
1854 }
1855
1856 if (data_already_requested == TRUE) {
1857 orig_behavior = fault_info->behavior;
1858 orig_cluster_size = fault_info->cluster_size;
1859
1860 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1861 fault_info->cluster_size = PAGE_SIZE;
1862 }
1863 /*
1864 * Call the memory manager to retrieve the data.
1865 */
1866 rc = memory_object_data_request(
1867 pager,
1868 vm_object_trunc_page(offset) + object->paging_offset,
1869 PAGE_SIZE,
1870 access_required | wants_copy_flag,
1871 (memory_object_fault_info_t)fault_info);
1872
1873 if (data_already_requested == TRUE) {
1874 fault_info->behavior = orig_behavior;
1875 fault_info->cluster_size = orig_cluster_size;
1876 } else {
1877 data_already_requested = TRUE;
1878 }
1879
1880 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1881 #if TRACEFAULTPAGE
1882 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1883 #endif
1884 vm_object_lock(object);
1885
1886 if (drop_floor && object->object_is_shared_cache) {
1887 thread_priority_floor_end(&token);
1888 drop_floor = false;
1889 }
1890
1891 data_requested:
1892 if (rc != KERN_SUCCESS) {
1893 vm_fault_cleanup(object, first_m);
1894 thread_interrupt_level(interruptible_state);
1895
1896 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
1897
1898 return (rc == MACH_SEND_INTERRUPTED) ?
1899 VM_FAULT_INTERRUPTED :
1900 VM_FAULT_MEMORY_ERROR;
1901 } else {
1902 clock_sec_t tv_sec;
1903 clock_usec_t tv_usec;
1904
1905 if (my_fault_type == DBG_PAGEIN_FAULT) {
1906 clock_get_system_microtime(&tv_sec, &tv_usec);
1907 current_thread()->t_page_creation_time = tv_sec;
1908 current_thread()->t_page_creation_count = 0;
1909 }
1910 }
1911 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1912 vm_fault_cleanup(object, first_m);
1913 thread_interrupt_level(interruptible_state);
1914
1915 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
1916 return VM_FAULT_INTERRUPTED;
1917 }
1918 if (force_fault_retry == TRUE) {
1919 vm_fault_cleanup(object, first_m);
1920 thread_interrupt_level(interruptible_state);
1921
1922 return VM_FAULT_RETRY;
1923 }
1924 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1925 /*
1926 * No page here means that the object we
1927 * initially looked up was "physically
1928 * contiguous" (i.e. device memory). However,
1929 * with Virtual VRAM, the object might not
1930 * be backed by that device memory anymore,
1931 * so we're done here only if the object is
1932 * still "phys_contiguous".
1933 * Otherwise, if the object is no longer
1934 * "phys_contiguous", we need to retry the
1935 * page fault against the object's new backing
1936 * store (different memory object).
1937 */
1938 phys_contig_object:
1939 goto done;
1940 }
1941 /*
1942 * potentially a pagein fault
1943 * if we make it through the state checks
1944 * above, than we'll count it as such
1945 */
1946 my_fault = my_fault_type;
1947
1948 /*
1949 * Retry with same object/offset, since new data may
1950 * be in a different page (i.e., m is meaningless at
1951 * this point).
1952 */
1953 continue;
1954 }
1955 dont_look_for_page:
1956 /*
1957 * We get here if the object has no pager, or an existence map
1958 * exists and indicates the page isn't present on the pager
1959 * or we're unwiring a page. If a pager exists, but there
1960 * is no existence map, then the m->vmp_absent case above handles
1961 * the ZF case when the pager can't provide the page
1962 */
1963 #if TRACEFAULTPAGE
1964 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1965 #endif
1966 if (object == first_object) {
1967 first_m = m;
1968 } else {
1969 assert(m == VM_PAGE_NULL);
1970 }
1971
1972 next_object = object->shadow;
1973
1974 if (next_object == VM_OBJECT_NULL) {
1975 /*
1976 * we've hit the bottom of the shadown chain,
1977 * fill the page in the top object with zeros.
1978 */
1979 assert(!must_be_resident);
1980
1981 if (object != first_object) {
1982 vm_object_paging_end(object);
1983 vm_object_unlock(object);
1984
1985 object = first_object;
1986 offset = first_offset;
1987 vm_object_lock(object);
1988 }
1989 m = first_m;
1990 assert(VM_PAGE_OBJECT(m) == object);
1991 first_m = VM_PAGE_NULL;
1992
1993 /*
1994 * check for any conditions that prevent
1995 * us from creating a new zero-fill page
1996 * vm_fault_check will do all of the
1997 * fault cleanup in the case of an error condition
1998 * including resetting the thread_interrupt_level
1999 */
2000 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
2001
2002 if (error != VM_FAULT_SUCCESS) {
2003 return error;
2004 }
2005
2006 if (m == VM_PAGE_NULL) {
2007 m = vm_page_grab_options(grab_options);
2008
2009 if (m == VM_PAGE_NULL) {
2010 vm_fault_cleanup(object, VM_PAGE_NULL);
2011 thread_interrupt_level(interruptible_state);
2012
2013 return VM_FAULT_MEMORY_SHORTAGE;
2014 }
2015 vm_page_insert(m, object, vm_object_trunc_page(offset));
2016 }
2017 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2018 m->vmp_absent = TRUE;
2019 clear_absent_on_error = true;
2020 }
2021
2022 my_fault = vm_fault_zero_page(m, no_zero_fill);
2023
2024 break;
2025 } else {
2026 /*
2027 * Move on to the next object. Lock the next
2028 * object before unlocking the current one.
2029 */
2030 if ((object != first_object) || must_be_resident) {
2031 vm_object_paging_end(object);
2032 }
2033
2034 offset += object->vo_shadow_offset;
2035 fault_info->lo_offset += object->vo_shadow_offset;
2036 fault_info->hi_offset += object->vo_shadow_offset;
2037 access_required = VM_PROT_READ;
2038
2039 vm_object_lock(next_object);
2040 vm_object_unlock(object);
2041
2042 object = next_object;
2043 vm_object_paging_begin(object);
2044 }
2045 }
2046
2047 /*
2048 * PAGE HAS BEEN FOUND.
2049 *
2050 * This page (m) is:
2051 * busy, so that we can play with it;
2052 * not absent, so that nobody else will fill it;
2053 * possibly eligible for pageout;
2054 *
2055 * The top-level page (first_m) is:
2056 * VM_PAGE_NULL if the page was found in the
2057 * top-level object;
2058 * busy, not absent, and ineligible for pageout.
2059 *
2060 * The current object (object) is locked. A paging
2061 * reference is held for the current and top-level
2062 * objects.
2063 */
2064
2065 #if TRACEFAULTPAGE
2066 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
2067 #endif
2068 #if EXTRA_ASSERTIONS
2069 assert(m->vmp_busy && !m->vmp_absent);
2070 assert((first_m == VM_PAGE_NULL) ||
2071 (first_m->vmp_busy && !first_m->vmp_absent &&
2072 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2073 #endif /* EXTRA_ASSERTIONS */
2074
2075 /*
2076 * If the page is being written, but isn't
2077 * already owned by the top-level object,
2078 * we have to copy it into a new page owned
2079 * by the top-level object.
2080 */
2081 if (object != first_object) {
2082 #if TRACEFAULTPAGE
2083 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2084 #endif
2085 if (fault_type & VM_PROT_WRITE) {
2086 vm_page_t copy_m;
2087
2088 /*
2089 * We only really need to copy if we
2090 * want to write it.
2091 */
2092 assert(!must_be_resident);
2093
2094 /*
2095 * If we try to collapse first_object at this
2096 * point, we may deadlock when we try to get
2097 * the lock on an intermediate object (since we
2098 * have the bottom object locked). We can't
2099 * unlock the bottom object, because the page
2100 * we found may move (by collapse) if we do.
2101 *
2102 * Instead, we first copy the page. Then, when
2103 * we have no more use for the bottom object,
2104 * we unlock it and try to collapse.
2105 *
2106 * Note that we copy the page even if we didn't
2107 * need to... that's the breaks.
2108 */
2109
2110 /*
2111 * Allocate a page for the copy
2112 */
2113 copy_m = vm_page_grab_options(grab_options);
2114
2115 if (copy_m == VM_PAGE_NULL) {
2116 RELEASE_PAGE(m);
2117
2118 vm_fault_cleanup(object, first_m);
2119 thread_interrupt_level(interruptible_state);
2120
2121 return VM_FAULT_MEMORY_SHORTAGE;
2122 }
2123
2124 vm_page_copy(m, copy_m);
2125
2126 /*
2127 * If another map is truly sharing this
2128 * page with us, we have to flush all
2129 * uses of the original page, since we
2130 * can't distinguish those which want the
2131 * original from those which need the
2132 * new copy.
2133 *
2134 * XXXO If we know that only one map has
2135 * access to this page, then we could
2136 * avoid the pmap_disconnect() call.
2137 */
2138 if (m->vmp_pmapped) {
2139 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2140 }
2141
2142 if (m->vmp_clustered) {
2143 VM_PAGE_COUNT_AS_PAGEIN(m);
2144 VM_PAGE_CONSUME_CLUSTERED(m);
2145 }
2146 assert(!m->vmp_cleaning);
2147
2148 /*
2149 * We no longer need the old page or object.
2150 */
2151 RELEASE_PAGE(m);
2152
2153 /*
2154 * This check helps with marking the object as having a sequential pattern
2155 * Normally we'll miss doing this below because this fault is about COW to
2156 * the first_object i.e. bring page in from disk, push to object above but
2157 * don't update the file object's sequential pattern.
2158 */
2159 if (object->internal == FALSE) {
2160 vm_fault_is_sequential(object, offset, fault_info->behavior);
2161 }
2162
2163 vm_object_paging_end(object);
2164 vm_object_unlock(object);
2165
2166 my_fault = DBG_COW_FAULT;
2167 counter_inc(&vm_statistics_cow_faults);
2168 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2169 counter_inc(¤t_task()->cow_faults);
2170
2171 object = first_object;
2172 offset = first_offset;
2173
2174 vm_object_lock(object);
2175 /*
2176 * get rid of the place holder
2177 * page that we soldered in earlier
2178 */
2179 VM_PAGE_FREE(first_m);
2180 first_m = VM_PAGE_NULL;
2181
2182 /*
2183 * and replace it with the
2184 * page we just copied into
2185 */
2186 assert(copy_m->vmp_busy);
2187 vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2188 SET_PAGE_DIRTY(copy_m, TRUE);
2189
2190 m = copy_m;
2191 /*
2192 * Now that we've gotten the copy out of the
2193 * way, let's try to collapse the top object.
2194 * But we have to play ugly games with
2195 * paging_in_progress to do that...
2196 */
2197 vm_object_paging_end(object);
2198 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2199 vm_object_paging_begin(object);
2200 } else {
2201 *protection &= (~VM_PROT_WRITE);
2202 }
2203 }
2204 /*
2205 * Now check whether the page needs to be pushed into the
2206 * copy object. The use of asymmetric copy on write for
2207 * shared temporary objects means that we may do two copies to
2208 * satisfy the fault; one above to get the page from a
2209 * shadowed object, and one here to push it into the copy.
2210 */
2211 try_failed_count = 0;
2212
2213 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2214 vm_object_offset_t copy_offset;
2215 vm_page_t copy_m;
2216
2217 #if TRACEFAULTPAGE
2218 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2219 #endif
2220 /*
2221 * If the page is being written, but hasn't been
2222 * copied to the copy-object, we have to copy it there.
2223 */
2224 if ((fault_type & VM_PROT_WRITE) == 0) {
2225 *protection &= ~VM_PROT_WRITE;
2226 break;
2227 }
2228
2229 /*
2230 * If the page was guaranteed to be resident,
2231 * we must have already performed the copy.
2232 */
2233 if (must_be_resident) {
2234 break;
2235 }
2236
2237 /*
2238 * Try to get the lock on the copy_object.
2239 */
2240 if (!vm_object_lock_try(copy_object)) {
2241 vm_object_unlock(object);
2242 try_failed_count++;
2243
2244 mutex_pause(try_failed_count); /* wait a bit */
2245 vm_object_lock(object);
2246
2247 continue;
2248 }
2249 try_failed_count = 0;
2250
2251 /*
2252 * Make another reference to the copy-object,
2253 * to keep it from disappearing during the
2254 * copy.
2255 */
2256 vm_object_reference_locked(copy_object);
2257
2258 /*
2259 * Does the page exist in the copy?
2260 */
2261 copy_offset = first_offset - copy_object->vo_shadow_offset;
2262 copy_offset = vm_object_trunc_page(copy_offset);
2263
2264 if (copy_object->vo_size <= copy_offset) {
2265 /*
2266 * Copy object doesn't cover this page -- do nothing.
2267 */
2268 ;
2269 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2270 /*
2271 * Page currently exists in the copy object
2272 */
2273 if (copy_m->vmp_busy) {
2274 /*
2275 * If the page is being brought
2276 * in, wait for it and then retry.
2277 */
2278 RELEASE_PAGE(m);
2279
2280 /*
2281 * take an extra ref so object won't die
2282 */
2283 vm_object_reference_locked(copy_object);
2284 vm_object_unlock(copy_object);
2285 vm_fault_cleanup(object, first_m);
2286
2287 vm_object_lock(copy_object);
2288 assert(copy_object->ref_count > 0);
2289 vm_object_lock_assert_exclusive(copy_object);
2290 copy_object->ref_count--;
2291 assert(copy_object->ref_count > 0);
2292 copy_m = vm_page_lookup(copy_object, copy_offset);
2293
2294 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2295 PAGE_ASSERT_WAIT(copy_m, interruptible);
2296
2297 vm_object_unlock(copy_object);
2298 wait_result = thread_block(THREAD_CONTINUE_NULL);
2299 vm_object_deallocate(copy_object);
2300
2301 goto backoff;
2302 } else {
2303 vm_object_unlock(copy_object);
2304 vm_object_deallocate(copy_object);
2305 thread_interrupt_level(interruptible_state);
2306
2307 return VM_FAULT_RETRY;
2308 }
2309 }
2310 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2311 /*
2312 * If PAGED_OUT is TRUE, then the page used to exist
2313 * in the copy-object, and has already been paged out.
2314 * We don't need to repeat this. If PAGED_OUT is
2315 * FALSE, then either we don't know (!pager_created,
2316 * for example) or it hasn't been paged out.
2317 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2318 * We must copy the page to the copy object.
2319 *
2320 * Allocate a page for the copy
2321 */
2322 copy_m = vm_page_alloc(copy_object, copy_offset);
2323
2324 if (copy_m == VM_PAGE_NULL) {
2325 RELEASE_PAGE(m);
2326
2327 vm_object_lock_assert_exclusive(copy_object);
2328 copy_object->ref_count--;
2329 assert(copy_object->ref_count > 0);
2330
2331 vm_object_unlock(copy_object);
2332 vm_fault_cleanup(object, first_m);
2333 thread_interrupt_level(interruptible_state);
2334
2335 return VM_FAULT_MEMORY_SHORTAGE;
2336 }
2337 /*
2338 * Must copy page into copy-object.
2339 */
2340 vm_page_copy(m, copy_m);
2341
2342 /*
2343 * If the old page was in use by any users
2344 * of the copy-object, it must be removed
2345 * from all pmaps. (We can't know which
2346 * pmaps use it.)
2347 */
2348 if (m->vmp_pmapped) {
2349 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2350 }
2351
2352 if (m->vmp_clustered) {
2353 VM_PAGE_COUNT_AS_PAGEIN(m);
2354 VM_PAGE_CONSUME_CLUSTERED(m);
2355 }
2356 /*
2357 * If there's a pager, then immediately
2358 * page out this page, using the "initialize"
2359 * option. Else, we use the copy.
2360 */
2361 if ((!copy_object->pager_ready)
2362 || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2363 ) {
2364 vm_page_lockspin_queues();
2365 assert(!m->vmp_cleaning);
2366 vm_page_activate(copy_m);
2367 vm_page_unlock_queues();
2368
2369 SET_PAGE_DIRTY(copy_m, TRUE);
2370 PAGE_WAKEUP_DONE(copy_m);
2371 } else {
2372 assert(copy_m->vmp_busy == TRUE);
2373 assert(!m->vmp_cleaning);
2374
2375 /*
2376 * dirty is protected by the object lock
2377 */
2378 SET_PAGE_DIRTY(copy_m, TRUE);
2379
2380 /*
2381 * The page is already ready for pageout:
2382 * not on pageout queues and busy.
2383 * Unlock everything except the
2384 * copy_object itself.
2385 */
2386 vm_object_unlock(object);
2387
2388 /*
2389 * Write the page to the copy-object,
2390 * flushing it from the kernel.
2391 */
2392 vm_pageout_initialize_page(copy_m);
2393
2394 /*
2395 * Since the pageout may have
2396 * temporarily dropped the
2397 * copy_object's lock, we
2398 * check whether we'll have
2399 * to deallocate the hard way.
2400 */
2401 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2402 vm_object_unlock(copy_object);
2403 vm_object_deallocate(copy_object);
2404 vm_object_lock(object);
2405
2406 continue;
2407 }
2408 /*
2409 * Pick back up the old object's
2410 * lock. [It is safe to do so,
2411 * since it must be deeper in the
2412 * object tree.]
2413 */
2414 vm_object_lock(object);
2415 }
2416
2417 /*
2418 * Because we're pushing a page upward
2419 * in the object tree, we must restart
2420 * any faults that are waiting here.
2421 * [Note that this is an expansion of
2422 * PAGE_WAKEUP that uses the THREAD_RESTART
2423 * wait result]. Can't turn off the page's
2424 * busy bit because we're not done with it.
2425 */
2426 if (m->vmp_wanted) {
2427 m->vmp_wanted = FALSE;
2428 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2429 }
2430 }
2431 /*
2432 * The reference count on copy_object must be
2433 * at least 2: one for our extra reference,
2434 * and at least one from the outside world
2435 * (we checked that when we last locked
2436 * copy_object).
2437 */
2438 vm_object_lock_assert_exclusive(copy_object);
2439 copy_object->ref_count--;
2440 assert(copy_object->ref_count > 0);
2441
2442 vm_object_unlock(copy_object);
2443
2444 break;
2445 }
2446
2447 done:
2448 *result_page = m;
2449 *top_page = first_m;
2450
2451 if (m != VM_PAGE_NULL) {
2452 assert(VM_PAGE_OBJECT(m) == object);
2453
2454 retval = VM_FAULT_SUCCESS;
2455
2456 if (my_fault == DBG_PAGEIN_FAULT) {
2457 VM_PAGE_COUNT_AS_PAGEIN(m);
2458
2459 if (object->internal) {
2460 my_fault = DBG_PAGEIND_FAULT;
2461 } else {
2462 my_fault = DBG_PAGEINV_FAULT;
2463 }
2464
2465 /*
2466 * evaluate access pattern and update state
2467 * vm_fault_deactivate_behind depends on the
2468 * state being up to date
2469 */
2470 vm_fault_is_sequential(object, offset, fault_info->behavior);
2471 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2472 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2473 /*
2474 * we weren't called from vm_fault, so handle the
2475 * accounting here for hits in the cache
2476 */
2477 if (m->vmp_clustered) {
2478 VM_PAGE_COUNT_AS_PAGEIN(m);
2479 VM_PAGE_CONSUME_CLUSTERED(m);
2480 }
2481 vm_fault_is_sequential(object, offset, fault_info->behavior);
2482 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2483 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2484 VM_STAT_DECOMPRESSIONS();
2485 }
2486 if (type_of_fault) {
2487 *type_of_fault = my_fault;
2488 }
2489 } else {
2490 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2491 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2492 assert(first_m == VM_PAGE_NULL);
2493 assert(object == first_object);
2494 }
2495
2496 thread_interrupt_level(interruptible_state);
2497
2498 #if TRACEFAULTPAGE
2499 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2500 #endif
2501 return retval;
2502
2503 backoff:
2504 thread_interrupt_level(interruptible_state);
2505
2506 if (wait_result == THREAD_INTERRUPTED) {
2507 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2508 return VM_FAULT_INTERRUPTED;
2509 }
2510 return VM_FAULT_RETRY;
2511
2512 #undef RELEASE_PAGE
2513 }
2514
2515 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2516 #define PANIC_ON_CS_KILLED_DEFAULT true
2517 #else
2518 #define PANIC_ON_CS_KILLED_DEFAULT false
2519 #endif
2520 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2521 PANIC_ON_CS_KILLED_DEFAULT);
2522
2523 extern int proc_selfpid(void);
2524 extern char *proc_name_address(void *p);
2525 unsigned long cs_enter_tainted_rejected = 0;
2526 unsigned long cs_enter_tainted_accepted = 0;
2527
2528 /*
2529 * CODE SIGNING:
2530 * When soft faulting a page, we have to validate the page if:
2531 * 1. the page is being mapped in user space
2532 * 2. the page hasn't already been found to be "tainted"
2533 * 3. the page belongs to a code-signed object
2534 * 4. the page has not been validated yet or has been mapped for write.
2535 */
2536 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2537 vm_fault_cs_need_validation(
2538 pmap_t pmap,
2539 vm_page_t page,
2540 vm_object_t page_obj,
2541 vm_map_size_t fault_page_size,
2542 vm_map_offset_t fault_phys_offset)
2543 {
2544 if (pmap == kernel_pmap) {
2545 /* 1 - not user space */
2546 return false;
2547 }
2548 if (!page_obj->code_signed) {
2549 /* 3 - page does not belong to a code-signed object */
2550 return false;
2551 }
2552 if (fault_page_size == PAGE_SIZE) {
2553 /* looking at the whole page */
2554 assertf(fault_phys_offset == 0,
2555 "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2556 (uint64_t)fault_page_size,
2557 (uint64_t)fault_phys_offset);
2558 if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2559 /* 2 - page is all tainted */
2560 return false;
2561 }
2562 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2563 !page->vmp_wpmapped) {
2564 /* 4 - already fully validated and never mapped writable */
2565 return false;
2566 }
2567 } else {
2568 /* looking at a specific sub-page */
2569 if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2570 /* 2 - sub-page was already marked as tainted */
2571 return false;
2572 }
2573 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2574 !page->vmp_wpmapped) {
2575 /* 4 - already validated and never mapped writable */
2576 return false;
2577 }
2578 }
2579 /* page needs to be validated */
2580 return true;
2581 }
2582
2583
2584 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2585 vm_fault_cs_page_immutable(
2586 vm_page_t m,
2587 vm_map_size_t fault_page_size,
2588 vm_map_offset_t fault_phys_offset,
2589 vm_prot_t prot __unused)
2590 {
2591 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2592 /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2593 return true;
2594 }
2595 return false;
2596 }
2597
2598 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2599 vm_fault_cs_page_nx(
2600 vm_page_t m,
2601 vm_map_size_t fault_page_size,
2602 vm_map_offset_t fault_phys_offset)
2603 {
2604 return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2605 }
2606
2607 /*
2608 * Check if the page being entered into the pmap violates code signing.
2609 */
2610 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2611 vm_fault_cs_check_violation(
2612 bool cs_bypass,
2613 vm_object_t object,
2614 vm_page_t m,
2615 pmap_t pmap,
2616 vm_prot_t prot,
2617 vm_prot_t caller_prot,
2618 vm_map_size_t fault_page_size,
2619 vm_map_offset_t fault_phys_offset,
2620 vm_object_fault_info_t fault_info,
2621 bool map_is_switched,
2622 bool map_is_switch_protected,
2623 bool *cs_violation)
2624 {
2625 #if !PMAP_CS
2626 #pragma unused(caller_prot)
2627 #pragma unused(fault_info)
2628 #endif /* !PMAP_CS */
2629 int cs_enforcement_enabled;
2630 if (!cs_bypass &&
2631 vm_fault_cs_need_validation(pmap, m, object,
2632 fault_page_size, fault_phys_offset)) {
2633 vm_object_lock_assert_exclusive(object);
2634
2635 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2636 vm_cs_revalidates++;
2637 }
2638
2639 /* VM map is locked, so 1 ref will remain on VM object -
2640 * so no harm if vm_page_validate_cs drops the object lock */
2641
2642 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2643 }
2644
2645 /* If the map is switched, and is switch-protected, we must protect
2646 * some pages from being write-faulted: immutable pages because by
2647 * definition they may not be written, and executable pages because that
2648 * would provide a way to inject unsigned code.
2649 * If the page is immutable, we can simply return. However, we can't
2650 * immediately determine whether a page is executable anywhere. But,
2651 * we can disconnect it everywhere and remove the executable protection
2652 * from the current map. We do that below right before we do the
2653 * PMAP_ENTER.
2654 */
2655 if (pmap == kernel_pmap) {
2656 /* kernel fault: cs_enforcement does not apply */
2657 cs_enforcement_enabled = 0;
2658 } else {
2659 cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2660 }
2661
2662 if (cs_enforcement_enabled && map_is_switched &&
2663 map_is_switch_protected &&
2664 vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2665 (prot & VM_PROT_WRITE)) {
2666 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2667 return KERN_CODESIGN_ERROR;
2668 }
2669
2670 if (cs_enforcement_enabled &&
2671 vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2672 (prot & VM_PROT_EXECUTE)) {
2673 if (cs_debug) {
2674 printf("page marked to be NX, not letting it be mapped EXEC\n");
2675 }
2676 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2677 return KERN_CODESIGN_ERROR;
2678 }
2679
2680 /* A page could be tainted, or pose a risk of being tainted later.
2681 * Check whether the receiving process wants it, and make it feel
2682 * the consequences (that hapens in cs_invalid_page()).
2683 * For CS Enforcement, two other conditions will
2684 * cause that page to be tainted as well:
2685 * - pmapping an unsigned page executable - this means unsigned code;
2686 * - writeable mapping of a validated page - the content of that page
2687 * can be changed without the kernel noticing, therefore unsigned
2688 * code can be created
2689 */
2690 if (cs_bypass) {
2691 /* code-signing is bypassed */
2692 *cs_violation = FALSE;
2693 } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2694 /* tainted page */
2695 *cs_violation = TRUE;
2696 } else if (!cs_enforcement_enabled) {
2697 /* no further code-signing enforcement */
2698 *cs_violation = FALSE;
2699 } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2700 ((prot & VM_PROT_WRITE) ||
2701 m->vmp_wpmapped)) {
2702 /*
2703 * The page should be immutable, but is in danger of being
2704 * modified.
2705 * This is the case where we want policy from the code
2706 * directory - is the page immutable or not? For now we have
2707 * to assume that code pages will be immutable, data pages not.
2708 * We'll assume a page is a code page if it has a code directory
2709 * and we fault for execution.
2710 * That is good enough since if we faulted the code page for
2711 * writing in another map before, it is wpmapped; if we fault
2712 * it for writing in this map later it will also be faulted for
2713 * executing at the same time; and if we fault for writing in
2714 * another map later, we will disconnect it from this pmap so
2715 * we'll notice the change.
2716 */
2717 *cs_violation = TRUE;
2718 } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2719 (prot & VM_PROT_EXECUTE)
2720 ) {
2721 *cs_violation = TRUE;
2722 } else {
2723 *cs_violation = FALSE;
2724 }
2725 return KERN_SUCCESS;
2726 }
2727
2728 /*
2729 * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2730 * @param must_disconnect This value will be set to true if the caller must disconnect
2731 * this page.
2732 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2733 */
2734 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2735 vm_fault_cs_handle_violation(
2736 vm_object_t object,
2737 vm_page_t m,
2738 pmap_t pmap,
2739 vm_prot_t prot,
2740 vm_map_offset_t vaddr,
2741 vm_map_size_t fault_page_size,
2742 vm_map_offset_t fault_phys_offset,
2743 bool map_is_switched,
2744 bool map_is_switch_protected,
2745 bool *must_disconnect)
2746 {
2747 #if !MACH_ASSERT
2748 #pragma unused(pmap)
2749 #pragma unused(map_is_switch_protected)
2750 #endif /* !MACH_ASSERT */
2751 /*
2752 * We will have a tainted page. Have to handle the special case
2753 * of a switched map now. If the map is not switched, standard
2754 * procedure applies - call cs_invalid_page().
2755 * If the map is switched, the real owner is invalid already.
2756 * There is no point in invalidating the switching process since
2757 * it will not be executing from the map. So we don't call
2758 * cs_invalid_page() in that case.
2759 */
2760 boolean_t reject_page, cs_killed;
2761 kern_return_t kr;
2762 if (map_is_switched) {
2763 assert(pmap == vm_map_pmap(current_thread()->map));
2764 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2765 reject_page = FALSE;
2766 } else {
2767 if (cs_debug > 5) {
2768 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2769 object->code_signed ? "yes" : "no",
2770 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2771 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2772 m->vmp_wpmapped ? "yes" : "no",
2773 (int)prot);
2774 }
2775 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2776 }
2777
2778 if (reject_page) {
2779 /* reject the invalid page: abort the page fault */
2780 int pid;
2781 const char *procname;
2782 task_t task;
2783 vm_object_t file_object, shadow;
2784 vm_object_offset_t file_offset;
2785 char *pathname, *filename;
2786 vm_size_t pathname_len, filename_len;
2787 boolean_t truncated_path;
2788 #define __PATH_MAX 1024
2789 struct timespec mtime, cs_mtime;
2790 int shadow_depth;
2791 os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2792
2793 kr = KERN_CODESIGN_ERROR;
2794 cs_enter_tainted_rejected++;
2795
2796 /* get process name and pid */
2797 procname = "?";
2798 task = current_task();
2799 pid = proc_selfpid();
2800 if (get_bsdtask_info(task) != NULL) {
2801 procname = proc_name_address(get_bsdtask_info(task));
2802 }
2803
2804 /* get file's VM object */
2805 file_object = object;
2806 file_offset = m->vmp_offset;
2807 for (shadow = file_object->shadow,
2808 shadow_depth = 0;
2809 shadow != VM_OBJECT_NULL;
2810 shadow = file_object->shadow,
2811 shadow_depth++) {
2812 vm_object_lock_shared(shadow);
2813 if (file_object != object) {
2814 vm_object_unlock(file_object);
2815 }
2816 file_offset += file_object->vo_shadow_offset;
2817 file_object = shadow;
2818 }
2819
2820 mtime.tv_sec = 0;
2821 mtime.tv_nsec = 0;
2822 cs_mtime.tv_sec = 0;
2823 cs_mtime.tv_nsec = 0;
2824
2825 /* get file's pathname and/or filename */
2826 pathname = NULL;
2827 filename = NULL;
2828 pathname_len = 0;
2829 filename_len = 0;
2830 truncated_path = FALSE;
2831 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2832 if (file_object->pager != NULL) {
2833 pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
2834 if (pathname) {
2835 pathname[0] = '\0';
2836 pathname_len = __PATH_MAX;
2837 filename = pathname + pathname_len;
2838 filename_len = __PATH_MAX;
2839
2840 if (vnode_pager_get_object_name(file_object->pager,
2841 pathname,
2842 pathname_len,
2843 filename,
2844 filename_len,
2845 &truncated_path) == KERN_SUCCESS) {
2846 /* safety first... */
2847 pathname[__PATH_MAX - 1] = '\0';
2848 filename[__PATH_MAX - 1] = '\0';
2849
2850 vnode_pager_get_object_mtime(file_object->pager,
2851 &mtime,
2852 &cs_mtime);
2853 } else {
2854 kfree_data(pathname, __PATH_MAX * 2);
2855 pathname = NULL;
2856 filename = NULL;
2857 pathname_len = 0;
2858 filename_len = 0;
2859 truncated_path = FALSE;
2860 }
2861 }
2862 }
2863 printf("CODE SIGNING: process %d[%s]: "
2864 "rejecting invalid page at address 0x%llx "
2865 "from offset 0x%llx in file \"%s%s%s\" "
2866 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2867 "(signed:%d validated:%d tainted:%d nx:%d "
2868 "wpmapped:%d dirty:%d depth:%d)\n",
2869 pid, procname, (addr64_t) vaddr,
2870 file_offset,
2871 (pathname ? pathname : "<nil>"),
2872 (truncated_path ? "/.../" : ""),
2873 (truncated_path ? filename : ""),
2874 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2875 ((cs_mtime.tv_sec == mtime.tv_sec &&
2876 cs_mtime.tv_nsec == mtime.tv_nsec)
2877 ? "=="
2878 : "!="),
2879 mtime.tv_sec, mtime.tv_nsec,
2880 object->code_signed,
2881 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2882 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2883 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2884 m->vmp_wpmapped,
2885 m->vmp_dirty,
2886 shadow_depth);
2887
2888 /*
2889 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2890 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2891 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2892 * will deal with the segmentation fault.
2893 */
2894 if (cs_killed) {
2895 KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2896 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2897
2898 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2899 if (codesigning_exit_reason == NULL) {
2900 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2901 } else {
2902 mach_vm_address_t data_addr = 0;
2903 struct codesigning_exit_reason_info *ceri = NULL;
2904 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2905
2906 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2907 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2908 } else {
2909 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2910 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2911 ceri = (struct codesigning_exit_reason_info *)data_addr;
2912 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2913
2914 ceri->ceri_virt_addr = vaddr;
2915 ceri->ceri_file_offset = file_offset;
2916 if (pathname) {
2917 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2918 } else {
2919 ceri->ceri_pathname[0] = '\0';
2920 }
2921 if (filename) {
2922 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2923 } else {
2924 ceri->ceri_filename[0] = '\0';
2925 }
2926 ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2927 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2928 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2929 ceri->ceri_page_modtime_secs = mtime.tv_sec;
2930 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2931 ceri->ceri_object_codesigned = (object->code_signed);
2932 ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2933 ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2934 ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2935 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2936 ceri->ceri_page_slid = 0;
2937 ceri->ceri_page_dirty = (m->vmp_dirty);
2938 ceri->ceri_page_shadow_depth = shadow_depth;
2939 } else {
2940 #if DEBUG || DEVELOPMENT
2941 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2942 #else
2943 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2944 #endif /* DEBUG || DEVELOPMENT */
2945 /* Free the buffer */
2946 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2947 }
2948 }
2949 }
2950
2951 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2952 }
2953 if (panic_on_cs_killed &&
2954 object->object_is_shared_cache) {
2955 char *tainted_contents;
2956 vm_map_offset_t src_vaddr;
2957 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2958 tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
2959 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2960 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2961 panic("CODE SIGNING: process %d[%s]: "
2962 "rejecting invalid page (phys#0x%x) at address 0x%llx "
2963 "from offset 0x%llx in file \"%s%s%s\" "
2964 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2965 "(signed:%d validated:%d tainted:%d nx:%d"
2966 "wpmapped:%d dirty:%d depth:%d)\n",
2967 pid, procname,
2968 VM_PAGE_GET_PHYS_PAGE(m),
2969 (addr64_t) vaddr,
2970 file_offset,
2971 (pathname ? pathname : "<nil>"),
2972 (truncated_path ? "/.../" : ""),
2973 (truncated_path ? filename : ""),
2974 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2975 ((cs_mtime.tv_sec == mtime.tv_sec &&
2976 cs_mtime.tv_nsec == mtime.tv_nsec)
2977 ? "=="
2978 : "!="),
2979 mtime.tv_sec, mtime.tv_nsec,
2980 object->code_signed,
2981 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2982 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2983 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2984 m->vmp_wpmapped,
2985 m->vmp_dirty,
2986 shadow_depth);
2987 }
2988
2989 if (file_object != object) {
2990 vm_object_unlock(file_object);
2991 }
2992 if (pathname_len != 0) {
2993 kfree_data(pathname, __PATH_MAX * 2);
2994 pathname = NULL;
2995 filename = NULL;
2996 }
2997 } else {
2998 /* proceed with the invalid page */
2999 kr = KERN_SUCCESS;
3000 if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
3001 !object->code_signed) {
3002 /*
3003 * This page has not been (fully) validated but
3004 * does not belong to a code-signed object
3005 * so it should not be forcefully considered
3006 * as tainted.
3007 * We're just concerned about it here because
3008 * we've been asked to "execute" it but that
3009 * does not mean that it should cause other
3010 * accesses to fail.
3011 * This happens when a debugger sets a
3012 * breakpoint and we then execute code in
3013 * that page. Marking the page as "tainted"
3014 * would cause any inspection tool ("leaks",
3015 * "vmmap", "CrashReporter", ...) to get killed
3016 * due to code-signing violation on that page,
3017 * even though they're just reading it and not
3018 * executing from it.
3019 */
3020 } else {
3021 /*
3022 * Page might have been tainted before or not;
3023 * now it definitively is. If the page wasn't
3024 * tainted, we must disconnect it from all
3025 * pmaps later, to force existing mappings
3026 * through that code path for re-consideration
3027 * of the validity of that page.
3028 */
3029 if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3030 *must_disconnect = TRUE;
3031 VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3032 }
3033 }
3034 cs_enter_tainted_accepted++;
3035 }
3036 if (kr != KERN_SUCCESS) {
3037 if (cs_debug) {
3038 printf("CODESIGNING: vm_fault_enter(0x%llx): "
3039 "*** INVALID PAGE ***\n",
3040 (long long)vaddr);
3041 }
3042 #if !SECURE_KERNEL
3043 if (cs_enforcement_panic) {
3044 panic("CODESIGNING: panicking on invalid page");
3045 }
3046 #endif
3047 }
3048 return kr;
3049 }
3050
3051 /*
3052 * Check that the code signature is valid for the given page being inserted into
3053 * the pmap.
3054 *
3055 * @param must_disconnect This value will be set to true if the caller must disconnect
3056 * this page.
3057 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3058 */
3059 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3060 vm_fault_validate_cs(
3061 bool cs_bypass,
3062 vm_object_t object,
3063 vm_page_t m,
3064 pmap_t pmap,
3065 vm_map_offset_t vaddr,
3066 vm_prot_t prot,
3067 vm_prot_t caller_prot,
3068 vm_map_size_t fault_page_size,
3069 vm_map_offset_t fault_phys_offset,
3070 vm_object_fault_info_t fault_info,
3071 bool *must_disconnect)
3072 {
3073 bool map_is_switched, map_is_switch_protected, cs_violation;
3074 kern_return_t kr;
3075 /* Validate code signature if necessary. */
3076 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3077 (pmap == vm_map_pmap(current_thread()->map)));
3078 map_is_switch_protected = current_thread()->map->switch_protect;
3079 kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3080 prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3081 map_is_switched, map_is_switch_protected, &cs_violation);
3082 if (kr != KERN_SUCCESS) {
3083 return kr;
3084 }
3085 if (cs_violation) {
3086 kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3087 fault_page_size, fault_phys_offset,
3088 map_is_switched, map_is_switch_protected, must_disconnect);
3089 }
3090 return kr;
3091 }
3092
3093 /*
3094 * Enqueue the page on the appropriate paging queue.
3095 */
3096 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3097 vm_fault_enqueue_page(
3098 vm_object_t object,
3099 vm_page_t m,
3100 bool wired,
3101 bool change_wiring,
3102 vm_tag_t wire_tag,
3103 bool no_cache,
3104 int *type_of_fault,
3105 kern_return_t kr)
3106 {
3107 assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3108 boolean_t page_queues_locked = FALSE;
3109 boolean_t previously_pmapped = m->vmp_pmapped;
3110 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3111 MACRO_BEGIN \
3112 if (! page_queues_locked) { \
3113 page_queues_locked = TRUE; \
3114 vm_page_lockspin_queues(); \
3115 } \
3116 MACRO_END
3117 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3118 MACRO_BEGIN \
3119 if (page_queues_locked) { \
3120 page_queues_locked = FALSE; \
3121 vm_page_unlock_queues(); \
3122 } \
3123 MACRO_END
3124
3125 vm_page_update_special_state(m);
3126 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3127 /*
3128 * Compressor pages are neither wired
3129 * nor pageable and should never change.
3130 */
3131 assert(object == compressor_object);
3132 } else if (change_wiring) {
3133 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3134
3135 if (wired) {
3136 if (kr == KERN_SUCCESS) {
3137 vm_page_wire(m, wire_tag, TRUE);
3138 }
3139 } else {
3140 vm_page_unwire(m, TRUE);
3141 }
3142 /* we keep the page queues lock, if we need it later */
3143 } else {
3144 if (object->internal == TRUE) {
3145 /*
3146 * don't allow anonymous pages on
3147 * the speculative queues
3148 */
3149 no_cache = FALSE;
3150 }
3151 if (kr != KERN_SUCCESS) {
3152 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3153 vm_page_deactivate(m);
3154 /* we keep the page queues lock, if we need it later */
3155 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3156 (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3157 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3158 ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3159 !VM_PAGE_WIRED(m)) {
3160 if (vm_page_local_q &&
3161 (*type_of_fault == DBG_COW_FAULT ||
3162 *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3163 struct vpl *lq;
3164 uint32_t lid;
3165
3166 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3167
3168 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3169 vm_object_lock_assert_exclusive(object);
3170
3171 /*
3172 * we got a local queue to stuff this
3173 * new page on...
3174 * its safe to manipulate local and
3175 * local_id at this point since we're
3176 * behind an exclusive object lock and
3177 * the page is not on any global queue.
3178 *
3179 * we'll use the current cpu number to
3180 * select the queue note that we don't
3181 * need to disable preemption... we're
3182 * going to be behind the local queue's
3183 * lock to do the real work
3184 */
3185 lid = cpu_number();
3186
3187 lq = zpercpu_get_cpu(vm_page_local_q, lid);
3188
3189 VPL_LOCK(&lq->vpl_lock);
3190
3191 vm_page_check_pageable_safe(m);
3192 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3193 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3194 m->vmp_local_id = lid;
3195 lq->vpl_count++;
3196
3197 if (object->internal) {
3198 lq->vpl_internal_count++;
3199 } else {
3200 lq->vpl_external_count++;
3201 }
3202
3203 VPL_UNLOCK(&lq->vpl_lock);
3204
3205 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3206 /*
3207 * we're beyond the soft limit
3208 * for the local queue
3209 * vm_page_reactivate_local will
3210 * 'try' to take the global page
3211 * queue lock... if it can't
3212 * that's ok... we'll let the
3213 * queue continue to grow up
3214 * to the hard limit... at that
3215 * point we'll wait for the
3216 * lock... once we've got the
3217 * lock, we'll transfer all of
3218 * the pages from the local
3219 * queue to the global active
3220 * queue
3221 */
3222 vm_page_reactivate_local(lid, FALSE, FALSE);
3223 }
3224 } else {
3225 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3226
3227 /*
3228 * test again now that we hold the
3229 * page queue lock
3230 */
3231 if (!VM_PAGE_WIRED(m)) {
3232 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3233 vm_page_queues_remove(m, FALSE);
3234
3235 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3236 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3237 }
3238
3239 if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3240 no_cache) {
3241 /*
3242 * If this is a no_cache mapping
3243 * and the page has never been
3244 * mapped before or was
3245 * previously a no_cache page,
3246 * then we want to leave pages
3247 * in the speculative state so
3248 * that they can be readily
3249 * recycled if free memory runs
3250 * low. Otherwise the page is
3251 * activated as normal.
3252 */
3253
3254 if (no_cache &&
3255 (!previously_pmapped ||
3256 m->vmp_no_cache)) {
3257 m->vmp_no_cache = TRUE;
3258
3259 if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3260 vm_page_speculate(m, FALSE);
3261 }
3262 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3263 vm_page_activate(m);
3264 }
3265 }
3266 }
3267 /* we keep the page queues lock, if we need it later */
3268 }
3269 }
3270 }
3271 /* we're done with the page queues lock, if we ever took it */
3272 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3273 }
3274
3275 /*
3276 * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3277 * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3278 * before being inserted into the pmap.
3279 */
3280 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3281 vm_fault_enter_set_mapped(
3282 vm_object_t object,
3283 vm_page_t m,
3284 vm_prot_t prot,
3285 vm_prot_t fault_type)
3286 {
3287 bool page_needs_sync = false;
3288 /*
3289 * NOTE: we may only hold the vm_object lock SHARED
3290 * at this point, so we need the phys_page lock to
3291 * properly serialize updating the pmapped and
3292 * xpmapped bits
3293 */
3294 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3295 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3296
3297 pmap_lock_phys_page(phys_page);
3298 m->vmp_pmapped = TRUE;
3299
3300 if (!m->vmp_xpmapped) {
3301 m->vmp_xpmapped = TRUE;
3302
3303 pmap_unlock_phys_page(phys_page);
3304
3305 if (!object->internal) {
3306 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3307 }
3308
3309 #if defined(__arm64__)
3310 page_needs_sync = true;
3311 #else
3312 if (object->internal &&
3313 object->pager != NULL) {
3314 /*
3315 * This page could have been
3316 * uncompressed by the
3317 * compressor pager and its
3318 * contents might be only in
3319 * the data cache.
3320 * Since it's being mapped for
3321 * "execute" for the fist time,
3322 * make sure the icache is in
3323 * sync.
3324 */
3325 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3326 page_needs_sync = true;
3327 }
3328 #endif
3329 } else {
3330 pmap_unlock_phys_page(phys_page);
3331 }
3332 } else {
3333 if (m->vmp_pmapped == FALSE) {
3334 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3335
3336 pmap_lock_phys_page(phys_page);
3337 m->vmp_pmapped = TRUE;
3338 pmap_unlock_phys_page(phys_page);
3339 }
3340 }
3341
3342 if (fault_type & VM_PROT_WRITE) {
3343 if (m->vmp_wpmapped == FALSE) {
3344 vm_object_lock_assert_exclusive(object);
3345 if (!object->internal && object->pager) {
3346 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3347 }
3348 m->vmp_wpmapped = TRUE;
3349 }
3350 }
3351 return page_needs_sync;
3352 }
3353
3354 /*
3355 * Try to enter the given page into the pmap.
3356 * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3357 * a codesigning failure on a non-execute fault.
3358 */
3359 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3360 vm_fault_attempt_pmap_enter(
3361 pmap_t pmap,
3362 vm_map_offset_t vaddr,
3363 vm_map_size_t fault_page_size,
3364 vm_map_offset_t fault_phys_offset,
3365 vm_page_t m,
3366 vm_prot_t *prot,
3367 vm_prot_t caller_prot,
3368 vm_prot_t fault_type,
3369 bool wired,
3370 int pmap_options)
3371 {
3372 #if !PMAP_CS
3373 #pragma unused(caller_prot)
3374 #endif /* !PMAP_CS */
3375 kern_return_t kr;
3376 if (fault_page_size != PAGE_SIZE) {
3377 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3378 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3379 fault_phys_offset < PAGE_SIZE),
3380 "0x%llx\n", (uint64_t)fault_phys_offset);
3381 } else {
3382 assertf(fault_phys_offset == 0,
3383 "0x%llx\n", (uint64_t)fault_phys_offset);
3384 }
3385
3386 PMAP_ENTER_OPTIONS(pmap, vaddr,
3387 fault_phys_offset,
3388 m, *prot, fault_type, 0,
3389 wired,
3390 pmap_options,
3391 kr);
3392 return kr;
3393 }
3394
3395 /*
3396 * Enter the given page into the pmap.
3397 * The map must be locked shared.
3398 * The vm object must NOT be locked.
3399 *
3400 * @param need_retry if not null, avoid making a (potentially) blocking call into
3401 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3402 */
3403 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3404 vm_fault_pmap_enter(
3405 pmap_t pmap,
3406 vm_map_offset_t vaddr,
3407 vm_map_size_t fault_page_size,
3408 vm_map_offset_t fault_phys_offset,
3409 vm_page_t m,
3410 vm_prot_t *prot,
3411 vm_prot_t caller_prot,
3412 vm_prot_t fault_type,
3413 bool wired,
3414 int pmap_options,
3415 boolean_t *need_retry)
3416 {
3417 kern_return_t kr;
3418 if (need_retry != NULL) {
3419 /*
3420 * Although we don't hold a lock on this object, we hold a lock
3421 * on the top object in the chain. To prevent a deadlock, we
3422 * can't allow the pmap layer to block.
3423 */
3424 pmap_options |= PMAP_OPTIONS_NOWAIT;
3425 }
3426 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3427 fault_page_size, fault_phys_offset,
3428 m, prot, caller_prot, fault_type, wired, pmap_options);
3429 if (kr == KERN_RESOURCE_SHORTAGE) {
3430 if (need_retry) {
3431 /*
3432 * There's nothing we can do here since we hold the
3433 * lock on the top object in the chain. The caller
3434 * will need to deal with this by dropping that lock and retrying.
3435 */
3436 *need_retry = TRUE;
3437 vm_pmap_enter_retried++;
3438 }
3439 }
3440 return kr;
3441 }
3442
3443 /*
3444 * Enter the given page into the pmap.
3445 * The vm map must be locked shared.
3446 * The vm object must be locked exclusive, unless this is a soft fault.
3447 * For a soft fault, the object must be locked shared or exclusive.
3448 *
3449 * @param need_retry if not null, avoid making a (potentially) blocking call into
3450 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3451 */
3452 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3453 vm_fault_pmap_enter_with_object_lock(
3454 vm_object_t object,
3455 pmap_t pmap,
3456 vm_map_offset_t vaddr,
3457 vm_map_size_t fault_page_size,
3458 vm_map_offset_t fault_phys_offset,
3459 vm_page_t m,
3460 vm_prot_t *prot,
3461 vm_prot_t caller_prot,
3462 vm_prot_t fault_type,
3463 bool wired,
3464 int pmap_options,
3465 boolean_t *need_retry)
3466 {
3467 kern_return_t kr;
3468 /*
3469 * Prevent a deadlock by not
3470 * holding the object lock if we need to wait for a page in
3471 * pmap_enter() - <rdar://problem/7138958>
3472 */
3473 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3474 fault_page_size, fault_phys_offset,
3475 m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3476 #if __x86_64__
3477 if (kr == KERN_INVALID_ARGUMENT &&
3478 pmap == PMAP_NULL &&
3479 wired) {
3480 /*
3481 * Wiring a page in a pmap-less VM map:
3482 * VMware's "vmmon" kernel extension does this
3483 * to grab pages.
3484 * Let it proceed even though the PMAP_ENTER() failed.
3485 */
3486 kr = KERN_SUCCESS;
3487 }
3488 #endif /* __x86_64__ */
3489
3490 if (kr == KERN_RESOURCE_SHORTAGE) {
3491 if (need_retry) {
3492 /*
3493 * this will be non-null in the case where we hold the lock
3494 * on the top-object in this chain... we can't just drop
3495 * the lock on the object we're inserting the page into
3496 * and recall the PMAP_ENTER since we can still cause
3497 * a deadlock if one of the critical paths tries to
3498 * acquire the lock on the top-object and we're blocked
3499 * in PMAP_ENTER waiting for memory... our only recourse
3500 * is to deal with it at a higher level where we can
3501 * drop both locks.
3502 */
3503 *need_retry = TRUE;
3504 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PMAP_ENTER_RESOURCE_SHORTAGE), 0 /* arg */);
3505 vm_pmap_enter_retried++;
3506 goto done;
3507 }
3508 /*
3509 * The nonblocking version of pmap_enter did not succeed.
3510 * and we don't need to drop other locks and retry
3511 * at the level above us, so
3512 * use the blocking version instead. Requires marking
3513 * the page busy and unlocking the object
3514 */
3515 boolean_t was_busy = m->vmp_busy;
3516
3517 vm_object_lock_assert_exclusive(object);
3518
3519 m->vmp_busy = TRUE;
3520 vm_object_unlock(object);
3521
3522 PMAP_ENTER_OPTIONS(pmap, vaddr,
3523 fault_phys_offset,
3524 m, *prot, fault_type,
3525 0, wired,
3526 pmap_options, kr);
3527
3528 assert(VM_PAGE_OBJECT(m) == object);
3529
3530 /* Take the object lock again. */
3531 vm_object_lock(object);
3532
3533 /* If the page was busy, someone else will wake it up.
3534 * Otherwise, we have to do it now. */
3535 assert(m->vmp_busy);
3536 if (!was_busy) {
3537 PAGE_WAKEUP_DONE(m);
3538 }
3539 vm_pmap_enter_blocked++;
3540 }
3541
3542 done:
3543 return kr;
3544 }
3545
3546 /*
3547 * Prepare to enter a page into the pmap by checking CS, protection bits,
3548 * and setting mapped bits on the page_t.
3549 * Does not modify the page's paging queue.
3550 *
3551 * page queue lock must NOT be held
3552 * m->vmp_object must be locked
3553 *
3554 * NOTE: m->vmp_object could be locked "shared" only if we are called
3555 * from vm_fault() as part of a soft fault.
3556 */
3557 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,boolean_t change_wiring,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync)3558 vm_fault_enter_prepare(
3559 vm_page_t m,
3560 pmap_t pmap,
3561 vm_map_offset_t vaddr,
3562 vm_prot_t *prot,
3563 vm_prot_t caller_prot,
3564 vm_map_size_t fault_page_size,
3565 vm_map_offset_t fault_phys_offset,
3566 boolean_t change_wiring,
3567 vm_prot_t fault_type,
3568 vm_object_fault_info_t fault_info,
3569 int *type_of_fault,
3570 bool *page_needs_data_sync)
3571 {
3572 kern_return_t kr;
3573 bool is_tainted = false;
3574 vm_object_t object;
3575 boolean_t cs_bypass = fault_info->cs_bypass;
3576
3577 object = VM_PAGE_OBJECT(m);
3578
3579 vm_object_lock_assert_held(object);
3580
3581 #if KASAN
3582 if (pmap == kernel_pmap) {
3583 kasan_notify_address(vaddr, PAGE_SIZE);
3584 }
3585 #endif
3586
3587 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3588
3589 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3590 vm_object_lock_assert_exclusive(object);
3591 } else if ((fault_type & VM_PROT_WRITE) == 0 &&
3592 !change_wiring &&
3593 (!m->vmp_wpmapped
3594 #if VM_OBJECT_ACCESS_TRACKING
3595 || object->access_tracking
3596 #endif /* VM_OBJECT_ACCESS_TRACKING */
3597 )) {
3598 /*
3599 * This is not a "write" fault, so we
3600 * might not have taken the object lock
3601 * exclusively and we might not be able
3602 * to update the "wpmapped" bit in
3603 * vm_fault_enter().
3604 * Let's just grant read access to
3605 * the page for now and we'll
3606 * soft-fault again if we need write
3607 * access later...
3608 */
3609
3610 /* This had better not be a JIT page. */
3611 if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3612 *prot &= ~VM_PROT_WRITE;
3613 } else {
3614 assert(cs_bypass);
3615 }
3616 }
3617 if (m->vmp_pmapped == FALSE) {
3618 if (m->vmp_clustered) {
3619 if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3620 /*
3621 * found it in the cache, but this
3622 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3623 * so it must have come in as part of
3624 * a cluster... account 1 pagein against it
3625 */
3626 if (object->internal) {
3627 *type_of_fault = DBG_PAGEIND_FAULT;
3628 } else {
3629 *type_of_fault = DBG_PAGEINV_FAULT;
3630 }
3631
3632 VM_PAGE_COUNT_AS_PAGEIN(m);
3633 }
3634 VM_PAGE_CONSUME_CLUSTERED(m);
3635 }
3636 }
3637
3638 if (*type_of_fault != DBG_COW_FAULT) {
3639 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3640
3641 if (pmap == kernel_pmap) {
3642 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3643 }
3644 }
3645
3646 kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3647 *prot, caller_prot, fault_page_size, fault_phys_offset,
3648 fault_info, &is_tainted);
3649 if (kr == KERN_SUCCESS) {
3650 /*
3651 * We either have a good page, or a tainted page that has been accepted by the process.
3652 * In both cases the page will be entered into the pmap.
3653 */
3654 *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3655 if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3656 /*
3657 * This page is tainted but we're inserting it anyways.
3658 * Since it's writeable, we need to disconnect it from other pmaps
3659 * now so those processes can take note.
3660 */
3661
3662 /*
3663 * We can only get here
3664 * because of the CSE logic
3665 */
3666 assert(pmap_get_vm_map_cs_enforced(pmap));
3667 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3668 /*
3669 * If we are faulting for a write, we can clear
3670 * the execute bit - that will ensure the page is
3671 * checked again before being executable, which
3672 * protects against a map switch.
3673 * This only happens the first time the page
3674 * gets tainted, so we won't get stuck here
3675 * to make an already writeable page executable.
3676 */
3677 if (!cs_bypass) {
3678 assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3679 *prot &= ~VM_PROT_EXECUTE;
3680 }
3681 }
3682 assert(VM_PAGE_OBJECT(m) == object);
3683
3684 #if VM_OBJECT_ACCESS_TRACKING
3685 if (object->access_tracking) {
3686 DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3687 if (fault_type & VM_PROT_WRITE) {
3688 object->access_tracking_writes++;
3689 vm_object_access_tracking_writes++;
3690 } else {
3691 object->access_tracking_reads++;
3692 vm_object_access_tracking_reads++;
3693 }
3694 }
3695 #endif /* VM_OBJECT_ACCESS_TRACKING */
3696 }
3697
3698 return kr;
3699 }
3700
3701 /*
3702 * page queue lock must NOT be held
3703 * m->vmp_object must be locked
3704 *
3705 * NOTE: m->vmp_object could be locked "shared" only if we are called
3706 * from vm_fault() as part of a soft fault. If so, we must be
3707 * careful not to modify the VM object in any way that is not
3708 * legal under a shared lock...
3709 */
3710 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,boolean_t change_wiring,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,boolean_t * need_retry,int * type_of_fault)3711 vm_fault_enter(
3712 vm_page_t m,
3713 pmap_t pmap,
3714 vm_map_offset_t vaddr,
3715 vm_map_size_t fault_page_size,
3716 vm_map_offset_t fault_phys_offset,
3717 vm_prot_t prot,
3718 vm_prot_t caller_prot,
3719 boolean_t wired,
3720 boolean_t change_wiring,
3721 vm_tag_t wire_tag,
3722 vm_object_fault_info_t fault_info,
3723 boolean_t *need_retry,
3724 int *type_of_fault)
3725 {
3726 kern_return_t kr;
3727 vm_object_t object;
3728 bool page_needs_data_sync;
3729 vm_prot_t fault_type;
3730 int pmap_options = fault_info->pmap_options;
3731
3732 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3733 assert(m->vmp_fictitious);
3734 return KERN_SUCCESS;
3735 }
3736
3737 fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3738
3739 assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
3740 kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3741 fault_page_size, fault_phys_offset, change_wiring, fault_type,
3742 fault_info, type_of_fault, &page_needs_data_sync);
3743 object = VM_PAGE_OBJECT(m);
3744
3745 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3746
3747 if (kr == KERN_SUCCESS) {
3748 if (page_needs_data_sync) {
3749 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3750 }
3751
3752 kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3753 fault_page_size, fault_phys_offset, m,
3754 &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3755 }
3756
3757 return kr;
3758 }
3759
3760 void
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)3761 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3762 {
3763 if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3764 vm_fault(current_map(), /* map */
3765 vaddr, /* vaddr */
3766 prot, /* fault_type */
3767 FALSE, /* change_wiring */
3768 VM_KERN_MEMORY_NONE, /* tag - not wiring */
3769 THREAD_UNINT, /* interruptible */
3770 NULL, /* caller_pmap */
3771 0 /* caller_pmap_addr */);
3772 }
3773 }
3774
3775
3776 /*
3777 * Routine: vm_fault
3778 * Purpose:
3779 * Handle page faults, including pseudo-faults
3780 * used to change the wiring status of pages.
3781 * Returns:
3782 * Explicit continuations have been removed.
3783 * Implementation:
3784 * vm_fault and vm_fault_page save mucho state
3785 * in the moral equivalent of a closure. The state
3786 * structure is allocated when first entering vm_fault
3787 * and deallocated when leaving vm_fault.
3788 */
3789
3790 extern uint64_t get_current_unique_pid(void);
3791
3792 unsigned long vm_fault_collapse_total = 0;
3793 unsigned long vm_fault_collapse_skipped = 0;
3794
3795
3796 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3797 vm_fault_external(
3798 vm_map_t map,
3799 vm_map_offset_t vaddr,
3800 vm_prot_t fault_type,
3801 boolean_t change_wiring,
3802 int interruptible,
3803 pmap_t caller_pmap,
3804 vm_map_offset_t caller_pmap_addr)
3805 {
3806 return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3807 change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3808 interruptible, caller_pmap, caller_pmap_addr,
3809 NULL);
3810 }
3811
3812 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3813 vm_fault(
3814 vm_map_t map,
3815 vm_map_offset_t vaddr,
3816 vm_prot_t fault_type,
3817 boolean_t change_wiring,
3818 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3819 int interruptible,
3820 pmap_t caller_pmap,
3821 vm_map_offset_t caller_pmap_addr)
3822 {
3823 return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3824 interruptible, caller_pmap, caller_pmap_addr,
3825 NULL);
3826 }
3827
3828 static boolean_t
current_proc_is_privileged(void)3829 current_proc_is_privileged(void)
3830 {
3831 return csproc_get_platform_binary(current_proc());
3832 }
3833
3834 uint64_t vm_copied_on_read = 0;
3835
3836 /*
3837 * Cleanup after a vm_fault_enter.
3838 * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3839 * or the page should be in the pmap and on the correct paging queue.
3840 *
3841 * Precondition:
3842 * map must be locked shared.
3843 * m_object must be locked.
3844 * If top_object != VM_OBJECT_NULL, it must be locked.
3845 * real_map must be locked.
3846 *
3847 * Postcondition:
3848 * map will be unlocked
3849 * m_object will be unlocked
3850 * top_object will be unlocked
3851 * If real_map != map, it will be unlocked
3852 */
3853 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,boolean_t need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)3854 vm_fault_complete(
3855 vm_map_t map,
3856 vm_map_t real_map,
3857 vm_object_t object,
3858 vm_object_t m_object,
3859 vm_page_t m,
3860 vm_map_offset_t offset,
3861 vm_map_offset_t trace_real_vaddr,
3862 vm_object_fault_info_t fault_info,
3863 vm_prot_t caller_prot,
3864 #if CONFIG_DTRACE
3865 vm_map_offset_t real_vaddr,
3866 #else
3867 __unused vm_map_offset_t real_vaddr,
3868 #endif /* CONFIG_DTRACE */
3869 int type_of_fault,
3870 boolean_t need_retry,
3871 kern_return_t kr,
3872 ppnum_t *physpage_p,
3873 vm_prot_t prot,
3874 vm_object_t top_object,
3875 boolean_t need_collapse,
3876 vm_map_offset_t cur_offset,
3877 vm_prot_t fault_type,
3878 vm_object_t *written_on_object,
3879 memory_object_t *written_on_pager,
3880 vm_object_offset_t *written_on_offset)
3881 {
3882 int event_code = 0;
3883 vm_map_lock_assert_shared(map);
3884 vm_object_lock_assert_held(m_object);
3885 if (top_object != VM_OBJECT_NULL) {
3886 vm_object_lock_assert_held(top_object);
3887 }
3888 vm_map_lock_assert_held(real_map);
3889
3890 if (m_object->internal) {
3891 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3892 } else if (m_object->object_is_shared_cache) {
3893 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3894 } else {
3895 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3896 }
3897 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
3898 if (need_retry == FALSE) {
3899 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
3900 }
3901 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3902 if (kr == KERN_SUCCESS &&
3903 physpage_p != NULL) {
3904 /* for vm_map_wire_and_extract() */
3905 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3906 if (prot & VM_PROT_WRITE) {
3907 vm_object_lock_assert_exclusive(m_object);
3908 m->vmp_dirty = TRUE;
3909 }
3910 }
3911
3912 if (top_object != VM_OBJECT_NULL) {
3913 /*
3914 * It's safe to drop the top object
3915 * now that we've done our
3916 * vm_fault_enter(). Any other fault
3917 * in progress for that virtual
3918 * address will either find our page
3919 * and translation or put in a new page
3920 * and translation.
3921 */
3922 vm_object_unlock(top_object);
3923 top_object = VM_OBJECT_NULL;
3924 }
3925
3926 if (need_collapse == TRUE) {
3927 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3928 }
3929
3930 if (need_retry == FALSE &&
3931 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3932 /*
3933 * evaluate access pattern and update state
3934 * vm_fault_deactivate_behind depends on the
3935 * state being up to date
3936 */
3937 vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3938
3939 vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3940 }
3941 /*
3942 * That's it, clean up and return.
3943 */
3944 if (m->vmp_busy) {
3945 vm_object_lock_assert_exclusive(m_object);
3946 PAGE_WAKEUP_DONE(m);
3947 }
3948
3949 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3950 vm_object_paging_begin(m_object);
3951
3952 assert(*written_on_object == VM_OBJECT_NULL);
3953 *written_on_object = m_object;
3954 *written_on_pager = m_object->pager;
3955 *written_on_offset = m_object->paging_offset + m->vmp_offset;
3956 }
3957 vm_object_unlock(object);
3958
3959 vm_map_unlock_read(map);
3960 if (real_map != map) {
3961 vm_map_unlock(real_map);
3962 }
3963 }
3964
3965 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)3966 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3967 {
3968 if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3969 return DBG_COR_FAULT;
3970 }
3971 return type_of_fault;
3972 }
3973
3974 uint64_t vm_fault_resilient_media_initiate = 0;
3975 uint64_t vm_fault_resilient_media_retry = 0;
3976 uint64_t vm_fault_resilient_media_proceed = 0;
3977 uint64_t vm_fault_resilient_media_release = 0;
3978 uint64_t vm_fault_resilient_media_abort1 = 0;
3979 uint64_t vm_fault_resilient_media_abort2 = 0;
3980
3981 #if MACH_ASSERT
3982 int vm_fault_resilient_media_inject_error1_rate = 0;
3983 int vm_fault_resilient_media_inject_error1 = 0;
3984 int vm_fault_resilient_media_inject_error2_rate = 0;
3985 int vm_fault_resilient_media_inject_error2 = 0;
3986 int vm_fault_resilient_media_inject_error3_rate = 0;
3987 int vm_fault_resilient_media_inject_error3 = 0;
3988 #endif /* MACH_ASSERT */
3989
3990 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p)3991 vm_fault_internal(
3992 vm_map_t map,
3993 vm_map_offset_t vaddr,
3994 vm_prot_t caller_prot,
3995 boolean_t change_wiring,
3996 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3997 int interruptible,
3998 pmap_t caller_pmap,
3999 vm_map_offset_t caller_pmap_addr,
4000 ppnum_t *physpage_p)
4001 {
4002 vm_map_version_t version; /* Map version for verificiation */
4003 boolean_t wired; /* Should mapping be wired down? */
4004 vm_object_t object; /* Top-level object */
4005 vm_object_offset_t offset; /* Top-level offset */
4006 vm_prot_t prot; /* Protection for mapping */
4007 vm_object_t old_copy_object; /* Saved copy object */
4008 vm_page_t result_page; /* Result of vm_fault_page */
4009 vm_page_t top_page; /* Placeholder page */
4010 kern_return_t kr;
4011
4012 vm_page_t m; /* Fast access to result_page */
4013 kern_return_t error_code;
4014 vm_object_t cur_object;
4015 vm_object_t m_object = NULL;
4016 vm_object_offset_t cur_offset;
4017 vm_page_t cur_m;
4018 vm_object_t new_object;
4019 int type_of_fault;
4020 pmap_t pmap;
4021 wait_interrupt_t interruptible_state;
4022 vm_map_t real_map = map;
4023 vm_map_t original_map = map;
4024 bool object_locks_dropped = FALSE;
4025 vm_prot_t fault_type;
4026 vm_prot_t original_fault_type;
4027 struct vm_object_fault_info fault_info = {};
4028 bool need_collapse = FALSE;
4029 boolean_t need_retry = FALSE;
4030 boolean_t *need_retry_ptr = NULL;
4031 uint8_t object_lock_type = 0;
4032 uint8_t cur_object_lock_type;
4033 vm_object_t top_object = VM_OBJECT_NULL;
4034 vm_object_t written_on_object = VM_OBJECT_NULL;
4035 memory_object_t written_on_pager = NULL;
4036 vm_object_offset_t written_on_offset = 0;
4037 int throttle_delay;
4038 int compressed_count_delta;
4039 uint8_t grab_options;
4040 bool need_copy;
4041 bool need_copy_on_read;
4042 vm_map_offset_t trace_vaddr;
4043 vm_map_offset_t trace_real_vaddr;
4044 vm_map_size_t fault_page_size;
4045 vm_map_size_t fault_page_mask;
4046 int fault_page_shift;
4047 vm_map_offset_t fault_phys_offset;
4048 vm_map_offset_t real_vaddr;
4049 bool resilient_media_retry = false;
4050 bool resilient_media_ref_transfer = false;
4051 vm_object_t resilient_media_object = VM_OBJECT_NULL;
4052 vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
4053 bool page_needs_data_sync = false;
4054 /*
4055 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4056 * If so, the zero fill path will drop the lock
4057 * NB: Ideally we would always drop the lock rather than rely on
4058 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4059 */
4060 bool object_is_contended = false;
4061
4062 real_vaddr = vaddr;
4063 trace_real_vaddr = vaddr;
4064
4065 /*
4066 * Some (kernel) submaps are marked with "should never fault".
4067 *
4068 * We do this for two reasons:
4069 * - PGZ which is inside the zone map range can't go down the normal
4070 * lookup path (vm_map_lookup_entry() would panic).
4071 *
4072 * - we want for guard pages to not have to use fictitious pages at all
4073 * to prevent from ZFOD pages to be made.
4074 *
4075 * We also want capture the fault address easily so that the zone
4076 * allocator might present an enhanced panic log.
4077 */
4078 if (map->never_faults || (pgz_owned(vaddr) && map->pmap == kernel_pmap)) {
4079 assert(map->pmap == kernel_pmap);
4080 panic_fault_address = vaddr;
4081 return KERN_INVALID_ADDRESS;
4082 }
4083
4084 if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4085 fault_phys_offset = (vm_map_offset_t)-1;
4086 fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4087 fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4088 fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4089 if (fault_page_size < PAGE_SIZE) {
4090 DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4091 vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4092 }
4093 } else {
4094 fault_phys_offset = 0;
4095 fault_page_size = PAGE_SIZE;
4096 fault_page_mask = PAGE_MASK;
4097 fault_page_shift = PAGE_SHIFT;
4098 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4099 }
4100
4101 if (map == kernel_map) {
4102 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4103 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4104 } else {
4105 trace_vaddr = vaddr;
4106 }
4107
4108 KDBG_RELEASE(
4109 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
4110 ((uint64_t)trace_vaddr >> 32),
4111 trace_vaddr,
4112 (map == kernel_map));
4113
4114 if (get_preemption_level() != 0) {
4115 KDBG_RELEASE(
4116 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4117 ((uint64_t)trace_vaddr >> 32),
4118 trace_vaddr,
4119 KERN_FAILURE);
4120
4121 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4122 return KERN_FAILURE;
4123 }
4124
4125 thread_t cthread = current_thread();
4126 bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4127 uint64_t fstart = 0;
4128
4129 if (rtfault) {
4130 fstart = mach_continuous_time();
4131 }
4132
4133 interruptible_state = thread_interrupt_level(interruptible);
4134
4135 fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4136
4137 counter_inc(&vm_statistics_faults);
4138 counter_inc(¤t_task()->faults);
4139 original_fault_type = fault_type;
4140
4141 need_copy = FALSE;
4142 if (fault_type & VM_PROT_WRITE) {
4143 need_copy = TRUE;
4144 }
4145
4146 if (need_copy || change_wiring) {
4147 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4148 } else {
4149 object_lock_type = OBJECT_LOCK_SHARED;
4150 }
4151
4152 cur_object_lock_type = OBJECT_LOCK_SHARED;
4153
4154 if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4155 if (compressor_map) {
4156 if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4157 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4158 }
4159 }
4160 }
4161 RetryFault:
4162 assert(written_on_object == VM_OBJECT_NULL);
4163
4164 /*
4165 * assume we will hit a page in the cache
4166 * otherwise, explicitly override with
4167 * the real fault type once we determine it
4168 */
4169 type_of_fault = DBG_CACHE_HIT_FAULT;
4170
4171 /*
4172 * Find the backing store object and offset into
4173 * it to begin the search.
4174 */
4175 fault_type = original_fault_type;
4176 map = original_map;
4177 vm_map_lock_read(map);
4178
4179 if (resilient_media_retry) {
4180 /*
4181 * If we have to insert a fake zero-filled page to hide
4182 * a media failure to provide the real page, we need to
4183 * resolve any pending copy-on-write on this mapping.
4184 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4185 * with that even if this is not a "write" fault.
4186 */
4187 need_copy = TRUE;
4188 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4189 vm_fault_resilient_media_retry++;
4190 }
4191
4192 kr = vm_map_lookup_and_lock_object(&map, vaddr,
4193 (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4194 object_lock_type, &version,
4195 &object, &offset, &prot, &wired,
4196 &fault_info,
4197 &real_map,
4198 &object_is_contended);
4199
4200 if (kr != KERN_SUCCESS) {
4201 vm_map_unlock_read(map);
4202 /*
4203 * This can be seen in a crash report if indeed the
4204 * thread is crashing due to an invalid access in a non-existent
4205 * range.
4206 * Turning this OFF for now because it is noisy and not always fatal
4207 * eg prefaulting.
4208 *
4209 * if (kr == KERN_INVALID_ADDRESS) {
4210 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4211 * }
4212 */
4213 goto done;
4214 }
4215
4216
4217 pmap = real_map->pmap;
4218 fault_info.interruptible = interruptible;
4219 fault_info.stealth = FALSE;
4220 fault_info.io_sync = FALSE;
4221 fault_info.mark_zf_absent = FALSE;
4222 fault_info.batch_pmap_op = FALSE;
4223
4224 if (resilient_media_retry) {
4225 /*
4226 * We're retrying this fault after having detected a media
4227 * failure from a "resilient_media" mapping.
4228 * Check that the mapping is still pointing at the object
4229 * that just failed to provide a page.
4230 */
4231 assert(resilient_media_object != VM_OBJECT_NULL);
4232 assert(resilient_media_offset != (vm_object_offset_t)-1);
4233 if ((object != VM_OBJECT_NULL &&
4234 object == resilient_media_object &&
4235 offset == resilient_media_offset &&
4236 fault_info.resilient_media)
4237 #if MACH_ASSERT
4238 && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4239 (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4240 #endif /* MACH_ASSERT */
4241 ) {
4242 /*
4243 * This mapping still points at the same object
4244 * and is still "resilient_media": proceed in
4245 * "recovery-from-media-failure" mode, where we'll
4246 * insert a zero-filled page in the top object.
4247 */
4248 // printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4249 vm_fault_resilient_media_proceed++;
4250 } else {
4251 /* not recovering: reset state and retry fault */
4252 // printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4253 vm_object_unlock(object);
4254 if (real_map != map) {
4255 vm_map_unlock(real_map);
4256 }
4257 vm_map_unlock_read(map);
4258 /* release our extra reference on failed object */
4259 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4260 vm_object_lock_assert_notheld(resilient_media_object);
4261 vm_object_deallocate(resilient_media_object);
4262 resilient_media_object = VM_OBJECT_NULL;
4263 resilient_media_offset = (vm_object_offset_t)-1;
4264 resilient_media_retry = false;
4265 vm_fault_resilient_media_abort1++;
4266 goto RetryFault;
4267 }
4268 } else {
4269 assert(resilient_media_object == VM_OBJECT_NULL);
4270 resilient_media_offset = (vm_object_offset_t)-1;
4271 }
4272
4273 /*
4274 * If the page is wired, we must fault for the current protection
4275 * value, to avoid further faults.
4276 */
4277 if (wired) {
4278 fault_type = prot | VM_PROT_WRITE;
4279 }
4280 if (wired || need_copy) {
4281 /*
4282 * since we're treating this fault as a 'write'
4283 * we must hold the top object lock exclusively
4284 */
4285 if (object_lock_type == OBJECT_LOCK_SHARED) {
4286 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4287
4288 if (vm_object_lock_upgrade(object) == FALSE) {
4289 /*
4290 * couldn't upgrade, so explictly
4291 * take the lock exclusively
4292 */
4293 vm_object_lock(object);
4294 }
4295 }
4296 }
4297
4298 #if VM_FAULT_CLASSIFY
4299 /*
4300 * Temporary data gathering code
4301 */
4302 vm_fault_classify(object, offset, fault_type);
4303 #endif
4304 /*
4305 * Fast fault code. The basic idea is to do as much as
4306 * possible while holding the map lock and object locks.
4307 * Busy pages are not used until the object lock has to
4308 * be dropped to do something (copy, zero fill, pmap enter).
4309 * Similarly, paging references aren't acquired until that
4310 * point, and object references aren't used.
4311 *
4312 * If we can figure out what to do
4313 * (zero fill, copy on write, pmap enter) while holding
4314 * the locks, then it gets done. Otherwise, we give up,
4315 * and use the original fault path (which doesn't hold
4316 * the map lock, and relies on busy pages).
4317 * The give up cases include:
4318 * - Have to talk to pager.
4319 * - Page is busy, absent or in error.
4320 * - Pager has locked out desired access.
4321 * - Fault needs to be restarted.
4322 * - Have to push page into copy object.
4323 *
4324 * The code is an infinite loop that moves one level down
4325 * the shadow chain each time. cur_object and cur_offset
4326 * refer to the current object being examined. object and offset
4327 * are the original object from the map. The loop is at the
4328 * top level if and only if object and cur_object are the same.
4329 *
4330 * Invariants: Map lock is held throughout. Lock is held on
4331 * original object and cur_object (if different) when
4332 * continuing or exiting loop.
4333 *
4334 */
4335
4336 #if defined(__arm64__)
4337 /*
4338 * Fail if reading an execute-only page in a
4339 * pmap that enforces execute-only protection.
4340 */
4341 if (fault_type == VM_PROT_READ &&
4342 (prot & VM_PROT_EXECUTE) &&
4343 !(prot & VM_PROT_READ) &&
4344 pmap_enforces_execute_only(pmap)) {
4345 vm_object_unlock(object);
4346 vm_map_unlock_read(map);
4347 if (real_map != map) {
4348 vm_map_unlock(real_map);
4349 }
4350 kr = KERN_PROTECTION_FAILURE;
4351 goto done;
4352 }
4353 #endif
4354
4355 fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4356
4357 /*
4358 * If this page is to be inserted in a copy delay object
4359 * for writing, and if the object has a copy, then the
4360 * copy delay strategy is implemented in the slow fault page.
4361 */
4362 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4363 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4364 goto handle_copy_delay;
4365 }
4366
4367 cur_object = object;
4368 cur_offset = offset;
4369
4370 grab_options = 0;
4371 #if CONFIG_SECLUDED_MEMORY
4372 if (object->can_grab_secluded) {
4373 grab_options |= VM_PAGE_GRAB_SECLUDED;
4374 }
4375 #endif /* CONFIG_SECLUDED_MEMORY */
4376
4377 while (TRUE) {
4378 if (!cur_object->pager_created &&
4379 cur_object->phys_contiguous) { /* superpage */
4380 break;
4381 }
4382
4383 if (cur_object->blocked_access) {
4384 /*
4385 * Access to this VM object has been blocked.
4386 * Let the slow path handle it.
4387 */
4388 break;
4389 }
4390
4391 m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4392 m_object = NULL;
4393
4394 if (m != VM_PAGE_NULL) {
4395 m_object = cur_object;
4396
4397 if (m->vmp_busy) {
4398 wait_result_t result;
4399
4400 /*
4401 * in order to do the PAGE_ASSERT_WAIT, we must
4402 * have object that 'm' belongs to locked exclusively
4403 */
4404 if (object != cur_object) {
4405 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4406 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4407
4408 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4409 /*
4410 * couldn't upgrade so go do a full retry
4411 * immediately since we can no longer be
4412 * certain about cur_object (since we
4413 * don't hold a reference on it)...
4414 * first drop the top object lock
4415 */
4416 vm_object_unlock(object);
4417
4418 vm_map_unlock_read(map);
4419 if (real_map != map) {
4420 vm_map_unlock(real_map);
4421 }
4422
4423 goto RetryFault;
4424 }
4425 }
4426 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4427 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4428
4429 if (vm_object_lock_upgrade(object) == FALSE) {
4430 /*
4431 * couldn't upgrade, so explictly take the lock
4432 * exclusively and go relookup the page since we
4433 * will have dropped the object lock and
4434 * a different thread could have inserted
4435 * a page at this offset
4436 * no need for a full retry since we're
4437 * at the top level of the object chain
4438 */
4439 vm_object_lock(object);
4440
4441 continue;
4442 }
4443 }
4444 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4445 /*
4446 * m->vmp_busy == TRUE and the object is locked exclusively
4447 * if m->pageout_queue == TRUE after we acquire the
4448 * queues lock, we are guaranteed that it is stable on
4449 * the pageout queue and therefore reclaimable
4450 *
4451 * NOTE: this is only true for the internal pageout queue
4452 * in the compressor world
4453 */
4454 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4455
4456 vm_page_lock_queues();
4457
4458 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4459 vm_pageout_throttle_up(m);
4460 vm_page_unlock_queues();
4461
4462 PAGE_WAKEUP_DONE(m);
4463 goto reclaimed_from_pageout;
4464 }
4465 vm_page_unlock_queues();
4466 }
4467 if (object != cur_object) {
4468 vm_object_unlock(object);
4469 }
4470
4471 vm_map_unlock_read(map);
4472 if (real_map != map) {
4473 vm_map_unlock(real_map);
4474 }
4475
4476 result = PAGE_ASSERT_WAIT(m, interruptible);
4477
4478 vm_object_unlock(cur_object);
4479
4480 if (result == THREAD_WAITING) {
4481 result = thread_block(THREAD_CONTINUE_NULL);
4482 }
4483 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4484 goto RetryFault;
4485 }
4486
4487 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4488 kr = KERN_ABORTED;
4489 goto done;
4490 }
4491 reclaimed_from_pageout:
4492 if (m->vmp_laundry) {
4493 if (object != cur_object) {
4494 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4495 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4496
4497 vm_object_unlock(object);
4498 vm_object_unlock(cur_object);
4499
4500 vm_map_unlock_read(map);
4501 if (real_map != map) {
4502 vm_map_unlock(real_map);
4503 }
4504
4505 goto RetryFault;
4506 }
4507 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4508 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4509
4510 if (vm_object_lock_upgrade(object) == FALSE) {
4511 /*
4512 * couldn't upgrade, so explictly take the lock
4513 * exclusively and go relookup the page since we
4514 * will have dropped the object lock and
4515 * a different thread could have inserted
4516 * a page at this offset
4517 * no need for a full retry since we're
4518 * at the top level of the object chain
4519 */
4520 vm_object_lock(object);
4521
4522 continue;
4523 }
4524 }
4525 vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
4526 vm_pageout_steal_laundry(m, FALSE);
4527 }
4528
4529
4530 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4531 /*
4532 * Guard page: let the slow path deal with it
4533 */
4534 break;
4535 }
4536 if (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4537 /*
4538 * Unusual case... let the slow path deal with it
4539 */
4540 break;
4541 }
4542 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4543 if (object != cur_object) {
4544 vm_object_unlock(object);
4545 }
4546 vm_map_unlock_read(map);
4547 if (real_map != map) {
4548 vm_map_unlock(real_map);
4549 }
4550 vm_object_unlock(cur_object);
4551 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
4552 kr = KERN_MEMORY_ERROR;
4553 goto done;
4554 }
4555 assert(m_object == VM_PAGE_OBJECT(m));
4556
4557 if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4558 PAGE_SIZE, 0) ||
4559 (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4560 upgrade_lock_and_retry:
4561 /*
4562 * We might need to validate this page
4563 * against its code signature, so we
4564 * want to hold the VM object exclusively.
4565 */
4566 if (object != cur_object) {
4567 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4568 vm_object_unlock(object);
4569 vm_object_unlock(cur_object);
4570
4571 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4572
4573 vm_map_unlock_read(map);
4574 if (real_map != map) {
4575 vm_map_unlock(real_map);
4576 }
4577
4578 goto RetryFault;
4579 }
4580 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4581 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4582
4583 if (vm_object_lock_upgrade(object) == FALSE) {
4584 /*
4585 * couldn't upgrade, so explictly take the lock
4586 * exclusively and go relookup the page since we
4587 * will have dropped the object lock and
4588 * a different thread could have inserted
4589 * a page at this offset
4590 * no need for a full retry since we're
4591 * at the top level of the object chain
4592 */
4593 vm_object_lock(object);
4594
4595 continue;
4596 }
4597 }
4598 }
4599 /*
4600 * Two cases of map in faults:
4601 * - At top level w/o copy object.
4602 * - Read fault anywhere.
4603 * --> must disallow write.
4604 */
4605
4606 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4607 goto FastPmapEnter;
4608 }
4609
4610 if (!need_copy &&
4611 !fault_info.no_copy_on_read &&
4612 cur_object != object &&
4613 !cur_object->internal &&
4614 !cur_object->pager_trusted &&
4615 vm_protect_privileged_from_untrusted &&
4616 !cur_object->code_signed &&
4617 current_proc_is_privileged()) {
4618 /*
4619 * We're faulting on a page in "object" and
4620 * went down the shadow chain to "cur_object"
4621 * to find out that "cur_object"'s pager
4622 * is not "trusted", i.e. we can not trust it
4623 * to always return the same contents.
4624 * Since the target is a "privileged" process,
4625 * let's treat this as a copy-on-read fault, as
4626 * if it was a copy-on-write fault.
4627 * Once "object" gets a copy of this page, it
4628 * won't have to rely on "cur_object" to
4629 * provide the contents again.
4630 *
4631 * This is done by setting "need_copy" and
4632 * retrying the fault from the top with the
4633 * appropriate locking.
4634 *
4635 * Special case: if the mapping is executable
4636 * and the untrusted object is code-signed and
4637 * the process is "cs_enforced", we do not
4638 * copy-on-read because that would break
4639 * code-signing enforcement expectations (an
4640 * executable page must belong to a code-signed
4641 * object) and we can rely on code-signing
4642 * to re-validate the page if it gets evicted
4643 * and paged back in.
4644 */
4645 // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4646 vm_copied_on_read++;
4647 need_copy = TRUE;
4648
4649 vm_object_unlock(object);
4650 vm_object_unlock(cur_object);
4651 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4652 vm_map_unlock_read(map);
4653 if (real_map != map) {
4654 vm_map_unlock(real_map);
4655 }
4656 goto RetryFault;
4657 }
4658
4659 if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4660 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4661 prot &= ~VM_PROT_WRITE;
4662 } else {
4663 /*
4664 * For a protection that the pmap cares
4665 * about, we must hand over the full
4666 * set of protections (so that the pmap
4667 * layer can apply any desired policy).
4668 * This means that cs_bypass must be
4669 * set, as this can force us to pass
4670 * RWX.
4671 */
4672 assert(fault_info.cs_bypass);
4673 }
4674
4675 if (object != cur_object) {
4676 /*
4677 * We still need to hold the top object
4678 * lock here to prevent a race between
4679 * a read fault (taking only "shared"
4680 * locks) and a write fault (taking
4681 * an "exclusive" lock on the top
4682 * object.
4683 * Otherwise, as soon as we release the
4684 * top lock, the write fault could
4685 * proceed and actually complete before
4686 * the read fault, and the copied page's
4687 * translation could then be overwritten
4688 * by the read fault's translation for
4689 * the original page.
4690 *
4691 * Let's just record what the top object
4692 * is and we'll release it later.
4693 */
4694 top_object = object;
4695
4696 /*
4697 * switch to the object that has the new page
4698 */
4699 object = cur_object;
4700 object_lock_type = cur_object_lock_type;
4701 }
4702 FastPmapEnter:
4703 assert(m_object == VM_PAGE_OBJECT(m));
4704
4705 /*
4706 * prepare for the pmap_enter...
4707 * object and map are both locked
4708 * m contains valid data
4709 * object == m->vmp_object
4710 * cur_object == NULL or it's been unlocked
4711 * no paging references on either object or cur_object
4712 */
4713 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4714 need_retry_ptr = &need_retry;
4715 } else {
4716 need_retry_ptr = NULL;
4717 }
4718
4719 if (fault_page_size < PAGE_SIZE) {
4720 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4721 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4722 fault_phys_offset < PAGE_SIZE),
4723 "0x%llx\n", (uint64_t)fault_phys_offset);
4724 } else {
4725 assertf(fault_phys_offset == 0,
4726 "0x%llx\n", (uint64_t)fault_phys_offset);
4727 }
4728
4729 if (__improbable(rtfault &&
4730 !m->vmp_realtime &&
4731 vm_pageout_protect_realtime)) {
4732 vm_page_lock_queues();
4733 if (!m->vmp_realtime) {
4734 m->vmp_realtime = true;
4735 vm_page_realtime_count++;
4736 }
4737 vm_page_unlock_queues();
4738 }
4739 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
4740 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
4741 if (caller_pmap) {
4742 kr = vm_fault_enter(m,
4743 caller_pmap,
4744 caller_pmap_addr,
4745 fault_page_size,
4746 fault_phys_offset,
4747 prot,
4748 caller_prot,
4749 wired,
4750 change_wiring,
4751 wire_tag,
4752 &fault_info,
4753 need_retry_ptr,
4754 &type_of_fault);
4755 } else {
4756 kr = vm_fault_enter(m,
4757 pmap,
4758 vaddr,
4759 fault_page_size,
4760 fault_phys_offset,
4761 prot,
4762 caller_prot,
4763 wired,
4764 change_wiring,
4765 wire_tag,
4766 &fault_info,
4767 need_retry_ptr,
4768 &type_of_fault);
4769 }
4770
4771 vm_fault_complete(
4772 map,
4773 real_map,
4774 object,
4775 m_object,
4776 m,
4777 offset,
4778 trace_real_vaddr,
4779 &fault_info,
4780 caller_prot,
4781 real_vaddr,
4782 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4783 need_retry,
4784 kr,
4785 physpage_p,
4786 prot,
4787 top_object,
4788 need_collapse,
4789 cur_offset,
4790 fault_type,
4791 &written_on_object,
4792 &written_on_pager,
4793 &written_on_offset);
4794 top_object = VM_OBJECT_NULL;
4795 if (need_retry == TRUE) {
4796 /*
4797 * vm_fault_enter couldn't complete the PMAP_ENTER...
4798 * at this point we don't hold any locks so it's safe
4799 * to ask the pmap layer to expand the page table to
4800 * accommodate this mapping... once expanded, we'll
4801 * re-drive the fault which should result in vm_fault_enter
4802 * being able to successfully enter the mapping this time around
4803 */
4804 (void)pmap_enter_options(
4805 pmap, vaddr, 0, 0, 0, 0, 0,
4806 PMAP_OPTIONS_NOENTER, NULL);
4807
4808 need_retry = FALSE;
4809 goto RetryFault;
4810 }
4811 goto done;
4812 }
4813 /*
4814 * COPY ON WRITE FAULT
4815 */
4816 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4817
4818 /*
4819 * If objects match, then
4820 * object->copy must not be NULL (else control
4821 * would be in previous code block), and we
4822 * have a potential push into the copy object
4823 * with which we can't cope with here.
4824 */
4825 if (cur_object == object) {
4826 /*
4827 * must take the slow path to
4828 * deal with the copy push
4829 */
4830 break;
4831 }
4832
4833 /*
4834 * This is now a shadow based copy on write
4835 * fault -- it requires a copy up the shadow
4836 * chain.
4837 */
4838 assert(m_object == VM_PAGE_OBJECT(m));
4839
4840 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4841 vm_fault_cs_need_validation(NULL, m, m_object,
4842 PAGE_SIZE, 0)) {
4843 goto upgrade_lock_and_retry;
4844 }
4845
4846 #if MACH_ASSERT
4847 if (resilient_media_retry &&
4848 vm_fault_resilient_media_inject_error2_rate != 0 &&
4849 (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
4850 /* inject an error */
4851 cur_m = m;
4852 m = VM_PAGE_NULL;
4853 m_object = VM_OBJECT_NULL;
4854 break;
4855 }
4856 #endif /* MACH_ASSERT */
4857 /*
4858 * Allocate a page in the original top level
4859 * object. Give up if allocate fails. Also
4860 * need to remember current page, as it's the
4861 * source of the copy.
4862 *
4863 * at this point we hold locks on both
4864 * object and cur_object... no need to take
4865 * paging refs or mark pages BUSY since
4866 * we don't drop either object lock until
4867 * the page has been copied and inserted
4868 */
4869 cur_m = m;
4870 m = vm_page_grab_options(grab_options);
4871 m_object = NULL;
4872
4873 if (m == VM_PAGE_NULL) {
4874 /*
4875 * no free page currently available...
4876 * must take the slow path
4877 */
4878 break;
4879 }
4880
4881 /*
4882 * Now do the copy. Mark the source page busy...
4883 *
4884 * NOTE: This code holds the map lock across
4885 * the page copy.
4886 */
4887 vm_page_copy(cur_m, m);
4888 vm_page_insert(m, object, vm_object_trunc_page(offset));
4889 if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4890 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4891 }
4892 m_object = object;
4893 SET_PAGE_DIRTY(m, FALSE);
4894
4895 /*
4896 * Now cope with the source page and object
4897 */
4898 if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4899 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4900 } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4901 /*
4902 * We've copied the full 16K page but we're
4903 * about to call vm_fault_enter() only for
4904 * the 4K chunk we're faulting on. The other
4905 * three 4K chunks in that page could still
4906 * be pmapped in this pmap.
4907 * Since the VM object layer thinks that the
4908 * entire page has been dealt with and the
4909 * original page might no longer be needed,
4910 * it might collapse/bypass the original VM
4911 * object and free its pages, which would be
4912 * bad (and would trigger pmap_verify_free()
4913 * assertions) if the other 4K chunks are still
4914 * pmapped.
4915 */
4916 /*
4917 * XXX FBDP TODO4K: to be revisisted
4918 * Technically, we need to pmap_disconnect()
4919 * only the target pmap's mappings for the 4K
4920 * chunks of this 16K VM page. If other pmaps
4921 * have PTEs on these chunks, that means that
4922 * the associated VM map must have a reference
4923 * on the VM object, so no need to worry about
4924 * those.
4925 * pmap_protect() for each 4K chunk would be
4926 * better but we'd have to check which chunks
4927 * are actually mapped before and after this
4928 * one.
4929 * A full-blown pmap_disconnect() is easier
4930 * for now but not efficient.
4931 */
4932 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4933 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4934 }
4935
4936 if (cur_m->vmp_clustered) {
4937 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4938 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4939 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4940 }
4941 need_collapse = TRUE;
4942
4943 if (!cur_object->internal &&
4944 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4945 /*
4946 * The object from which we've just
4947 * copied a page is most probably backed
4948 * by a vnode. We don't want to waste too
4949 * much time trying to collapse the VM objects
4950 * and create a bottleneck when several tasks
4951 * map the same file.
4952 */
4953 if (cur_object->copy == object) {
4954 /*
4955 * Shared mapping or no COW yet.
4956 * We can never collapse a copy
4957 * object into its backing object.
4958 */
4959 need_collapse = FALSE;
4960 } else if (cur_object->copy == object->shadow &&
4961 object->shadow->resident_page_count == 0) {
4962 /*
4963 * Shared mapping after a COW occurred.
4964 */
4965 need_collapse = FALSE;
4966 }
4967 }
4968 vm_object_unlock(cur_object);
4969
4970 if (need_collapse == FALSE) {
4971 vm_fault_collapse_skipped++;
4972 }
4973 vm_fault_collapse_total++;
4974
4975 type_of_fault = DBG_COW_FAULT;
4976 counter_inc(&vm_statistics_cow_faults);
4977 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4978 counter_inc(¤t_task()->cow_faults);
4979
4980 goto FastPmapEnter;
4981 } else {
4982 /*
4983 * No page at cur_object, cur_offset... m == NULL
4984 */
4985 if (cur_object->pager_created) {
4986 vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4987
4988 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4989 int my_fault_type;
4990 uint8_t c_flags = C_DONT_BLOCK;
4991 bool insert_cur_object = FALSE;
4992
4993 /*
4994 * May have to talk to a pager...
4995 * if so, take the slow path by
4996 * doing a 'break' from the while (TRUE) loop
4997 *
4998 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4999 * if the compressor is active and the page exists there
5000 */
5001 if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
5002 break;
5003 }
5004
5005 if (map == kernel_map || real_map == kernel_map) {
5006 /*
5007 * can't call into the compressor with the kernel_map
5008 * lock held, since the compressor may try to operate
5009 * on the kernel map in order to return an empty c_segment
5010 */
5011 break;
5012 }
5013 if (object != cur_object) {
5014 if (fault_type & VM_PROT_WRITE) {
5015 c_flags |= C_KEEP;
5016 } else {
5017 insert_cur_object = TRUE;
5018 }
5019 }
5020 if (insert_cur_object == TRUE) {
5021 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5022 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5023
5024 if (vm_object_lock_upgrade(cur_object) == FALSE) {
5025 /*
5026 * couldn't upgrade so go do a full retry
5027 * immediately since we can no longer be
5028 * certain about cur_object (since we
5029 * don't hold a reference on it)...
5030 * first drop the top object lock
5031 */
5032 vm_object_unlock(object);
5033
5034 vm_map_unlock_read(map);
5035 if (real_map != map) {
5036 vm_map_unlock(real_map);
5037 }
5038
5039 goto RetryFault;
5040 }
5041 }
5042 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
5043 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5044
5045 if (object != cur_object) {
5046 /*
5047 * we can't go for the upgrade on the top
5048 * lock since the upgrade may block waiting
5049 * for readers to drain... since we hold
5050 * cur_object locked at this point, waiting
5051 * for the readers to drain would represent
5052 * a lock order inversion since the lock order
5053 * for objects is the reference order in the
5054 * shadown chain
5055 */
5056 vm_object_unlock(object);
5057 vm_object_unlock(cur_object);
5058
5059 vm_map_unlock_read(map);
5060 if (real_map != map) {
5061 vm_map_unlock(real_map);
5062 }
5063
5064 goto RetryFault;
5065 }
5066 if (vm_object_lock_upgrade(object) == FALSE) {
5067 /*
5068 * couldn't upgrade, so explictly take the lock
5069 * exclusively and go relookup the page since we
5070 * will have dropped the object lock and
5071 * a different thread could have inserted
5072 * a page at this offset
5073 * no need for a full retry since we're
5074 * at the top level of the object chain
5075 */
5076 vm_object_lock(object);
5077
5078 continue;
5079 }
5080 }
5081 m = vm_page_grab_options(grab_options);
5082 m_object = NULL;
5083
5084 if (m == VM_PAGE_NULL) {
5085 /*
5086 * no free page currently available...
5087 * must take the slow path
5088 */
5089 break;
5090 }
5091
5092 /*
5093 * The object is and remains locked
5094 * so no need to take a
5095 * "paging_in_progress" reference.
5096 */
5097 bool shared_lock;
5098 if ((object == cur_object &&
5099 object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5100 (object != cur_object &&
5101 cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5102 shared_lock = FALSE;
5103 } else {
5104 shared_lock = TRUE;
5105 }
5106
5107 kr = vm_compressor_pager_get(
5108 cur_object->pager,
5109 (vm_object_trunc_page(cur_offset)
5110 + cur_object->paging_offset),
5111 VM_PAGE_GET_PHYS_PAGE(m),
5112 &my_fault_type,
5113 c_flags,
5114 &compressed_count_delta);
5115
5116 vm_compressor_pager_count(
5117 cur_object->pager,
5118 compressed_count_delta,
5119 shared_lock,
5120 cur_object);
5121
5122 if (kr != KERN_SUCCESS) {
5123 vm_page_release(m, FALSE);
5124 m = VM_PAGE_NULL;
5125 }
5126 /*
5127 * If vm_compressor_pager_get() returns
5128 * KERN_MEMORY_FAILURE, then the
5129 * compressed data is permanently lost,
5130 * so return this error immediately.
5131 */
5132 if (kr == KERN_MEMORY_FAILURE) {
5133 if (object != cur_object) {
5134 vm_object_unlock(cur_object);
5135 }
5136 vm_object_unlock(object);
5137 vm_map_unlock_read(map);
5138 if (real_map != map) {
5139 vm_map_unlock(real_map);
5140 }
5141
5142 goto done;
5143 } else if (kr != KERN_SUCCESS) {
5144 break;
5145 }
5146 m->vmp_dirty = TRUE;
5147
5148 /*
5149 * If the object is purgeable, its
5150 * owner's purgeable ledgers will be
5151 * updated in vm_page_insert() but the
5152 * page was also accounted for in a
5153 * "compressed purgeable" ledger, so
5154 * update that now.
5155 */
5156 if (object != cur_object &&
5157 !insert_cur_object) {
5158 /*
5159 * We're not going to insert
5160 * the decompressed page into
5161 * the object it came from.
5162 *
5163 * We're dealing with a
5164 * copy-on-write fault on
5165 * "object".
5166 * We're going to decompress
5167 * the page directly into the
5168 * target "object" while
5169 * keepin the compressed
5170 * page for "cur_object", so
5171 * no ledger update in that
5172 * case.
5173 */
5174 } else if (((cur_object->purgable ==
5175 VM_PURGABLE_DENY) &&
5176 (!cur_object->vo_ledger_tag)) ||
5177 (cur_object->vo_owner ==
5178 NULL)) {
5179 /*
5180 * "cur_object" is not purgeable
5181 * and is not ledger-taged, or
5182 * there's no owner for it,
5183 * so no owner's ledgers to
5184 * update.
5185 */
5186 } else {
5187 /*
5188 * One less compressed
5189 * purgeable/tagged page for
5190 * cur_object's owner.
5191 */
5192 vm_object_owner_compressed_update(
5193 cur_object,
5194 -1);
5195 }
5196
5197 if (insert_cur_object) {
5198 vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5199 m_object = cur_object;
5200 } else {
5201 vm_page_insert(m, object, vm_object_trunc_page(offset));
5202 m_object = object;
5203 }
5204
5205 if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5206 /*
5207 * If the page is not cacheable,
5208 * we can't let its contents
5209 * linger in the data cache
5210 * after the decompression.
5211 */
5212 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5213 }
5214
5215 type_of_fault = my_fault_type;
5216
5217 VM_STAT_DECOMPRESSIONS();
5218
5219 if (cur_object != object) {
5220 if (insert_cur_object) {
5221 top_object = object;
5222 /*
5223 * switch to the object that has the new page
5224 */
5225 object = cur_object;
5226 object_lock_type = cur_object_lock_type;
5227 } else {
5228 vm_object_unlock(cur_object);
5229 cur_object = object;
5230 }
5231 }
5232 goto FastPmapEnter;
5233 }
5234 /*
5235 * existence map present and indicates
5236 * that the pager doesn't have this page
5237 */
5238 }
5239 if (cur_object->shadow == VM_OBJECT_NULL ||
5240 resilient_media_retry) {
5241 /*
5242 * Zero fill fault. Page gets
5243 * inserted into the original object.
5244 */
5245 if (cur_object->shadow_severed ||
5246 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5247 cur_object == compressor_object ||
5248 cur_object == kernel_object) {
5249 if (object != cur_object) {
5250 vm_object_unlock(cur_object);
5251 }
5252 vm_object_unlock(object);
5253
5254 vm_map_unlock_read(map);
5255 if (real_map != map) {
5256 vm_map_unlock(real_map);
5257 }
5258 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5259 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5260 }
5261
5262 if (cur_object->shadow_severed) {
5263 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5264 }
5265
5266 kr = KERN_MEMORY_ERROR;
5267 goto done;
5268 }
5269 if (cur_object != object) {
5270 vm_object_unlock(cur_object);
5271
5272 cur_object = object;
5273 }
5274 if (object_lock_type == OBJECT_LOCK_SHARED) {
5275 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5276
5277 if (vm_object_lock_upgrade(object) == FALSE) {
5278 /*
5279 * couldn't upgrade so do a full retry on the fault
5280 * since we dropped the object lock which
5281 * could allow another thread to insert
5282 * a page at this offset
5283 */
5284 vm_map_unlock_read(map);
5285 if (real_map != map) {
5286 vm_map_unlock(real_map);
5287 }
5288
5289 goto RetryFault;
5290 }
5291 }
5292 if (!object->internal) {
5293 panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5294 }
5295 #if MACH_ASSERT
5296 if (resilient_media_retry &&
5297 vm_fault_resilient_media_inject_error3_rate != 0 &&
5298 (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5299 /* inject an error */
5300 m_object = NULL;
5301 break;
5302 }
5303 #endif /* MACH_ASSERT */
5304 m = vm_page_alloc(object, vm_object_trunc_page(offset));
5305 m_object = NULL;
5306
5307 if (m == VM_PAGE_NULL) {
5308 /*
5309 * no free page currently available...
5310 * must take the slow path
5311 */
5312 break;
5313 }
5314 m_object = object;
5315
5316 if ((prot & VM_PROT_WRITE) &&
5317 !(fault_type & VM_PROT_WRITE) &&
5318 object->copy != VM_OBJECT_NULL) {
5319 /*
5320 * This is not a write fault and
5321 * we might have a copy-on-write
5322 * obligation to honor (copy object or
5323 * "needs_copy" map entry), so do not
5324 * give write access yet.
5325 * We'll need to catch the first write
5326 * to resolve the copy-on-write by
5327 * pushing this page to a copy object
5328 * or making a shadow object.
5329 */
5330 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5331 prot &= ~VM_PROT_WRITE;
5332 } else {
5333 assert(fault_info.cs_bypass);
5334 }
5335 }
5336
5337 /*
5338 * Zeroing the page and entering into it into the pmap
5339 * represents a significant amount of the zero fill fault handler's work.
5340 *
5341 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5342 * now that we've inserted the page into the vm object.
5343 * Before dropping the lock, we need to check protection bits and set the
5344 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5345 * zero it, and do the pmap enter. We'll need to reacquire the lock
5346 * to clear the busy bit and wake up any waiters.
5347 */
5348 vm_fault_cs_clear(m);
5349 m->vmp_pmapped = TRUE;
5350 if (map->no_zero_fill) {
5351 type_of_fault = DBG_NZF_PAGE_FAULT;
5352 } else {
5353 type_of_fault = DBG_ZERO_FILL_FAULT;
5354 }
5355 {
5356 pmap_t destination_pmap;
5357 vm_map_offset_t destination_pmap_vaddr;
5358 vm_prot_t enter_fault_type;
5359 if (caller_pmap) {
5360 destination_pmap = caller_pmap;
5361 destination_pmap_vaddr = caller_pmap_addr;
5362 } else {
5363 destination_pmap = pmap;
5364 destination_pmap_vaddr = vaddr;
5365 }
5366 if (change_wiring) {
5367 enter_fault_type = VM_PROT_NONE;
5368 } else {
5369 enter_fault_type = caller_prot;
5370 }
5371 assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
5372 kr = vm_fault_enter_prepare(m,
5373 destination_pmap,
5374 destination_pmap_vaddr,
5375 &prot,
5376 caller_prot,
5377 fault_page_size,
5378 fault_phys_offset,
5379 change_wiring,
5380 enter_fault_type,
5381 &fault_info,
5382 &type_of_fault,
5383 &page_needs_data_sync);
5384 if (kr != KERN_SUCCESS) {
5385 goto zero_fill_cleanup;
5386 }
5387
5388 if (object_is_contended) {
5389 /*
5390 * At this point the page is in the vm object, but not on a paging queue.
5391 * Since it's accessible to another thread but its contents are invalid
5392 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5393 */
5394 m->vmp_busy = TRUE;
5395 vm_object_unlock(object);
5396 }
5397 if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5398 /*
5399 * Now zero fill page...
5400 * the page is probably going to
5401 * be written soon, so don't bother
5402 * to clear the modified bit
5403 *
5404 * NOTE: This code holds the map
5405 * lock across the zero fill.
5406 */
5407 vm_page_zero_fill(m);
5408 counter_inc(&vm_statistics_zero_fill_count);
5409 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5410 }
5411 if (page_needs_data_sync) {
5412 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5413 }
5414
5415 if (top_object != VM_OBJECT_NULL) {
5416 need_retry_ptr = &need_retry;
5417 } else {
5418 need_retry_ptr = NULL;
5419 }
5420 if (object_is_contended) {
5421 kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5422 fault_page_size, fault_phys_offset,
5423 m, &prot, caller_prot, enter_fault_type, wired,
5424 fault_info.pmap_options, need_retry_ptr);
5425 vm_object_lock(object);
5426 } else {
5427 kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5428 fault_page_size, fault_phys_offset,
5429 m, &prot, caller_prot, enter_fault_type, wired,
5430 fault_info.pmap_options, need_retry_ptr);
5431 }
5432 }
5433 zero_fill_cleanup:
5434 if (!VM_DYNAMIC_PAGING_ENABLED() &&
5435 (object->purgable == VM_PURGABLE_DENY ||
5436 object->purgable == VM_PURGABLE_NONVOLATILE ||
5437 object->purgable == VM_PURGABLE_VOLATILE)) {
5438 vm_page_lockspin_queues();
5439 if (!VM_DYNAMIC_PAGING_ENABLED()) {
5440 vm_fault_enqueue_throttled_locked(m);
5441 }
5442 vm_page_unlock_queues();
5443 }
5444 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5445
5446 if (__improbable(rtfault &&
5447 !m->vmp_realtime &&
5448 vm_pageout_protect_realtime)) {
5449 vm_page_lock_queues();
5450 if (!m->vmp_realtime) {
5451 m->vmp_realtime = true;
5452 vm_page_realtime_count++;
5453 }
5454 vm_page_unlock_queues();
5455 }
5456 vm_fault_complete(
5457 map,
5458 real_map,
5459 object,
5460 m_object,
5461 m,
5462 offset,
5463 trace_real_vaddr,
5464 &fault_info,
5465 caller_prot,
5466 real_vaddr,
5467 type_of_fault,
5468 need_retry,
5469 kr,
5470 physpage_p,
5471 prot,
5472 top_object,
5473 need_collapse,
5474 cur_offset,
5475 fault_type,
5476 &written_on_object,
5477 &written_on_pager,
5478 &written_on_offset);
5479 top_object = VM_OBJECT_NULL;
5480 if (need_retry == TRUE) {
5481 /*
5482 * vm_fault_enter couldn't complete the PMAP_ENTER...
5483 * at this point we don't hold any locks so it's safe
5484 * to ask the pmap layer to expand the page table to
5485 * accommodate this mapping... once expanded, we'll
5486 * re-drive the fault which should result in vm_fault_enter
5487 * being able to successfully enter the mapping this time around
5488 */
5489 (void)pmap_enter_options(
5490 pmap, vaddr, 0, 0, 0, 0, 0,
5491 PMAP_OPTIONS_NOENTER, NULL);
5492
5493 need_retry = FALSE;
5494 goto RetryFault;
5495 }
5496 goto done;
5497 }
5498 /*
5499 * On to the next level in the shadow chain
5500 */
5501 cur_offset += cur_object->vo_shadow_offset;
5502 new_object = cur_object->shadow;
5503 fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5504
5505 /*
5506 * take the new_object's lock with the indicated state
5507 */
5508 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5509 vm_object_lock_shared(new_object);
5510 } else {
5511 vm_object_lock(new_object);
5512 }
5513
5514 if (cur_object != object) {
5515 vm_object_unlock(cur_object);
5516 }
5517
5518 cur_object = new_object;
5519
5520 continue;
5521 }
5522 }
5523 /*
5524 * Cleanup from fast fault failure. Drop any object
5525 * lock other than original and drop map lock.
5526 */
5527 if (object != cur_object) {
5528 vm_object_unlock(cur_object);
5529 }
5530
5531 /*
5532 * must own the object lock exclusively at this point
5533 */
5534 if (object_lock_type == OBJECT_LOCK_SHARED) {
5535 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5536
5537 if (vm_object_lock_upgrade(object) == FALSE) {
5538 /*
5539 * couldn't upgrade, so explictly
5540 * take the lock exclusively
5541 * no need to retry the fault at this
5542 * point since "vm_fault_page" will
5543 * completely re-evaluate the state
5544 */
5545 vm_object_lock(object);
5546 }
5547 }
5548
5549 handle_copy_delay:
5550 vm_map_unlock_read(map);
5551 if (real_map != map) {
5552 vm_map_unlock(real_map);
5553 }
5554
5555 if (__improbable(object == compressor_object ||
5556 object == kernel_object)) {
5557 /*
5558 * These objects are explicitly managed and populated by the
5559 * kernel. The virtual ranges backed by these objects should
5560 * either have wired pages or "holes" that are not supposed to
5561 * be accessed at all until they get explicitly populated.
5562 * We should never have to resolve a fault on a mapping backed
5563 * by one of these VM objects and providing a zero-filled page
5564 * would be wrong here, so let's fail the fault and let the
5565 * caller crash or recover.
5566 */
5567 vm_object_unlock(object);
5568 kr = KERN_MEMORY_ERROR;
5569 goto done;
5570 }
5571
5572 resilient_media_ref_transfer = false;
5573 if (resilient_media_retry) {
5574 /*
5575 * We could get here if we failed to get a free page
5576 * to zero-fill and had to take the slow path again.
5577 * Reset our "recovery-from-failed-media" state.
5578 */
5579 assert(resilient_media_object != VM_OBJECT_NULL);
5580 assert(resilient_media_offset != (vm_object_offset_t)-1);
5581 /* release our extra reference on failed object */
5582 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5583 if (object == resilient_media_object) {
5584 /*
5585 * We're holding "object"'s lock, so we can't release
5586 * our extra reference at this point.
5587 * We need an extra reference on "object" anyway
5588 * (see below), so let's just transfer this reference.
5589 */
5590 resilient_media_ref_transfer = true;
5591 } else {
5592 vm_object_lock_assert_notheld(resilient_media_object);
5593 vm_object_deallocate(resilient_media_object);
5594 }
5595 resilient_media_object = VM_OBJECT_NULL;
5596 resilient_media_offset = (vm_object_offset_t)-1;
5597 resilient_media_retry = false;
5598 vm_fault_resilient_media_abort2++;
5599 }
5600
5601 /*
5602 * Make a reference to this object to
5603 * prevent its disposal while we are messing with
5604 * it. Once we have the reference, the map is free
5605 * to be diddled. Since objects reference their
5606 * shadows (and copies), they will stay around as well.
5607 */
5608 if (resilient_media_ref_transfer) {
5609 /* we already have an extra reference on this object */
5610 resilient_media_ref_transfer = false;
5611 } else {
5612 vm_object_reference_locked(object);
5613 }
5614 vm_object_paging_begin(object);
5615
5616 set_thread_pagein_error(cthread, 0);
5617 error_code = 0;
5618
5619 result_page = VM_PAGE_NULL;
5620 kr = vm_fault_page(object, offset, fault_type,
5621 (change_wiring && !wired),
5622 FALSE, /* page not looked up */
5623 &prot, &result_page, &top_page,
5624 &type_of_fault,
5625 &error_code, map->no_zero_fill,
5626 &fault_info);
5627
5628 /*
5629 * if kr != VM_FAULT_SUCCESS, then the paging reference
5630 * has been dropped and the object unlocked... the ref_count
5631 * is still held
5632 *
5633 * if kr == VM_FAULT_SUCCESS, then the paging reference
5634 * is still held along with the ref_count on the original object
5635 *
5636 * the object is returned locked with a paging reference
5637 *
5638 * if top_page != NULL, then it's BUSY and the
5639 * object it belongs to has a paging reference
5640 * but is returned unlocked
5641 */
5642 if (kr != VM_FAULT_SUCCESS &&
5643 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5644 if (kr == VM_FAULT_MEMORY_ERROR &&
5645 fault_info.resilient_media) {
5646 assertf(object->internal, "object %p", object);
5647 /*
5648 * This fault failed but the mapping was
5649 * "media resilient", so we'll retry the fault in
5650 * recovery mode to get a zero-filled page in the
5651 * top object.
5652 * Keep the reference on the failing object so
5653 * that we can check that the mapping is still
5654 * pointing to it when we retry the fault.
5655 */
5656 // printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5657 assert(!resilient_media_retry); /* no double retry */
5658 assert(resilient_media_object == VM_OBJECT_NULL);
5659 assert(resilient_media_offset == (vm_object_offset_t)-1);
5660 resilient_media_retry = true;
5661 resilient_media_object = object;
5662 resilient_media_offset = offset;
5663 // printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5664 vm_fault_resilient_media_initiate++;
5665 goto RetryFault;
5666 } else {
5667 /*
5668 * we didn't succeed, lose the object reference
5669 * immediately.
5670 */
5671 vm_object_deallocate(object);
5672 object = VM_OBJECT_NULL; /* no longer valid */
5673 }
5674
5675 /*
5676 * See why we failed, and take corrective action.
5677 */
5678 switch (kr) {
5679 case VM_FAULT_MEMORY_SHORTAGE:
5680 if (vm_page_wait((change_wiring) ?
5681 THREAD_UNINT :
5682 THREAD_ABORTSAFE)) {
5683 goto RetryFault;
5684 }
5685 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
5686 OS_FALLTHROUGH;
5687 case VM_FAULT_INTERRUPTED:
5688 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
5689 kr = KERN_ABORTED;
5690 goto done;
5691 case VM_FAULT_RETRY:
5692 goto RetryFault;
5693 case VM_FAULT_MEMORY_ERROR:
5694 if (error_code) {
5695 kr = error_code;
5696 } else {
5697 kr = KERN_MEMORY_ERROR;
5698 }
5699 goto done;
5700 default:
5701 panic("vm_fault: unexpected error 0x%x from "
5702 "vm_fault_page()\n", kr);
5703 }
5704 }
5705 m = result_page;
5706 m_object = NULL;
5707
5708 if (m != VM_PAGE_NULL) {
5709 m_object = VM_PAGE_OBJECT(m);
5710 assert((change_wiring && !wired) ?
5711 (top_page == VM_PAGE_NULL) :
5712 ((top_page == VM_PAGE_NULL) == (m_object == object)));
5713 }
5714
5715 /*
5716 * What to do with the resulting page from vm_fault_page
5717 * if it doesn't get entered into the physical map:
5718 */
5719 #define RELEASE_PAGE(m) \
5720 MACRO_BEGIN \
5721 PAGE_WAKEUP_DONE(m); \
5722 if ( !VM_PAGE_PAGEABLE(m)) { \
5723 vm_page_lockspin_queues(); \
5724 if ( !VM_PAGE_PAGEABLE(m)) \
5725 vm_page_activate(m); \
5726 vm_page_unlock_queues(); \
5727 } \
5728 MACRO_END
5729
5730
5731 object_locks_dropped = FALSE;
5732 /*
5733 * We must verify that the maps have not changed
5734 * since our last lookup. vm_map_verify() needs the
5735 * map lock (shared) but we are holding object locks.
5736 * So we do a try_lock() first and, if that fails, we
5737 * drop the object locks and go in for the map lock again.
5738 */
5739 if (!vm_map_try_lock_read(original_map)) {
5740 if (m != VM_PAGE_NULL) {
5741 old_copy_object = m_object->copy;
5742 vm_object_unlock(m_object);
5743 } else {
5744 old_copy_object = VM_OBJECT_NULL;
5745 vm_object_unlock(object);
5746 }
5747
5748 object_locks_dropped = TRUE;
5749
5750 vm_map_lock_read(original_map);
5751 }
5752
5753 if ((map != original_map) || !vm_map_verify(map, &version)) {
5754 if (object_locks_dropped == FALSE) {
5755 if (m != VM_PAGE_NULL) {
5756 old_copy_object = m_object->copy;
5757 vm_object_unlock(m_object);
5758 } else {
5759 old_copy_object = VM_OBJECT_NULL;
5760 vm_object_unlock(object);
5761 }
5762
5763 object_locks_dropped = TRUE;
5764 }
5765
5766 /*
5767 * no object locks are held at this point
5768 */
5769 vm_object_t retry_object;
5770 vm_object_offset_t retry_offset;
5771 vm_prot_t retry_prot;
5772
5773 /*
5774 * To avoid trying to write_lock the map while another
5775 * thread has it read_locked (in vm_map_pageable), we
5776 * do not try for write permission. If the page is
5777 * still writable, we will get write permission. If it
5778 * is not, or has been marked needs_copy, we enter the
5779 * mapping without write permission, and will merely
5780 * take another fault.
5781 */
5782 map = original_map;
5783
5784 kr = vm_map_lookup_and_lock_object(&map, vaddr,
5785 fault_type & ~VM_PROT_WRITE,
5786 OBJECT_LOCK_EXCLUSIVE, &version,
5787 &retry_object, &retry_offset, &retry_prot,
5788 &wired,
5789 &fault_info,
5790 &real_map,
5791 NULL);
5792 pmap = real_map->pmap;
5793
5794 if (kr != KERN_SUCCESS) {
5795 vm_map_unlock_read(map);
5796
5797 if (m != VM_PAGE_NULL) {
5798 assert(VM_PAGE_OBJECT(m) == m_object);
5799
5800 /*
5801 * retake the lock so that
5802 * we can drop the paging reference
5803 * in vm_fault_cleanup and do the
5804 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5805 */
5806 vm_object_lock(m_object);
5807
5808 RELEASE_PAGE(m);
5809
5810 vm_fault_cleanup(m_object, top_page);
5811 } else {
5812 /*
5813 * retake the lock so that
5814 * we can drop the paging reference
5815 * in vm_fault_cleanup
5816 */
5817 vm_object_lock(object);
5818
5819 vm_fault_cleanup(object, top_page);
5820 }
5821 vm_object_deallocate(object);
5822
5823 if (kr == KERN_INVALID_ADDRESS) {
5824 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
5825 }
5826 goto done;
5827 }
5828 vm_object_unlock(retry_object);
5829
5830 if ((retry_object != object) || (retry_offset != offset)) {
5831 vm_map_unlock_read(map);
5832 if (real_map != map) {
5833 vm_map_unlock(real_map);
5834 }
5835
5836 if (m != VM_PAGE_NULL) {
5837 assert(VM_PAGE_OBJECT(m) == m_object);
5838
5839 /*
5840 * retake the lock so that
5841 * we can drop the paging reference
5842 * in vm_fault_cleanup and do the
5843 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5844 */
5845 vm_object_lock(m_object);
5846
5847 RELEASE_PAGE(m);
5848
5849 vm_fault_cleanup(m_object, top_page);
5850 } else {
5851 /*
5852 * retake the lock so that
5853 * we can drop the paging reference
5854 * in vm_fault_cleanup
5855 */
5856 vm_object_lock(object);
5857
5858 vm_fault_cleanup(object, top_page);
5859 }
5860 vm_object_deallocate(object);
5861
5862 goto RetryFault;
5863 }
5864 /*
5865 * Check whether the protection has changed or the object
5866 * has been copied while we left the map unlocked.
5867 */
5868 if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5869 /* If the pmap layer cares, pass the full set. */
5870 prot = retry_prot;
5871 } else {
5872 prot &= retry_prot;
5873 }
5874 }
5875
5876 if (object_locks_dropped == TRUE) {
5877 if (m != VM_PAGE_NULL) {
5878 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5879 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5880 vm_object_lock(m_object);
5881
5882 if (m_object->copy != old_copy_object) {
5883 /*
5884 * The copy object changed while the top-level object
5885 * was unlocked, so take away write permission.
5886 */
5887 assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5888 prot &= ~VM_PROT_WRITE;
5889 }
5890 } else {
5891 vm_object_lock(object);
5892 }
5893
5894 object_locks_dropped = FALSE;
5895 }
5896
5897 if (!need_copy &&
5898 !fault_info.no_copy_on_read &&
5899 m != VM_PAGE_NULL &&
5900 VM_PAGE_OBJECT(m) != object &&
5901 !VM_PAGE_OBJECT(m)->pager_trusted &&
5902 vm_protect_privileged_from_untrusted &&
5903 !VM_PAGE_OBJECT(m)->code_signed &&
5904 current_proc_is_privileged()) {
5905 /*
5906 * We found the page we want in an "untrusted" VM object
5907 * down the shadow chain. Since the target is "privileged"
5908 * we want to perform a copy-on-read of that page, so that the
5909 * mapped object gets a stable copy and does not have to
5910 * rely on the "untrusted" object to provide the same
5911 * contents if the page gets reclaimed and has to be paged
5912 * in again later on.
5913 *
5914 * Special case: if the mapping is executable and the untrusted
5915 * object is code-signed and the process is "cs_enforced", we
5916 * do not copy-on-read because that would break code-signing
5917 * enforcement expectations (an executable page must belong
5918 * to a code-signed object) and we can rely on code-signing
5919 * to re-validate the page if it gets evicted and paged back in.
5920 */
5921 // printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5922 vm_copied_on_read++;
5923 need_copy_on_read = TRUE;
5924 need_copy = TRUE;
5925 } else {
5926 need_copy_on_read = FALSE;
5927 }
5928
5929 /*
5930 * If we want to wire down this page, but no longer have
5931 * adequate permissions, we must start all over.
5932 * If we decided to copy-on-read, we must also start all over.
5933 */
5934 if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5935 need_copy_on_read) {
5936 vm_map_unlock_read(map);
5937 if (real_map != map) {
5938 vm_map_unlock(real_map);
5939 }
5940
5941 if (m != VM_PAGE_NULL) {
5942 assert(VM_PAGE_OBJECT(m) == m_object);
5943
5944 RELEASE_PAGE(m);
5945
5946 vm_fault_cleanup(m_object, top_page);
5947 } else {
5948 vm_fault_cleanup(object, top_page);
5949 }
5950
5951 vm_object_deallocate(object);
5952
5953 goto RetryFault;
5954 }
5955 if (m != VM_PAGE_NULL) {
5956 /*
5957 * Put this page into the physical map.
5958 * We had to do the unlock above because pmap_enter
5959 * may cause other faults. The page may be on
5960 * the pageout queues. If the pageout daemon comes
5961 * across the page, it will remove it from the queues.
5962 */
5963 if (fault_page_size < PAGE_SIZE) {
5964 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5965 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5966 fault_phys_offset < PAGE_SIZE),
5967 "0x%llx\n", (uint64_t)fault_phys_offset);
5968 } else {
5969 assertf(fault_phys_offset == 0,
5970 "0x%llx\n", (uint64_t)fault_phys_offset);
5971 }
5972 assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5973 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5974 if (caller_pmap) {
5975 kr = vm_fault_enter(m,
5976 caller_pmap,
5977 caller_pmap_addr,
5978 fault_page_size,
5979 fault_phys_offset,
5980 prot,
5981 caller_prot,
5982 wired,
5983 change_wiring,
5984 wire_tag,
5985 &fault_info,
5986 NULL,
5987 &type_of_fault);
5988 } else {
5989 kr = vm_fault_enter(m,
5990 pmap,
5991 vaddr,
5992 fault_page_size,
5993 fault_phys_offset,
5994 prot,
5995 caller_prot,
5996 wired,
5997 change_wiring,
5998 wire_tag,
5999 &fault_info,
6000 NULL,
6001 &type_of_fault);
6002 }
6003 assert(VM_PAGE_OBJECT(m) == m_object);
6004
6005 {
6006 int event_code = 0;
6007
6008 if (m_object->internal) {
6009 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
6010 } else if (m_object->object_is_shared_cache) {
6011 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
6012 } else {
6013 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
6014 }
6015
6016 KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
6017 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
6018
6019 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
6020 }
6021 if (kr != KERN_SUCCESS) {
6022 /* abort this page fault */
6023 vm_map_unlock_read(map);
6024 if (real_map != map) {
6025 vm_map_unlock(real_map);
6026 }
6027 PAGE_WAKEUP_DONE(m);
6028 vm_fault_cleanup(m_object, top_page);
6029 vm_object_deallocate(object);
6030 goto done;
6031 }
6032 if (physpage_p != NULL) {
6033 /* for vm_map_wire_and_extract() */
6034 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6035 if (prot & VM_PROT_WRITE) {
6036 vm_object_lock_assert_exclusive(m_object);
6037 m->vmp_dirty = TRUE;
6038 }
6039 }
6040 } else {
6041 vm_map_entry_t entry;
6042 vm_map_offset_t laddr;
6043 vm_map_offset_t ldelta, hdelta;
6044
6045 /*
6046 * do a pmap block mapping from the physical address
6047 * in the object
6048 */
6049
6050 if (real_map != map) {
6051 vm_map_unlock(real_map);
6052 }
6053
6054 if (original_map != map) {
6055 vm_map_unlock_read(map);
6056 vm_map_lock_read(original_map);
6057 map = original_map;
6058 }
6059 real_map = map;
6060
6061 laddr = vaddr;
6062 hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6063
6064 while (vm_map_lookup_entry(map, laddr, &entry)) {
6065 if (ldelta > (laddr - entry->vme_start)) {
6066 ldelta = laddr - entry->vme_start;
6067 }
6068 if (hdelta > (entry->vme_end - laddr)) {
6069 hdelta = entry->vme_end - laddr;
6070 }
6071 if (entry->is_sub_map) {
6072 laddr = ((laddr - entry->vme_start)
6073 + VME_OFFSET(entry));
6074 vm_map_lock_read(VME_SUBMAP(entry));
6075
6076 if (map != real_map) {
6077 vm_map_unlock_read(map);
6078 }
6079 if (entry->use_pmap) {
6080 vm_map_unlock_read(real_map);
6081 real_map = VME_SUBMAP(entry);
6082 }
6083 map = VME_SUBMAP(entry);
6084 } else {
6085 break;
6086 }
6087 }
6088
6089 if (vm_map_lookup_entry(map, laddr, &entry) &&
6090 (!entry->is_sub_map) &&
6091 (object != VM_OBJECT_NULL) &&
6092 (VME_OBJECT(entry) == object)) {
6093 uint16_t superpage;
6094
6095 if (!object->pager_created &&
6096 object->phys_contiguous &&
6097 VME_OFFSET(entry) == 0 &&
6098 (entry->vme_end - entry->vme_start == object->vo_size) &&
6099 VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6100 superpage = VM_MEM_SUPERPAGE;
6101 } else {
6102 superpage = 0;
6103 }
6104
6105 if (superpage && physpage_p) {
6106 /* for vm_map_wire_and_extract() */
6107 *physpage_p = (ppnum_t)
6108 ((((vm_map_offset_t)
6109 object->vo_shadow_offset)
6110 + VME_OFFSET(entry)
6111 + (laddr - entry->vme_start))
6112 >> PAGE_SHIFT);
6113 }
6114
6115 if (caller_pmap) {
6116 /*
6117 * Set up a block mapped area
6118 */
6119 assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6120 kr = pmap_map_block_addr(caller_pmap,
6121 (addr64_t)(caller_pmap_addr - ldelta),
6122 (pmap_paddr_t)(((vm_map_offset_t) (object->vo_shadow_offset)) +
6123 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6124 (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6125 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6126
6127 if (kr != KERN_SUCCESS) {
6128 goto cleanup;
6129 }
6130 } else {
6131 /*
6132 * Set up a block mapped area
6133 */
6134 assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6135 kr = pmap_map_block_addr(real_map->pmap,
6136 (addr64_t)(vaddr - ldelta),
6137 (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6138 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6139 (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6140 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6141
6142 if (kr != KERN_SUCCESS) {
6143 goto cleanup;
6144 }
6145 }
6146 }
6147 }
6148
6149 /*
6150 * Success
6151 */
6152 kr = KERN_SUCCESS;
6153
6154 /*
6155 * TODO: could most of the done cases just use cleanup?
6156 */
6157 cleanup:
6158 /*
6159 * Unlock everything, and return
6160 */
6161 vm_map_unlock_read(map);
6162 if (real_map != map) {
6163 vm_map_unlock(real_map);
6164 }
6165
6166 if (m != VM_PAGE_NULL) {
6167 if (__improbable(rtfault &&
6168 !m->vmp_realtime &&
6169 vm_pageout_protect_realtime)) {
6170 vm_page_lock_queues();
6171 if (!m->vmp_realtime) {
6172 m->vmp_realtime = true;
6173 vm_page_realtime_count++;
6174 }
6175 vm_page_unlock_queues();
6176 }
6177 assert(VM_PAGE_OBJECT(m) == m_object);
6178
6179 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6180 vm_object_paging_begin(m_object);
6181
6182 assert(written_on_object == VM_OBJECT_NULL);
6183 written_on_object = m_object;
6184 written_on_pager = m_object->pager;
6185 written_on_offset = m_object->paging_offset + m->vmp_offset;
6186 }
6187 PAGE_WAKEUP_DONE(m);
6188
6189 vm_fault_cleanup(m_object, top_page);
6190 } else {
6191 vm_fault_cleanup(object, top_page);
6192 }
6193
6194 vm_object_deallocate(object);
6195
6196 #undef RELEASE_PAGE
6197
6198 done:
6199 thread_interrupt_level(interruptible_state);
6200
6201 if (resilient_media_object != VM_OBJECT_NULL) {
6202 assert(resilient_media_retry);
6203 assert(resilient_media_offset != (vm_object_offset_t)-1);
6204 /* release extra reference on failed object */
6205 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6206 vm_object_lock_assert_notheld(resilient_media_object);
6207 vm_object_deallocate(resilient_media_object);
6208 resilient_media_object = VM_OBJECT_NULL;
6209 resilient_media_offset = (vm_object_offset_t)-1;
6210 resilient_media_retry = false;
6211 vm_fault_resilient_media_release++;
6212 }
6213 assert(!resilient_media_retry);
6214
6215 /*
6216 * Only I/O throttle on faults which cause a pagein/swapin.
6217 */
6218 if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6219 throttle_lowpri_io(1);
6220 } else {
6221 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6222 if ((throttle_delay = vm_page_throttled(TRUE))) {
6223 if (vm_debug_events) {
6224 if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6225 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6226 } else if (type_of_fault == DBG_COW_FAULT) {
6227 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6228 } else {
6229 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6230 }
6231 }
6232 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6233 }
6234 }
6235 }
6236
6237 if (written_on_object) {
6238 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6239
6240 vm_object_lock(written_on_object);
6241 vm_object_paging_end(written_on_object);
6242 vm_object_unlock(written_on_object);
6243
6244 written_on_object = VM_OBJECT_NULL;
6245 }
6246
6247 if (rtfault) {
6248 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6249 }
6250
6251 KDBG_RELEASE(
6252 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
6253 ((uint64_t)trace_vaddr >> 32),
6254 trace_vaddr,
6255 kr,
6256 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6257
6258 if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6259 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6260 }
6261
6262 return kr;
6263 }
6264
6265 /*
6266 * vm_fault_wire:
6267 *
6268 * Wire down a range of virtual addresses in a map.
6269 */
6270 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6271 vm_fault_wire(
6272 vm_map_t map,
6273 vm_map_entry_t entry,
6274 vm_prot_t prot,
6275 vm_tag_t wire_tag,
6276 pmap_t pmap,
6277 vm_map_offset_t pmap_addr,
6278 ppnum_t *physpage_p)
6279 {
6280 vm_map_offset_t va;
6281 vm_map_offset_t end_addr = entry->vme_end;
6282 kern_return_t rc;
6283 vm_map_size_t effective_page_size;
6284
6285 assert(entry->in_transition);
6286
6287 if (!entry->is_sub_map &&
6288 VME_OBJECT(entry) != VM_OBJECT_NULL &&
6289 VME_OBJECT(entry)->phys_contiguous) {
6290 return KERN_SUCCESS;
6291 }
6292
6293 /*
6294 * Inform the physical mapping system that the
6295 * range of addresses may not fault, so that
6296 * page tables and such can be locked down as well.
6297 */
6298
6299 pmap_pageable(pmap, pmap_addr,
6300 pmap_addr + (end_addr - entry->vme_start), FALSE);
6301
6302 /*
6303 * We simulate a fault to get the page and enter it
6304 * in the physical map.
6305 */
6306
6307 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6308 for (va = entry->vme_start;
6309 va < end_addr;
6310 va += effective_page_size) {
6311 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6312 pmap_addr + (va - entry->vme_start),
6313 physpage_p);
6314 if (rc != KERN_SUCCESS) {
6315 rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6316 ((pmap == kernel_pmap)
6317 ? THREAD_UNINT
6318 : THREAD_ABORTSAFE),
6319 pmap,
6320 (pmap_addr +
6321 (va - entry->vme_start)),
6322 physpage_p);
6323 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6324 }
6325
6326 if (rc != KERN_SUCCESS) {
6327 struct vm_map_entry tmp_entry = *entry;
6328
6329 /* unwire wired pages */
6330 tmp_entry.vme_end = va;
6331 vm_fault_unwire(map,
6332 &tmp_entry, FALSE, pmap, pmap_addr);
6333
6334 return rc;
6335 }
6336 }
6337 return KERN_SUCCESS;
6338 }
6339
6340 /*
6341 * vm_fault_unwire:
6342 *
6343 * Unwire a range of virtual addresses in a map.
6344 */
6345 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr)6346 vm_fault_unwire(
6347 vm_map_t map,
6348 vm_map_entry_t entry,
6349 boolean_t deallocate,
6350 pmap_t pmap,
6351 vm_map_offset_t pmap_addr)
6352 {
6353 vm_map_offset_t va;
6354 vm_map_offset_t end_addr = entry->vme_end;
6355 vm_object_t object;
6356 struct vm_object_fault_info fault_info = {};
6357 unsigned int unwired_pages;
6358 vm_map_size_t effective_page_size;
6359
6360 object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6361
6362 /*
6363 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6364 * do anything since such memory is wired by default. So we don't have
6365 * anything to undo here.
6366 */
6367
6368 if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6369 return;
6370 }
6371
6372 fault_info.interruptible = THREAD_UNINT;
6373 fault_info.behavior = entry->behavior;
6374 fault_info.user_tag = VME_ALIAS(entry);
6375 if (entry->iokit_acct ||
6376 (!entry->is_sub_map && !entry->use_pmap)) {
6377 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6378 }
6379 fault_info.lo_offset = VME_OFFSET(entry);
6380 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6381 fault_info.no_cache = entry->no_cache;
6382 fault_info.stealth = TRUE;
6383
6384 unwired_pages = 0;
6385
6386 /*
6387 * Since the pages are wired down, we must be able to
6388 * get their mappings from the physical map system.
6389 */
6390
6391 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6392 for (va = entry->vme_start;
6393 va < end_addr;
6394 va += effective_page_size) {
6395 if (object == VM_OBJECT_NULL) {
6396 if (pmap) {
6397 pmap_change_wiring(pmap,
6398 pmap_addr + (va - entry->vme_start), FALSE);
6399 }
6400 (void) vm_fault(map, va, VM_PROT_NONE,
6401 TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6402 } else {
6403 vm_prot_t prot;
6404 vm_page_t result_page;
6405 vm_page_t top_page;
6406 vm_object_t result_object;
6407 vm_fault_return_t result;
6408
6409 /* cap cluster size at maximum UPL size */
6410 upl_size_t cluster_size;
6411 if (os_sub_overflow(end_addr, va, &cluster_size)) {
6412 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6413 }
6414 fault_info.cluster_size = cluster_size;
6415
6416 do {
6417 prot = VM_PROT_NONE;
6418
6419 vm_object_lock(object);
6420 vm_object_paging_begin(object);
6421 result_page = VM_PAGE_NULL;
6422 result = vm_fault_page(
6423 object,
6424 (VME_OFFSET(entry) +
6425 (va - entry->vme_start)),
6426 VM_PROT_NONE, TRUE,
6427 FALSE, /* page not looked up */
6428 &prot, &result_page, &top_page,
6429 (int *)0,
6430 NULL, map->no_zero_fill,
6431 &fault_info);
6432 } while (result == VM_FAULT_RETRY);
6433
6434 /*
6435 * If this was a mapping to a file on a device that has been forcibly
6436 * unmounted, then we won't get a page back from vm_fault_page(). Just
6437 * move on to the next one in case the remaining pages are mapped from
6438 * different objects. During a forced unmount, the object is terminated
6439 * so the alive flag will be false if this happens. A forced unmount will
6440 * will occur when an external disk is unplugged before the user does an
6441 * eject, so we don't want to panic in that situation.
6442 */
6443
6444 if (result == VM_FAULT_MEMORY_ERROR) {
6445 if (!object->alive) {
6446 continue;
6447 }
6448 if (!object->internal && object->pager == NULL) {
6449 continue;
6450 }
6451 }
6452
6453 if (result == VM_FAULT_MEMORY_ERROR &&
6454 object == kernel_object) {
6455 /*
6456 * This must have been allocated with
6457 * KMA_KOBJECT and KMA_VAONLY and there's
6458 * no physical page at this offset.
6459 * We're done (no page to free).
6460 */
6461 assert(deallocate);
6462 continue;
6463 }
6464
6465 if (result != VM_FAULT_SUCCESS) {
6466 panic("vm_fault_unwire: failure");
6467 }
6468
6469 result_object = VM_PAGE_OBJECT(result_page);
6470
6471 if (deallocate) {
6472 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6473 vm_page_fictitious_addr);
6474 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6475 if (VM_PAGE_WIRED(result_page)) {
6476 unwired_pages++;
6477 }
6478 VM_PAGE_FREE(result_page);
6479 } else {
6480 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6481 pmap_change_wiring(pmap,
6482 pmap_addr + (va - entry->vme_start), FALSE);
6483 }
6484
6485
6486 if (VM_PAGE_WIRED(result_page)) {
6487 vm_page_lockspin_queues();
6488 vm_page_unwire(result_page, TRUE);
6489 vm_page_unlock_queues();
6490 unwired_pages++;
6491 }
6492 if (entry->zero_wired_pages) {
6493 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6494 entry->zero_wired_pages = FALSE;
6495 }
6496
6497 PAGE_WAKEUP_DONE(result_page);
6498 }
6499 vm_fault_cleanup(result_object, top_page);
6500 }
6501 }
6502
6503 /*
6504 * Inform the physical mapping system that the range
6505 * of addresses may fault, so that page tables and
6506 * such may be unwired themselves.
6507 */
6508
6509 pmap_pageable(pmap, pmap_addr,
6510 pmap_addr + (end_addr - entry->vme_start), TRUE);
6511
6512 if (kernel_object == object) {
6513 /*
6514 * Would like to make user_tag in vm_object_fault_info
6515 * vm_tag_t (unsigned short) but user_tag derives its value from
6516 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6517 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6518 * code at many places.
6519 *
6520 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6521 */
6522 assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6523 "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6524 vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6525 }
6526 }
6527
6528 /*
6529 * vm_fault_wire_fast:
6530 *
6531 * Handle common case of a wire down page fault at the given address.
6532 * If successful, the page is inserted into the associated physical map.
6533 * The map entry is passed in to avoid the overhead of a map lookup.
6534 *
6535 * NOTE: the given address should be truncated to the
6536 * proper page address.
6537 *
6538 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
6539 * a standard error specifying why the fault is fatal is returned.
6540 *
6541 * The map in question must be referenced, and remains so.
6542 * Caller has a read lock on the map.
6543 *
6544 * This is a stripped version of vm_fault() for wiring pages. Anything
6545 * other than the common case will return KERN_FAILURE, and the caller
6546 * is expected to call vm_fault().
6547 */
6548 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6549 vm_fault_wire_fast(
6550 __unused vm_map_t map,
6551 vm_map_offset_t va,
6552 __unused vm_prot_t caller_prot,
6553 vm_tag_t wire_tag,
6554 vm_map_entry_t entry,
6555 pmap_t pmap,
6556 vm_map_offset_t pmap_addr,
6557 ppnum_t *physpage_p)
6558 {
6559 vm_object_t object;
6560 vm_object_offset_t offset;
6561 vm_page_t m;
6562 vm_prot_t prot;
6563 thread_t thread = current_thread();
6564 int type_of_fault;
6565 kern_return_t kr;
6566 vm_map_size_t fault_page_size;
6567 vm_map_offset_t fault_phys_offset;
6568 struct vm_object_fault_info fault_info = {};
6569
6570 counter_inc(&vm_statistics_faults);
6571
6572 if (thread != THREAD_NULL) {
6573 counter_inc(&get_threadtask(thread)->faults);
6574 }
6575
6576 /*
6577 * Recovery actions
6578 */
6579
6580 #undef RELEASE_PAGE
6581 #define RELEASE_PAGE(m) { \
6582 PAGE_WAKEUP_DONE(m); \
6583 vm_page_lockspin_queues(); \
6584 vm_page_unwire(m, TRUE); \
6585 vm_page_unlock_queues(); \
6586 }
6587
6588
6589 #undef UNLOCK_THINGS
6590 #define UNLOCK_THINGS { \
6591 vm_object_paging_end(object); \
6592 vm_object_unlock(object); \
6593 }
6594
6595 #undef UNLOCK_AND_DEALLOCATE
6596 #define UNLOCK_AND_DEALLOCATE { \
6597 UNLOCK_THINGS; \
6598 vm_object_deallocate(object); \
6599 }
6600 /*
6601 * Give up and have caller do things the hard way.
6602 */
6603
6604 #define GIVE_UP { \
6605 UNLOCK_AND_DEALLOCATE; \
6606 return(KERN_FAILURE); \
6607 }
6608
6609
6610 /*
6611 * If this entry is not directly to a vm_object, bail out.
6612 */
6613 if (entry->is_sub_map) {
6614 assert(physpage_p == NULL);
6615 return KERN_FAILURE;
6616 }
6617
6618 /*
6619 * Find the backing store object and offset into it.
6620 */
6621
6622 object = VME_OBJECT(entry);
6623 offset = (va - entry->vme_start) + VME_OFFSET(entry);
6624 prot = entry->protection;
6625
6626 /*
6627 * Make a reference to this object to prevent its
6628 * disposal while we are messing with it.
6629 */
6630
6631 vm_object_lock(object);
6632 vm_object_reference_locked(object);
6633 vm_object_paging_begin(object);
6634
6635 /*
6636 * INVARIANTS (through entire routine):
6637 *
6638 * 1) At all times, we must either have the object
6639 * lock or a busy page in some object to prevent
6640 * some other thread from trying to bring in
6641 * the same page.
6642 *
6643 * 2) Once we have a busy page, we must remove it from
6644 * the pageout queues, so that the pageout daemon
6645 * will not grab it away.
6646 *
6647 */
6648
6649 /*
6650 * Look for page in top-level object. If it's not there or
6651 * there's something going on, give up.
6652 */
6653 m = vm_page_lookup(object, vm_object_trunc_page(offset));
6654 if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6655 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
6656 GIVE_UP;
6657 }
6658 if (m->vmp_fictitious &&
6659 VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6660 /*
6661 * Guard pages are fictitious pages and are never
6662 * entered into a pmap, so let's say it's been wired...
6663 */
6664 kr = KERN_SUCCESS;
6665 goto done;
6666 }
6667
6668 /*
6669 * Wire the page down now. All bail outs beyond this
6670 * point must unwire the page.
6671 */
6672
6673 vm_page_lockspin_queues();
6674 vm_page_wire(m, wire_tag, TRUE);
6675 vm_page_unlock_queues();
6676
6677 /*
6678 * Mark page busy for other threads.
6679 */
6680 assert(!m->vmp_busy);
6681 m->vmp_busy = TRUE;
6682 assert(!m->vmp_absent);
6683
6684 /*
6685 * Give up if the page is being written and there's a copy object
6686 */
6687 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6688 RELEASE_PAGE(m);
6689 GIVE_UP;
6690 }
6691
6692 fault_info.user_tag = VME_ALIAS(entry);
6693 fault_info.pmap_options = 0;
6694 if (entry->iokit_acct ||
6695 (!entry->is_sub_map && !entry->use_pmap)) {
6696 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6697 }
6698
6699 fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6700 fault_phys_offset = offset - vm_object_trunc_page(offset);
6701
6702 /*
6703 * Put this page into the physical map.
6704 */
6705 type_of_fault = DBG_CACHE_HIT_FAULT;
6706 assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6707 assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6708 kr = vm_fault_enter(m,
6709 pmap,
6710 pmap_addr,
6711 fault_page_size,
6712 fault_phys_offset,
6713 prot,
6714 prot,
6715 TRUE, /* wired */
6716 FALSE, /* change_wiring */
6717 wire_tag,
6718 &fault_info,
6719 NULL,
6720 &type_of_fault);
6721 if (kr != KERN_SUCCESS) {
6722 RELEASE_PAGE(m);
6723 GIVE_UP;
6724 }
6725
6726 done:
6727 /*
6728 * Unlock everything, and return
6729 */
6730
6731 if (physpage_p) {
6732 /* for vm_map_wire_and_extract() */
6733 if (kr == KERN_SUCCESS) {
6734 assert(object == VM_PAGE_OBJECT(m));
6735 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6736 if (prot & VM_PROT_WRITE) {
6737 vm_object_lock_assert_exclusive(object);
6738 m->vmp_dirty = TRUE;
6739 }
6740 } else {
6741 *physpage_p = 0;
6742 }
6743 }
6744
6745 PAGE_WAKEUP_DONE(m);
6746 UNLOCK_AND_DEALLOCATE;
6747
6748 return kr;
6749 }
6750
6751 /*
6752 * Routine: vm_fault_copy_cleanup
6753 * Purpose:
6754 * Release a page used by vm_fault_copy.
6755 */
6756
6757 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)6758 vm_fault_copy_cleanup(
6759 vm_page_t page,
6760 vm_page_t top_page)
6761 {
6762 vm_object_t object = VM_PAGE_OBJECT(page);
6763
6764 vm_object_lock(object);
6765 PAGE_WAKEUP_DONE(page);
6766 if (!VM_PAGE_PAGEABLE(page)) {
6767 vm_page_lockspin_queues();
6768 if (!VM_PAGE_PAGEABLE(page)) {
6769 vm_page_activate(page);
6770 }
6771 vm_page_unlock_queues();
6772 }
6773 vm_fault_cleanup(object, top_page);
6774 }
6775
6776 static void
vm_fault_copy_dst_cleanup(vm_page_t page)6777 vm_fault_copy_dst_cleanup(
6778 vm_page_t page)
6779 {
6780 vm_object_t object;
6781
6782 if (page != VM_PAGE_NULL) {
6783 object = VM_PAGE_OBJECT(page);
6784 vm_object_lock(object);
6785 vm_page_lockspin_queues();
6786 vm_page_unwire(page, TRUE);
6787 vm_page_unlock_queues();
6788 vm_object_paging_end(object);
6789 vm_object_unlock(object);
6790 }
6791 }
6792
6793 /*
6794 * Routine: vm_fault_copy
6795 *
6796 * Purpose:
6797 * Copy pages from one virtual memory object to another --
6798 * neither the source nor destination pages need be resident.
6799 *
6800 * Before actually copying a page, the version associated with
6801 * the destination address map wil be verified.
6802 *
6803 * In/out conditions:
6804 * The caller must hold a reference, but not a lock, to
6805 * each of the source and destination objects and to the
6806 * destination map.
6807 *
6808 * Results:
6809 * Returns KERN_SUCCESS if no errors were encountered in
6810 * reading or writing the data. Returns KERN_INTERRUPTED if
6811 * the operation was interrupted (only possible if the
6812 * "interruptible" argument is asserted). Other return values
6813 * indicate a permanent error in copying the data.
6814 *
6815 * The actual amount of data copied will be returned in the
6816 * "copy_size" argument. In the event that the destination map
6817 * verification failed, this amount may be less than the amount
6818 * requested.
6819 */
6820 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)6821 vm_fault_copy(
6822 vm_object_t src_object,
6823 vm_object_offset_t src_offset,
6824 vm_map_size_t *copy_size, /* INOUT */
6825 vm_object_t dst_object,
6826 vm_object_offset_t dst_offset,
6827 vm_map_t dst_map,
6828 vm_map_version_t *dst_version,
6829 int interruptible)
6830 {
6831 vm_page_t result_page;
6832
6833 vm_page_t src_page;
6834 vm_page_t src_top_page;
6835 vm_prot_t src_prot;
6836
6837 vm_page_t dst_page;
6838 vm_page_t dst_top_page;
6839 vm_prot_t dst_prot;
6840
6841 vm_map_size_t amount_left;
6842 vm_object_t old_copy_object;
6843 vm_object_t result_page_object = NULL;
6844 kern_return_t error = 0;
6845 vm_fault_return_t result;
6846
6847 vm_map_size_t part_size;
6848 struct vm_object_fault_info fault_info_src = {};
6849 struct vm_object_fault_info fault_info_dst = {};
6850
6851 /*
6852 * In order not to confuse the clustered pageins, align
6853 * the different offsets on a page boundary.
6854 */
6855
6856 #define RETURN(x) \
6857 MACRO_BEGIN \
6858 *copy_size -= amount_left; \
6859 MACRO_RETURN(x); \
6860 MACRO_END
6861
6862 amount_left = *copy_size;
6863
6864 fault_info_src.interruptible = interruptible;
6865 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6866 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6867 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6868 fault_info_src.stealth = TRUE;
6869
6870 fault_info_dst.interruptible = interruptible;
6871 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6872 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6873 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6874 fault_info_dst.stealth = TRUE;
6875
6876 do { /* while (amount_left > 0) */
6877 /*
6878 * There may be a deadlock if both source and destination
6879 * pages are the same. To avoid this deadlock, the copy must
6880 * start by getting the destination page in order to apply
6881 * COW semantics if any.
6882 */
6883
6884 RetryDestinationFault:;
6885
6886 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6887
6888 vm_object_lock(dst_object);
6889 vm_object_paging_begin(dst_object);
6890
6891 /* cap cluster size at maximum UPL size */
6892 upl_size_t cluster_size;
6893 if (os_convert_overflow(amount_left, &cluster_size)) {
6894 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6895 }
6896 fault_info_dst.cluster_size = cluster_size;
6897
6898 dst_page = VM_PAGE_NULL;
6899 result = vm_fault_page(dst_object,
6900 vm_object_trunc_page(dst_offset),
6901 VM_PROT_WRITE | VM_PROT_READ,
6902 FALSE,
6903 FALSE, /* page not looked up */
6904 &dst_prot, &dst_page, &dst_top_page,
6905 (int *)0,
6906 &error,
6907 dst_map->no_zero_fill,
6908 &fault_info_dst);
6909 switch (result) {
6910 case VM_FAULT_SUCCESS:
6911 break;
6912 case VM_FAULT_RETRY:
6913 goto RetryDestinationFault;
6914 case VM_FAULT_MEMORY_SHORTAGE:
6915 if (vm_page_wait(interruptible)) {
6916 goto RetryDestinationFault;
6917 }
6918 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
6919 OS_FALLTHROUGH;
6920 case VM_FAULT_INTERRUPTED:
6921 RETURN(MACH_SEND_INTERRUPTED);
6922 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6923 /* success but no VM page: fail the copy */
6924 vm_object_paging_end(dst_object);
6925 vm_object_unlock(dst_object);
6926 OS_FALLTHROUGH;
6927 case VM_FAULT_MEMORY_ERROR:
6928 if (error) {
6929 return error;
6930 } else {
6931 return KERN_MEMORY_ERROR;
6932 }
6933 default:
6934 panic("vm_fault_copy: unexpected error 0x%x from "
6935 "vm_fault_page()\n", result);
6936 }
6937 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6938
6939 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6940 old_copy_object = dst_object->copy;
6941
6942 /*
6943 * There exists the possiblity that the source and
6944 * destination page are the same. But we can't
6945 * easily determine that now. If they are the
6946 * same, the call to vm_fault_page() for the
6947 * destination page will deadlock. To prevent this we
6948 * wire the page so we can drop busy without having
6949 * the page daemon steal the page. We clean up the
6950 * top page but keep the paging reference on the object
6951 * holding the dest page so it doesn't go away.
6952 */
6953
6954 vm_page_lockspin_queues();
6955 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6956 vm_page_unlock_queues();
6957 PAGE_WAKEUP_DONE(dst_page);
6958 vm_object_unlock(dst_object);
6959
6960 if (dst_top_page != VM_PAGE_NULL) {
6961 vm_object_lock(dst_object);
6962 VM_PAGE_FREE(dst_top_page);
6963 vm_object_paging_end(dst_object);
6964 vm_object_unlock(dst_object);
6965 }
6966
6967 RetrySourceFault:;
6968
6969 if (src_object == VM_OBJECT_NULL) {
6970 /*
6971 * No source object. We will just
6972 * zero-fill the page in dst_object.
6973 */
6974 src_page = VM_PAGE_NULL;
6975 result_page = VM_PAGE_NULL;
6976 } else {
6977 vm_object_lock(src_object);
6978 src_page = vm_page_lookup(src_object,
6979 vm_object_trunc_page(src_offset));
6980 if (src_page == dst_page) {
6981 src_prot = dst_prot;
6982 result_page = VM_PAGE_NULL;
6983 } else {
6984 src_prot = VM_PROT_READ;
6985 vm_object_paging_begin(src_object);
6986
6987 /* cap cluster size at maximum UPL size */
6988 if (os_convert_overflow(amount_left, &cluster_size)) {
6989 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6990 }
6991 fault_info_src.cluster_size = cluster_size;
6992
6993 result_page = VM_PAGE_NULL;
6994 result = vm_fault_page(
6995 src_object,
6996 vm_object_trunc_page(src_offset),
6997 VM_PROT_READ, FALSE,
6998 FALSE, /* page not looked up */
6999 &src_prot,
7000 &result_page, &src_top_page,
7001 (int *)0, &error, FALSE,
7002 &fault_info_src);
7003
7004 switch (result) {
7005 case VM_FAULT_SUCCESS:
7006 break;
7007 case VM_FAULT_RETRY:
7008 goto RetrySourceFault;
7009 case VM_FAULT_MEMORY_SHORTAGE:
7010 if (vm_page_wait(interruptible)) {
7011 goto RetrySourceFault;
7012 }
7013 OS_FALLTHROUGH;
7014 case VM_FAULT_INTERRUPTED:
7015 vm_fault_copy_dst_cleanup(dst_page);
7016 RETURN(MACH_SEND_INTERRUPTED);
7017 case VM_FAULT_SUCCESS_NO_VM_PAGE:
7018 /* success but no VM page: fail */
7019 vm_object_paging_end(src_object);
7020 vm_object_unlock(src_object);
7021 OS_FALLTHROUGH;
7022 case VM_FAULT_MEMORY_ERROR:
7023 vm_fault_copy_dst_cleanup(dst_page);
7024 if (error) {
7025 return error;
7026 } else {
7027 return KERN_MEMORY_ERROR;
7028 }
7029 default:
7030 panic("vm_fault_copy(2): unexpected "
7031 "error 0x%x from "
7032 "vm_fault_page()\n", result);
7033 }
7034
7035 result_page_object = VM_PAGE_OBJECT(result_page);
7036 assert((src_top_page == VM_PAGE_NULL) ==
7037 (result_page_object == src_object));
7038 }
7039 assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7040 vm_object_unlock(result_page_object);
7041 }
7042
7043 vm_map_lock_read(dst_map);
7044
7045 if (!vm_map_verify(dst_map, dst_version)) {
7046 vm_map_unlock_read(dst_map);
7047 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7048 vm_fault_copy_cleanup(result_page, src_top_page);
7049 }
7050 vm_fault_copy_dst_cleanup(dst_page);
7051 break;
7052 }
7053 assert(dst_object == VM_PAGE_OBJECT(dst_page));
7054
7055 vm_object_lock(dst_object);
7056
7057 if (dst_object->copy != old_copy_object) {
7058 vm_object_unlock(dst_object);
7059 vm_map_unlock_read(dst_map);
7060 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7061 vm_fault_copy_cleanup(result_page, src_top_page);
7062 }
7063 vm_fault_copy_dst_cleanup(dst_page);
7064 break;
7065 }
7066 vm_object_unlock(dst_object);
7067
7068 /*
7069 * Copy the page, and note that it is dirty
7070 * immediately.
7071 */
7072
7073 if (!page_aligned(src_offset) ||
7074 !page_aligned(dst_offset) ||
7075 !page_aligned(amount_left)) {
7076 vm_object_offset_t src_po,
7077 dst_po;
7078
7079 src_po = src_offset - vm_object_trunc_page(src_offset);
7080 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7081
7082 if (dst_po > src_po) {
7083 part_size = PAGE_SIZE - dst_po;
7084 } else {
7085 part_size = PAGE_SIZE - src_po;
7086 }
7087 if (part_size > (amount_left)) {
7088 part_size = amount_left;
7089 }
7090
7091 if (result_page == VM_PAGE_NULL) {
7092 assert((vm_offset_t) dst_po == dst_po);
7093 assert((vm_size_t) part_size == part_size);
7094 vm_page_part_zero_fill(dst_page,
7095 (vm_offset_t) dst_po,
7096 (vm_size_t) part_size);
7097 } else {
7098 assert((vm_offset_t) src_po == src_po);
7099 assert((vm_offset_t) dst_po == dst_po);
7100 assert((vm_size_t) part_size == part_size);
7101 vm_page_part_copy(result_page,
7102 (vm_offset_t) src_po,
7103 dst_page,
7104 (vm_offset_t) dst_po,
7105 (vm_size_t)part_size);
7106 if (!dst_page->vmp_dirty) {
7107 vm_object_lock(dst_object);
7108 SET_PAGE_DIRTY(dst_page, TRUE);
7109 vm_object_unlock(dst_object);
7110 }
7111 }
7112 } else {
7113 part_size = PAGE_SIZE;
7114
7115 if (result_page == VM_PAGE_NULL) {
7116 vm_page_zero_fill(dst_page);
7117 } else {
7118 vm_object_lock(result_page_object);
7119 vm_page_copy(result_page, dst_page);
7120 vm_object_unlock(result_page_object);
7121
7122 if (!dst_page->vmp_dirty) {
7123 vm_object_lock(dst_object);
7124 SET_PAGE_DIRTY(dst_page, TRUE);
7125 vm_object_unlock(dst_object);
7126 }
7127 }
7128 }
7129
7130 /*
7131 * Unlock everything, and return
7132 */
7133
7134 vm_map_unlock_read(dst_map);
7135
7136 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7137 vm_fault_copy_cleanup(result_page, src_top_page);
7138 }
7139 vm_fault_copy_dst_cleanup(dst_page);
7140
7141 amount_left -= part_size;
7142 src_offset += part_size;
7143 dst_offset += part_size;
7144 } while (amount_left > 0);
7145
7146 RETURN(KERN_SUCCESS);
7147 #undef RETURN
7148
7149 /*NOTREACHED*/
7150 }
7151
7152 #if VM_FAULT_CLASSIFY
7153 /*
7154 * Temporary statistics gathering support.
7155 */
7156
7157 /*
7158 * Statistics arrays:
7159 */
7160 #define VM_FAULT_TYPES_MAX 5
7161 #define VM_FAULT_LEVEL_MAX 8
7162
7163 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
7164
7165 #define VM_FAULT_TYPE_ZERO_FILL 0
7166 #define VM_FAULT_TYPE_MAP_IN 1
7167 #define VM_FAULT_TYPE_PAGER 2
7168 #define VM_FAULT_TYPE_COPY 3
7169 #define VM_FAULT_TYPE_OTHER 4
7170
7171
7172 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)7173 vm_fault_classify(vm_object_t object,
7174 vm_object_offset_t offset,
7175 vm_prot_t fault_type)
7176 {
7177 int type, level = 0;
7178 vm_page_t m;
7179
7180 while (TRUE) {
7181 m = vm_page_lookup(object, offset);
7182 if (m != VM_PAGE_NULL) {
7183 if (m->vmp_busy || VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent) {
7184 type = VM_FAULT_TYPE_OTHER;
7185 break;
7186 }
7187 if (((fault_type & VM_PROT_WRITE) == 0) ||
7188 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
7189 type = VM_FAULT_TYPE_MAP_IN;
7190 break;
7191 }
7192 type = VM_FAULT_TYPE_COPY;
7193 break;
7194 } else {
7195 if (object->pager_created) {
7196 type = VM_FAULT_TYPE_PAGER;
7197 break;
7198 }
7199 if (object->shadow == VM_OBJECT_NULL) {
7200 type = VM_FAULT_TYPE_ZERO_FILL;
7201 break;
7202 }
7203
7204 offset += object->vo_shadow_offset;
7205 object = object->shadow;
7206 level++;
7207 continue;
7208 }
7209 }
7210
7211 if (level > VM_FAULT_LEVEL_MAX) {
7212 level = VM_FAULT_LEVEL_MAX;
7213 }
7214
7215 vm_fault_stats[type][level] += 1;
7216
7217 return;
7218 }
7219
7220 /* cleanup routine to call from debugger */
7221
7222 void
vm_fault_classify_init(void)7223 vm_fault_classify_init(void)
7224 {
7225 int type, level;
7226
7227 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
7228 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
7229 vm_fault_stats[type][level] = 0;
7230 }
7231 }
7232
7233 return;
7234 }
7235 #endif /* VM_FAULT_CLASSIFY */
7236
7237 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr)7238 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7239 {
7240 vm_map_entry_t entry;
7241 vm_object_t object;
7242 vm_offset_t object_offset;
7243 vm_page_t m;
7244 int compressor_external_state, compressed_count_delta;
7245 int compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
7246 int my_fault_type = VM_PROT_READ;
7247 kern_return_t kr;
7248 int effective_page_mask, effective_page_size;
7249
7250 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7251 effective_page_mask = VM_MAP_PAGE_MASK(map);
7252 effective_page_size = VM_MAP_PAGE_SIZE(map);
7253 } else {
7254 effective_page_mask = PAGE_MASK;
7255 effective_page_size = PAGE_SIZE;
7256 }
7257
7258 if (not_in_kdp) {
7259 panic("kdp_lightweight_fault called from outside of debugger context");
7260 }
7261
7262 assert(map != VM_MAP_NULL);
7263
7264 assert((cur_target_addr & effective_page_mask) == 0);
7265 if ((cur_target_addr & effective_page_mask) != 0) {
7266 return 0;
7267 }
7268
7269 if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7270 return 0;
7271 }
7272
7273 if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7274 return 0;
7275 }
7276
7277 if (entry->is_sub_map) {
7278 return 0;
7279 }
7280
7281 object = VME_OBJECT(entry);
7282 if (object == VM_OBJECT_NULL) {
7283 return 0;
7284 }
7285
7286 object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7287
7288 while (TRUE) {
7289 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7290 return 0;
7291 }
7292
7293 if (object->pager_created && (object->paging_in_progress ||
7294 object->activity_in_progress)) {
7295 return 0;
7296 }
7297
7298 m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7299
7300 if (m != VM_PAGE_NULL) {
7301 if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7302 return 0;
7303 }
7304
7305 if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
7306 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7307 return 0;
7308 }
7309
7310 assert(!m->vmp_private);
7311 if (m->vmp_private) {
7312 return 0;
7313 }
7314
7315 assert(!m->vmp_fictitious);
7316 if (m->vmp_fictitious) {
7317 return 0;
7318 }
7319
7320 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7321 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7322 return 0;
7323 }
7324
7325 return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7326 }
7327
7328 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7329
7330 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7331 if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7332 kr = vm_compressor_pager_get(object->pager,
7333 vm_object_trunc_page(object_offset + object->paging_offset),
7334 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7335 compressor_flags, &compressed_count_delta);
7336 if (kr == KERN_SUCCESS) {
7337 return kdp_compressor_decompressed_page_paddr;
7338 } else {
7339 return 0;
7340 }
7341 }
7342 }
7343
7344 if (object->shadow == VM_OBJECT_NULL) {
7345 return 0;
7346 }
7347
7348 object_offset += object->vo_shadow_offset;
7349 object = object->shadow;
7350 }
7351 }
7352
7353 /*
7354 * vm_page_validate_cs_fast():
7355 * Performs a few quick checks to determine if the page's code signature
7356 * really needs to be fully validated. It could:
7357 * 1. have been modified (i.e. automatically tainted),
7358 * 2. have already been validated,
7359 * 3. have already been found to be tainted,
7360 * 4. no longer have a backing store.
7361 * Returns FALSE if the page needs to be fully validated.
7362 */
7363 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7364 vm_page_validate_cs_fast(
7365 vm_page_t page,
7366 vm_map_size_t fault_page_size,
7367 vm_map_offset_t fault_phys_offset)
7368 {
7369 vm_object_t object;
7370
7371 object = VM_PAGE_OBJECT(page);
7372 vm_object_lock_assert_held(object);
7373
7374 if (page->vmp_wpmapped &&
7375 !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7376 /*
7377 * This page was mapped for "write" access sometime in the
7378 * past and could still be modifiable in the future.
7379 * Consider it tainted.
7380 * [ If the page was already found to be "tainted", no
7381 * need to re-validate. ]
7382 */
7383 vm_object_lock_assert_exclusive(object);
7384 VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7385 VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7386 if (cs_debug) {
7387 printf("CODESIGNING: %s: "
7388 "page %p obj %p off 0x%llx "
7389 "was modified\n",
7390 __FUNCTION__,
7391 page, object, page->vmp_offset);
7392 }
7393 vm_cs_validated_dirtied++;
7394 }
7395
7396 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7397 VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7398 return TRUE;
7399 }
7400 vm_object_lock_assert_exclusive(object);
7401
7402 #if CHECK_CS_VALIDATION_BITMAP
7403 kern_return_t kr;
7404
7405 kr = vnode_pager_cs_check_validation_bitmap(
7406 object->pager,
7407 page->vmp_offset + object->paging_offset,
7408 CS_BITMAP_CHECK);
7409 if (kr == KERN_SUCCESS) {
7410 page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7411 page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7412 vm_cs_bitmap_validated++;
7413 return TRUE;
7414 }
7415 #endif /* CHECK_CS_VALIDATION_BITMAP */
7416
7417 if (!object->alive || object->terminating || object->pager == NULL) {
7418 /*
7419 * The object is terminating and we don't have its pager
7420 * so we can't validate the data...
7421 */
7422 return TRUE;
7423 }
7424
7425 /* we need to really validate this page */
7426 vm_object_lock_assert_exclusive(object);
7427 return FALSE;
7428 }
7429
7430 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)7431 vm_page_validate_cs_mapped_slow(
7432 vm_page_t page,
7433 const void *kaddr)
7434 {
7435 vm_object_t object;
7436 memory_object_offset_t mo_offset;
7437 memory_object_t pager;
7438 struct vnode *vnode;
7439 int validated, tainted, nx;
7440
7441 assert(page->vmp_busy);
7442 object = VM_PAGE_OBJECT(page);
7443 vm_object_lock_assert_exclusive(object);
7444
7445 vm_cs_validates++;
7446
7447 /*
7448 * Since we get here to validate a page that was brought in by
7449 * the pager, we know that this pager is all setup and ready
7450 * by now.
7451 */
7452 assert(object->code_signed);
7453 assert(!object->internal);
7454 assert(object->pager != NULL);
7455 assert(object->pager_ready);
7456
7457 pager = object->pager;
7458 assert(object->paging_in_progress);
7459 vnode = vnode_pager_lookup_vnode(pager);
7460 mo_offset = page->vmp_offset + object->paging_offset;
7461
7462 /* verify the SHA1 hash for this page */
7463 validated = 0;
7464 tainted = 0;
7465 nx = 0;
7466 cs_validate_page(vnode,
7467 pager,
7468 mo_offset,
7469 (const void *)((const char *)kaddr),
7470 &validated,
7471 &tainted,
7472 &nx);
7473
7474 page->vmp_cs_validated |= validated;
7475 page->vmp_cs_tainted |= tainted;
7476 page->vmp_cs_nx |= nx;
7477
7478 #if CHECK_CS_VALIDATION_BITMAP
7479 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7480 page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7481 vnode_pager_cs_check_validation_bitmap(object->pager,
7482 mo_offset,
7483 CS_BITMAP_SET);
7484 }
7485 #endif /* CHECK_CS_VALIDATION_BITMAP */
7486 }
7487
7488 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)7489 vm_page_validate_cs_mapped(
7490 vm_page_t page,
7491 vm_map_size_t fault_page_size,
7492 vm_map_offset_t fault_phys_offset,
7493 const void *kaddr)
7494 {
7495 if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7496 vm_page_validate_cs_mapped_slow(page, kaddr);
7497 }
7498 }
7499
7500 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)7501 vm_page_map_and_validate_cs(
7502 vm_object_t object,
7503 vm_page_t page)
7504 {
7505 vm_object_offset_t offset;
7506 vm_map_offset_t koffset;
7507 vm_map_size_t ksize;
7508 vm_offset_t kaddr;
7509 kern_return_t kr;
7510 boolean_t busy_page;
7511 boolean_t need_unmap;
7512
7513 vm_object_lock_assert_exclusive(object);
7514
7515 assert(object->code_signed);
7516 offset = page->vmp_offset;
7517
7518 busy_page = page->vmp_busy;
7519 if (!busy_page) {
7520 /* keep page busy while we map (and unlock) the VM object */
7521 page->vmp_busy = TRUE;
7522 }
7523
7524 /*
7525 * Take a paging reference on the VM object
7526 * to protect it from collapse or bypass,
7527 * and keep it from disappearing too.
7528 */
7529 vm_object_paging_begin(object);
7530
7531 /* map the page in the kernel address space */
7532 ksize = PAGE_SIZE_64;
7533 koffset = 0;
7534 need_unmap = FALSE;
7535 kr = vm_paging_map_object(page,
7536 object,
7537 offset,
7538 VM_PROT_READ,
7539 FALSE, /* can't unlock object ! */
7540 &ksize,
7541 &koffset,
7542 &need_unmap);
7543 if (kr != KERN_SUCCESS) {
7544 panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
7545 }
7546 kaddr = CAST_DOWN(vm_offset_t, koffset);
7547
7548 /* validate the mapped page */
7549 vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7550
7551 assert(page->vmp_busy);
7552 assert(object == VM_PAGE_OBJECT(page));
7553 vm_object_lock_assert_exclusive(object);
7554
7555 if (!busy_page) {
7556 PAGE_WAKEUP_DONE(page);
7557 }
7558 if (need_unmap) {
7559 /* unmap the map from the kernel address space */
7560 vm_paging_unmap_object(object, koffset, koffset + ksize);
7561 koffset = 0;
7562 ksize = 0;
7563 kaddr = 0;
7564 }
7565 vm_object_paging_end(object);
7566 }
7567
7568 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7569 vm_page_validate_cs(
7570 vm_page_t page,
7571 vm_map_size_t fault_page_size,
7572 vm_map_offset_t fault_phys_offset)
7573 {
7574 vm_object_t object;
7575
7576 object = VM_PAGE_OBJECT(page);
7577 vm_object_lock_assert_held(object);
7578
7579 if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7580 return;
7581 }
7582 vm_page_map_and_validate_cs(object, page);
7583 }
7584
7585 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)7586 vm_page_validate_cs_mapped_chunk(
7587 vm_page_t page,
7588 const void *kaddr,
7589 vm_offset_t chunk_offset,
7590 vm_size_t chunk_size,
7591 boolean_t *validated_p,
7592 unsigned *tainted_p)
7593 {
7594 vm_object_t object;
7595 vm_object_offset_t offset, offset_in_page;
7596 memory_object_t pager;
7597 struct vnode *vnode;
7598 boolean_t validated;
7599 unsigned tainted;
7600
7601 *validated_p = FALSE;
7602 *tainted_p = 0;
7603
7604 assert(page->vmp_busy);
7605 object = VM_PAGE_OBJECT(page);
7606 vm_object_lock_assert_exclusive(object);
7607
7608 assert(object->code_signed);
7609 offset = page->vmp_offset;
7610
7611 if (!object->alive || object->terminating || object->pager == NULL) {
7612 /*
7613 * The object is terminating and we don't have its pager
7614 * so we can't validate the data...
7615 */
7616 return;
7617 }
7618 /*
7619 * Since we get here to validate a page that was brought in by
7620 * the pager, we know that this pager is all setup and ready
7621 * by now.
7622 */
7623 assert(!object->internal);
7624 assert(object->pager != NULL);
7625 assert(object->pager_ready);
7626
7627 pager = object->pager;
7628 assert(object->paging_in_progress);
7629 vnode = vnode_pager_lookup_vnode(pager);
7630
7631 /* verify the signature for this chunk */
7632 offset_in_page = chunk_offset;
7633 assert(offset_in_page < PAGE_SIZE);
7634
7635 tainted = 0;
7636 validated = cs_validate_range(vnode,
7637 pager,
7638 (object->paging_offset +
7639 offset +
7640 offset_in_page),
7641 (const void *)((const char *)kaddr
7642 + offset_in_page),
7643 chunk_size,
7644 &tainted);
7645 if (validated) {
7646 *validated_p = TRUE;
7647 }
7648 if (tainted) {
7649 *tainted_p = tainted;
7650 }
7651 }
7652
7653 static void
vm_rtfrecord_lock(void)7654 vm_rtfrecord_lock(void)
7655 {
7656 lck_spin_lock(&vm_rtfr_slock);
7657 }
7658
7659 static void
vm_rtfrecord_unlock(void)7660 vm_rtfrecord_unlock(void)
7661 {
7662 lck_spin_unlock(&vm_rtfr_slock);
7663 }
7664
7665 unsigned int
vmrtfaultinfo_bufsz(void)7666 vmrtfaultinfo_bufsz(void)
7667 {
7668 return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7669 }
7670
7671 #include <kern/backtrace.h>
7672
7673 __attribute__((noinline))
7674 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)7675 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7676 {
7677 uint64_t fend = mach_continuous_time();
7678
7679 uint64_t cfpc = 0;
7680 uint64_t ctid = cthread->thread_id;
7681 uint64_t cupid = get_current_unique_pid();
7682
7683 uintptr_t bpc = 0;
7684 errno_t btr = 0;
7685
7686 /*
7687 * Capture a single-frame backtrace. This extracts just the program
7688 * counter at the point of the fault, and should not use copyin to get
7689 * Rosetta save state.
7690 */
7691 struct backtrace_control ctl = {
7692 .btc_user_thread = cthread,
7693 .btc_user_copy = backtrace_user_copy_error,
7694 };
7695 unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
7696 if ((btr == 0) && (bfrs > 0)) {
7697 cfpc = bpc;
7698 }
7699
7700 assert((fstart != 0) && fend >= fstart);
7701 vm_rtfrecord_lock();
7702 assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7703
7704 vmrtfrs.vmrtf_total++;
7705 vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7706
7707 cvmr->rtfabstime = fstart;
7708 cvmr->rtfduration = fend - fstart;
7709 cvmr->rtfaddr = fault_vaddr;
7710 cvmr->rtfpc = cfpc;
7711 cvmr->rtftype = type_of_fault;
7712 cvmr->rtfupid = cupid;
7713 cvmr->rtftid = ctid;
7714
7715 if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7716 vmrtfrs.vmrtfr_curi = 0;
7717 }
7718
7719 vm_rtfrecord_unlock();
7720 }
7721
7722 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)7723 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7724 {
7725 vm_rtfault_record_t *cvmrd = vrecords;
7726 size_t residue = vrecordsz;
7727 size_t numextracted = 0;
7728 boolean_t early_exit = FALSE;
7729
7730 vm_rtfrecord_lock();
7731
7732 for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7733 if (residue < sizeof(vm_rtfault_record_t)) {
7734 early_exit = TRUE;
7735 break;
7736 }
7737
7738 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7739 #if DEVELOPMENT || DEBUG
7740 if (isroot == FALSE) {
7741 continue;
7742 }
7743 #else
7744 continue;
7745 #endif /* DEVDEBUG */
7746 }
7747
7748 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7749 cvmrd++;
7750 residue -= sizeof(vm_rtfault_record_t);
7751 numextracted++;
7752 }
7753
7754 vm_rtfrecord_unlock();
7755
7756 *vmrtfrv = numextracted;
7757 return early_exit;
7758 }
7759
7760 /*
7761 * Only allow one diagnosis to be in flight at a time, to avoid
7762 * creating too much additional memory usage.
7763 */
7764 static volatile uint_t vmtc_diagnosing;
7765 unsigned int vmtc_total = 0;
7766
7767 /*
7768 * Type used to update telemetry for the diagnosis counts.
7769 */
7770 CA_EVENT(vmtc_telemetry,
7771 CA_INT, vmtc_num_byte, /* number of corrupt bytes found */
7772 CA_BOOL, vmtc_undiagnosed, /* undiagnosed because more than 1 at a time */
7773 CA_BOOL, vmtc_not_eligible, /* the page didn't qualify */
7774 CA_BOOL, vmtc_copyin_fail, /* unable to copy in the page */
7775 CA_BOOL, vmtc_not_found, /* no corruption found even though CS failed */
7776 CA_BOOL, vmtc_one_bit_flip, /* single bit flip */
7777 CA_BOOL, vmtc_testing); /* caused on purpose by testing */
7778
7779 #if DEVELOPMENT || DEBUG
7780 /*
7781 * Buffers used to compare before/after page contents.
7782 * Stashed to aid when debugging crashes.
7783 */
7784 static size_t vmtc_last_buffer_size = 0;
7785 static uint64_t *vmtc_last_before_buffer = NULL;
7786 static uint64_t *vmtc_last_after_buffer = NULL;
7787
7788 /*
7789 * Needed to record corruptions due to testing.
7790 */
7791 static uintptr_t corruption_test_va = 0;
7792 #endif /* DEVELOPMENT || DEBUG */
7793
7794 /*
7795 * Stash a copy of data from a possibly corrupt page.
7796 */
7797 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)7798 vmtc_get_page_data(
7799 vm_map_offset_t code_addr,
7800 vm_page_t page)
7801 {
7802 uint64_t *buffer = NULL;
7803 addr64_t buffer_paddr;
7804 addr64_t page_paddr;
7805 extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
7806 uint_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7807
7808 /*
7809 * Need an aligned buffer to do a physical copy.
7810 */
7811 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
7812 size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
7813 return NULL;
7814 }
7815 buffer_paddr = kvtophys((vm_offset_t)buffer);
7816 page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
7817
7818 /* adjust the page start address if we need only 4K of a 16K page */
7819 if (size < PAGE_SIZE) {
7820 uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
7821 page_paddr += subpage_start;
7822 }
7823
7824 bcopy_phys(page_paddr, buffer_paddr, size);
7825 return buffer;
7826 }
7827
7828 /*
7829 * Set things up so we can diagnose a potential text page corruption.
7830 */
7831 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)7832 vmtc_text_page_diagnose_setup(
7833 vm_map_offset_t code_addr,
7834 vm_page_t page,
7835 CA_EVENT_TYPE(vmtc_telemetry) *event)
7836 {
7837 uint64_t *buffer = NULL;
7838
7839 /*
7840 * If another is being diagnosed, skip this one.
7841 */
7842 if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
7843 event->vmtc_undiagnosed = true;
7844 return NULL;
7845 }
7846
7847 /*
7848 * Get the contents of the corrupt page.
7849 */
7850 buffer = vmtc_get_page_data(code_addr, page);
7851 if (buffer == NULL) {
7852 event->vmtc_copyin_fail = true;
7853 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7854 panic("Bad compare and swap in setup!");
7855 }
7856 return NULL;
7857 }
7858 return buffer;
7859 }
7860
7861 /*
7862 * Diagnose the text page by comparing its contents with
7863 * the one we've previously saved.
7864 */
7865 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)7866 vmtc_text_page_diagnose(
7867 vm_map_offset_t code_addr,
7868 uint64_t *old_code_buffer,
7869 CA_EVENT_TYPE(vmtc_telemetry) *event)
7870 {
7871 uint64_t *new_code_buffer;
7872 size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7873 uint_t count = (uint_t)size / sizeof(uint64_t);
7874 uint_t diff_count = 0;
7875 bool bit_flip = false;
7876 uint_t b;
7877 uint64_t *new;
7878 uint64_t *old;
7879
7880 new_code_buffer = kalloc_data(size, Z_WAITOK);
7881 assert(new_code_buffer != NULL);
7882 if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
7883 /* copyin error, so undo things */
7884 event->vmtc_copyin_fail = true;
7885 goto done;
7886 }
7887
7888 new = new_code_buffer;
7889 old = old_code_buffer;
7890 for (; count-- > 0; ++new, ++old) {
7891 if (*new == *old) {
7892 continue;
7893 }
7894
7895 /*
7896 * On first diff, check for a single bit flip
7897 */
7898 if (diff_count == 0) {
7899 uint64_t x = (*new ^ *old);
7900 assert(x != 0);
7901 if ((x & (x - 1)) == 0) {
7902 bit_flip = true;
7903 ++diff_count;
7904 continue;
7905 }
7906 }
7907
7908 /*
7909 * count up the number of different bytes.
7910 */
7911 for (b = 0; b < sizeof(uint64_t); ++b) {
7912 char *n = (char *)new;
7913 char *o = (char *)old;
7914 if (n[b] != o[b]) {
7915 ++diff_count;
7916 }
7917 }
7918 }
7919
7920 if (diff_count > 1) {
7921 bit_flip = false;
7922 }
7923
7924 if (diff_count == 0) {
7925 event->vmtc_not_found = true;
7926 } else {
7927 event->vmtc_num_byte = diff_count;
7928 }
7929 if (bit_flip) {
7930 event->vmtc_one_bit_flip = true;
7931 }
7932
7933 done:
7934 /*
7935 * Free up the code copy buffers, but save the last
7936 * set on development / debug kernels in case they
7937 * can provide evidence for debugging memory stomps.
7938 */
7939 #if DEVELOPMENT || DEBUG
7940 if (vmtc_last_before_buffer != NULL) {
7941 kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
7942 }
7943 if (vmtc_last_after_buffer != NULL) {
7944 kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
7945 }
7946 vmtc_last_before_buffer = old_code_buffer;
7947 vmtc_last_after_buffer = new_code_buffer;
7948 vmtc_last_buffer_size = size;
7949 #else /* DEVELOPMENT || DEBUG */
7950 kfree_data(new_code_buffer, size);
7951 kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
7952 #endif /* DEVELOPMENT || DEBUG */
7953
7954 /*
7955 * We're finished, so clear the diagnosing flag.
7956 */
7957 if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7958 panic("Bad compare and swap in diagnose!");
7959 }
7960 }
7961
7962 /*
7963 * For the given map, virt address, find the object, offset, and page.
7964 * This has to lookup the map entry, verify protections, walk any shadow chains.
7965 * If found, returns with the object locked.
7966 */
7967 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)7968 vmtc_revalidate_lookup(
7969 vm_map_t map,
7970 vm_map_offset_t vaddr,
7971 vm_object_t *ret_object,
7972 vm_object_offset_t *ret_offset,
7973 vm_page_t *ret_page,
7974 vm_prot_t *ret_prot)
7975 {
7976 vm_object_t object;
7977 vm_object_offset_t offset;
7978 vm_page_t page;
7979 kern_return_t kr = KERN_SUCCESS;
7980 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7981 vm_map_version_t version;
7982 boolean_t wired;
7983 struct vm_object_fault_info fault_info = {};
7984 vm_map_t real_map = NULL;
7985 vm_prot_t prot;
7986 vm_object_t shadow;
7987
7988 /*
7989 * Find the object/offset for the given location/map.
7990 * Note this returns with the object locked.
7991 */
7992 restart:
7993 vm_map_lock_read(map);
7994 object = VM_OBJECT_NULL; /* in case we come around the restart path */
7995 kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
7996 object_lock_type, &version, &object, &offset, &prot, &wired,
7997 &fault_info, &real_map, NULL);
7998 vm_map_unlock_read(map);
7999 if (real_map != NULL && real_map != map) {
8000 vm_map_unlock(real_map);
8001 }
8002
8003 /*
8004 * If there's no page here, fail.
8005 */
8006 if (kr != KERN_SUCCESS || object == NULL) {
8007 kr = KERN_FAILURE;
8008 goto done;
8009 }
8010
8011 /*
8012 * Chase down any shadow chains to find the actual page.
8013 */
8014 for (;;) {
8015 /*
8016 * See if the page is on the current object.
8017 */
8018 page = vm_page_lookup(object, vm_object_trunc_page(offset));
8019 if (page != NULL) {
8020 /* restart the lookup */
8021 if (page->vmp_restart) {
8022 vm_object_unlock(object);
8023 goto restart;
8024 }
8025
8026 /*
8027 * If this page is busy, we need to wait for it.
8028 */
8029 if (page->vmp_busy) {
8030 PAGE_SLEEP(object, page, TRUE);
8031 vm_object_unlock(object);
8032 goto restart;
8033 }
8034 break;
8035 }
8036
8037 /*
8038 * If the object doesn't have the page and
8039 * has no shadow, then we can quit.
8040 */
8041 shadow = object->shadow;
8042 if (shadow == NULL) {
8043 kr = KERN_FAILURE;
8044 goto done;
8045 }
8046
8047 /*
8048 * Move to the next object
8049 */
8050 offset += object->vo_shadow_offset;
8051 vm_object_lock(shadow);
8052 vm_object_unlock(object);
8053 object = shadow;
8054 shadow = VM_OBJECT_NULL;
8055 }
8056 *ret_object = object;
8057 *ret_offset = vm_object_trunc_page(offset);
8058 *ret_page = page;
8059 *ret_prot = prot;
8060
8061 done:
8062 if (kr != KERN_SUCCESS && object != NULL) {
8063 vm_object_unlock(object);
8064 }
8065 return kr;
8066 }
8067
8068 /*
8069 * Check if a page is wired, needs extra locking.
8070 */
8071 static bool
is_page_wired(vm_page_t page)8072 is_page_wired(vm_page_t page)
8073 {
8074 bool result;
8075 vm_page_lock_queues();
8076 result = VM_PAGE_WIRED(page);
8077 vm_page_unlock_queues();
8078 return result;
8079 }
8080
8081 /*
8082 * A fatal process error has occurred in the given task.
8083 * Recheck the code signing of the text page at the given
8084 * address to check for a text page corruption.
8085 *
8086 * Returns KERN_FAILURE if a page was found to be corrupt
8087 * by failing to match its code signature. KERN_SUCCESS
8088 * means the page is either valid or we don't have the
8089 * information to say it's corrupt.
8090 */
8091 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)8092 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
8093 {
8094 kern_return_t kr;
8095 vm_map_t map;
8096 vm_object_t object = NULL;
8097 vm_object_offset_t offset;
8098 vm_page_t page = NULL;
8099 struct vnode *vnode;
8100 uint64_t *diagnose_buffer = NULL;
8101 CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
8102 ca_event_t ca_event = NULL;
8103 vm_prot_t prot;
8104
8105 map = task->map;
8106 if (task->map == NULL) {
8107 return KERN_SUCCESS;
8108 }
8109
8110 kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
8111 if (kr != KERN_SUCCESS) {
8112 goto done;
8113 }
8114
8115 /*
8116 * The page must be executable.
8117 */
8118 if (!(prot & VM_PROT_EXECUTE)) {
8119 goto done;
8120 }
8121
8122 /*
8123 * The object needs to have a pager.
8124 */
8125 if (object->pager == NULL) {
8126 goto done;
8127 }
8128
8129 /*
8130 * Needs to be a vnode backed page to have a signature.
8131 */
8132 vnode = vnode_pager_lookup_vnode(object->pager);
8133 if (vnode == NULL) {
8134 goto done;
8135 }
8136
8137 /*
8138 * Object checks to see if we should proceed.
8139 */
8140 if (!object->code_signed || /* no code signature to check */
8141 object->internal || /* internal objects aren't signed */
8142 object->terminating || /* the object and its pages are already going away */
8143 !object->pager_ready) { /* this should happen, but check shouldn't hurt */
8144 goto done;
8145 }
8146
8147
8148 /*
8149 * Check the code signature of the page in question.
8150 */
8151 vm_page_map_and_validate_cs(object, page);
8152
8153 /*
8154 * At this point:
8155 * vmp_cs_validated |= validated (set if a code signature exists)
8156 * vmp_cs_tainted |= tainted (set if code signature violation)
8157 * vmp_cs_nx |= nx; ??
8158 *
8159 * if vmp_pmapped then have to pmap_disconnect..
8160 * other flags to check on object or page?
8161 */
8162 if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
8163 #if DEBUG || DEVELOPMENT
8164 /*
8165 * On development builds, a boot-arg can be used to cause
8166 * a panic, instead of a quiet repair.
8167 */
8168 if (vmtc_panic_instead) {
8169 panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
8170 }
8171 #endif /* DEBUG || DEVELOPMENT */
8172
8173 /*
8174 * We're going to invalidate this page. Grab a copy of it for comparison.
8175 */
8176 ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
8177 event = ca_event->data;
8178 diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
8179
8180 /*
8181 * Invalidate, i.e. toss, the corrupted page.
8182 */
8183 if (!page->vmp_cleaning &&
8184 !page->vmp_laundry &&
8185 !page->vmp_fictitious &&
8186 !page->vmp_precious &&
8187 !page->vmp_absent &&
8188 !VMP_ERROR_GET(page) &&
8189 !page->vmp_dirty &&
8190 !is_page_wired(page)) {
8191 if (page->vmp_pmapped) {
8192 int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
8193 if (refmod & VM_MEM_MODIFIED) {
8194 SET_PAGE_DIRTY(page, FALSE);
8195 }
8196 if (refmod & VM_MEM_REFERENCED) {
8197 page->vmp_reference = TRUE;
8198 }
8199 }
8200 /* If the page seems intentionally modified, don't trash it. */
8201 if (!page->vmp_dirty) {
8202 VM_PAGE_FREE(page);
8203 } else {
8204 event->vmtc_not_eligible = true;
8205 }
8206 } else {
8207 event->vmtc_not_eligible = true;
8208 }
8209 vm_object_unlock(object);
8210 object = VM_OBJECT_NULL;
8211
8212 /*
8213 * Now try to diagnose the type of failure by faulting
8214 * in a new copy and diff'ing it with what we saved.
8215 */
8216 if (diagnose_buffer != NULL) {
8217 vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
8218 }
8219 #if DEBUG || DEVELOPMENT
8220 if (corruption_test_va != 0) {
8221 corruption_test_va = 0;
8222 event->vmtc_testing = true;
8223 }
8224 #endif /* DEBUG || DEVELOPMENT */
8225 ktriage_record(thread_tid(current_thread()),
8226 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
8227 0 /* arg */);
8228 CA_EVENT_SEND(ca_event);
8229 printf("Text page corruption detected for pid %d\n", proc_selfpid());
8230 ++vmtc_total;
8231 return KERN_FAILURE; /* failure means we definitely found a corrupt page */
8232 }
8233 done:
8234 if (object != NULL) {
8235 vm_object_unlock(object);
8236 }
8237 return KERN_SUCCESS;
8238 }
8239
8240 #if DEBUG || DEVELOPMENT
8241 /*
8242 * For implementing unit tests - ask the pmap to corrupt a text page.
8243 * We have to find the page, to get the physical address, then invoke
8244 * the pmap.
8245 */
8246 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
8247
8248 kern_return_t
vm_corrupt_text_addr(uintptr_t va)8249 vm_corrupt_text_addr(uintptr_t va)
8250 {
8251 task_t task = current_task();
8252 vm_map_t map;
8253 kern_return_t kr = KERN_SUCCESS;
8254 vm_object_t object = VM_OBJECT_NULL;
8255 vm_object_offset_t offset;
8256 vm_page_t page = NULL;
8257 pmap_paddr_t pa;
8258 vm_prot_t prot;
8259
8260 map = task->map;
8261 if (task->map == NULL) {
8262 printf("corrupt_text_addr: no map\n");
8263 return KERN_FAILURE;
8264 }
8265
8266 kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
8267 if (kr != KERN_SUCCESS) {
8268 printf("corrupt_text_addr: page lookup failed\n");
8269 return kr;
8270 }
8271 if (!(prot & VM_PROT_EXECUTE)) {
8272 printf("corrupt_text_addr: page not executable\n");
8273 return KERN_FAILURE;
8274 }
8275
8276 /* get the physical address to use */
8277 pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
8278
8279 /*
8280 * Check we have something we can work with.
8281 * Due to racing with pageout as we enter the sysctl,
8282 * it's theoretically possible to have the page disappear, just
8283 * before the lookup.
8284 *
8285 * That's highly likely to happen often. I've filed a radar 72857482
8286 * to bubble up the error here to the sysctl result and have the
8287 * test not FAIL in that case.
8288 */
8289 if (page->vmp_busy) {
8290 printf("corrupt_text_addr: vmp_busy\n");
8291 kr = KERN_FAILURE;
8292 }
8293 if (page->vmp_cleaning) {
8294 printf("corrupt_text_addr: vmp_cleaning\n");
8295 kr = KERN_FAILURE;
8296 }
8297 if (page->vmp_laundry) {
8298 printf("corrupt_text_addr: vmp_cleaning\n");
8299 kr = KERN_FAILURE;
8300 }
8301 if (page->vmp_fictitious) {
8302 printf("corrupt_text_addr: vmp_fictitious\n");
8303 kr = KERN_FAILURE;
8304 }
8305 if (page->vmp_precious) {
8306 printf("corrupt_text_addr: vmp_precious\n");
8307 kr = KERN_FAILURE;
8308 }
8309 if (page->vmp_absent) {
8310 printf("corrupt_text_addr: vmp_absent\n");
8311 kr = KERN_FAILURE;
8312 }
8313 if (VMP_ERROR_GET(page)) {
8314 printf("corrupt_text_addr: vmp_error\n");
8315 kr = KERN_FAILURE;
8316 }
8317 if (page->vmp_dirty) {
8318 printf("corrupt_text_addr: vmp_dirty\n");
8319 kr = KERN_FAILURE;
8320 }
8321 if (is_page_wired(page)) {
8322 printf("corrupt_text_addr: wired\n");
8323 kr = KERN_FAILURE;
8324 }
8325 if (!page->vmp_pmapped) {
8326 printf("corrupt_text_addr: !vmp_pmapped\n");
8327 kr = KERN_FAILURE;
8328 }
8329
8330 if (kr == KERN_SUCCESS) {
8331 printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8332 kr = pmap_test_text_corruption(pa);
8333 if (kr != KERN_SUCCESS) {
8334 printf("corrupt_text_addr: pmap error %d\n", kr);
8335 } else {
8336 corruption_test_va = va;
8337 }
8338 } else {
8339 printf("corrupt_text_addr: object %p\n", object);
8340 printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8341 printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8342 printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8343 printf("corrupt_text_addr: vm_page_t %p\n", page);
8344 printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8345 printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8346 }
8347
8348 if (object != VM_OBJECT_NULL) {
8349 vm_object_unlock(object);
8350 }
8351 return kr;
8352 }
8353
8354 #endif /* DEBUG || DEVELOPMENT */
8355