xref: /xnu-8792.61.2/osfmk/vm/vm_fault.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm_fault.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *
62  *	Page fault handling module.
63  */
64 
65 #include <libkern/OSAtomic.h>
66 
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h>       /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75 
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88 
89 #include <vm/vm_compressor.h>
90 #include <vm/vm_compressor_pager.h>
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_kern.h>
96 #include <vm/pmap.h>
97 #include <vm/vm_pageout.h>
98 #include <vm/vm_protos.h>
99 #include <vm/vm_external.h>
100 #include <vm/memory_object.h>
101 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
102 #include <vm/vm_shared_region.h>
103 
104 #include <sys/codesign.h>
105 #include <sys/reason.h>
106 #include <sys/signalvar.h>
107 
108 #include <sys/kdebug_triage.h>
109 
110 #include <san/kasan.h>
111 #include <libkern/coreanalytics/coreanalytics.h>
112 
113 #define VM_FAULT_CLASSIFY       0
114 
115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116 
117 int vm_protect_privileged_from_untrusted = 1;
118 
119 unsigned int    vm_object_pagein_throttle = 16;
120 
121 /*
122  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
123  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
124  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
125  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
126  * keep the UI active so that the user has a chance to kill the offending task before the system
127  * completely hangs.
128  *
129  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
130  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
131  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
132  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
133  */
134 
135 extern void throttle_lowpri_io(int);
136 
137 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
138 
139 uint64_t vm_hard_throttle_threshold;
140 
141 #if DEBUG || DEVELOPMENT
142 static bool vmtc_panic_instead = false;
143 int panic_object_not_alive = 1;
144 #endif /* DEBUG || DEVELOPMENT */
145 
146 OS_ALWAYS_INLINE
147 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)148 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
149 {
150 	return vm_wants_task_throttled(current_task()) ||
151 	       ((vm_page_free_count < vm_page_throttle_limit ||
152 	       HARD_THROTTLE_LIMIT_REACHED()) &&
153 	       proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
154 }
155 
156 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
157 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
158 
159 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
160 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
161 
162 
163 #define VM_STAT_DECOMPRESSIONS()        \
164 MACRO_BEGIN                             \
165 	counter_inc(&vm_statistics_decompressions); \
166 	current_thread()->decompressions++; \
167 MACRO_END
168 
169 boolean_t current_thread_aborted(void);
170 
171 /* Forward declarations of internal routines. */
172 static kern_return_t vm_fault_wire_fast(
173 	vm_map_t        map,
174 	vm_map_offset_t va,
175 	vm_prot_t       prot,
176 	vm_tag_t        wire_tag,
177 	vm_map_entry_t  entry,
178 	pmap_t          pmap,
179 	vm_map_offset_t pmap_addr,
180 	ppnum_t         *physpage_p);
181 
182 static kern_return_t vm_fault_internal(
183 	vm_map_t        map,
184 	vm_map_offset_t vaddr,
185 	vm_prot_t       caller_prot,
186 	boolean_t       change_wiring,
187 	vm_tag_t        wire_tag,
188 	int             interruptible,
189 	pmap_t          pmap,
190 	vm_map_offset_t pmap_addr,
191 	ppnum_t         *physpage_p);
192 
193 static void vm_fault_copy_cleanup(
194 	vm_page_t       page,
195 	vm_page_t       top_page);
196 
197 static void vm_fault_copy_dst_cleanup(
198 	vm_page_t       page);
199 
200 #if     VM_FAULT_CLASSIFY
201 extern void vm_fault_classify(vm_object_t       object,
202     vm_object_offset_t    offset,
203     vm_prot_t             fault_type);
204 
205 extern void vm_fault_classify_init(void);
206 #endif
207 
208 unsigned long vm_pmap_enter_blocked = 0;
209 unsigned long vm_pmap_enter_retried = 0;
210 
211 unsigned long vm_cs_validates = 0;
212 unsigned long vm_cs_revalidates = 0;
213 unsigned long vm_cs_query_modified = 0;
214 unsigned long vm_cs_validated_dirtied = 0;
215 unsigned long vm_cs_bitmap_validated = 0;
216 
217 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
218 
219 extern char *kdp_compressor_decompressed_page;
220 extern addr64_t kdp_compressor_decompressed_page_paddr;
221 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
222 
223 struct vmrtfr {
224 	int vmrtfr_maxi;
225 	int vmrtfr_curi;
226 	int64_t vmrtf_total;
227 	vm_rtfault_record_t *vm_rtf_records;
228 } vmrtfrs;
229 #define VMRTF_DEFAULT_BUFSIZE (4096)
230 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
231 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
232 
233 static void vm_rtfrecord_lock(void);
234 static void vm_rtfrecord_unlock(void);
235 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
236 
237 extern lck_grp_t vm_page_lck_grp_bucket;
238 extern lck_attr_t vm_page_lck_attr;
239 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
240 
241 #if DEVELOPMENT || DEBUG
242 extern int madvise_free_debug;
243 #endif /* DEVELOPMENT || DEBUG */
244 
245 extern int vm_pageout_protect_realtime;
246 
247 #if CONFIG_FREEZE
248 #endif /* CONFIG_FREEZE */
249 
250 /*
251  *	Routine:	vm_fault_init
252  *	Purpose:
253  *		Initialize our private data structures.
254  */
255 __startup_func
256 void
vm_fault_init(void)257 vm_fault_init(void)
258 {
259 	int i, vm_compressor_temp;
260 	boolean_t need_default_val = TRUE;
261 	/*
262 	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
263 	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
264 	 * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
265 	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
266 	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
267 	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
268 	 */
269 
270 	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
271 
272 	/*
273 	 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
274 	 */
275 
276 	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
277 		for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
278 			if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
279 				need_default_val = FALSE;
280 				vm_compressor_mode = vm_compressor_temp;
281 				break;
282 			}
283 		}
284 		if (need_default_val) {
285 			printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
286 		}
287 	}
288 #if CONFIG_FREEZE
289 	if (need_default_val) {
290 		if (osenvironment_is_diagnostics()) {
291 			printf("osenvironment == \"diagnostics\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
292 			vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
293 			need_default_val = false;
294 		}
295 	}
296 #endif /* CONFIG_FREEZE */
297 	if (need_default_val) {
298 		/* If no boot arg or incorrect boot arg, try device tree. */
299 		PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
300 	}
301 	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
302 	vm_config_init();
303 
304 	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
305 	    &vm_protect_privileged_from_untrusted,
306 	    sizeof(vm_protect_privileged_from_untrusted));
307 
308 #if DEBUG || DEVELOPMENT
309 	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
310 
311 	if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
312 		madvise_free_debug = 0;
313 	}
314 
315 	PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
316 #endif /* DEBUG || DEVELOPMENT */
317 }
318 
319 __startup_func
320 static void
vm_rtfault_record_init(void)321 vm_rtfault_record_init(void)
322 {
323 	size_t size;
324 
325 	vmrtf_num_records = MAX(vmrtf_num_records, 1);
326 	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
327 	vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
328 	    ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
329 	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
330 }
331 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
332 
333 /*
334  *	Routine:	vm_fault_cleanup
335  *	Purpose:
336  *		Clean up the result of vm_fault_page.
337  *	Results:
338  *		The paging reference for "object" is released.
339  *		"object" is unlocked.
340  *		If "top_page" is not null,  "top_page" is
341  *		freed and the paging reference for the object
342  *		containing it is released.
343  *
344  *	In/out conditions:
345  *		"object" must be locked.
346  */
347 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)348 vm_fault_cleanup(
349 	vm_object_t     object,
350 	vm_page_t       top_page)
351 {
352 	vm_object_paging_end(object);
353 	vm_object_unlock(object);
354 
355 	if (top_page != VM_PAGE_NULL) {
356 		object = VM_PAGE_OBJECT(top_page);
357 
358 		vm_object_lock(object);
359 		VM_PAGE_FREE(top_page);
360 		vm_object_paging_end(object);
361 		vm_object_unlock(object);
362 	}
363 }
364 
365 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
366 
367 
368 boolean_t       vm_page_deactivate_behind = TRUE;
369 /*
370  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
371  */
372 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
373 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
374                                                                 /* we use it to size an array on the stack */
375 
376 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
377 
378 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
379 
380 /*
381  * vm_page_is_sequential
382  *
383  * Determine if sequential access is in progress
384  * in accordance with the behavior specified.
385  * Update state to indicate current access pattern.
386  *
387  * object must have at least the shared lock held
388  */
389 static
390 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)391 vm_fault_is_sequential(
392 	vm_object_t             object,
393 	vm_object_offset_t      offset,
394 	vm_behavior_t           behavior)
395 {
396 	vm_object_offset_t      last_alloc;
397 	int                     sequential;
398 	int                     orig_sequential;
399 
400 	last_alloc = object->last_alloc;
401 	sequential = object->sequential;
402 	orig_sequential = sequential;
403 
404 	offset = vm_object_trunc_page(offset);
405 	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
406 		/* re-faulting in the same page: no change in behavior */
407 		return;
408 	}
409 
410 	switch (behavior) {
411 	case VM_BEHAVIOR_RANDOM:
412 		/*
413 		 * reset indicator of sequential behavior
414 		 */
415 		sequential = 0;
416 		break;
417 
418 	case VM_BEHAVIOR_SEQUENTIAL:
419 		if (offset && last_alloc == offset - PAGE_SIZE_64) {
420 			/*
421 			 * advance indicator of sequential behavior
422 			 */
423 			if (sequential < MAX_SEQUENTIAL_RUN) {
424 				sequential += PAGE_SIZE;
425 			}
426 		} else {
427 			/*
428 			 * reset indicator of sequential behavior
429 			 */
430 			sequential = 0;
431 		}
432 		break;
433 
434 	case VM_BEHAVIOR_RSEQNTL:
435 		if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
436 			/*
437 			 * advance indicator of sequential behavior
438 			 */
439 			if (sequential > -MAX_SEQUENTIAL_RUN) {
440 				sequential -= PAGE_SIZE;
441 			}
442 		} else {
443 			/*
444 			 * reset indicator of sequential behavior
445 			 */
446 			sequential = 0;
447 		}
448 		break;
449 
450 	case VM_BEHAVIOR_DEFAULT:
451 	default:
452 		if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
453 			/*
454 			 * advance indicator of sequential behavior
455 			 */
456 			if (sequential < 0) {
457 				sequential = 0;
458 			}
459 			if (sequential < MAX_SEQUENTIAL_RUN) {
460 				sequential += PAGE_SIZE;
461 			}
462 		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
463 			/*
464 			 * advance indicator of sequential behavior
465 			 */
466 			if (sequential > 0) {
467 				sequential = 0;
468 			}
469 			if (sequential > -MAX_SEQUENTIAL_RUN) {
470 				sequential -= PAGE_SIZE;
471 			}
472 		} else {
473 			/*
474 			 * reset indicator of sequential behavior
475 			 */
476 			sequential = 0;
477 		}
478 		break;
479 	}
480 	if (sequential != orig_sequential) {
481 		if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
482 			/*
483 			 * if someone else has already updated object->sequential
484 			 * don't bother trying to update it or object->last_alloc
485 			 */
486 			return;
487 		}
488 	}
489 	/*
490 	 * I'd like to do this with a OSCompareAndSwap64, but that
491 	 * doesn't exist for PPC...  however, it shouldn't matter
492 	 * that much... last_alloc is maintained so that we can determine
493 	 * if a sequential access pattern is taking place... if only
494 	 * one thread is banging on this object, no problem with the unprotected
495 	 * update... if 2 or more threads are banging away, we run the risk of
496 	 * someone seeing a mangled update... however, in the face of multiple
497 	 * accesses, no sequential access pattern can develop anyway, so we
498 	 * haven't lost any real info.
499 	 */
500 	object->last_alloc = offset;
501 }
502 
503 #if DEVELOPMENT || DEBUG
504 uint64_t vm_page_deactivate_behind_count = 0;
505 #endif /* DEVELOPMENT || DEBUG */
506 
507 /*
508  * vm_page_deactivate_behind
509  *
510  * Determine if sequential access is in progress
511  * in accordance with the behavior specified.  If
512  * so, compute a potential page to deactivate and
513  * deactivate it.
514  *
515  * object must be locked.
516  *
517  * return TRUE if we actually deactivate a page
518  */
519 static
520 boolean_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)521 vm_fault_deactivate_behind(
522 	vm_object_t             object,
523 	vm_object_offset_t      offset,
524 	vm_behavior_t           behavior)
525 {
526 	int             n;
527 	int             pages_in_run = 0;
528 	int             max_pages_in_run = 0;
529 	int             sequential_run;
530 	int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
531 	vm_object_offset_t      run_offset = 0;
532 	vm_object_offset_t      pg_offset = 0;
533 	vm_page_t       m;
534 	vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
535 
536 	pages_in_run = 0;
537 #if TRACEFAULTPAGE
538 	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
539 #endif
540 	if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
541 		/*
542 		 * Do not deactivate pages from the kernel object: they
543 		 * are not intended to become pageable.
544 		 * or we've disabled the deactivate behind mechanism
545 		 * or we are dealing with an offset that is not aligned to
546 		 * the system's PAGE_SIZE because in that case we will
547 		 * handle the deactivation on the aligned offset and, thus,
548 		 * the full PAGE_SIZE page once. This helps us avoid the redundant
549 		 * deactivates and the extra faults.
550 		 */
551 		return FALSE;
552 	}
553 	if ((sequential_run = object->sequential)) {
554 		if (sequential_run < 0) {
555 			sequential_behavior = VM_BEHAVIOR_RSEQNTL;
556 			sequential_run = 0 - sequential_run;
557 		} else {
558 			sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
559 		}
560 	}
561 	switch (behavior) {
562 	case VM_BEHAVIOR_RANDOM:
563 		break;
564 	case VM_BEHAVIOR_SEQUENTIAL:
565 		if (sequential_run >= (int)PAGE_SIZE) {
566 			run_offset = 0 - PAGE_SIZE_64;
567 			max_pages_in_run = 1;
568 		}
569 		break;
570 	case VM_BEHAVIOR_RSEQNTL:
571 		if (sequential_run >= (int)PAGE_SIZE) {
572 			run_offset = PAGE_SIZE_64;
573 			max_pages_in_run = 1;
574 		}
575 		break;
576 	case VM_BEHAVIOR_DEFAULT:
577 	default:
578 	{       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
579 
580 		/*
581 		 * determine if the run of sequential accesss has been
582 		 * long enough on an object with default access behavior
583 		 * to consider it for deactivation
584 		 */
585 		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
586 			/*
587 			 * the comparisons between offset and behind are done
588 			 * in this kind of odd fashion in order to prevent wrap around
589 			 * at the end points
590 			 */
591 			if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
592 				if (offset >= behind) {
593 					run_offset = 0 - behind;
594 					pg_offset = PAGE_SIZE_64;
595 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
596 				}
597 			} else {
598 				if (offset < -behind) {
599 					run_offset = behind;
600 					pg_offset = 0 - PAGE_SIZE_64;
601 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
602 				}
603 			}
604 		}
605 		break;}
606 	}
607 	for (n = 0; n < max_pages_in_run; n++) {
608 		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
609 
610 		if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
611 			page_run[pages_in_run++] = m;
612 
613 			/*
614 			 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
615 			 *
616 			 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
617 			 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
618 			 * new reference happens. If no futher references happen on the page after that remote TLB flushes
619 			 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
620 			 * by pageout_scan, which is just fine since the last reference would have happened quite far
621 			 * in the past (TLB caches don't hang around for very long), and of course could just as easily
622 			 * have happened before we did the deactivate_behind.
623 			 */
624 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
625 		}
626 	}
627 	if (pages_in_run) {
628 		vm_page_lockspin_queues();
629 
630 		for (n = 0; n < pages_in_run; n++) {
631 			m = page_run[n];
632 
633 			vm_page_deactivate_internal(m, FALSE);
634 
635 #if DEVELOPMENT || DEBUG
636 			vm_page_deactivate_behind_count++;
637 #endif /* DEVELOPMENT || DEBUG */
638 
639 #if TRACEFAULTPAGE
640 			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
641 #endif
642 		}
643 		vm_page_unlock_queues();
644 
645 		return TRUE;
646 	}
647 	return FALSE;
648 }
649 
650 
651 #if (DEVELOPMENT || DEBUG)
652 uint32_t        vm_page_creation_throttled_hard = 0;
653 uint32_t        vm_page_creation_throttled_soft = 0;
654 uint64_t        vm_page_creation_throttle_avoided = 0;
655 #endif /* DEVELOPMENT || DEBUG */
656 
657 static int
vm_page_throttled(boolean_t page_kept)658 vm_page_throttled(boolean_t page_kept)
659 {
660 	clock_sec_t     elapsed_sec;
661 	clock_sec_t     tv_sec;
662 	clock_usec_t    tv_usec;
663 	task_t          curtask = current_task_early();
664 
665 	thread_t thread = current_thread();
666 
667 	if (thread->options & TH_OPT_VMPRIV) {
668 		return 0;
669 	}
670 
671 	if (curtask && !curtask->active) {
672 		return 0;
673 	}
674 
675 	if (thread->t_page_creation_throttled) {
676 		thread->t_page_creation_throttled = 0;
677 
678 		if (page_kept == FALSE) {
679 			goto no_throttle;
680 		}
681 	}
682 	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
683 #if (DEVELOPMENT || DEBUG)
684 		thread->t_page_creation_throttled_hard++;
685 		OSAddAtomic(1, &vm_page_creation_throttled_hard);
686 #endif /* DEVELOPMENT || DEBUG */
687 		return HARD_THROTTLE_DELAY;
688 	}
689 
690 	if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
691 	    thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
692 		if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
693 #if (DEVELOPMENT || DEBUG)
694 			OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
695 #endif
696 			goto no_throttle;
697 		}
698 		clock_get_system_microtime(&tv_sec, &tv_usec);
699 
700 		elapsed_sec = tv_sec - thread->t_page_creation_time;
701 
702 		if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
703 		    (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
704 			if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
705 				/*
706 				 * we'll reset our stats to give a well behaved app
707 				 * that was unlucky enough to accumulate a bunch of pages
708 				 * over a long period of time a chance to get out of
709 				 * the throttled state... we reset the counter and timestamp
710 				 * so that if it stays under the rate limit for the next second
711 				 * it will be back in our good graces... if it exceeds it, it
712 				 * will remain in the throttled state
713 				 */
714 				thread->t_page_creation_time = tv_sec;
715 				thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
716 			}
717 			VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
718 
719 			thread->t_page_creation_throttled = 1;
720 
721 			if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
722 #if (DEVELOPMENT || DEBUG)
723 				thread->t_page_creation_throttled_hard++;
724 				OSAddAtomic(1, &vm_page_creation_throttled_hard);
725 #endif /* DEVELOPMENT || DEBUG */
726 				return HARD_THROTTLE_DELAY;
727 			} else {
728 #if (DEVELOPMENT || DEBUG)
729 				thread->t_page_creation_throttled_soft++;
730 				OSAddAtomic(1, &vm_page_creation_throttled_soft);
731 #endif /* DEVELOPMENT || DEBUG */
732 				return SOFT_THROTTLE_DELAY;
733 			}
734 		}
735 		thread->t_page_creation_time = tv_sec;
736 		thread->t_page_creation_count = 0;
737 	}
738 no_throttle:
739 	thread->t_page_creation_count++;
740 
741 	return 0;
742 }
743 
744 extern boolean_t vm_pageout_running;
745 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)746 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
747 	int throttle_delay)
748 {
749 	/* make sure vm_pageout_scan() gets to work while we're throttled */
750 	if (!vm_pageout_running) {
751 		thread_wakeup((event_t)&vm_page_free_wanted);
752 	}
753 	delay(throttle_delay);
754 }
755 
756 
757 /*
758  * check for various conditions that would
759  * prevent us from creating a ZF page...
760  * cleanup is based on being called from vm_fault_page
761  *
762  * object must be locked
763  * object == m->vmp_object
764  */
765 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)766 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
767 {
768 	int throttle_delay;
769 
770 	if (object->shadow_severed ||
771 	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
772 		/*
773 		 * Either:
774 		 * 1. the shadow chain was severed,
775 		 * 2. the purgeable object is volatile or empty and is marked
776 		 *    to fault on access while volatile.
777 		 * Just have to return an error at this point
778 		 */
779 		if (m != VM_PAGE_NULL) {
780 			VM_PAGE_FREE(m);
781 		}
782 		vm_fault_cleanup(object, first_m);
783 
784 		thread_interrupt_level(interruptible_state);
785 
786 		if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
787 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
788 		}
789 
790 		if (object->shadow_severed) {
791 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
792 		}
793 		return VM_FAULT_MEMORY_ERROR;
794 	}
795 	if (page_throttle == TRUE) {
796 		if ((throttle_delay = vm_page_throttled(FALSE))) {
797 			/*
798 			 * we're throttling zero-fills...
799 			 * treat this as if we couldn't grab a page
800 			 */
801 			if (m != VM_PAGE_NULL) {
802 				VM_PAGE_FREE(m);
803 			}
804 			vm_fault_cleanup(object, first_m);
805 
806 			VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
807 
808 			__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
809 
810 			if (current_thread_aborted()) {
811 				thread_interrupt_level(interruptible_state);
812 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
813 				return VM_FAULT_INTERRUPTED;
814 			}
815 			thread_interrupt_level(interruptible_state);
816 
817 			return VM_FAULT_MEMORY_SHORTAGE;
818 		}
819 	}
820 	return VM_FAULT_SUCCESS;
821 }
822 
823 /*
824  * Clear the code signing bits on the given page_t
825  */
826 static void
vm_fault_cs_clear(vm_page_t m)827 vm_fault_cs_clear(vm_page_t m)
828 {
829 	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
830 	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
831 	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
832 }
833 
834 /*
835  * Enqueues the given page on the throttled queue.
836  * The caller must hold the vm_page_queue_lock and it will be held on return.
837  */
838 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)839 vm_fault_enqueue_throttled_locked(vm_page_t m)
840 {
841 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
842 	assert(!VM_PAGE_WIRED(m));
843 
844 	/*
845 	 * can't be on the pageout queue since we don't
846 	 * have a pager to try and clean to
847 	 */
848 	vm_page_queues_remove(m, TRUE);
849 	vm_page_check_pageable_safe(m);
850 	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
851 	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
852 	vm_page_throttled_count++;
853 }
854 
855 /*
856  * do the work to zero fill a page and
857  * inject it into the correct paging queue
858  *
859  * m->vmp_object must be locked
860  * page queue lock must NOT be held
861  */
862 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)863 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
864 {
865 	int my_fault = DBG_ZERO_FILL_FAULT;
866 	vm_object_t     object;
867 
868 	object = VM_PAGE_OBJECT(m);
869 
870 	/*
871 	 * This is is a zero-fill page fault...
872 	 *
873 	 * Checking the page lock is a waste of
874 	 * time;  this page was absent, so
875 	 * it can't be page locked by a pager.
876 	 *
877 	 * we also consider it undefined
878 	 * with respect to instruction
879 	 * execution.  i.e. it is the responsibility
880 	 * of higher layers to call for an instruction
881 	 * sync after changing the contents and before
882 	 * sending a program into this area.  We
883 	 * choose this approach for performance
884 	 */
885 	vm_fault_cs_clear(m);
886 	m->vmp_pmapped = TRUE;
887 
888 	if (no_zero_fill == TRUE) {
889 		my_fault = DBG_NZF_PAGE_FAULT;
890 
891 		if (m->vmp_absent && m->vmp_busy) {
892 			return my_fault;
893 		}
894 	} else {
895 		vm_page_zero_fill(m);
896 
897 		counter_inc(&vm_statistics_zero_fill_count);
898 		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
899 	}
900 	assert(!m->vmp_laundry);
901 	assert(object != kernel_object);
902 	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
903 	if (!VM_DYNAMIC_PAGING_ENABLED() &&
904 	    (object->purgable == VM_PURGABLE_DENY ||
905 	    object->purgable == VM_PURGABLE_NONVOLATILE ||
906 	    object->purgable == VM_PURGABLE_VOLATILE)) {
907 		vm_page_lockspin_queues();
908 		if (!VM_DYNAMIC_PAGING_ENABLED()) {
909 			vm_fault_enqueue_throttled_locked(m);
910 		}
911 		vm_page_unlock_queues();
912 	}
913 	return my_fault;
914 }
915 
916 
917 /*
918  *	Routine:	vm_fault_page
919  *	Purpose:
920  *		Find the resident page for the virtual memory
921  *		specified by the given virtual memory object
922  *		and offset.
923  *	Additional arguments:
924  *		The required permissions for the page is given
925  *		in "fault_type".  Desired permissions are included
926  *		in "protection".
927  *		fault_info is passed along to determine pagein cluster
928  *		limits... it contains the expected reference pattern,
929  *		cluster size if available, etc...
930  *
931  *		If the desired page is known to be resident (for
932  *		example, because it was previously wired down), asserting
933  *		the "unwiring" parameter will speed the search.
934  *
935  *		If the operation can be interrupted (by thread_abort
936  *		or thread_terminate), then the "interruptible"
937  *		parameter should be asserted.
938  *
939  *	Results:
940  *		The page containing the proper data is returned
941  *		in "result_page".
942  *
943  *	In/out conditions:
944  *		The source object must be locked and referenced,
945  *		and must donate one paging reference.  The reference
946  *		is not affected.  The paging reference and lock are
947  *		consumed.
948  *
949  *		If the call succeeds, the object in which "result_page"
950  *		resides is left locked and holding a paging reference.
951  *		If this is not the original object, a busy page in the
952  *		original object is returned in "top_page", to prevent other
953  *		callers from pursuing this same data, along with a paging
954  *		reference for the original object.  The "top_page" should
955  *		be destroyed when this guarantee is no longer required.
956  *		The "result_page" is also left busy.  It is not removed
957  *		from the pageout queues.
958  *	Special Case:
959  *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
960  *		fault succeeded but there's no VM page (i.e. the VM object
961  *              does not actually hold VM pages, but device memory or
962  *		large pages).  The object is still locked and we still hold a
963  *		paging_in_progress reference.
964  */
965 unsigned int vm_fault_page_blocked_access = 0;
966 unsigned int vm_fault_page_forced_retry = 0;
967 
968 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)969 vm_fault_page(
970 	/* Arguments: */
971 	vm_object_t     first_object,   /* Object to begin search */
972 	vm_object_offset_t first_offset,        /* Offset into object */
973 	vm_prot_t       fault_type,     /* What access is requested */
974 	boolean_t       must_be_resident,/* Must page be resident? */
975 	boolean_t       caller_lookup,  /* caller looked up page */
976 	/* Modifies in place: */
977 	vm_prot_t       *protection,    /* Protection for mapping */
978 	vm_page_t       *result_page,   /* Page found, if successful */
979 	/* Returns: */
980 	vm_page_t       *top_page,      /* Page in top object, if
981                                          * not result_page.  */
982 	int             *type_of_fault, /* if non-null, fill in with type of fault
983                                          * COW, zero-fill, etc... returned in trace point */
984 	/* More arguments: */
985 	kern_return_t   *error_code,    /* code if page is in error */
986 	boolean_t       no_zero_fill,   /* don't zero fill absent pages */
987 	vm_object_fault_info_t fault_info)
988 {
989 	vm_page_t               m;
990 	vm_object_t             object;
991 	vm_object_offset_t      offset;
992 	vm_page_t               first_m;
993 	vm_object_t             next_object;
994 	vm_object_t             copy_object;
995 	boolean_t               look_for_page;
996 	boolean_t               force_fault_retry = FALSE;
997 	vm_prot_t               access_required = fault_type;
998 	vm_prot_t               wants_copy_flag;
999 	kern_return_t           wait_result;
1000 	wait_interrupt_t        interruptible_state;
1001 	boolean_t               data_already_requested = FALSE;
1002 	vm_behavior_t           orig_behavior;
1003 	vm_size_t               orig_cluster_size;
1004 	vm_fault_return_t       error;
1005 	int                     my_fault;
1006 	uint32_t                try_failed_count;
1007 	int                     interruptible; /* how may fault be interrupted? */
1008 	int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
1009 	memory_object_t         pager;
1010 	vm_fault_return_t       retval;
1011 	int                     grab_options;
1012 	bool                    clear_absent_on_error = false;
1013 
1014 /*
1015  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1016  * marked as paged out in the compressor pager or the pager doesn't exist.
1017  * Note also that if the pager for an internal object
1018  * has not been created, the pager is not invoked regardless of the value
1019  * of MUST_ASK_PAGER().
1020  *
1021  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1022  * is marked as paged out in the compressor pager.
1023  * PAGED_OUT() is used to determine if a page has already been pushed
1024  * into a copy object in order to avoid a redundant page out operation.
1025  */
1026 #define MUST_ASK_PAGER(o, f, s)                                 \
1027 	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1028 
1029 #define PAGED_OUT(o, f) \
1030 	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1031 
1032 /*
1033  *	Recovery actions
1034  */
1035 #define RELEASE_PAGE(m)                                 \
1036 	MACRO_BEGIN                                     \
1037 	PAGE_WAKEUP_DONE(m);                            \
1038 	if ( !VM_PAGE_PAGEABLE(m)) {                    \
1039 	        vm_page_lockspin_queues();              \
1040 	        if (clear_absent_on_error && m->vmp_absent) {\
1041 	                vm_page_zero_fill(m);           \
1042 	                counter_inc(&vm_statistics_zero_fill_count);\
1043 	                DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);\
1044 	                m->vmp_absent = false;          \
1045 	        }                                       \
1046 	        if ( !VM_PAGE_PAGEABLE(m)) {            \
1047 	                if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
1048 	                        vm_page_deactivate(m);          \
1049 	                else                                    \
1050 	                        vm_page_activate(m);            \
1051 	        }                                               \
1052 	        vm_page_unlock_queues();                        \
1053 	}                                                       \
1054 	clear_absent_on_error = false;                  \
1055 	MACRO_END
1056 
1057 #if TRACEFAULTPAGE
1058 	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1059 #endif
1060 
1061 	interruptible = fault_info->interruptible;
1062 	interruptible_state = thread_interrupt_level(interruptible);
1063 
1064 	/*
1065 	 *	INVARIANTS (through entire routine):
1066 	 *
1067 	 *	1)	At all times, we must either have the object
1068 	 *		lock or a busy page in some object to prevent
1069 	 *		some other thread from trying to bring in
1070 	 *		the same page.
1071 	 *
1072 	 *		Note that we cannot hold any locks during the
1073 	 *		pager access or when waiting for memory, so
1074 	 *		we use a busy page then.
1075 	 *
1076 	 *	2)	To prevent another thread from racing us down the
1077 	 *		shadow chain and entering a new page in the top
1078 	 *		object before we do, we must keep a busy page in
1079 	 *		the top object while following the shadow chain.
1080 	 *
1081 	 *	3)	We must increment paging_in_progress on any object
1082 	 *		for which we have a busy page before dropping
1083 	 *		the object lock
1084 	 *
1085 	 *	4)	We leave busy pages on the pageout queues.
1086 	 *		If the pageout daemon comes across a busy page,
1087 	 *		it will remove the page from the pageout queues.
1088 	 */
1089 
1090 	object = first_object;
1091 	offset = first_offset;
1092 	first_m = VM_PAGE_NULL;
1093 	access_required = fault_type;
1094 
1095 	/*
1096 	 * default type of fault
1097 	 */
1098 	my_fault = DBG_CACHE_HIT_FAULT;
1099 	thread_pri_floor_t token;
1100 	bool    drop_floor = false;
1101 
1102 	while (TRUE) {
1103 #if TRACEFAULTPAGE
1104 		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1105 #endif
1106 
1107 		grab_options = 0;
1108 #if CONFIG_SECLUDED_MEMORY
1109 		if (object->can_grab_secluded) {
1110 			grab_options |= VM_PAGE_GRAB_SECLUDED;
1111 		}
1112 #endif /* CONFIG_SECLUDED_MEMORY */
1113 
1114 		if (!object->alive) {
1115 			/*
1116 			 * object is no longer valid
1117 			 * clean up and return error
1118 			 */
1119 #if DEVELOPMENT || DEBUG
1120 			printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1121 			if (panic_object_not_alive) {
1122 				panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1123 			}
1124 #endif /* DEVELOPMENT || DEBUG */
1125 			vm_fault_cleanup(object, first_m);
1126 			thread_interrupt_level(interruptible_state);
1127 
1128 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1129 			return VM_FAULT_MEMORY_ERROR;
1130 		}
1131 
1132 		if (!object->pager_created && object->phys_contiguous) {
1133 			/*
1134 			 * A physically-contiguous object without a pager:
1135 			 * must be a "large page" object.  We do not deal
1136 			 * with VM pages for this object.
1137 			 */
1138 			caller_lookup = FALSE;
1139 			m = VM_PAGE_NULL;
1140 			goto phys_contig_object;
1141 		}
1142 
1143 		if (object->blocked_access) {
1144 			/*
1145 			 * Access to this VM object has been blocked.
1146 			 * Replace our "paging_in_progress" reference with
1147 			 * a "activity_in_progress" reference and wait for
1148 			 * access to be unblocked.
1149 			 */
1150 			caller_lookup = FALSE; /* no longer valid after sleep */
1151 			vm_object_activity_begin(object);
1152 			vm_object_paging_end(object);
1153 			while (object->blocked_access) {
1154 				vm_object_sleep(object,
1155 				    VM_OBJECT_EVENT_UNBLOCKED,
1156 				    THREAD_UNINT);
1157 			}
1158 			vm_fault_page_blocked_access++;
1159 			vm_object_paging_begin(object);
1160 			vm_object_activity_end(object);
1161 		}
1162 
1163 		/*
1164 		 * See whether the page at 'offset' is resident
1165 		 */
1166 		if (caller_lookup == TRUE) {
1167 			/*
1168 			 * The caller has already looked up the page
1169 			 * and gave us the result in "result_page".
1170 			 * We can use this for the first lookup but
1171 			 * it loses its validity as soon as we unlock
1172 			 * the object.
1173 			 */
1174 			m = *result_page;
1175 			caller_lookup = FALSE; /* no longer valid after that */
1176 		} else {
1177 			m = vm_page_lookup(object, vm_object_trunc_page(offset));
1178 		}
1179 #if TRACEFAULTPAGE
1180 		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1181 #endif
1182 		if (m != VM_PAGE_NULL) {
1183 			if (m->vmp_busy) {
1184 				/*
1185 				 * The page is being brought in,
1186 				 * wait for it and then retry.
1187 				 */
1188 #if TRACEFAULTPAGE
1189 				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1190 #endif
1191 				wait_result = PAGE_SLEEP(object, m, interruptible);
1192 
1193 				if (wait_result != THREAD_AWAKENED) {
1194 					vm_fault_cleanup(object, first_m);
1195 					thread_interrupt_level(interruptible_state);
1196 
1197 					if (wait_result == THREAD_RESTART) {
1198 						return VM_FAULT_RETRY;
1199 					} else {
1200 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1201 						return VM_FAULT_INTERRUPTED;
1202 					}
1203 				}
1204 				continue;
1205 			}
1206 			if (m->vmp_laundry) {
1207 				m->vmp_free_when_done = FALSE;
1208 
1209 				if (!m->vmp_cleaning) {
1210 					vm_pageout_steal_laundry(m, FALSE);
1211 				}
1212 			}
1213 			vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1214 			if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1215 				/*
1216 				 * Guard page: off limits !
1217 				 */
1218 				if (fault_type == VM_PROT_NONE) {
1219 					/*
1220 					 * The fault is not requesting any
1221 					 * access to the guard page, so it must
1222 					 * be just to wire or unwire it.
1223 					 * Let's pretend it succeeded...
1224 					 */
1225 					m->vmp_busy = TRUE;
1226 					*result_page = m;
1227 					assert(first_m == VM_PAGE_NULL);
1228 					*top_page = first_m;
1229 					if (type_of_fault) {
1230 						*type_of_fault = DBG_GUARD_FAULT;
1231 					}
1232 					thread_interrupt_level(interruptible_state);
1233 					return VM_FAULT_SUCCESS;
1234 				} else {
1235 					/*
1236 					 * The fault requests access to the
1237 					 * guard page: let's deny that !
1238 					 */
1239 					vm_fault_cleanup(object, first_m);
1240 					thread_interrupt_level(interruptible_state);
1241 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1242 					return VM_FAULT_MEMORY_ERROR;
1243 				}
1244 			}
1245 
1246 
1247 			if (VMP_ERROR_GET(m)) {
1248 				/*
1249 				 * The page is in error, give up now.
1250 				 */
1251 #if TRACEFAULTPAGE
1252 				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1253 #endif
1254 				if (error_code) {
1255 					*error_code = KERN_MEMORY_ERROR;
1256 				}
1257 				VM_PAGE_FREE(m);
1258 
1259 				vm_fault_cleanup(object, first_m);
1260 				thread_interrupt_level(interruptible_state);
1261 
1262 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1263 				return VM_FAULT_MEMORY_ERROR;
1264 			}
1265 			if (m->vmp_restart) {
1266 				/*
1267 				 * The pager wants us to restart
1268 				 * at the top of the chain,
1269 				 * typically because it has moved the
1270 				 * page to another pager, then do so.
1271 				 */
1272 #if TRACEFAULTPAGE
1273 				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1274 #endif
1275 				VM_PAGE_FREE(m);
1276 
1277 				vm_fault_cleanup(object, first_m);
1278 				thread_interrupt_level(interruptible_state);
1279 
1280 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1281 				return VM_FAULT_RETRY;
1282 			}
1283 			if (m->vmp_absent) {
1284 				/*
1285 				 * The page isn't busy, but is absent,
1286 				 * therefore it's deemed "unavailable".
1287 				 *
1288 				 * Remove the non-existent page (unless it's
1289 				 * in the top object) and move on down to the
1290 				 * next object (if there is one).
1291 				 */
1292 #if TRACEFAULTPAGE
1293 				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1294 #endif
1295 				next_object = object->shadow;
1296 
1297 				if (next_object == VM_OBJECT_NULL) {
1298 					/*
1299 					 * Absent page at bottom of shadow
1300 					 * chain; zero fill the page we left
1301 					 * busy in the first object, and free
1302 					 * the absent page.
1303 					 */
1304 					assert(!must_be_resident);
1305 
1306 					/*
1307 					 * check for any conditions that prevent
1308 					 * us from creating a new zero-fill page
1309 					 * vm_fault_check will do all of the
1310 					 * fault cleanup in the case of an error condition
1311 					 * including resetting the thread_interrupt_level
1312 					 */
1313 					error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1314 
1315 					if (error != VM_FAULT_SUCCESS) {
1316 						return error;
1317 					}
1318 
1319 					if (object != first_object) {
1320 						/*
1321 						 * free the absent page we just found
1322 						 */
1323 						VM_PAGE_FREE(m);
1324 
1325 						/*
1326 						 * drop reference and lock on current object
1327 						 */
1328 						vm_object_paging_end(object);
1329 						vm_object_unlock(object);
1330 
1331 						/*
1332 						 * grab the original page we
1333 						 * 'soldered' in place and
1334 						 * retake lock on 'first_object'
1335 						 */
1336 						m = first_m;
1337 						first_m = VM_PAGE_NULL;
1338 
1339 						object = first_object;
1340 						offset = first_offset;
1341 
1342 						vm_object_lock(object);
1343 					} else {
1344 						/*
1345 						 * we're going to use the absent page we just found
1346 						 * so convert it to a 'busy' page
1347 						 */
1348 						m->vmp_absent = FALSE;
1349 						m->vmp_busy = TRUE;
1350 					}
1351 					if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1352 						m->vmp_absent = TRUE;
1353 						clear_absent_on_error = true;
1354 					}
1355 					/*
1356 					 * zero-fill the page and put it on
1357 					 * the correct paging queue
1358 					 */
1359 					my_fault = vm_fault_zero_page(m, no_zero_fill);
1360 
1361 					break;
1362 				} else {
1363 					if (must_be_resident) {
1364 						vm_object_paging_end(object);
1365 					} else if (object != first_object) {
1366 						vm_object_paging_end(object);
1367 						VM_PAGE_FREE(m);
1368 					} else {
1369 						first_m = m;
1370 						m->vmp_absent = FALSE;
1371 						m->vmp_busy = TRUE;
1372 
1373 						vm_page_lockspin_queues();
1374 						vm_page_queues_remove(m, FALSE);
1375 						vm_page_unlock_queues();
1376 					}
1377 
1378 					offset += object->vo_shadow_offset;
1379 					fault_info->lo_offset += object->vo_shadow_offset;
1380 					fault_info->hi_offset += object->vo_shadow_offset;
1381 					access_required = VM_PROT_READ;
1382 
1383 					vm_object_lock(next_object);
1384 					vm_object_unlock(object);
1385 					object = next_object;
1386 					vm_object_paging_begin(object);
1387 
1388 					/*
1389 					 * reset to default type of fault
1390 					 */
1391 					my_fault = DBG_CACHE_HIT_FAULT;
1392 
1393 					continue;
1394 				}
1395 			}
1396 			if ((m->vmp_cleaning)
1397 			    && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1398 			    && (fault_type & VM_PROT_WRITE)) {
1399 				/*
1400 				 * This is a copy-on-write fault that will
1401 				 * cause us to revoke access to this page, but
1402 				 * this page is in the process of being cleaned
1403 				 * in a clustered pageout. We must wait until
1404 				 * the cleaning operation completes before
1405 				 * revoking access to the original page,
1406 				 * otherwise we might attempt to remove a
1407 				 * wired mapping.
1408 				 */
1409 #if TRACEFAULTPAGE
1410 				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1411 #endif
1412 				/*
1413 				 * take an extra ref so that object won't die
1414 				 */
1415 				vm_object_reference_locked(object);
1416 
1417 				vm_fault_cleanup(object, first_m);
1418 
1419 				vm_object_lock(object);
1420 				assert(object->ref_count > 0);
1421 
1422 				m = vm_page_lookup(object, vm_object_trunc_page(offset));
1423 
1424 				if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1425 					PAGE_ASSERT_WAIT(m, interruptible);
1426 
1427 					vm_object_unlock(object);
1428 					wait_result = thread_block(THREAD_CONTINUE_NULL);
1429 					vm_object_deallocate(object);
1430 
1431 					goto backoff;
1432 				} else {
1433 					vm_object_unlock(object);
1434 
1435 					vm_object_deallocate(object);
1436 					thread_interrupt_level(interruptible_state);
1437 
1438 					return VM_FAULT_RETRY;
1439 				}
1440 			}
1441 			if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1442 			    !(fault_info != NULL && fault_info->stealth)) {
1443 				/*
1444 				 * If we were passed a non-NULL pointer for
1445 				 * "type_of_fault", than we came from
1446 				 * vm_fault... we'll let it deal with
1447 				 * this condition, since it
1448 				 * needs to see m->vmp_speculative to correctly
1449 				 * account the pageins, otherwise...
1450 				 * take it off the speculative queue, we'll
1451 				 * let the caller of vm_fault_page deal
1452 				 * with getting it onto the correct queue
1453 				 *
1454 				 * If the caller specified in fault_info that
1455 				 * it wants a "stealth" fault, we also leave
1456 				 * the page in the speculative queue.
1457 				 */
1458 				vm_page_lockspin_queues();
1459 				if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1460 					vm_page_queues_remove(m, FALSE);
1461 				}
1462 				vm_page_unlock_queues();
1463 			}
1464 			assert(object == VM_PAGE_OBJECT(m));
1465 
1466 			if (object->code_signed) {
1467 				/*
1468 				 * CODE SIGNING:
1469 				 * We just paged in a page from a signed
1470 				 * memory object but we don't need to
1471 				 * validate it now.  We'll validate it if
1472 				 * when it gets mapped into a user address
1473 				 * space for the first time or when the page
1474 				 * gets copied to another object as a result
1475 				 * of a copy-on-write.
1476 				 */
1477 			}
1478 
1479 			/*
1480 			 * We mark the page busy and leave it on
1481 			 * the pageout queues.  If the pageout
1482 			 * deamon comes across it, then it will
1483 			 * remove the page from the queue, but not the object
1484 			 */
1485 #if TRACEFAULTPAGE
1486 			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1487 #endif
1488 			assert(!m->vmp_busy);
1489 			assert(!m->vmp_absent);
1490 
1491 			m->vmp_busy = TRUE;
1492 			break;
1493 		}
1494 
1495 		/*
1496 		 * we get here when there is no page present in the object at
1497 		 * the offset we're interested in... we'll allocate a page
1498 		 * at this point if the pager associated with
1499 		 * this object can provide the data or we're the top object...
1500 		 * object is locked;  m == NULL
1501 		 */
1502 
1503 		if (must_be_resident) {
1504 			if (fault_type == VM_PROT_NONE &&
1505 			    object == kernel_object) {
1506 				/*
1507 				 * We've been called from vm_fault_unwire()
1508 				 * while removing a map entry that was allocated
1509 				 * with KMA_KOBJECT and KMA_VAONLY.  This page
1510 				 * is not present and there's nothing more to
1511 				 * do here (nothing to unwire).
1512 				 */
1513 				vm_fault_cleanup(object, first_m);
1514 				thread_interrupt_level(interruptible_state);
1515 
1516 				return VM_FAULT_MEMORY_ERROR;
1517 			}
1518 
1519 			goto dont_look_for_page;
1520 		}
1521 
1522 		/* Don't expect to fault pages into the kernel object. */
1523 		assert(object != kernel_object);
1524 
1525 		look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1526 
1527 #if TRACEFAULTPAGE
1528 		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1529 #endif
1530 		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1531 			/*
1532 			 * Allocate a new page for this object/offset pair as a placeholder
1533 			 */
1534 			m = vm_page_grab_options(grab_options);
1535 #if TRACEFAULTPAGE
1536 			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1537 #endif
1538 			if (m == VM_PAGE_NULL) {
1539 				vm_fault_cleanup(object, first_m);
1540 				thread_interrupt_level(interruptible_state);
1541 
1542 				return VM_FAULT_MEMORY_SHORTAGE;
1543 			}
1544 
1545 			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1546 				vm_page_insert_internal(m, object,
1547 				    vm_object_trunc_page(offset),
1548 				    VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1549 			} else {
1550 				vm_page_insert(m, object, vm_object_trunc_page(offset));
1551 			}
1552 		}
1553 		if (look_for_page) {
1554 			kern_return_t   rc;
1555 			int             my_fault_type;
1556 
1557 			/*
1558 			 *	If the memory manager is not ready, we
1559 			 *	cannot make requests.
1560 			 */
1561 			if (!object->pager_ready) {
1562 #if TRACEFAULTPAGE
1563 				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1564 #endif
1565 				if (m != VM_PAGE_NULL) {
1566 					VM_PAGE_FREE(m);
1567 				}
1568 
1569 				/*
1570 				 * take an extra ref so object won't die
1571 				 */
1572 				vm_object_reference_locked(object);
1573 				vm_fault_cleanup(object, first_m);
1574 
1575 				vm_object_lock(object);
1576 				assert(object->ref_count > 0);
1577 
1578 				if (!object->pager_ready) {
1579 					wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1580 
1581 					vm_object_unlock(object);
1582 					if (wait_result == THREAD_WAITING) {
1583 						wait_result = thread_block(THREAD_CONTINUE_NULL);
1584 					}
1585 					vm_object_deallocate(object);
1586 
1587 					goto backoff;
1588 				} else {
1589 					vm_object_unlock(object);
1590 					vm_object_deallocate(object);
1591 					thread_interrupt_level(interruptible_state);
1592 
1593 					return VM_FAULT_RETRY;
1594 				}
1595 			}
1596 			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1597 				/*
1598 				 * If there are too many outstanding page
1599 				 * requests pending on this external object, we
1600 				 * wait for them to be resolved now.
1601 				 */
1602 #if TRACEFAULTPAGE
1603 				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1604 #endif
1605 				if (m != VM_PAGE_NULL) {
1606 					VM_PAGE_FREE(m);
1607 				}
1608 				/*
1609 				 * take an extra ref so object won't die
1610 				 */
1611 				vm_object_reference_locked(object);
1612 
1613 				vm_fault_cleanup(object, first_m);
1614 
1615 				vm_object_lock(object);
1616 				assert(object->ref_count > 0);
1617 
1618 				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1619 					vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1620 
1621 					vm_object_unlock(object);
1622 					wait_result = thread_block(THREAD_CONTINUE_NULL);
1623 					vm_object_deallocate(object);
1624 
1625 					goto backoff;
1626 				} else {
1627 					vm_object_unlock(object);
1628 					vm_object_deallocate(object);
1629 					thread_interrupt_level(interruptible_state);
1630 
1631 					return VM_FAULT_RETRY;
1632 				}
1633 			}
1634 			if (object->internal) {
1635 				int compressed_count_delta;
1636 
1637 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1638 
1639 				if (m == VM_PAGE_NULL) {
1640 					/*
1641 					 * Allocate a new page for this object/offset pair as a placeholder
1642 					 */
1643 					m = vm_page_grab_options(grab_options);
1644 #if TRACEFAULTPAGE
1645 					dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1646 #endif
1647 					if (m == VM_PAGE_NULL) {
1648 						vm_fault_cleanup(object, first_m);
1649 						thread_interrupt_level(interruptible_state);
1650 
1651 						return VM_FAULT_MEMORY_SHORTAGE;
1652 					}
1653 
1654 					m->vmp_absent = TRUE;
1655 					if (fault_info && fault_info->batch_pmap_op == TRUE) {
1656 						vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1657 					} else {
1658 						vm_page_insert(m, object, vm_object_trunc_page(offset));
1659 					}
1660 				}
1661 				assert(m->vmp_busy);
1662 
1663 				m->vmp_absent = TRUE;
1664 				pager = object->pager;
1665 
1666 				assert(object->paging_in_progress > 0);
1667 				vm_object_unlock(object);
1668 
1669 				rc = vm_compressor_pager_get(
1670 					pager,
1671 					offset + object->paging_offset,
1672 					VM_PAGE_GET_PHYS_PAGE(m),
1673 					&my_fault_type,
1674 					0,
1675 					&compressed_count_delta);
1676 
1677 				if (type_of_fault == NULL) {
1678 					int     throttle_delay;
1679 
1680 					/*
1681 					 * we weren't called from vm_fault, so we
1682 					 * need to apply page creation throttling
1683 					 * do it before we re-acquire any locks
1684 					 */
1685 					if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1686 						if ((throttle_delay = vm_page_throttled(TRUE))) {
1687 							VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1688 							__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1689 						}
1690 					}
1691 				}
1692 				vm_object_lock(object);
1693 				assert(object->paging_in_progress > 0);
1694 
1695 				vm_compressor_pager_count(
1696 					pager,
1697 					compressed_count_delta,
1698 					FALSE, /* shared_lock */
1699 					object);
1700 
1701 				switch (rc) {
1702 				case KERN_SUCCESS:
1703 					m->vmp_absent = FALSE;
1704 					m->vmp_dirty = TRUE;
1705 					if ((object->wimg_bits &
1706 					    VM_WIMG_MASK) !=
1707 					    VM_WIMG_USE_DEFAULT) {
1708 						/*
1709 						 * If the page is not cacheable,
1710 						 * we can't let its contents
1711 						 * linger in the data cache
1712 						 * after the decompression.
1713 						 */
1714 						pmap_sync_page_attributes_phys(
1715 							VM_PAGE_GET_PHYS_PAGE(m));
1716 					} else {
1717 						m->vmp_written_by_kernel = TRUE;
1718 					}
1719 
1720 					/*
1721 					 * If the object is purgeable, its
1722 					 * owner's purgeable ledgers have been
1723 					 * updated in vm_page_insert() but the
1724 					 * page was also accounted for in a
1725 					 * "compressed purgeable" ledger, so
1726 					 * update that now.
1727 					 */
1728 					if (((object->purgable !=
1729 					    VM_PURGABLE_DENY) ||
1730 					    object->vo_ledger_tag) &&
1731 					    (object->vo_owner !=
1732 					    NULL)) {
1733 						/*
1734 						 * One less compressed
1735 						 * purgeable/tagged page.
1736 						 */
1737 						vm_object_owner_compressed_update(
1738 							object,
1739 							-1);
1740 					}
1741 
1742 					break;
1743 				case KERN_MEMORY_FAILURE:
1744 					m->vmp_unusual = TRUE;
1745 					m->vmp_error = TRUE;
1746 					m->vmp_absent = FALSE;
1747 					break;
1748 				case KERN_MEMORY_ERROR:
1749 					assert(m->vmp_absent);
1750 					break;
1751 				default:
1752 					panic("vm_fault_page(): unexpected "
1753 					    "error %d from "
1754 					    "vm_compressor_pager_get()\n",
1755 					    rc);
1756 				}
1757 				PAGE_WAKEUP_DONE(m);
1758 
1759 				rc = KERN_SUCCESS;
1760 				goto data_requested;
1761 			}
1762 			my_fault_type = DBG_PAGEIN_FAULT;
1763 
1764 			if (m != VM_PAGE_NULL) {
1765 				VM_PAGE_FREE(m);
1766 				m = VM_PAGE_NULL;
1767 			}
1768 
1769 #if TRACEFAULTPAGE
1770 			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1771 #endif
1772 
1773 			/*
1774 			 * It's possible someone called vm_object_destroy while we weren't
1775 			 * holding the object lock.  If that has happened, then bail out
1776 			 * here.
1777 			 */
1778 
1779 			pager = object->pager;
1780 
1781 			if (pager == MEMORY_OBJECT_NULL) {
1782 				vm_fault_cleanup(object, first_m);
1783 				thread_interrupt_level(interruptible_state);
1784 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NO_PAGER), 0 /* arg */);
1785 				return VM_FAULT_MEMORY_ERROR;
1786 			}
1787 
1788 			/*
1789 			 * We have an absent page in place for the faulting offset,
1790 			 * so we can release the object lock.
1791 			 */
1792 
1793 			if (object->object_is_shared_cache) {
1794 				token = thread_priority_floor_start();
1795 				/*
1796 				 * A non-native shared cache object might
1797 				 * be getting set up in parallel with this
1798 				 * fault and so we can't assume that this
1799 				 * check will be valid after we drop the
1800 				 * object lock below.
1801 				 */
1802 				drop_floor = true;
1803 			}
1804 
1805 			vm_object_unlock(object);
1806 
1807 			/*
1808 			 * If this object uses a copy_call strategy,
1809 			 * and we are interested in a copy of this object
1810 			 * (having gotten here only by following a
1811 			 * shadow chain), then tell the memory manager
1812 			 * via a flag added to the desired_access
1813 			 * parameter, so that it can detect a race
1814 			 * between our walking down the shadow chain
1815 			 * and its pushing pages up into a copy of
1816 			 * the object that it manages.
1817 			 */
1818 			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1819 				wants_copy_flag = VM_PROT_WANTS_COPY;
1820 			} else {
1821 				wants_copy_flag = VM_PROT_NONE;
1822 			}
1823 
1824 			if (object->copy == first_object) {
1825 				/*
1826 				 * if we issue the memory_object_data_request in
1827 				 * this state, we are subject to a deadlock with
1828 				 * the underlying filesystem if it is trying to
1829 				 * shrink the file resulting in a push of pages
1830 				 * into the copy object...  that push will stall
1831 				 * on the placeholder page, and if the pushing thread
1832 				 * is holding a lock that is required on the pagein
1833 				 * path (such as a truncate lock), we'll deadlock...
1834 				 * to avoid this potential deadlock, we throw away
1835 				 * our placeholder page before calling memory_object_data_request
1836 				 * and force this thread to retry the vm_fault_page after
1837 				 * we have issued the I/O.  the second time through this path
1838 				 * we will find the page already in the cache (presumably still
1839 				 * busy waiting for the I/O to complete) and then complete
1840 				 * the fault w/o having to go through memory_object_data_request again
1841 				 */
1842 				assert(first_m != VM_PAGE_NULL);
1843 				assert(VM_PAGE_OBJECT(first_m) == first_object);
1844 
1845 				vm_object_lock(first_object);
1846 				VM_PAGE_FREE(first_m);
1847 				vm_object_paging_end(first_object);
1848 				vm_object_unlock(first_object);
1849 
1850 				first_m = VM_PAGE_NULL;
1851 				force_fault_retry = TRUE;
1852 
1853 				vm_fault_page_forced_retry++;
1854 			}
1855 
1856 			if (data_already_requested == TRUE) {
1857 				orig_behavior = fault_info->behavior;
1858 				orig_cluster_size = fault_info->cluster_size;
1859 
1860 				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1861 				fault_info->cluster_size = PAGE_SIZE;
1862 			}
1863 			/*
1864 			 * Call the memory manager to retrieve the data.
1865 			 */
1866 			rc = memory_object_data_request(
1867 				pager,
1868 				vm_object_trunc_page(offset) + object->paging_offset,
1869 				PAGE_SIZE,
1870 				access_required | wants_copy_flag,
1871 				(memory_object_fault_info_t)fault_info);
1872 
1873 			if (data_already_requested == TRUE) {
1874 				fault_info->behavior = orig_behavior;
1875 				fault_info->cluster_size = orig_cluster_size;
1876 			} else {
1877 				data_already_requested = TRUE;
1878 			}
1879 
1880 			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1881 #if TRACEFAULTPAGE
1882 			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1883 #endif
1884 			vm_object_lock(object);
1885 
1886 			if (drop_floor && object->object_is_shared_cache) {
1887 				thread_priority_floor_end(&token);
1888 				drop_floor = false;
1889 			}
1890 
1891 data_requested:
1892 			if (rc != KERN_SUCCESS) {
1893 				vm_fault_cleanup(object, first_m);
1894 				thread_interrupt_level(interruptible_state);
1895 
1896 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
1897 
1898 				return (rc == MACH_SEND_INTERRUPTED) ?
1899 				       VM_FAULT_INTERRUPTED :
1900 				       VM_FAULT_MEMORY_ERROR;
1901 			} else {
1902 				clock_sec_t     tv_sec;
1903 				clock_usec_t    tv_usec;
1904 
1905 				if (my_fault_type == DBG_PAGEIN_FAULT) {
1906 					clock_get_system_microtime(&tv_sec, &tv_usec);
1907 					current_thread()->t_page_creation_time = tv_sec;
1908 					current_thread()->t_page_creation_count = 0;
1909 				}
1910 			}
1911 			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1912 				vm_fault_cleanup(object, first_m);
1913 				thread_interrupt_level(interruptible_state);
1914 
1915 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
1916 				return VM_FAULT_INTERRUPTED;
1917 			}
1918 			if (force_fault_retry == TRUE) {
1919 				vm_fault_cleanup(object, first_m);
1920 				thread_interrupt_level(interruptible_state);
1921 
1922 				return VM_FAULT_RETRY;
1923 			}
1924 			if (m == VM_PAGE_NULL && object->phys_contiguous) {
1925 				/*
1926 				 * No page here means that the object we
1927 				 * initially looked up was "physically
1928 				 * contiguous" (i.e. device memory).  However,
1929 				 * with Virtual VRAM, the object might not
1930 				 * be backed by that device memory anymore,
1931 				 * so we're done here only if the object is
1932 				 * still "phys_contiguous".
1933 				 * Otherwise, if the object is no longer
1934 				 * "phys_contiguous", we need to retry the
1935 				 * page fault against the object's new backing
1936 				 * store (different memory object).
1937 				 */
1938 phys_contig_object:
1939 				goto done;
1940 			}
1941 			/*
1942 			 * potentially a pagein fault
1943 			 * if we make it through the state checks
1944 			 * above, than we'll count it as such
1945 			 */
1946 			my_fault = my_fault_type;
1947 
1948 			/*
1949 			 * Retry with same object/offset, since new data may
1950 			 * be in a different page (i.e., m is meaningless at
1951 			 * this point).
1952 			 */
1953 			continue;
1954 		}
1955 dont_look_for_page:
1956 		/*
1957 		 * We get here if the object has no pager, or an existence map
1958 		 * exists and indicates the page isn't present on the pager
1959 		 * or we're unwiring a page.  If a pager exists, but there
1960 		 * is no existence map, then the m->vmp_absent case above handles
1961 		 * the ZF case when the pager can't provide the page
1962 		 */
1963 #if TRACEFAULTPAGE
1964 		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1965 #endif
1966 		if (object == first_object) {
1967 			first_m = m;
1968 		} else {
1969 			assert(m == VM_PAGE_NULL);
1970 		}
1971 
1972 		next_object = object->shadow;
1973 
1974 		if (next_object == VM_OBJECT_NULL) {
1975 			/*
1976 			 * we've hit the bottom of the shadown chain,
1977 			 * fill the page in the top object with zeros.
1978 			 */
1979 			assert(!must_be_resident);
1980 
1981 			if (object != first_object) {
1982 				vm_object_paging_end(object);
1983 				vm_object_unlock(object);
1984 
1985 				object = first_object;
1986 				offset = first_offset;
1987 				vm_object_lock(object);
1988 			}
1989 			m = first_m;
1990 			assert(VM_PAGE_OBJECT(m) == object);
1991 			first_m = VM_PAGE_NULL;
1992 
1993 			/*
1994 			 * check for any conditions that prevent
1995 			 * us from creating a new zero-fill page
1996 			 * vm_fault_check will do all of the
1997 			 * fault cleanup in the case of an error condition
1998 			 * including resetting the thread_interrupt_level
1999 			 */
2000 			error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
2001 
2002 			if (error != VM_FAULT_SUCCESS) {
2003 				return error;
2004 			}
2005 
2006 			if (m == VM_PAGE_NULL) {
2007 				m = vm_page_grab_options(grab_options);
2008 
2009 				if (m == VM_PAGE_NULL) {
2010 					vm_fault_cleanup(object, VM_PAGE_NULL);
2011 					thread_interrupt_level(interruptible_state);
2012 
2013 					return VM_FAULT_MEMORY_SHORTAGE;
2014 				}
2015 				vm_page_insert(m, object, vm_object_trunc_page(offset));
2016 			}
2017 			if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2018 				m->vmp_absent = TRUE;
2019 				clear_absent_on_error = true;
2020 			}
2021 
2022 			my_fault = vm_fault_zero_page(m, no_zero_fill);
2023 
2024 			break;
2025 		} else {
2026 			/*
2027 			 * Move on to the next object.  Lock the next
2028 			 * object before unlocking the current one.
2029 			 */
2030 			if ((object != first_object) || must_be_resident) {
2031 				vm_object_paging_end(object);
2032 			}
2033 
2034 			offset += object->vo_shadow_offset;
2035 			fault_info->lo_offset += object->vo_shadow_offset;
2036 			fault_info->hi_offset += object->vo_shadow_offset;
2037 			access_required = VM_PROT_READ;
2038 
2039 			vm_object_lock(next_object);
2040 			vm_object_unlock(object);
2041 
2042 			object = next_object;
2043 			vm_object_paging_begin(object);
2044 		}
2045 	}
2046 
2047 	/*
2048 	 *	PAGE HAS BEEN FOUND.
2049 	 *
2050 	 *	This page (m) is:
2051 	 *		busy, so that we can play with it;
2052 	 *		not absent, so that nobody else will fill it;
2053 	 *		possibly eligible for pageout;
2054 	 *
2055 	 *	The top-level page (first_m) is:
2056 	 *		VM_PAGE_NULL if the page was found in the
2057 	 *		 top-level object;
2058 	 *		busy, not absent, and ineligible for pageout.
2059 	 *
2060 	 *	The current object (object) is locked.  A paging
2061 	 *	reference is held for the current and top-level
2062 	 *	objects.
2063 	 */
2064 
2065 #if TRACEFAULTPAGE
2066 	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
2067 #endif
2068 #if     EXTRA_ASSERTIONS
2069 	assert(m->vmp_busy && !m->vmp_absent);
2070 	assert((first_m == VM_PAGE_NULL) ||
2071 	    (first_m->vmp_busy && !first_m->vmp_absent &&
2072 	    !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2073 #endif  /* EXTRA_ASSERTIONS */
2074 
2075 	/*
2076 	 * If the page is being written, but isn't
2077 	 * already owned by the top-level object,
2078 	 * we have to copy it into a new page owned
2079 	 * by the top-level object.
2080 	 */
2081 	if (object != first_object) {
2082 #if TRACEFAULTPAGE
2083 		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2084 #endif
2085 		if (fault_type & VM_PROT_WRITE) {
2086 			vm_page_t copy_m;
2087 
2088 			/*
2089 			 * We only really need to copy if we
2090 			 * want to write it.
2091 			 */
2092 			assert(!must_be_resident);
2093 
2094 			/*
2095 			 * If we try to collapse first_object at this
2096 			 * point, we may deadlock when we try to get
2097 			 * the lock on an intermediate object (since we
2098 			 * have the bottom object locked).  We can't
2099 			 * unlock the bottom object, because the page
2100 			 * we found may move (by collapse) if we do.
2101 			 *
2102 			 * Instead, we first copy the page.  Then, when
2103 			 * we have no more use for the bottom object,
2104 			 * we unlock it and try to collapse.
2105 			 *
2106 			 * Note that we copy the page even if we didn't
2107 			 * need to... that's the breaks.
2108 			 */
2109 
2110 			/*
2111 			 * Allocate a page for the copy
2112 			 */
2113 			copy_m = vm_page_grab_options(grab_options);
2114 
2115 			if (copy_m == VM_PAGE_NULL) {
2116 				RELEASE_PAGE(m);
2117 
2118 				vm_fault_cleanup(object, first_m);
2119 				thread_interrupt_level(interruptible_state);
2120 
2121 				return VM_FAULT_MEMORY_SHORTAGE;
2122 			}
2123 
2124 			vm_page_copy(m, copy_m);
2125 
2126 			/*
2127 			 * If another map is truly sharing this
2128 			 * page with us, we have to flush all
2129 			 * uses of the original page, since we
2130 			 * can't distinguish those which want the
2131 			 * original from those which need the
2132 			 * new copy.
2133 			 *
2134 			 * XXXO If we know that only one map has
2135 			 * access to this page, then we could
2136 			 * avoid the pmap_disconnect() call.
2137 			 */
2138 			if (m->vmp_pmapped) {
2139 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2140 			}
2141 
2142 			if (m->vmp_clustered) {
2143 				VM_PAGE_COUNT_AS_PAGEIN(m);
2144 				VM_PAGE_CONSUME_CLUSTERED(m);
2145 			}
2146 			assert(!m->vmp_cleaning);
2147 
2148 			/*
2149 			 * We no longer need the old page or object.
2150 			 */
2151 			RELEASE_PAGE(m);
2152 
2153 			/*
2154 			 * This check helps with marking the object as having a sequential pattern
2155 			 * Normally we'll miss doing this below because this fault is about COW to
2156 			 * the first_object i.e. bring page in from disk, push to object above but
2157 			 * don't update the file object's sequential pattern.
2158 			 */
2159 			if (object->internal == FALSE) {
2160 				vm_fault_is_sequential(object, offset, fault_info->behavior);
2161 			}
2162 
2163 			vm_object_paging_end(object);
2164 			vm_object_unlock(object);
2165 
2166 			my_fault = DBG_COW_FAULT;
2167 			counter_inc(&vm_statistics_cow_faults);
2168 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2169 			counter_inc(&current_task()->cow_faults);
2170 
2171 			object = first_object;
2172 			offset = first_offset;
2173 
2174 			vm_object_lock(object);
2175 			/*
2176 			 * get rid of the place holder
2177 			 * page that we soldered in earlier
2178 			 */
2179 			VM_PAGE_FREE(first_m);
2180 			first_m = VM_PAGE_NULL;
2181 
2182 			/*
2183 			 * and replace it with the
2184 			 * page we just copied into
2185 			 */
2186 			assert(copy_m->vmp_busy);
2187 			vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2188 			SET_PAGE_DIRTY(copy_m, TRUE);
2189 
2190 			m = copy_m;
2191 			/*
2192 			 * Now that we've gotten the copy out of the
2193 			 * way, let's try to collapse the top object.
2194 			 * But we have to play ugly games with
2195 			 * paging_in_progress to do that...
2196 			 */
2197 			vm_object_paging_end(object);
2198 			vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2199 			vm_object_paging_begin(object);
2200 		} else {
2201 			*protection &= (~VM_PROT_WRITE);
2202 		}
2203 	}
2204 	/*
2205 	 * Now check whether the page needs to be pushed into the
2206 	 * copy object.  The use of asymmetric copy on write for
2207 	 * shared temporary objects means that we may do two copies to
2208 	 * satisfy the fault; one above to get the page from a
2209 	 * shadowed object, and one here to push it into the copy.
2210 	 */
2211 	try_failed_count = 0;
2212 
2213 	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2214 		vm_object_offset_t      copy_offset;
2215 		vm_page_t               copy_m;
2216 
2217 #if TRACEFAULTPAGE
2218 		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2219 #endif
2220 		/*
2221 		 * If the page is being written, but hasn't been
2222 		 * copied to the copy-object, we have to copy it there.
2223 		 */
2224 		if ((fault_type & VM_PROT_WRITE) == 0) {
2225 			*protection &= ~VM_PROT_WRITE;
2226 			break;
2227 		}
2228 
2229 		/*
2230 		 * If the page was guaranteed to be resident,
2231 		 * we must have already performed the copy.
2232 		 */
2233 		if (must_be_resident) {
2234 			break;
2235 		}
2236 
2237 		/*
2238 		 * Try to get the lock on the copy_object.
2239 		 */
2240 		if (!vm_object_lock_try(copy_object)) {
2241 			vm_object_unlock(object);
2242 			try_failed_count++;
2243 
2244 			mutex_pause(try_failed_count);  /* wait a bit */
2245 			vm_object_lock(object);
2246 
2247 			continue;
2248 		}
2249 		try_failed_count = 0;
2250 
2251 		/*
2252 		 * Make another reference to the copy-object,
2253 		 * to keep it from disappearing during the
2254 		 * copy.
2255 		 */
2256 		vm_object_reference_locked(copy_object);
2257 
2258 		/*
2259 		 * Does the page exist in the copy?
2260 		 */
2261 		copy_offset = first_offset - copy_object->vo_shadow_offset;
2262 		copy_offset = vm_object_trunc_page(copy_offset);
2263 
2264 		if (copy_object->vo_size <= copy_offset) {
2265 			/*
2266 			 * Copy object doesn't cover this page -- do nothing.
2267 			 */
2268 			;
2269 		} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2270 			/*
2271 			 * Page currently exists in the copy object
2272 			 */
2273 			if (copy_m->vmp_busy) {
2274 				/*
2275 				 * If the page is being brought
2276 				 * in, wait for it and then retry.
2277 				 */
2278 				RELEASE_PAGE(m);
2279 
2280 				/*
2281 				 * take an extra ref so object won't die
2282 				 */
2283 				vm_object_reference_locked(copy_object);
2284 				vm_object_unlock(copy_object);
2285 				vm_fault_cleanup(object, first_m);
2286 
2287 				vm_object_lock(copy_object);
2288 				assert(copy_object->ref_count > 0);
2289 				vm_object_lock_assert_exclusive(copy_object);
2290 				copy_object->ref_count--;
2291 				assert(copy_object->ref_count > 0);
2292 				copy_m = vm_page_lookup(copy_object, copy_offset);
2293 
2294 				if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2295 					PAGE_ASSERT_WAIT(copy_m, interruptible);
2296 
2297 					vm_object_unlock(copy_object);
2298 					wait_result = thread_block(THREAD_CONTINUE_NULL);
2299 					vm_object_deallocate(copy_object);
2300 
2301 					goto backoff;
2302 				} else {
2303 					vm_object_unlock(copy_object);
2304 					vm_object_deallocate(copy_object);
2305 					thread_interrupt_level(interruptible_state);
2306 
2307 					return VM_FAULT_RETRY;
2308 				}
2309 			}
2310 		} else if (!PAGED_OUT(copy_object, copy_offset)) {
2311 			/*
2312 			 * If PAGED_OUT is TRUE, then the page used to exist
2313 			 * in the copy-object, and has already been paged out.
2314 			 * We don't need to repeat this. If PAGED_OUT is
2315 			 * FALSE, then either we don't know (!pager_created,
2316 			 * for example) or it hasn't been paged out.
2317 			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2318 			 * We must copy the page to the copy object.
2319 			 *
2320 			 * Allocate a page for the copy
2321 			 */
2322 			copy_m = vm_page_alloc(copy_object, copy_offset);
2323 
2324 			if (copy_m == VM_PAGE_NULL) {
2325 				RELEASE_PAGE(m);
2326 
2327 				vm_object_lock_assert_exclusive(copy_object);
2328 				copy_object->ref_count--;
2329 				assert(copy_object->ref_count > 0);
2330 
2331 				vm_object_unlock(copy_object);
2332 				vm_fault_cleanup(object, first_m);
2333 				thread_interrupt_level(interruptible_state);
2334 
2335 				return VM_FAULT_MEMORY_SHORTAGE;
2336 			}
2337 			/*
2338 			 * Must copy page into copy-object.
2339 			 */
2340 			vm_page_copy(m, copy_m);
2341 
2342 			/*
2343 			 * If the old page was in use by any users
2344 			 * of the copy-object, it must be removed
2345 			 * from all pmaps.  (We can't know which
2346 			 * pmaps use it.)
2347 			 */
2348 			if (m->vmp_pmapped) {
2349 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2350 			}
2351 
2352 			if (m->vmp_clustered) {
2353 				VM_PAGE_COUNT_AS_PAGEIN(m);
2354 				VM_PAGE_CONSUME_CLUSTERED(m);
2355 			}
2356 			/*
2357 			 * If there's a pager, then immediately
2358 			 * page out this page, using the "initialize"
2359 			 * option.  Else, we use the copy.
2360 			 */
2361 			if ((!copy_object->pager_ready)
2362 			    || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2363 			    ) {
2364 				vm_page_lockspin_queues();
2365 				assert(!m->vmp_cleaning);
2366 				vm_page_activate(copy_m);
2367 				vm_page_unlock_queues();
2368 
2369 				SET_PAGE_DIRTY(copy_m, TRUE);
2370 				PAGE_WAKEUP_DONE(copy_m);
2371 			} else {
2372 				assert(copy_m->vmp_busy == TRUE);
2373 				assert(!m->vmp_cleaning);
2374 
2375 				/*
2376 				 * dirty is protected by the object lock
2377 				 */
2378 				SET_PAGE_DIRTY(copy_m, TRUE);
2379 
2380 				/*
2381 				 * The page is already ready for pageout:
2382 				 * not on pageout queues and busy.
2383 				 * Unlock everything except the
2384 				 * copy_object itself.
2385 				 */
2386 				vm_object_unlock(object);
2387 
2388 				/*
2389 				 * Write the page to the copy-object,
2390 				 * flushing it from the kernel.
2391 				 */
2392 				vm_pageout_initialize_page(copy_m);
2393 
2394 				/*
2395 				 * Since the pageout may have
2396 				 * temporarily dropped the
2397 				 * copy_object's lock, we
2398 				 * check whether we'll have
2399 				 * to deallocate the hard way.
2400 				 */
2401 				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2402 					vm_object_unlock(copy_object);
2403 					vm_object_deallocate(copy_object);
2404 					vm_object_lock(object);
2405 
2406 					continue;
2407 				}
2408 				/*
2409 				 * Pick back up the old object's
2410 				 * lock.  [It is safe to do so,
2411 				 * since it must be deeper in the
2412 				 * object tree.]
2413 				 */
2414 				vm_object_lock(object);
2415 			}
2416 
2417 			/*
2418 			 * Because we're pushing a page upward
2419 			 * in the object tree, we must restart
2420 			 * any faults that are waiting here.
2421 			 * [Note that this is an expansion of
2422 			 * PAGE_WAKEUP that uses the THREAD_RESTART
2423 			 * wait result].  Can't turn off the page's
2424 			 * busy bit because we're not done with it.
2425 			 */
2426 			if (m->vmp_wanted) {
2427 				m->vmp_wanted = FALSE;
2428 				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2429 			}
2430 		}
2431 		/*
2432 		 * The reference count on copy_object must be
2433 		 * at least 2: one for our extra reference,
2434 		 * and at least one from the outside world
2435 		 * (we checked that when we last locked
2436 		 * copy_object).
2437 		 */
2438 		vm_object_lock_assert_exclusive(copy_object);
2439 		copy_object->ref_count--;
2440 		assert(copy_object->ref_count > 0);
2441 
2442 		vm_object_unlock(copy_object);
2443 
2444 		break;
2445 	}
2446 
2447 done:
2448 	*result_page = m;
2449 	*top_page = first_m;
2450 
2451 	if (m != VM_PAGE_NULL) {
2452 		assert(VM_PAGE_OBJECT(m) == object);
2453 
2454 		retval = VM_FAULT_SUCCESS;
2455 
2456 		if (my_fault == DBG_PAGEIN_FAULT) {
2457 			VM_PAGE_COUNT_AS_PAGEIN(m);
2458 
2459 			if (object->internal) {
2460 				my_fault = DBG_PAGEIND_FAULT;
2461 			} else {
2462 				my_fault = DBG_PAGEINV_FAULT;
2463 			}
2464 
2465 			/*
2466 			 * evaluate access pattern and update state
2467 			 * vm_fault_deactivate_behind depends on the
2468 			 * state being up to date
2469 			 */
2470 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2471 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2472 		} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2473 			/*
2474 			 * we weren't called from vm_fault, so handle the
2475 			 * accounting here for hits in the cache
2476 			 */
2477 			if (m->vmp_clustered) {
2478 				VM_PAGE_COUNT_AS_PAGEIN(m);
2479 				VM_PAGE_CONSUME_CLUSTERED(m);
2480 			}
2481 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2482 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2483 		} else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2484 			VM_STAT_DECOMPRESSIONS();
2485 		}
2486 		if (type_of_fault) {
2487 			*type_of_fault = my_fault;
2488 		}
2489 	} else {
2490 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2491 		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2492 		assert(first_m == VM_PAGE_NULL);
2493 		assert(object == first_object);
2494 	}
2495 
2496 	thread_interrupt_level(interruptible_state);
2497 
2498 #if TRACEFAULTPAGE
2499 	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2500 #endif
2501 	return retval;
2502 
2503 backoff:
2504 	thread_interrupt_level(interruptible_state);
2505 
2506 	if (wait_result == THREAD_INTERRUPTED) {
2507 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2508 		return VM_FAULT_INTERRUPTED;
2509 	}
2510 	return VM_FAULT_RETRY;
2511 
2512 #undef  RELEASE_PAGE
2513 }
2514 
2515 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2516 #define PANIC_ON_CS_KILLED_DEFAULT true
2517 #else
2518 #define PANIC_ON_CS_KILLED_DEFAULT false
2519 #endif
2520 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2521     PANIC_ON_CS_KILLED_DEFAULT);
2522 
2523 extern int proc_selfpid(void);
2524 extern char *proc_name_address(void *p);
2525 unsigned long cs_enter_tainted_rejected = 0;
2526 unsigned long cs_enter_tainted_accepted = 0;
2527 
2528 /*
2529  * CODE SIGNING:
2530  * When soft faulting a page, we have to validate the page if:
2531  * 1. the page is being mapped in user space
2532  * 2. the page hasn't already been found to be "tainted"
2533  * 3. the page belongs to a code-signed object
2534  * 4. the page has not been validated yet or has been mapped for write.
2535  */
2536 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2537 vm_fault_cs_need_validation(
2538 	pmap_t pmap,
2539 	vm_page_t page,
2540 	vm_object_t page_obj,
2541 	vm_map_size_t fault_page_size,
2542 	vm_map_offset_t fault_phys_offset)
2543 {
2544 	if (pmap == kernel_pmap) {
2545 		/* 1 - not user space */
2546 		return false;
2547 	}
2548 	if (!page_obj->code_signed) {
2549 		/* 3 - page does not belong to a code-signed object */
2550 		return false;
2551 	}
2552 	if (fault_page_size == PAGE_SIZE) {
2553 		/* looking at the whole page */
2554 		assertf(fault_phys_offset == 0,
2555 		    "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2556 		    (uint64_t)fault_page_size,
2557 		    (uint64_t)fault_phys_offset);
2558 		if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2559 			/* 2 - page is all tainted */
2560 			return false;
2561 		}
2562 		if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2563 		    !page->vmp_wpmapped) {
2564 			/* 4 - already fully validated and never mapped writable */
2565 			return false;
2566 		}
2567 	} else {
2568 		/* looking at a specific sub-page */
2569 		if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2570 			/* 2 - sub-page was already marked as tainted */
2571 			return false;
2572 		}
2573 		if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2574 		    !page->vmp_wpmapped) {
2575 			/* 4 - already validated and never mapped writable */
2576 			return false;
2577 		}
2578 	}
2579 	/* page needs to be validated */
2580 	return true;
2581 }
2582 
2583 
2584 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2585 vm_fault_cs_page_immutable(
2586 	vm_page_t m,
2587 	vm_map_size_t fault_page_size,
2588 	vm_map_offset_t fault_phys_offset,
2589 	vm_prot_t prot __unused)
2590 {
2591 	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2592 	    /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2593 		return true;
2594 	}
2595 	return false;
2596 }
2597 
2598 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2599 vm_fault_cs_page_nx(
2600 	vm_page_t m,
2601 	vm_map_size_t fault_page_size,
2602 	vm_map_offset_t fault_phys_offset)
2603 {
2604 	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2605 }
2606 
2607 /*
2608  * Check if the page being entered into the pmap violates code signing.
2609  */
2610 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2611 vm_fault_cs_check_violation(
2612 	bool cs_bypass,
2613 	vm_object_t object,
2614 	vm_page_t m,
2615 	pmap_t pmap,
2616 	vm_prot_t prot,
2617 	vm_prot_t caller_prot,
2618 	vm_map_size_t fault_page_size,
2619 	vm_map_offset_t fault_phys_offset,
2620 	vm_object_fault_info_t fault_info,
2621 	bool map_is_switched,
2622 	bool map_is_switch_protected,
2623 	bool *cs_violation)
2624 {
2625 #if !PMAP_CS
2626 #pragma unused(caller_prot)
2627 #pragma unused(fault_info)
2628 #endif /* !PMAP_CS */
2629 	int             cs_enforcement_enabled;
2630 	if (!cs_bypass &&
2631 	    vm_fault_cs_need_validation(pmap, m, object,
2632 	    fault_page_size, fault_phys_offset)) {
2633 		vm_object_lock_assert_exclusive(object);
2634 
2635 		if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2636 			vm_cs_revalidates++;
2637 		}
2638 
2639 		/* VM map is locked, so 1 ref will remain on VM object -
2640 		 * so no harm if vm_page_validate_cs drops the object lock */
2641 
2642 		vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2643 	}
2644 
2645 	/* If the map is switched, and is switch-protected, we must protect
2646 	 * some pages from being write-faulted: immutable pages because by
2647 	 * definition they may not be written, and executable pages because that
2648 	 * would provide a way to inject unsigned code.
2649 	 * If the page is immutable, we can simply return. However, we can't
2650 	 * immediately determine whether a page is executable anywhere. But,
2651 	 * we can disconnect it everywhere and remove the executable protection
2652 	 * from the current map. We do that below right before we do the
2653 	 * PMAP_ENTER.
2654 	 */
2655 	if (pmap == kernel_pmap) {
2656 		/* kernel fault: cs_enforcement does not apply */
2657 		cs_enforcement_enabled = 0;
2658 	} else {
2659 		cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2660 	}
2661 
2662 	if (cs_enforcement_enabled && map_is_switched &&
2663 	    map_is_switch_protected &&
2664 	    vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2665 	    (prot & VM_PROT_WRITE)) {
2666 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2667 		return KERN_CODESIGN_ERROR;
2668 	}
2669 
2670 	if (cs_enforcement_enabled &&
2671 	    vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2672 	    (prot & VM_PROT_EXECUTE)) {
2673 		if (cs_debug) {
2674 			printf("page marked to be NX, not letting it be mapped EXEC\n");
2675 		}
2676 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2677 		return KERN_CODESIGN_ERROR;
2678 	}
2679 
2680 	/* A page could be tainted, or pose a risk of being tainted later.
2681 	 * Check whether the receiving process wants it, and make it feel
2682 	 * the consequences (that hapens in cs_invalid_page()).
2683 	 * For CS Enforcement, two other conditions will
2684 	 * cause that page to be tainted as well:
2685 	 * - pmapping an unsigned page executable - this means unsigned code;
2686 	 * - writeable mapping of a validated page - the content of that page
2687 	 *   can be changed without the kernel noticing, therefore unsigned
2688 	 *   code can be created
2689 	 */
2690 	if (cs_bypass) {
2691 		/* code-signing is bypassed */
2692 		*cs_violation = FALSE;
2693 	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2694 		/* tainted page */
2695 		*cs_violation = TRUE;
2696 	} else if (!cs_enforcement_enabled) {
2697 		/* no further code-signing enforcement */
2698 		*cs_violation = FALSE;
2699 	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2700 	    ((prot & VM_PROT_WRITE) ||
2701 	    m->vmp_wpmapped)) {
2702 		/*
2703 		 * The page should be immutable, but is in danger of being
2704 		 * modified.
2705 		 * This is the case where we want policy from the code
2706 		 * directory - is the page immutable or not? For now we have
2707 		 * to assume that code pages will be immutable, data pages not.
2708 		 * We'll assume a page is a code page if it has a code directory
2709 		 * and we fault for execution.
2710 		 * That is good enough since if we faulted the code page for
2711 		 * writing in another map before, it is wpmapped; if we fault
2712 		 * it for writing in this map later it will also be faulted for
2713 		 * executing at the same time; and if we fault for writing in
2714 		 * another map later, we will disconnect it from this pmap so
2715 		 * we'll notice the change.
2716 		 */
2717 		*cs_violation = TRUE;
2718 	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2719 	    (prot & VM_PROT_EXECUTE)
2720 	    ) {
2721 		*cs_violation = TRUE;
2722 	} else {
2723 		*cs_violation = FALSE;
2724 	}
2725 	return KERN_SUCCESS;
2726 }
2727 
2728 /*
2729  * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2730  * @param must_disconnect This value will be set to true if the caller must disconnect
2731  * this page.
2732  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2733  */
2734 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2735 vm_fault_cs_handle_violation(
2736 	vm_object_t object,
2737 	vm_page_t m,
2738 	pmap_t pmap,
2739 	vm_prot_t prot,
2740 	vm_map_offset_t vaddr,
2741 	vm_map_size_t fault_page_size,
2742 	vm_map_offset_t fault_phys_offset,
2743 	bool map_is_switched,
2744 	bool map_is_switch_protected,
2745 	bool *must_disconnect)
2746 {
2747 #if !MACH_ASSERT
2748 #pragma unused(pmap)
2749 #pragma unused(map_is_switch_protected)
2750 #endif /* !MACH_ASSERT */
2751 	/*
2752 	 * We will have a tainted page. Have to handle the special case
2753 	 * of a switched map now. If the map is not switched, standard
2754 	 * procedure applies - call cs_invalid_page().
2755 	 * If the map is switched, the real owner is invalid already.
2756 	 * There is no point in invalidating the switching process since
2757 	 * it will not be executing from the map. So we don't call
2758 	 * cs_invalid_page() in that case.
2759 	 */
2760 	boolean_t reject_page, cs_killed;
2761 	kern_return_t kr;
2762 	if (map_is_switched) {
2763 		assert(pmap == vm_map_pmap(current_thread()->map));
2764 		assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2765 		reject_page = FALSE;
2766 	} else {
2767 		if (cs_debug > 5) {
2768 			printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2769 			    object->code_signed ? "yes" : "no",
2770 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2771 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2772 			    m->vmp_wpmapped ? "yes" : "no",
2773 			    (int)prot);
2774 		}
2775 		reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2776 	}
2777 
2778 	if (reject_page) {
2779 		/* reject the invalid page: abort the page fault */
2780 		int                     pid;
2781 		const char              *procname;
2782 		task_t                  task;
2783 		vm_object_t             file_object, shadow;
2784 		vm_object_offset_t      file_offset;
2785 		char                    *pathname, *filename;
2786 		vm_size_t               pathname_len, filename_len;
2787 		boolean_t               truncated_path;
2788 #define __PATH_MAX 1024
2789 		struct timespec         mtime, cs_mtime;
2790 		int                     shadow_depth;
2791 		os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2792 
2793 		kr = KERN_CODESIGN_ERROR;
2794 		cs_enter_tainted_rejected++;
2795 
2796 		/* get process name and pid */
2797 		procname = "?";
2798 		task = current_task();
2799 		pid = proc_selfpid();
2800 		if (get_bsdtask_info(task) != NULL) {
2801 			procname = proc_name_address(get_bsdtask_info(task));
2802 		}
2803 
2804 		/* get file's VM object */
2805 		file_object = object;
2806 		file_offset = m->vmp_offset;
2807 		for (shadow = file_object->shadow,
2808 		    shadow_depth = 0;
2809 		    shadow != VM_OBJECT_NULL;
2810 		    shadow = file_object->shadow,
2811 		    shadow_depth++) {
2812 			vm_object_lock_shared(shadow);
2813 			if (file_object != object) {
2814 				vm_object_unlock(file_object);
2815 			}
2816 			file_offset += file_object->vo_shadow_offset;
2817 			file_object = shadow;
2818 		}
2819 
2820 		mtime.tv_sec = 0;
2821 		mtime.tv_nsec = 0;
2822 		cs_mtime.tv_sec = 0;
2823 		cs_mtime.tv_nsec = 0;
2824 
2825 		/* get file's pathname and/or filename */
2826 		pathname = NULL;
2827 		filename = NULL;
2828 		pathname_len = 0;
2829 		filename_len = 0;
2830 		truncated_path = FALSE;
2831 		/* no pager -> no file -> no pathname, use "<nil>" in that case */
2832 		if (file_object->pager != NULL) {
2833 			pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
2834 			if (pathname) {
2835 				pathname[0] = '\0';
2836 				pathname_len = __PATH_MAX;
2837 				filename = pathname + pathname_len;
2838 				filename_len = __PATH_MAX;
2839 
2840 				if (vnode_pager_get_object_name(file_object->pager,
2841 				    pathname,
2842 				    pathname_len,
2843 				    filename,
2844 				    filename_len,
2845 				    &truncated_path) == KERN_SUCCESS) {
2846 					/* safety first... */
2847 					pathname[__PATH_MAX - 1] = '\0';
2848 					filename[__PATH_MAX - 1] = '\0';
2849 
2850 					vnode_pager_get_object_mtime(file_object->pager,
2851 					    &mtime,
2852 					    &cs_mtime);
2853 				} else {
2854 					kfree_data(pathname, __PATH_MAX * 2);
2855 					pathname = NULL;
2856 					filename = NULL;
2857 					pathname_len = 0;
2858 					filename_len = 0;
2859 					truncated_path = FALSE;
2860 				}
2861 			}
2862 		}
2863 		printf("CODE SIGNING: process %d[%s]: "
2864 		    "rejecting invalid page at address 0x%llx "
2865 		    "from offset 0x%llx in file \"%s%s%s\" "
2866 		    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2867 		    "(signed:%d validated:%d tainted:%d nx:%d "
2868 		    "wpmapped:%d dirty:%d depth:%d)\n",
2869 		    pid, procname, (addr64_t) vaddr,
2870 		    file_offset,
2871 		    (pathname ? pathname : "<nil>"),
2872 		    (truncated_path ? "/.../" : ""),
2873 		    (truncated_path ? filename : ""),
2874 		    cs_mtime.tv_sec, cs_mtime.tv_nsec,
2875 		    ((cs_mtime.tv_sec == mtime.tv_sec &&
2876 		    cs_mtime.tv_nsec == mtime.tv_nsec)
2877 		    ? "=="
2878 		    : "!="),
2879 		    mtime.tv_sec, mtime.tv_nsec,
2880 		    object->code_signed,
2881 		    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2882 		    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2883 		    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2884 		    m->vmp_wpmapped,
2885 		    m->vmp_dirty,
2886 		    shadow_depth);
2887 
2888 		/*
2889 		 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2890 		 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2891 		 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2892 		 * will deal with the segmentation fault.
2893 		 */
2894 		if (cs_killed) {
2895 			KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2896 			    pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2897 
2898 			codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2899 			if (codesigning_exit_reason == NULL) {
2900 				printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2901 			} else {
2902 				mach_vm_address_t data_addr = 0;
2903 				struct codesigning_exit_reason_info *ceri = NULL;
2904 				uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2905 
2906 				if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2907 					printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2908 				} else {
2909 					if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2910 					    EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2911 						ceri = (struct codesigning_exit_reason_info *)data_addr;
2912 						static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2913 
2914 						ceri->ceri_virt_addr = vaddr;
2915 						ceri->ceri_file_offset = file_offset;
2916 						if (pathname) {
2917 							strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2918 						} else {
2919 							ceri->ceri_pathname[0] = '\0';
2920 						}
2921 						if (filename) {
2922 							strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2923 						} else {
2924 							ceri->ceri_filename[0] = '\0';
2925 						}
2926 						ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2927 						ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2928 						ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2929 						ceri->ceri_page_modtime_secs = mtime.tv_sec;
2930 						ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2931 						ceri->ceri_object_codesigned = (object->code_signed);
2932 						ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2933 						ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2934 						ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2935 						ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2936 						ceri->ceri_page_slid = 0;
2937 						ceri->ceri_page_dirty = (m->vmp_dirty);
2938 						ceri->ceri_page_shadow_depth = shadow_depth;
2939 					} else {
2940 #if DEBUG || DEVELOPMENT
2941 						panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2942 #else
2943 						printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2944 #endif /* DEBUG || DEVELOPMENT */
2945 						/* Free the buffer */
2946 						os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2947 					}
2948 				}
2949 			}
2950 
2951 			set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2952 		}
2953 		if (panic_on_cs_killed &&
2954 		    object->object_is_shared_cache) {
2955 			char *tainted_contents;
2956 			vm_map_offset_t src_vaddr;
2957 			src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2958 			tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
2959 			bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2960 			printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2961 			panic("CODE SIGNING: process %d[%s]: "
2962 			    "rejecting invalid page (phys#0x%x) at address 0x%llx "
2963 			    "from offset 0x%llx in file \"%s%s%s\" "
2964 			    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2965 			    "(signed:%d validated:%d tainted:%d nx:%d"
2966 			    "wpmapped:%d dirty:%d depth:%d)\n",
2967 			    pid, procname,
2968 			    VM_PAGE_GET_PHYS_PAGE(m),
2969 			    (addr64_t) vaddr,
2970 			    file_offset,
2971 			    (pathname ? pathname : "<nil>"),
2972 			    (truncated_path ? "/.../" : ""),
2973 			    (truncated_path ? filename : ""),
2974 			    cs_mtime.tv_sec, cs_mtime.tv_nsec,
2975 			    ((cs_mtime.tv_sec == mtime.tv_sec &&
2976 			    cs_mtime.tv_nsec == mtime.tv_nsec)
2977 			    ? "=="
2978 			    : "!="),
2979 			    mtime.tv_sec, mtime.tv_nsec,
2980 			    object->code_signed,
2981 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2982 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2983 			    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2984 			    m->vmp_wpmapped,
2985 			    m->vmp_dirty,
2986 			    shadow_depth);
2987 		}
2988 
2989 		if (file_object != object) {
2990 			vm_object_unlock(file_object);
2991 		}
2992 		if (pathname_len != 0) {
2993 			kfree_data(pathname, __PATH_MAX * 2);
2994 			pathname = NULL;
2995 			filename = NULL;
2996 		}
2997 	} else {
2998 		/* proceed with the invalid page */
2999 		kr = KERN_SUCCESS;
3000 		if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
3001 		    !object->code_signed) {
3002 			/*
3003 			 * This page has not been (fully) validated but
3004 			 * does not belong to a code-signed object
3005 			 * so it should not be forcefully considered
3006 			 * as tainted.
3007 			 * We're just concerned about it here because
3008 			 * we've been asked to "execute" it but that
3009 			 * does not mean that it should cause other
3010 			 * accesses to fail.
3011 			 * This happens when a debugger sets a
3012 			 * breakpoint and we then execute code in
3013 			 * that page.  Marking the page as "tainted"
3014 			 * would cause any inspection tool ("leaks",
3015 			 * "vmmap", "CrashReporter", ...) to get killed
3016 			 * due to code-signing violation on that page,
3017 			 * even though they're just reading it and not
3018 			 * executing from it.
3019 			 */
3020 		} else {
3021 			/*
3022 			 * Page might have been tainted before or not;
3023 			 * now it definitively is. If the page wasn't
3024 			 * tainted, we must disconnect it from all
3025 			 * pmaps later, to force existing mappings
3026 			 * through that code path for re-consideration
3027 			 * of the validity of that page.
3028 			 */
3029 			if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3030 				*must_disconnect = TRUE;
3031 				VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3032 			}
3033 		}
3034 		cs_enter_tainted_accepted++;
3035 	}
3036 	if (kr != KERN_SUCCESS) {
3037 		if (cs_debug) {
3038 			printf("CODESIGNING: vm_fault_enter(0x%llx): "
3039 			    "*** INVALID PAGE ***\n",
3040 			    (long long)vaddr);
3041 		}
3042 #if !SECURE_KERNEL
3043 		if (cs_enforcement_panic) {
3044 			panic("CODESIGNING: panicking on invalid page");
3045 		}
3046 #endif
3047 	}
3048 	return kr;
3049 }
3050 
3051 /*
3052  * Check that the code signature is valid for the given page being inserted into
3053  * the pmap.
3054  *
3055  * @param must_disconnect This value will be set to true if the caller must disconnect
3056  * this page.
3057  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3058  */
3059 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3060 vm_fault_validate_cs(
3061 	bool cs_bypass,
3062 	vm_object_t object,
3063 	vm_page_t m,
3064 	pmap_t pmap,
3065 	vm_map_offset_t vaddr,
3066 	vm_prot_t prot,
3067 	vm_prot_t caller_prot,
3068 	vm_map_size_t fault_page_size,
3069 	vm_map_offset_t fault_phys_offset,
3070 	vm_object_fault_info_t fault_info,
3071 	bool *must_disconnect)
3072 {
3073 	bool map_is_switched, map_is_switch_protected, cs_violation;
3074 	kern_return_t kr;
3075 	/* Validate code signature if necessary. */
3076 	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3077 	    (pmap == vm_map_pmap(current_thread()->map)));
3078 	map_is_switch_protected = current_thread()->map->switch_protect;
3079 	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3080 	    prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3081 	    map_is_switched, map_is_switch_protected, &cs_violation);
3082 	if (kr != KERN_SUCCESS) {
3083 		return kr;
3084 	}
3085 	if (cs_violation) {
3086 		kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3087 		    fault_page_size, fault_phys_offset,
3088 		    map_is_switched, map_is_switch_protected, must_disconnect);
3089 	}
3090 	return kr;
3091 }
3092 
3093 /*
3094  * Enqueue the page on the appropriate paging queue.
3095  */
3096 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3097 vm_fault_enqueue_page(
3098 	vm_object_t object,
3099 	vm_page_t m,
3100 	bool wired,
3101 	bool change_wiring,
3102 	vm_tag_t wire_tag,
3103 	bool no_cache,
3104 	int *type_of_fault,
3105 	kern_return_t kr)
3106 {
3107 	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3108 	boolean_t       page_queues_locked = FALSE;
3109 	boolean_t       previously_pmapped = m->vmp_pmapped;
3110 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
3111 MACRO_BEGIN                                     \
3112 	if (! page_queues_locked) {             \
3113 	        page_queues_locked = TRUE;      \
3114 	        vm_page_lockspin_queues();      \
3115 	}                                       \
3116 MACRO_END
3117 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
3118 MACRO_BEGIN                                     \
3119 	if (page_queues_locked) {               \
3120 	        page_queues_locked = FALSE;     \
3121 	        vm_page_unlock_queues();        \
3122 	}                                       \
3123 MACRO_END
3124 
3125 	vm_page_update_special_state(m);
3126 	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3127 		/*
3128 		 * Compressor pages are neither wired
3129 		 * nor pageable and should never change.
3130 		 */
3131 		assert(object == compressor_object);
3132 	} else if (change_wiring) {
3133 		__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3134 
3135 		if (wired) {
3136 			if (kr == KERN_SUCCESS) {
3137 				vm_page_wire(m, wire_tag, TRUE);
3138 			}
3139 		} else {
3140 			vm_page_unwire(m, TRUE);
3141 		}
3142 		/* we keep the page queues lock, if we need it later */
3143 	} else {
3144 		if (object->internal == TRUE) {
3145 			/*
3146 			 * don't allow anonymous pages on
3147 			 * the speculative queues
3148 			 */
3149 			no_cache = FALSE;
3150 		}
3151 		if (kr != KERN_SUCCESS) {
3152 			__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3153 			vm_page_deactivate(m);
3154 			/* we keep the page queues lock, if we need it later */
3155 		} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3156 		    (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3157 		    (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3158 		    ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3159 		    !VM_PAGE_WIRED(m)) {
3160 			if (vm_page_local_q &&
3161 			    (*type_of_fault == DBG_COW_FAULT ||
3162 			    *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3163 				struct vpl      *lq;
3164 				uint32_t        lid;
3165 
3166 				assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3167 
3168 				__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3169 				vm_object_lock_assert_exclusive(object);
3170 
3171 				/*
3172 				 * we got a local queue to stuff this
3173 				 * new page on...
3174 				 * its safe to manipulate local and
3175 				 * local_id at this point since we're
3176 				 * behind an exclusive object lock and
3177 				 * the page is not on any global queue.
3178 				 *
3179 				 * we'll use the current cpu number to
3180 				 * select the queue note that we don't
3181 				 * need to disable preemption... we're
3182 				 * going to be behind the local queue's
3183 				 * lock to do the real work
3184 				 */
3185 				lid = cpu_number();
3186 
3187 				lq = zpercpu_get_cpu(vm_page_local_q, lid);
3188 
3189 				VPL_LOCK(&lq->vpl_lock);
3190 
3191 				vm_page_check_pageable_safe(m);
3192 				vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3193 				m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3194 				m->vmp_local_id = lid;
3195 				lq->vpl_count++;
3196 
3197 				if (object->internal) {
3198 					lq->vpl_internal_count++;
3199 				} else {
3200 					lq->vpl_external_count++;
3201 				}
3202 
3203 				VPL_UNLOCK(&lq->vpl_lock);
3204 
3205 				if (lq->vpl_count > vm_page_local_q_soft_limit) {
3206 					/*
3207 					 * we're beyond the soft limit
3208 					 * for the local queue
3209 					 * vm_page_reactivate_local will
3210 					 * 'try' to take the global page
3211 					 * queue lock... if it can't
3212 					 * that's ok... we'll let the
3213 					 * queue continue to grow up
3214 					 * to the hard limit... at that
3215 					 * point we'll wait for the
3216 					 * lock... once we've got the
3217 					 * lock, we'll transfer all of
3218 					 * the pages from the local
3219 					 * queue to the global active
3220 					 * queue
3221 					 */
3222 					vm_page_reactivate_local(lid, FALSE, FALSE);
3223 				}
3224 			} else {
3225 				__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3226 
3227 				/*
3228 				 * test again now that we hold the
3229 				 * page queue lock
3230 				 */
3231 				if (!VM_PAGE_WIRED(m)) {
3232 					if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3233 						vm_page_queues_remove(m, FALSE);
3234 
3235 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3236 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3237 					}
3238 
3239 					if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3240 					    no_cache) {
3241 						/*
3242 						 * If this is a no_cache mapping
3243 						 * and the page has never been
3244 						 * mapped before or was
3245 						 * previously a no_cache page,
3246 						 * then we want to leave pages
3247 						 * in the speculative state so
3248 						 * that they can be readily
3249 						 * recycled if free memory runs
3250 						 * low.  Otherwise the page is
3251 						 * activated as normal.
3252 						 */
3253 
3254 						if (no_cache &&
3255 						    (!previously_pmapped ||
3256 						    m->vmp_no_cache)) {
3257 							m->vmp_no_cache = TRUE;
3258 
3259 							if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3260 								vm_page_speculate(m, FALSE);
3261 							}
3262 						} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3263 							vm_page_activate(m);
3264 						}
3265 					}
3266 				}
3267 				/* we keep the page queues lock, if we need it later */
3268 			}
3269 		}
3270 	}
3271 	/* we're done with the page queues lock, if we ever took it */
3272 	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3273 }
3274 
3275 /*
3276  * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3277  * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3278  * before being inserted into the pmap.
3279  */
3280 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3281 vm_fault_enter_set_mapped(
3282 	vm_object_t object,
3283 	vm_page_t m,
3284 	vm_prot_t prot,
3285 	vm_prot_t fault_type)
3286 {
3287 	bool page_needs_sync = false;
3288 	/*
3289 	 * NOTE: we may only hold the vm_object lock SHARED
3290 	 * at this point, so we need the phys_page lock to
3291 	 * properly serialize updating the pmapped and
3292 	 * xpmapped bits
3293 	 */
3294 	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3295 		ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3296 
3297 		pmap_lock_phys_page(phys_page);
3298 		m->vmp_pmapped = TRUE;
3299 
3300 		if (!m->vmp_xpmapped) {
3301 			m->vmp_xpmapped = TRUE;
3302 
3303 			pmap_unlock_phys_page(phys_page);
3304 
3305 			if (!object->internal) {
3306 				OSAddAtomic(1, &vm_page_xpmapped_external_count);
3307 			}
3308 
3309 #if defined(__arm64__)
3310 			page_needs_sync = true;
3311 #else
3312 			if (object->internal &&
3313 			    object->pager != NULL) {
3314 				/*
3315 				 * This page could have been
3316 				 * uncompressed by the
3317 				 * compressor pager and its
3318 				 * contents might be only in
3319 				 * the data cache.
3320 				 * Since it's being mapped for
3321 				 * "execute" for the fist time,
3322 				 * make sure the icache is in
3323 				 * sync.
3324 				 */
3325 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3326 				page_needs_sync = true;
3327 			}
3328 #endif
3329 		} else {
3330 			pmap_unlock_phys_page(phys_page);
3331 		}
3332 	} else {
3333 		if (m->vmp_pmapped == FALSE) {
3334 			ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3335 
3336 			pmap_lock_phys_page(phys_page);
3337 			m->vmp_pmapped = TRUE;
3338 			pmap_unlock_phys_page(phys_page);
3339 		}
3340 	}
3341 
3342 	if (fault_type & VM_PROT_WRITE) {
3343 		if (m->vmp_wpmapped == FALSE) {
3344 			vm_object_lock_assert_exclusive(object);
3345 			if (!object->internal && object->pager) {
3346 				task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3347 			}
3348 			m->vmp_wpmapped = TRUE;
3349 		}
3350 	}
3351 	return page_needs_sync;
3352 }
3353 
3354 /*
3355  * Try to enter the given page into the pmap.
3356  * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3357  * a codesigning failure on a non-execute fault.
3358  */
3359 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3360 vm_fault_attempt_pmap_enter(
3361 	pmap_t pmap,
3362 	vm_map_offset_t vaddr,
3363 	vm_map_size_t fault_page_size,
3364 	vm_map_offset_t fault_phys_offset,
3365 	vm_page_t m,
3366 	vm_prot_t *prot,
3367 	vm_prot_t caller_prot,
3368 	vm_prot_t fault_type,
3369 	bool wired,
3370 	int pmap_options)
3371 {
3372 #if !PMAP_CS
3373 #pragma unused(caller_prot)
3374 #endif /* !PMAP_CS */
3375 	kern_return_t kr;
3376 	if (fault_page_size != PAGE_SIZE) {
3377 		DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3378 		assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3379 		    fault_phys_offset < PAGE_SIZE),
3380 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3381 	} else {
3382 		assertf(fault_phys_offset == 0,
3383 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3384 	}
3385 
3386 	PMAP_ENTER_OPTIONS(pmap, vaddr,
3387 	    fault_phys_offset,
3388 	    m, *prot, fault_type, 0,
3389 	    wired,
3390 	    pmap_options,
3391 	    kr);
3392 	return kr;
3393 }
3394 
3395 /*
3396  * Enter the given page into the pmap.
3397  * The map must be locked shared.
3398  * The vm object must NOT be locked.
3399  *
3400  * @param need_retry if not null, avoid making a (potentially) blocking call into
3401  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3402  */
3403 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3404 vm_fault_pmap_enter(
3405 	pmap_t pmap,
3406 	vm_map_offset_t vaddr,
3407 	vm_map_size_t fault_page_size,
3408 	vm_map_offset_t fault_phys_offset,
3409 	vm_page_t m,
3410 	vm_prot_t *prot,
3411 	vm_prot_t caller_prot,
3412 	vm_prot_t fault_type,
3413 	bool wired,
3414 	int pmap_options,
3415 	boolean_t *need_retry)
3416 {
3417 	kern_return_t kr;
3418 	if (need_retry != NULL) {
3419 		/*
3420 		 * Although we don't hold a lock on this object, we hold a lock
3421 		 * on the top object in the chain. To prevent a deadlock, we
3422 		 * can't allow the pmap layer to block.
3423 		 */
3424 		pmap_options |= PMAP_OPTIONS_NOWAIT;
3425 	}
3426 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3427 	    fault_page_size, fault_phys_offset,
3428 	    m, prot, caller_prot, fault_type, wired, pmap_options);
3429 	if (kr == KERN_RESOURCE_SHORTAGE) {
3430 		if (need_retry) {
3431 			/*
3432 			 * There's nothing we can do here since we hold the
3433 			 * lock on the top object in the chain. The caller
3434 			 * will need to deal with this by dropping that lock and retrying.
3435 			 */
3436 			*need_retry = TRUE;
3437 			vm_pmap_enter_retried++;
3438 		}
3439 	}
3440 	return kr;
3441 }
3442 
3443 /*
3444  * Enter the given page into the pmap.
3445  * The vm map must be locked shared.
3446  * The vm object must be locked exclusive, unless this is a soft fault.
3447  * For a soft fault, the object must be locked shared or exclusive.
3448  *
3449  * @param need_retry if not null, avoid making a (potentially) blocking call into
3450  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3451  */
3452 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3453 vm_fault_pmap_enter_with_object_lock(
3454 	vm_object_t object,
3455 	pmap_t pmap,
3456 	vm_map_offset_t vaddr,
3457 	vm_map_size_t fault_page_size,
3458 	vm_map_offset_t fault_phys_offset,
3459 	vm_page_t m,
3460 	vm_prot_t *prot,
3461 	vm_prot_t caller_prot,
3462 	vm_prot_t fault_type,
3463 	bool wired,
3464 	int pmap_options,
3465 	boolean_t *need_retry)
3466 {
3467 	kern_return_t kr;
3468 	/*
3469 	 * Prevent a deadlock by not
3470 	 * holding the object lock if we need to wait for a page in
3471 	 * pmap_enter() - <rdar://problem/7138958>
3472 	 */
3473 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3474 	    fault_page_size, fault_phys_offset,
3475 	    m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3476 #if __x86_64__
3477 	if (kr == KERN_INVALID_ARGUMENT &&
3478 	    pmap == PMAP_NULL &&
3479 	    wired) {
3480 		/*
3481 		 * Wiring a page in a pmap-less VM map:
3482 		 * VMware's "vmmon" kernel extension does this
3483 		 * to grab pages.
3484 		 * Let it proceed even though the PMAP_ENTER() failed.
3485 		 */
3486 		kr = KERN_SUCCESS;
3487 	}
3488 #endif /* __x86_64__ */
3489 
3490 	if (kr == KERN_RESOURCE_SHORTAGE) {
3491 		if (need_retry) {
3492 			/*
3493 			 * this will be non-null in the case where we hold the lock
3494 			 * on the top-object in this chain... we can't just drop
3495 			 * the lock on the object we're inserting the page into
3496 			 * and recall the PMAP_ENTER since we can still cause
3497 			 * a deadlock if one of the critical paths tries to
3498 			 * acquire the lock on the top-object and we're blocked
3499 			 * in PMAP_ENTER waiting for memory... our only recourse
3500 			 * is to deal with it at a higher level where we can
3501 			 * drop both locks.
3502 			 */
3503 			*need_retry = TRUE;
3504 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PMAP_ENTER_RESOURCE_SHORTAGE), 0 /* arg */);
3505 			vm_pmap_enter_retried++;
3506 			goto done;
3507 		}
3508 		/*
3509 		 * The nonblocking version of pmap_enter did not succeed.
3510 		 * and we don't need to drop other locks and retry
3511 		 * at the level above us, so
3512 		 * use the blocking version instead. Requires marking
3513 		 * the page busy and unlocking the object
3514 		 */
3515 		boolean_t was_busy = m->vmp_busy;
3516 
3517 		vm_object_lock_assert_exclusive(object);
3518 
3519 		m->vmp_busy = TRUE;
3520 		vm_object_unlock(object);
3521 
3522 		PMAP_ENTER_OPTIONS(pmap, vaddr,
3523 		    fault_phys_offset,
3524 		    m, *prot, fault_type,
3525 		    0, wired,
3526 		    pmap_options, kr);
3527 
3528 		assert(VM_PAGE_OBJECT(m) == object);
3529 
3530 		/* Take the object lock again. */
3531 		vm_object_lock(object);
3532 
3533 		/* If the page was busy, someone else will wake it up.
3534 		 * Otherwise, we have to do it now. */
3535 		assert(m->vmp_busy);
3536 		if (!was_busy) {
3537 			PAGE_WAKEUP_DONE(m);
3538 		}
3539 		vm_pmap_enter_blocked++;
3540 	}
3541 
3542 done:
3543 	return kr;
3544 }
3545 
3546 /*
3547  * Prepare to enter a page into the pmap by checking CS, protection bits,
3548  * and setting mapped bits on the page_t.
3549  * Does not modify the page's paging queue.
3550  *
3551  * page queue lock must NOT be held
3552  * m->vmp_object must be locked
3553  *
3554  * NOTE: m->vmp_object could be locked "shared" only if we are called
3555  * from vm_fault() as part of a soft fault.
3556  */
3557 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,boolean_t change_wiring,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync)3558 vm_fault_enter_prepare(
3559 	vm_page_t m,
3560 	pmap_t pmap,
3561 	vm_map_offset_t vaddr,
3562 	vm_prot_t *prot,
3563 	vm_prot_t caller_prot,
3564 	vm_map_size_t fault_page_size,
3565 	vm_map_offset_t fault_phys_offset,
3566 	boolean_t change_wiring,
3567 	vm_prot_t fault_type,
3568 	vm_object_fault_info_t fault_info,
3569 	int *type_of_fault,
3570 	bool *page_needs_data_sync)
3571 {
3572 	kern_return_t   kr;
3573 	bool            is_tainted = false;
3574 	vm_object_t     object;
3575 	boolean_t       cs_bypass = fault_info->cs_bypass;
3576 
3577 	object = VM_PAGE_OBJECT(m);
3578 
3579 	vm_object_lock_assert_held(object);
3580 
3581 #if KASAN
3582 	if (pmap == kernel_pmap) {
3583 		kasan_notify_address(vaddr, PAGE_SIZE);
3584 	}
3585 #endif
3586 
3587 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3588 
3589 	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3590 		vm_object_lock_assert_exclusive(object);
3591 	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
3592 	    !change_wiring &&
3593 	    (!m->vmp_wpmapped
3594 #if VM_OBJECT_ACCESS_TRACKING
3595 	    || object->access_tracking
3596 #endif /* VM_OBJECT_ACCESS_TRACKING */
3597 	    )) {
3598 		/*
3599 		 * This is not a "write" fault, so we
3600 		 * might not have taken the object lock
3601 		 * exclusively and we might not be able
3602 		 * to update the "wpmapped" bit in
3603 		 * vm_fault_enter().
3604 		 * Let's just grant read access to
3605 		 * the page for now and we'll
3606 		 * soft-fault again if we need write
3607 		 * access later...
3608 		 */
3609 
3610 		/* This had better not be a JIT page. */
3611 		if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3612 			*prot &= ~VM_PROT_WRITE;
3613 		} else {
3614 			assert(cs_bypass);
3615 		}
3616 	}
3617 	if (m->vmp_pmapped == FALSE) {
3618 		if (m->vmp_clustered) {
3619 			if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3620 				/*
3621 				 * found it in the cache, but this
3622 				 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3623 				 * so it must have come in as part of
3624 				 * a cluster... account 1 pagein against it
3625 				 */
3626 				if (object->internal) {
3627 					*type_of_fault = DBG_PAGEIND_FAULT;
3628 				} else {
3629 					*type_of_fault = DBG_PAGEINV_FAULT;
3630 				}
3631 
3632 				VM_PAGE_COUNT_AS_PAGEIN(m);
3633 			}
3634 			VM_PAGE_CONSUME_CLUSTERED(m);
3635 		}
3636 	}
3637 
3638 	if (*type_of_fault != DBG_COW_FAULT) {
3639 		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3640 
3641 		if (pmap == kernel_pmap) {
3642 			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3643 		}
3644 	}
3645 
3646 	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3647 	    *prot, caller_prot, fault_page_size, fault_phys_offset,
3648 	    fault_info, &is_tainted);
3649 	if (kr == KERN_SUCCESS) {
3650 		/*
3651 		 * We either have a good page, or a tainted page that has been accepted by the process.
3652 		 * In both cases the page will be entered into the pmap.
3653 		 */
3654 		*page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3655 		if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3656 			/*
3657 			 * This page is tainted but we're inserting it anyways.
3658 			 * Since it's writeable, we need to disconnect it from other pmaps
3659 			 * now so those processes can take note.
3660 			 */
3661 
3662 			/*
3663 			 * We can only get here
3664 			 * because of the CSE logic
3665 			 */
3666 			assert(pmap_get_vm_map_cs_enforced(pmap));
3667 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3668 			/*
3669 			 * If we are faulting for a write, we can clear
3670 			 * the execute bit - that will ensure the page is
3671 			 * checked again before being executable, which
3672 			 * protects against a map switch.
3673 			 * This only happens the first time the page
3674 			 * gets tainted, so we won't get stuck here
3675 			 * to make an already writeable page executable.
3676 			 */
3677 			if (!cs_bypass) {
3678 				assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3679 				*prot &= ~VM_PROT_EXECUTE;
3680 			}
3681 		}
3682 		assert(VM_PAGE_OBJECT(m) == object);
3683 
3684 #if VM_OBJECT_ACCESS_TRACKING
3685 		if (object->access_tracking) {
3686 			DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3687 			if (fault_type & VM_PROT_WRITE) {
3688 				object->access_tracking_writes++;
3689 				vm_object_access_tracking_writes++;
3690 			} else {
3691 				object->access_tracking_reads++;
3692 				vm_object_access_tracking_reads++;
3693 			}
3694 		}
3695 #endif /* VM_OBJECT_ACCESS_TRACKING */
3696 	}
3697 
3698 	return kr;
3699 }
3700 
3701 /*
3702  * page queue lock must NOT be held
3703  * m->vmp_object must be locked
3704  *
3705  * NOTE: m->vmp_object could be locked "shared" only if we are called
3706  * from vm_fault() as part of a soft fault.  If so, we must be
3707  * careful not to modify the VM object in any way that is not
3708  * legal under a shared lock...
3709  */
3710 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,boolean_t change_wiring,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,boolean_t * need_retry,int * type_of_fault)3711 vm_fault_enter(
3712 	vm_page_t m,
3713 	pmap_t pmap,
3714 	vm_map_offset_t vaddr,
3715 	vm_map_size_t fault_page_size,
3716 	vm_map_offset_t fault_phys_offset,
3717 	vm_prot_t prot,
3718 	vm_prot_t caller_prot,
3719 	boolean_t wired,
3720 	boolean_t change_wiring,
3721 	vm_tag_t  wire_tag,
3722 	vm_object_fault_info_t fault_info,
3723 	boolean_t *need_retry,
3724 	int *type_of_fault)
3725 {
3726 	kern_return_t   kr;
3727 	vm_object_t     object;
3728 	bool            page_needs_data_sync;
3729 	vm_prot_t       fault_type;
3730 	int             pmap_options = fault_info->pmap_options;
3731 
3732 	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3733 		assert(m->vmp_fictitious);
3734 		return KERN_SUCCESS;
3735 	}
3736 
3737 	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3738 
3739 	assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
3740 	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3741 	    fault_page_size, fault_phys_offset, change_wiring, fault_type,
3742 	    fault_info, type_of_fault, &page_needs_data_sync);
3743 	object = VM_PAGE_OBJECT(m);
3744 
3745 	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3746 
3747 	if (kr == KERN_SUCCESS) {
3748 		if (page_needs_data_sync) {
3749 			pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3750 		}
3751 
3752 		kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3753 		    fault_page_size, fault_phys_offset, m,
3754 		    &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3755 	}
3756 
3757 	return kr;
3758 }
3759 
3760 void
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)3761 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3762 {
3763 	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3764 		vm_fault(current_map(),      /* map */
3765 		    vaddr,                   /* vaddr */
3766 		    prot,                    /* fault_type */
3767 		    FALSE,                   /* change_wiring */
3768 		    VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3769 		    THREAD_UNINT,            /* interruptible */
3770 		    NULL,                    /* caller_pmap */
3771 		    0 /* caller_pmap_addr */);
3772 	}
3773 }
3774 
3775 
3776 /*
3777  *	Routine:	vm_fault
3778  *	Purpose:
3779  *		Handle page faults, including pseudo-faults
3780  *		used to change the wiring status of pages.
3781  *	Returns:
3782  *		Explicit continuations have been removed.
3783  *	Implementation:
3784  *		vm_fault and vm_fault_page save mucho state
3785  *		in the moral equivalent of a closure.  The state
3786  *		structure is allocated when first entering vm_fault
3787  *		and deallocated when leaving vm_fault.
3788  */
3789 
3790 extern uint64_t get_current_unique_pid(void);
3791 
3792 unsigned long vm_fault_collapse_total = 0;
3793 unsigned long vm_fault_collapse_skipped = 0;
3794 
3795 
3796 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3797 vm_fault_external(
3798 	vm_map_t        map,
3799 	vm_map_offset_t vaddr,
3800 	vm_prot_t       fault_type,
3801 	boolean_t       change_wiring,
3802 	int             interruptible,
3803 	pmap_t          caller_pmap,
3804 	vm_map_offset_t caller_pmap_addr)
3805 {
3806 	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3807 	           change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3808 	           interruptible, caller_pmap, caller_pmap_addr,
3809 	           NULL);
3810 }
3811 
3812 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3813 vm_fault(
3814 	vm_map_t        map,
3815 	vm_map_offset_t vaddr,
3816 	vm_prot_t       fault_type,
3817 	boolean_t       change_wiring,
3818 	vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3819 	int             interruptible,
3820 	pmap_t          caller_pmap,
3821 	vm_map_offset_t caller_pmap_addr)
3822 {
3823 	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3824 	           interruptible, caller_pmap, caller_pmap_addr,
3825 	           NULL);
3826 }
3827 
3828 static boolean_t
current_proc_is_privileged(void)3829 current_proc_is_privileged(void)
3830 {
3831 	return csproc_get_platform_binary(current_proc());
3832 }
3833 
3834 uint64_t vm_copied_on_read = 0;
3835 
3836 /*
3837  * Cleanup after a vm_fault_enter.
3838  * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3839  * or the page should be in the pmap and on the correct paging queue.
3840  *
3841  * Precondition:
3842  * map must be locked shared.
3843  * m_object must be locked.
3844  * If top_object != VM_OBJECT_NULL, it must be locked.
3845  * real_map must be locked.
3846  *
3847  * Postcondition:
3848  * map will be unlocked
3849  * m_object will be unlocked
3850  * top_object will be unlocked
3851  * If real_map != map, it will be unlocked
3852  */
3853 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,boolean_t need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)3854 vm_fault_complete(
3855 	vm_map_t map,
3856 	vm_map_t real_map,
3857 	vm_object_t object,
3858 	vm_object_t m_object,
3859 	vm_page_t m,
3860 	vm_map_offset_t offset,
3861 	vm_map_offset_t trace_real_vaddr,
3862 	vm_object_fault_info_t fault_info,
3863 	vm_prot_t caller_prot,
3864 #if CONFIG_DTRACE
3865 	vm_map_offset_t real_vaddr,
3866 #else
3867 	__unused vm_map_offset_t real_vaddr,
3868 #endif /* CONFIG_DTRACE */
3869 	int type_of_fault,
3870 	boolean_t need_retry,
3871 	kern_return_t kr,
3872 	ppnum_t *physpage_p,
3873 	vm_prot_t prot,
3874 	vm_object_t top_object,
3875 	boolean_t need_collapse,
3876 	vm_map_offset_t cur_offset,
3877 	vm_prot_t fault_type,
3878 	vm_object_t *written_on_object,
3879 	memory_object_t *written_on_pager,
3880 	vm_object_offset_t *written_on_offset)
3881 {
3882 	int     event_code = 0;
3883 	vm_map_lock_assert_shared(map);
3884 	vm_object_lock_assert_held(m_object);
3885 	if (top_object != VM_OBJECT_NULL) {
3886 		vm_object_lock_assert_held(top_object);
3887 	}
3888 	vm_map_lock_assert_held(real_map);
3889 
3890 	if (m_object->internal) {
3891 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3892 	} else if (m_object->object_is_shared_cache) {
3893 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3894 	} else {
3895 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3896 	}
3897 	KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
3898 	if (need_retry == FALSE) {
3899 		KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
3900 	}
3901 	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3902 	if (kr == KERN_SUCCESS &&
3903 	    physpage_p != NULL) {
3904 		/* for vm_map_wire_and_extract() */
3905 		*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3906 		if (prot & VM_PROT_WRITE) {
3907 			vm_object_lock_assert_exclusive(m_object);
3908 			m->vmp_dirty = TRUE;
3909 		}
3910 	}
3911 
3912 	if (top_object != VM_OBJECT_NULL) {
3913 		/*
3914 		 * It's safe to drop the top object
3915 		 * now that we've done our
3916 		 * vm_fault_enter().  Any other fault
3917 		 * in progress for that virtual
3918 		 * address will either find our page
3919 		 * and translation or put in a new page
3920 		 * and translation.
3921 		 */
3922 		vm_object_unlock(top_object);
3923 		top_object = VM_OBJECT_NULL;
3924 	}
3925 
3926 	if (need_collapse == TRUE) {
3927 		vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3928 	}
3929 
3930 	if (need_retry == FALSE &&
3931 	    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3932 		/*
3933 		 * evaluate access pattern and update state
3934 		 * vm_fault_deactivate_behind depends on the
3935 		 * state being up to date
3936 		 */
3937 		vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3938 
3939 		vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3940 	}
3941 	/*
3942 	 * That's it, clean up and return.
3943 	 */
3944 	if (m->vmp_busy) {
3945 		vm_object_lock_assert_exclusive(m_object);
3946 		PAGE_WAKEUP_DONE(m);
3947 	}
3948 
3949 	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3950 		vm_object_paging_begin(m_object);
3951 
3952 		assert(*written_on_object == VM_OBJECT_NULL);
3953 		*written_on_object = m_object;
3954 		*written_on_pager = m_object->pager;
3955 		*written_on_offset = m_object->paging_offset + m->vmp_offset;
3956 	}
3957 	vm_object_unlock(object);
3958 
3959 	vm_map_unlock_read(map);
3960 	if (real_map != map) {
3961 		vm_map_unlock(real_map);
3962 	}
3963 }
3964 
3965 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)3966 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3967 {
3968 	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3969 		return DBG_COR_FAULT;
3970 	}
3971 	return type_of_fault;
3972 }
3973 
3974 uint64_t vm_fault_resilient_media_initiate = 0;
3975 uint64_t vm_fault_resilient_media_retry = 0;
3976 uint64_t vm_fault_resilient_media_proceed = 0;
3977 uint64_t vm_fault_resilient_media_release = 0;
3978 uint64_t vm_fault_resilient_media_abort1 = 0;
3979 uint64_t vm_fault_resilient_media_abort2 = 0;
3980 
3981 #if MACH_ASSERT
3982 int vm_fault_resilient_media_inject_error1_rate = 0;
3983 int vm_fault_resilient_media_inject_error1 = 0;
3984 int vm_fault_resilient_media_inject_error2_rate = 0;
3985 int vm_fault_resilient_media_inject_error2 = 0;
3986 int vm_fault_resilient_media_inject_error3_rate = 0;
3987 int vm_fault_resilient_media_inject_error3 = 0;
3988 #endif /* MACH_ASSERT */
3989 
3990 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p)3991 vm_fault_internal(
3992 	vm_map_t        map,
3993 	vm_map_offset_t vaddr,
3994 	vm_prot_t       caller_prot,
3995 	boolean_t       change_wiring,
3996 	vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3997 	int             interruptible,
3998 	pmap_t          caller_pmap,
3999 	vm_map_offset_t caller_pmap_addr,
4000 	ppnum_t         *physpage_p)
4001 {
4002 	vm_map_version_t        version;        /* Map version for verificiation */
4003 	boolean_t               wired;          /* Should mapping be wired down? */
4004 	vm_object_t             object;         /* Top-level object */
4005 	vm_object_offset_t      offset;         /* Top-level offset */
4006 	vm_prot_t               prot;           /* Protection for mapping */
4007 	vm_object_t             old_copy_object; /* Saved copy object */
4008 	vm_page_t               result_page;    /* Result of vm_fault_page */
4009 	vm_page_t               top_page;       /* Placeholder page */
4010 	kern_return_t           kr;
4011 
4012 	vm_page_t               m;      /* Fast access to result_page */
4013 	kern_return_t           error_code;
4014 	vm_object_t             cur_object;
4015 	vm_object_t             m_object = NULL;
4016 	vm_object_offset_t      cur_offset;
4017 	vm_page_t               cur_m;
4018 	vm_object_t             new_object;
4019 	int                     type_of_fault;
4020 	pmap_t                  pmap;
4021 	wait_interrupt_t        interruptible_state;
4022 	vm_map_t                real_map = map;
4023 	vm_map_t                original_map = map;
4024 	bool                    object_locks_dropped = FALSE;
4025 	vm_prot_t               fault_type;
4026 	vm_prot_t               original_fault_type;
4027 	struct vm_object_fault_info fault_info = {};
4028 	bool                    need_collapse = FALSE;
4029 	boolean_t               need_retry = FALSE;
4030 	boolean_t               *need_retry_ptr = NULL;
4031 	uint8_t                 object_lock_type = 0;
4032 	uint8_t                 cur_object_lock_type;
4033 	vm_object_t             top_object = VM_OBJECT_NULL;
4034 	vm_object_t             written_on_object = VM_OBJECT_NULL;
4035 	memory_object_t         written_on_pager = NULL;
4036 	vm_object_offset_t      written_on_offset = 0;
4037 	int                     throttle_delay;
4038 	int                     compressed_count_delta;
4039 	uint8_t                 grab_options;
4040 	bool                    need_copy;
4041 	bool                    need_copy_on_read;
4042 	vm_map_offset_t         trace_vaddr;
4043 	vm_map_offset_t         trace_real_vaddr;
4044 	vm_map_size_t           fault_page_size;
4045 	vm_map_size_t           fault_page_mask;
4046 	int                     fault_page_shift;
4047 	vm_map_offset_t         fault_phys_offset;
4048 	vm_map_offset_t         real_vaddr;
4049 	bool                    resilient_media_retry = false;
4050 	bool                    resilient_media_ref_transfer = false;
4051 	vm_object_t             resilient_media_object = VM_OBJECT_NULL;
4052 	vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
4053 	bool                    page_needs_data_sync = false;
4054 	/*
4055 	 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4056 	 * If so, the zero fill path will drop the lock
4057 	 * NB: Ideally we would always drop the lock rather than rely on
4058 	 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4059 	 */
4060 	bool                    object_is_contended = false;
4061 
4062 	real_vaddr = vaddr;
4063 	trace_real_vaddr = vaddr;
4064 
4065 	/*
4066 	 * Some (kernel) submaps are marked with "should never fault".
4067 	 *
4068 	 * We do this for two reasons:
4069 	 * - PGZ which is inside the zone map range can't go down the normal
4070 	 *   lookup path (vm_map_lookup_entry() would panic).
4071 	 *
4072 	 * - we want for guard pages to not have to use fictitious pages at all
4073 	 *   to prevent from ZFOD pages to be made.
4074 	 *
4075 	 * We also want capture the fault address easily so that the zone
4076 	 * allocator might present an enhanced panic log.
4077 	 */
4078 	if (map->never_faults || (pgz_owned(vaddr) && map->pmap == kernel_pmap)) {
4079 		assert(map->pmap == kernel_pmap);
4080 		panic_fault_address = vaddr;
4081 		return KERN_INVALID_ADDRESS;
4082 	}
4083 
4084 	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4085 		fault_phys_offset = (vm_map_offset_t)-1;
4086 		fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4087 		fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4088 		fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4089 		if (fault_page_size < PAGE_SIZE) {
4090 			DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4091 			vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4092 		}
4093 	} else {
4094 		fault_phys_offset = 0;
4095 		fault_page_size = PAGE_SIZE;
4096 		fault_page_mask = PAGE_MASK;
4097 		fault_page_shift = PAGE_SHIFT;
4098 		vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4099 	}
4100 
4101 	if (map == kernel_map) {
4102 		trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4103 		trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4104 	} else {
4105 		trace_vaddr = vaddr;
4106 	}
4107 
4108 	KDBG_RELEASE(
4109 		(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
4110 		((uint64_t)trace_vaddr >> 32),
4111 		trace_vaddr,
4112 		(map == kernel_map));
4113 
4114 	if (get_preemption_level() != 0) {
4115 		KDBG_RELEASE(
4116 			(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4117 			((uint64_t)trace_vaddr >> 32),
4118 			trace_vaddr,
4119 			KERN_FAILURE);
4120 
4121 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4122 		return KERN_FAILURE;
4123 	}
4124 
4125 	thread_t cthread = current_thread();
4126 	bool      rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4127 	uint64_t fstart = 0;
4128 
4129 	if (rtfault) {
4130 		fstart = mach_continuous_time();
4131 	}
4132 
4133 	interruptible_state = thread_interrupt_level(interruptible);
4134 
4135 	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4136 
4137 	counter_inc(&vm_statistics_faults);
4138 	counter_inc(&current_task()->faults);
4139 	original_fault_type = fault_type;
4140 
4141 	need_copy = FALSE;
4142 	if (fault_type & VM_PROT_WRITE) {
4143 		need_copy = TRUE;
4144 	}
4145 
4146 	if (need_copy || change_wiring) {
4147 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4148 	} else {
4149 		object_lock_type = OBJECT_LOCK_SHARED;
4150 	}
4151 
4152 	cur_object_lock_type = OBJECT_LOCK_SHARED;
4153 
4154 	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4155 		if (compressor_map) {
4156 			if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4157 				panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4158 			}
4159 		}
4160 	}
4161 RetryFault:
4162 	assert(written_on_object == VM_OBJECT_NULL);
4163 
4164 	/*
4165 	 * assume we will hit a page in the cache
4166 	 * otherwise, explicitly override with
4167 	 * the real fault type once we determine it
4168 	 */
4169 	type_of_fault = DBG_CACHE_HIT_FAULT;
4170 
4171 	/*
4172 	 *	Find the backing store object and offset into
4173 	 *	it to begin the search.
4174 	 */
4175 	fault_type = original_fault_type;
4176 	map = original_map;
4177 	vm_map_lock_read(map);
4178 
4179 	if (resilient_media_retry) {
4180 		/*
4181 		 * If we have to insert a fake zero-filled page to hide
4182 		 * a media failure to provide the real page, we need to
4183 		 * resolve any pending copy-on-write on this mapping.
4184 		 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4185 		 * with that even if this is not a "write" fault.
4186 		 */
4187 		need_copy = TRUE;
4188 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4189 		vm_fault_resilient_media_retry++;
4190 	}
4191 
4192 	kr = vm_map_lookup_and_lock_object(&map, vaddr,
4193 	    (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4194 	    object_lock_type, &version,
4195 	    &object, &offset, &prot, &wired,
4196 	    &fault_info,
4197 	    &real_map,
4198 	    &object_is_contended);
4199 
4200 	if (kr != KERN_SUCCESS) {
4201 		vm_map_unlock_read(map);
4202 		/*
4203 		 * This can be seen in a crash report if indeed the
4204 		 * thread is crashing due to an invalid access in a non-existent
4205 		 * range.
4206 		 * Turning this OFF for now because it is noisy and not always fatal
4207 		 * eg prefaulting.
4208 		 *
4209 		 * if (kr == KERN_INVALID_ADDRESS) {
4210 		 *	ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4211 		 * }
4212 		 */
4213 		goto done;
4214 	}
4215 
4216 
4217 	pmap = real_map->pmap;
4218 	fault_info.interruptible = interruptible;
4219 	fault_info.stealth = FALSE;
4220 	fault_info.io_sync = FALSE;
4221 	fault_info.mark_zf_absent = FALSE;
4222 	fault_info.batch_pmap_op = FALSE;
4223 
4224 	if (resilient_media_retry) {
4225 		/*
4226 		 * We're retrying this fault after having detected a media
4227 		 * failure from a "resilient_media" mapping.
4228 		 * Check that the mapping is still pointing at the object
4229 		 * that just failed to provide a page.
4230 		 */
4231 		assert(resilient_media_object != VM_OBJECT_NULL);
4232 		assert(resilient_media_offset != (vm_object_offset_t)-1);
4233 		if ((object != VM_OBJECT_NULL &&
4234 		    object == resilient_media_object &&
4235 		    offset == resilient_media_offset &&
4236 		    fault_info.resilient_media)
4237 #if MACH_ASSERT
4238 		    && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4239 		    (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4240 #endif /* MACH_ASSERT */
4241 		    ) {
4242 			/*
4243 			 * This mapping still points at the same object
4244 			 * and is still "resilient_media": proceed in
4245 			 * "recovery-from-media-failure" mode, where we'll
4246 			 * insert a zero-filled page in the top object.
4247 			 */
4248 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4249 			vm_fault_resilient_media_proceed++;
4250 		} else {
4251 			/* not recovering: reset state and retry fault */
4252 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4253 			vm_object_unlock(object);
4254 			if (real_map != map) {
4255 				vm_map_unlock(real_map);
4256 			}
4257 			vm_map_unlock_read(map);
4258 			/* release our extra reference on failed object */
4259 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4260 			vm_object_lock_assert_notheld(resilient_media_object);
4261 			vm_object_deallocate(resilient_media_object);
4262 			resilient_media_object = VM_OBJECT_NULL;
4263 			resilient_media_offset = (vm_object_offset_t)-1;
4264 			resilient_media_retry = false;
4265 			vm_fault_resilient_media_abort1++;
4266 			goto RetryFault;
4267 		}
4268 	} else {
4269 		assert(resilient_media_object == VM_OBJECT_NULL);
4270 		resilient_media_offset = (vm_object_offset_t)-1;
4271 	}
4272 
4273 	/*
4274 	 * If the page is wired, we must fault for the current protection
4275 	 * value, to avoid further faults.
4276 	 */
4277 	if (wired) {
4278 		fault_type = prot | VM_PROT_WRITE;
4279 	}
4280 	if (wired || need_copy) {
4281 		/*
4282 		 * since we're treating this fault as a 'write'
4283 		 * we must hold the top object lock exclusively
4284 		 */
4285 		if (object_lock_type == OBJECT_LOCK_SHARED) {
4286 			object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4287 
4288 			if (vm_object_lock_upgrade(object) == FALSE) {
4289 				/*
4290 				 * couldn't upgrade, so explictly
4291 				 * take the lock exclusively
4292 				 */
4293 				vm_object_lock(object);
4294 			}
4295 		}
4296 	}
4297 
4298 #if     VM_FAULT_CLASSIFY
4299 	/*
4300 	 *	Temporary data gathering code
4301 	 */
4302 	vm_fault_classify(object, offset, fault_type);
4303 #endif
4304 	/*
4305 	 *	Fast fault code.  The basic idea is to do as much as
4306 	 *	possible while holding the map lock and object locks.
4307 	 *      Busy pages are not used until the object lock has to
4308 	 *	be dropped to do something (copy, zero fill, pmap enter).
4309 	 *	Similarly, paging references aren't acquired until that
4310 	 *	point, and object references aren't used.
4311 	 *
4312 	 *	If we can figure out what to do
4313 	 *	(zero fill, copy on write, pmap enter) while holding
4314 	 *	the locks, then it gets done.  Otherwise, we give up,
4315 	 *	and use the original fault path (which doesn't hold
4316 	 *	the map lock, and relies on busy pages).
4317 	 *	The give up cases include:
4318 	 *              - Have to talk to pager.
4319 	 *		- Page is busy, absent or in error.
4320 	 *		- Pager has locked out desired access.
4321 	 *		- Fault needs to be restarted.
4322 	 *		- Have to push page into copy object.
4323 	 *
4324 	 *	The code is an infinite loop that moves one level down
4325 	 *	the shadow chain each time.  cur_object and cur_offset
4326 	 *      refer to the current object being examined. object and offset
4327 	 *	are the original object from the map.  The loop is at the
4328 	 *	top level if and only if object and cur_object are the same.
4329 	 *
4330 	 *	Invariants:  Map lock is held throughout.  Lock is held on
4331 	 *		original object and cur_object (if different) when
4332 	 *		continuing or exiting loop.
4333 	 *
4334 	 */
4335 
4336 #if defined(__arm64__)
4337 	/*
4338 	 * Fail if reading an execute-only page in a
4339 	 * pmap that enforces execute-only protection.
4340 	 */
4341 	if (fault_type == VM_PROT_READ &&
4342 	    (prot & VM_PROT_EXECUTE) &&
4343 	    !(prot & VM_PROT_READ) &&
4344 	    pmap_enforces_execute_only(pmap)) {
4345 		vm_object_unlock(object);
4346 		vm_map_unlock_read(map);
4347 		if (real_map != map) {
4348 			vm_map_unlock(real_map);
4349 		}
4350 		kr = KERN_PROTECTION_FAILURE;
4351 		goto done;
4352 	}
4353 #endif
4354 
4355 	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4356 
4357 	/*
4358 	 * If this page is to be inserted in a copy delay object
4359 	 * for writing, and if the object has a copy, then the
4360 	 * copy delay strategy is implemented in the slow fault page.
4361 	 */
4362 	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4363 	    object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4364 		goto handle_copy_delay;
4365 	}
4366 
4367 	cur_object = object;
4368 	cur_offset = offset;
4369 
4370 	grab_options = 0;
4371 #if CONFIG_SECLUDED_MEMORY
4372 	if (object->can_grab_secluded) {
4373 		grab_options |= VM_PAGE_GRAB_SECLUDED;
4374 	}
4375 #endif /* CONFIG_SECLUDED_MEMORY */
4376 
4377 	while (TRUE) {
4378 		if (!cur_object->pager_created &&
4379 		    cur_object->phys_contiguous) { /* superpage */
4380 			break;
4381 		}
4382 
4383 		if (cur_object->blocked_access) {
4384 			/*
4385 			 * Access to this VM object has been blocked.
4386 			 * Let the slow path handle it.
4387 			 */
4388 			break;
4389 		}
4390 
4391 		m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4392 		m_object = NULL;
4393 
4394 		if (m != VM_PAGE_NULL) {
4395 			m_object = cur_object;
4396 
4397 			if (m->vmp_busy) {
4398 				wait_result_t   result;
4399 
4400 				/*
4401 				 * in order to do the PAGE_ASSERT_WAIT, we must
4402 				 * have object that 'm' belongs to locked exclusively
4403 				 */
4404 				if (object != cur_object) {
4405 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4406 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4407 
4408 						if (vm_object_lock_upgrade(cur_object) == FALSE) {
4409 							/*
4410 							 * couldn't upgrade so go do a full retry
4411 							 * immediately since we can no longer be
4412 							 * certain about cur_object (since we
4413 							 * don't hold a reference on it)...
4414 							 * first drop the top object lock
4415 							 */
4416 							vm_object_unlock(object);
4417 
4418 							vm_map_unlock_read(map);
4419 							if (real_map != map) {
4420 								vm_map_unlock(real_map);
4421 							}
4422 
4423 							goto RetryFault;
4424 						}
4425 					}
4426 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4427 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4428 
4429 					if (vm_object_lock_upgrade(object) == FALSE) {
4430 						/*
4431 						 * couldn't upgrade, so explictly take the lock
4432 						 * exclusively and go relookup the page since we
4433 						 * will have dropped the object lock and
4434 						 * a different thread could have inserted
4435 						 * a page at this offset
4436 						 * no need for a full retry since we're
4437 						 * at the top level of the object chain
4438 						 */
4439 						vm_object_lock(object);
4440 
4441 						continue;
4442 					}
4443 				}
4444 				if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4445 					/*
4446 					 * m->vmp_busy == TRUE and the object is locked exclusively
4447 					 * if m->pageout_queue == TRUE after we acquire the
4448 					 * queues lock, we are guaranteed that it is stable on
4449 					 * the pageout queue and therefore reclaimable
4450 					 *
4451 					 * NOTE: this is only true for the internal pageout queue
4452 					 * in the compressor world
4453 					 */
4454 					assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4455 
4456 					vm_page_lock_queues();
4457 
4458 					if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4459 						vm_pageout_throttle_up(m);
4460 						vm_page_unlock_queues();
4461 
4462 						PAGE_WAKEUP_DONE(m);
4463 						goto reclaimed_from_pageout;
4464 					}
4465 					vm_page_unlock_queues();
4466 				}
4467 				if (object != cur_object) {
4468 					vm_object_unlock(object);
4469 				}
4470 
4471 				vm_map_unlock_read(map);
4472 				if (real_map != map) {
4473 					vm_map_unlock(real_map);
4474 				}
4475 
4476 				result = PAGE_ASSERT_WAIT(m, interruptible);
4477 
4478 				vm_object_unlock(cur_object);
4479 
4480 				if (result == THREAD_WAITING) {
4481 					result = thread_block(THREAD_CONTINUE_NULL);
4482 				}
4483 				if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4484 					goto RetryFault;
4485 				}
4486 
4487 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4488 				kr = KERN_ABORTED;
4489 				goto done;
4490 			}
4491 reclaimed_from_pageout:
4492 			if (m->vmp_laundry) {
4493 				if (object != cur_object) {
4494 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4495 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4496 
4497 						vm_object_unlock(object);
4498 						vm_object_unlock(cur_object);
4499 
4500 						vm_map_unlock_read(map);
4501 						if (real_map != map) {
4502 							vm_map_unlock(real_map);
4503 						}
4504 
4505 						goto RetryFault;
4506 					}
4507 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4508 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4509 
4510 					if (vm_object_lock_upgrade(object) == FALSE) {
4511 						/*
4512 						 * couldn't upgrade, so explictly take the lock
4513 						 * exclusively and go relookup the page since we
4514 						 * will have dropped the object lock and
4515 						 * a different thread could have inserted
4516 						 * a page at this offset
4517 						 * no need for a full retry since we're
4518 						 * at the top level of the object chain
4519 						 */
4520 						vm_object_lock(object);
4521 
4522 						continue;
4523 					}
4524 				}
4525 				vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
4526 				vm_pageout_steal_laundry(m, FALSE);
4527 			}
4528 
4529 
4530 			if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4531 				/*
4532 				 * Guard page: let the slow path deal with it
4533 				 */
4534 				break;
4535 			}
4536 			if (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4537 				/*
4538 				 * Unusual case... let the slow path deal with it
4539 				 */
4540 				break;
4541 			}
4542 			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4543 				if (object != cur_object) {
4544 					vm_object_unlock(object);
4545 				}
4546 				vm_map_unlock_read(map);
4547 				if (real_map != map) {
4548 					vm_map_unlock(real_map);
4549 				}
4550 				vm_object_unlock(cur_object);
4551 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
4552 				kr = KERN_MEMORY_ERROR;
4553 				goto done;
4554 			}
4555 			assert(m_object == VM_PAGE_OBJECT(m));
4556 
4557 			if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4558 			    PAGE_SIZE, 0) ||
4559 			    (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4560 upgrade_lock_and_retry:
4561 				/*
4562 				 * We might need to validate this page
4563 				 * against its code signature, so we
4564 				 * want to hold the VM object exclusively.
4565 				 */
4566 				if (object != cur_object) {
4567 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4568 						vm_object_unlock(object);
4569 						vm_object_unlock(cur_object);
4570 
4571 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4572 
4573 						vm_map_unlock_read(map);
4574 						if (real_map != map) {
4575 							vm_map_unlock(real_map);
4576 						}
4577 
4578 						goto RetryFault;
4579 					}
4580 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4581 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4582 
4583 					if (vm_object_lock_upgrade(object) == FALSE) {
4584 						/*
4585 						 * couldn't upgrade, so explictly take the lock
4586 						 * exclusively and go relookup the page since we
4587 						 * will have dropped the object lock and
4588 						 * a different thread could have inserted
4589 						 * a page at this offset
4590 						 * no need for a full retry since we're
4591 						 * at the top level of the object chain
4592 						 */
4593 						vm_object_lock(object);
4594 
4595 						continue;
4596 					}
4597 				}
4598 			}
4599 			/*
4600 			 *	Two cases of map in faults:
4601 			 *	    - At top level w/o copy object.
4602 			 *	    - Read fault anywhere.
4603 			 *		--> must disallow write.
4604 			 */
4605 
4606 			if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4607 				goto FastPmapEnter;
4608 			}
4609 
4610 			if (!need_copy &&
4611 			    !fault_info.no_copy_on_read &&
4612 			    cur_object != object &&
4613 			    !cur_object->internal &&
4614 			    !cur_object->pager_trusted &&
4615 			    vm_protect_privileged_from_untrusted &&
4616 			    !cur_object->code_signed &&
4617 			    current_proc_is_privileged()) {
4618 				/*
4619 				 * We're faulting on a page in "object" and
4620 				 * went down the shadow chain to "cur_object"
4621 				 * to find out that "cur_object"'s pager
4622 				 * is not "trusted", i.e. we can not trust it
4623 				 * to always return the same contents.
4624 				 * Since the target is a "privileged" process,
4625 				 * let's treat this as a copy-on-read fault, as
4626 				 * if it was a copy-on-write fault.
4627 				 * Once "object" gets a copy of this page, it
4628 				 * won't have to rely on "cur_object" to
4629 				 * provide the contents again.
4630 				 *
4631 				 * This is done by setting "need_copy" and
4632 				 * retrying the fault from the top with the
4633 				 * appropriate locking.
4634 				 *
4635 				 * Special case: if the mapping is executable
4636 				 * and the untrusted object is code-signed and
4637 				 * the process is "cs_enforced", we do not
4638 				 * copy-on-read because that would break
4639 				 * code-signing enforcement expectations (an
4640 				 * executable page must belong to a code-signed
4641 				 * object) and we can rely on code-signing
4642 				 * to re-validate the page if it gets evicted
4643 				 * and paged back in.
4644 				 */
4645 //				printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4646 				vm_copied_on_read++;
4647 				need_copy = TRUE;
4648 
4649 				vm_object_unlock(object);
4650 				vm_object_unlock(cur_object);
4651 				object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4652 				vm_map_unlock_read(map);
4653 				if (real_map != map) {
4654 					vm_map_unlock(real_map);
4655 				}
4656 				goto RetryFault;
4657 			}
4658 
4659 			if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4660 				if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4661 					prot &= ~VM_PROT_WRITE;
4662 				} else {
4663 					/*
4664 					 * For a protection that the pmap cares
4665 					 * about, we must hand over the full
4666 					 * set of protections (so that the pmap
4667 					 * layer can apply any desired policy).
4668 					 * This means that cs_bypass must be
4669 					 * set, as this can force us to pass
4670 					 * RWX.
4671 					 */
4672 					assert(fault_info.cs_bypass);
4673 				}
4674 
4675 				if (object != cur_object) {
4676 					/*
4677 					 * We still need to hold the top object
4678 					 * lock here to prevent a race between
4679 					 * a read fault (taking only "shared"
4680 					 * locks) and a write fault (taking
4681 					 * an "exclusive" lock on the top
4682 					 * object.
4683 					 * Otherwise, as soon as we release the
4684 					 * top lock, the write fault could
4685 					 * proceed and actually complete before
4686 					 * the read fault, and the copied page's
4687 					 * translation could then be overwritten
4688 					 * by the read fault's translation for
4689 					 * the original page.
4690 					 *
4691 					 * Let's just record what the top object
4692 					 * is and we'll release it later.
4693 					 */
4694 					top_object = object;
4695 
4696 					/*
4697 					 * switch to the object that has the new page
4698 					 */
4699 					object = cur_object;
4700 					object_lock_type = cur_object_lock_type;
4701 				}
4702 FastPmapEnter:
4703 				assert(m_object == VM_PAGE_OBJECT(m));
4704 
4705 				/*
4706 				 * prepare for the pmap_enter...
4707 				 * object and map are both locked
4708 				 * m contains valid data
4709 				 * object == m->vmp_object
4710 				 * cur_object == NULL or it's been unlocked
4711 				 * no paging references on either object or cur_object
4712 				 */
4713 				if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4714 					need_retry_ptr = &need_retry;
4715 				} else {
4716 					need_retry_ptr = NULL;
4717 				}
4718 
4719 				if (fault_page_size < PAGE_SIZE) {
4720 					DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4721 					assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4722 					    fault_phys_offset < PAGE_SIZE),
4723 					    "0x%llx\n", (uint64_t)fault_phys_offset);
4724 				} else {
4725 					assertf(fault_phys_offset == 0,
4726 					    "0x%llx\n", (uint64_t)fault_phys_offset);
4727 				}
4728 
4729 				if (__improbable(rtfault &&
4730 				    !m->vmp_realtime &&
4731 				    vm_pageout_protect_realtime)) {
4732 					vm_page_lock_queues();
4733 					if (!m->vmp_realtime) {
4734 						m->vmp_realtime = true;
4735 						vm_page_realtime_count++;
4736 					}
4737 					vm_page_unlock_queues();
4738 				}
4739 				assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
4740 				assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
4741 				if (caller_pmap) {
4742 					kr = vm_fault_enter(m,
4743 					    caller_pmap,
4744 					    caller_pmap_addr,
4745 					    fault_page_size,
4746 					    fault_phys_offset,
4747 					    prot,
4748 					    caller_prot,
4749 					    wired,
4750 					    change_wiring,
4751 					    wire_tag,
4752 					    &fault_info,
4753 					    need_retry_ptr,
4754 					    &type_of_fault);
4755 				} else {
4756 					kr = vm_fault_enter(m,
4757 					    pmap,
4758 					    vaddr,
4759 					    fault_page_size,
4760 					    fault_phys_offset,
4761 					    prot,
4762 					    caller_prot,
4763 					    wired,
4764 					    change_wiring,
4765 					    wire_tag,
4766 					    &fault_info,
4767 					    need_retry_ptr,
4768 					    &type_of_fault);
4769 				}
4770 
4771 				vm_fault_complete(
4772 					map,
4773 					real_map,
4774 					object,
4775 					m_object,
4776 					m,
4777 					offset,
4778 					trace_real_vaddr,
4779 					&fault_info,
4780 					caller_prot,
4781 					real_vaddr,
4782 					vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4783 					need_retry,
4784 					kr,
4785 					physpage_p,
4786 					prot,
4787 					top_object,
4788 					need_collapse,
4789 					cur_offset,
4790 					fault_type,
4791 					&written_on_object,
4792 					&written_on_pager,
4793 					&written_on_offset);
4794 				top_object = VM_OBJECT_NULL;
4795 				if (need_retry == TRUE) {
4796 					/*
4797 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
4798 					 * at this point we don't hold any locks so it's safe
4799 					 * to ask the pmap layer to expand the page table to
4800 					 * accommodate this mapping... once expanded, we'll
4801 					 * re-drive the fault which should result in vm_fault_enter
4802 					 * being able to successfully enter the mapping this time around
4803 					 */
4804 					(void)pmap_enter_options(
4805 						pmap, vaddr, 0, 0, 0, 0, 0,
4806 						PMAP_OPTIONS_NOENTER, NULL);
4807 
4808 					need_retry = FALSE;
4809 					goto RetryFault;
4810 				}
4811 				goto done;
4812 			}
4813 			/*
4814 			 * COPY ON WRITE FAULT
4815 			 */
4816 			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4817 
4818 			/*
4819 			 * If objects match, then
4820 			 * object->copy must not be NULL (else control
4821 			 * would be in previous code block), and we
4822 			 * have a potential push into the copy object
4823 			 * with which we can't cope with here.
4824 			 */
4825 			if (cur_object == object) {
4826 				/*
4827 				 * must take the slow path to
4828 				 * deal with the copy push
4829 				 */
4830 				break;
4831 			}
4832 
4833 			/*
4834 			 * This is now a shadow based copy on write
4835 			 * fault -- it requires a copy up the shadow
4836 			 * chain.
4837 			 */
4838 			assert(m_object == VM_PAGE_OBJECT(m));
4839 
4840 			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4841 			    vm_fault_cs_need_validation(NULL, m, m_object,
4842 			    PAGE_SIZE, 0)) {
4843 				goto upgrade_lock_and_retry;
4844 			}
4845 
4846 #if MACH_ASSERT
4847 			if (resilient_media_retry &&
4848 			    vm_fault_resilient_media_inject_error2_rate != 0 &&
4849 			    (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
4850 				/* inject an error */
4851 				cur_m = m;
4852 				m = VM_PAGE_NULL;
4853 				m_object = VM_OBJECT_NULL;
4854 				break;
4855 			}
4856 #endif /* MACH_ASSERT */
4857 			/*
4858 			 * Allocate a page in the original top level
4859 			 * object. Give up if allocate fails.  Also
4860 			 * need to remember current page, as it's the
4861 			 * source of the copy.
4862 			 *
4863 			 * at this point we hold locks on both
4864 			 * object and cur_object... no need to take
4865 			 * paging refs or mark pages BUSY since
4866 			 * we don't drop either object lock until
4867 			 * the page has been copied and inserted
4868 			 */
4869 			cur_m = m;
4870 			m = vm_page_grab_options(grab_options);
4871 			m_object = NULL;
4872 
4873 			if (m == VM_PAGE_NULL) {
4874 				/*
4875 				 * no free page currently available...
4876 				 * must take the slow path
4877 				 */
4878 				break;
4879 			}
4880 
4881 			/*
4882 			 * Now do the copy.  Mark the source page busy...
4883 			 *
4884 			 *	NOTE: This code holds the map lock across
4885 			 *	the page copy.
4886 			 */
4887 			vm_page_copy(cur_m, m);
4888 			vm_page_insert(m, object, vm_object_trunc_page(offset));
4889 			if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4890 				DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4891 			}
4892 			m_object = object;
4893 			SET_PAGE_DIRTY(m, FALSE);
4894 
4895 			/*
4896 			 * Now cope with the source page and object
4897 			 */
4898 			if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4899 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4900 			} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4901 				/*
4902 				 * We've copied the full 16K page but we're
4903 				 * about to call vm_fault_enter() only for
4904 				 * the 4K chunk we're faulting on.  The other
4905 				 * three 4K chunks in that page could still
4906 				 * be pmapped in this pmap.
4907 				 * Since the VM object layer thinks that the
4908 				 * entire page has been dealt with and the
4909 				 * original page might no longer be needed,
4910 				 * it might collapse/bypass the original VM
4911 				 * object and free its pages, which would be
4912 				 * bad (and would trigger pmap_verify_free()
4913 				 * assertions) if the other 4K chunks are still
4914 				 * pmapped.
4915 				 */
4916 				/*
4917 				 * XXX FBDP TODO4K: to be revisisted
4918 				 * Technically, we need to pmap_disconnect()
4919 				 * only the target pmap's mappings for the 4K
4920 				 * chunks of this 16K VM page.  If other pmaps
4921 				 * have PTEs on these chunks, that means that
4922 				 * the associated VM map must have a reference
4923 				 * on the VM object, so no need to worry about
4924 				 * those.
4925 				 * pmap_protect() for each 4K chunk would be
4926 				 * better but we'd have to check which chunks
4927 				 * are actually mapped before and after this
4928 				 * one.
4929 				 * A full-blown pmap_disconnect() is easier
4930 				 * for now but not efficient.
4931 				 */
4932 				DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4933 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4934 			}
4935 
4936 			if (cur_m->vmp_clustered) {
4937 				VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4938 				VM_PAGE_CONSUME_CLUSTERED(cur_m);
4939 				vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4940 			}
4941 			need_collapse = TRUE;
4942 
4943 			if (!cur_object->internal &&
4944 			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4945 				/*
4946 				 * The object from which we've just
4947 				 * copied a page is most probably backed
4948 				 * by a vnode.  We don't want to waste too
4949 				 * much time trying to collapse the VM objects
4950 				 * and create a bottleneck when several tasks
4951 				 * map the same file.
4952 				 */
4953 				if (cur_object->copy == object) {
4954 					/*
4955 					 * Shared mapping or no COW yet.
4956 					 * We can never collapse a copy
4957 					 * object into its backing object.
4958 					 */
4959 					need_collapse = FALSE;
4960 				} else if (cur_object->copy == object->shadow &&
4961 				    object->shadow->resident_page_count == 0) {
4962 					/*
4963 					 * Shared mapping after a COW occurred.
4964 					 */
4965 					need_collapse = FALSE;
4966 				}
4967 			}
4968 			vm_object_unlock(cur_object);
4969 
4970 			if (need_collapse == FALSE) {
4971 				vm_fault_collapse_skipped++;
4972 			}
4973 			vm_fault_collapse_total++;
4974 
4975 			type_of_fault = DBG_COW_FAULT;
4976 			counter_inc(&vm_statistics_cow_faults);
4977 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4978 			counter_inc(&current_task()->cow_faults);
4979 
4980 			goto FastPmapEnter;
4981 		} else {
4982 			/*
4983 			 * No page at cur_object, cur_offset... m == NULL
4984 			 */
4985 			if (cur_object->pager_created) {
4986 				vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4987 
4988 				if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4989 					int             my_fault_type;
4990 					uint8_t         c_flags = C_DONT_BLOCK;
4991 					bool            insert_cur_object = FALSE;
4992 
4993 					/*
4994 					 * May have to talk to a pager...
4995 					 * if so, take the slow path by
4996 					 * doing a 'break' from the while (TRUE) loop
4997 					 *
4998 					 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4999 					 * if the compressor is active and the page exists there
5000 					 */
5001 					if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
5002 						break;
5003 					}
5004 
5005 					if (map == kernel_map || real_map == kernel_map) {
5006 						/*
5007 						 * can't call into the compressor with the kernel_map
5008 						 * lock held, since the compressor may try to operate
5009 						 * on the kernel map in order to return an empty c_segment
5010 						 */
5011 						break;
5012 					}
5013 					if (object != cur_object) {
5014 						if (fault_type & VM_PROT_WRITE) {
5015 							c_flags |= C_KEEP;
5016 						} else {
5017 							insert_cur_object = TRUE;
5018 						}
5019 					}
5020 					if (insert_cur_object == TRUE) {
5021 						if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5022 							cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5023 
5024 							if (vm_object_lock_upgrade(cur_object) == FALSE) {
5025 								/*
5026 								 * couldn't upgrade so go do a full retry
5027 								 * immediately since we can no longer be
5028 								 * certain about cur_object (since we
5029 								 * don't hold a reference on it)...
5030 								 * first drop the top object lock
5031 								 */
5032 								vm_object_unlock(object);
5033 
5034 								vm_map_unlock_read(map);
5035 								if (real_map != map) {
5036 									vm_map_unlock(real_map);
5037 								}
5038 
5039 								goto RetryFault;
5040 							}
5041 						}
5042 					} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5043 						object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5044 
5045 						if (object != cur_object) {
5046 							/*
5047 							 * we can't go for the upgrade on the top
5048 							 * lock since the upgrade may block waiting
5049 							 * for readers to drain... since we hold
5050 							 * cur_object locked at this point, waiting
5051 							 * for the readers to drain would represent
5052 							 * a lock order inversion since the lock order
5053 							 * for objects is the reference order in the
5054 							 * shadown chain
5055 							 */
5056 							vm_object_unlock(object);
5057 							vm_object_unlock(cur_object);
5058 
5059 							vm_map_unlock_read(map);
5060 							if (real_map != map) {
5061 								vm_map_unlock(real_map);
5062 							}
5063 
5064 							goto RetryFault;
5065 						}
5066 						if (vm_object_lock_upgrade(object) == FALSE) {
5067 							/*
5068 							 * couldn't upgrade, so explictly take the lock
5069 							 * exclusively and go relookup the page since we
5070 							 * will have dropped the object lock and
5071 							 * a different thread could have inserted
5072 							 * a page at this offset
5073 							 * no need for a full retry since we're
5074 							 * at the top level of the object chain
5075 							 */
5076 							vm_object_lock(object);
5077 
5078 							continue;
5079 						}
5080 					}
5081 					m = vm_page_grab_options(grab_options);
5082 					m_object = NULL;
5083 
5084 					if (m == VM_PAGE_NULL) {
5085 						/*
5086 						 * no free page currently available...
5087 						 * must take the slow path
5088 						 */
5089 						break;
5090 					}
5091 
5092 					/*
5093 					 * The object is and remains locked
5094 					 * so no need to take a
5095 					 * "paging_in_progress" reference.
5096 					 */
5097 					bool      shared_lock;
5098 					if ((object == cur_object &&
5099 					    object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5100 					    (object != cur_object &&
5101 					    cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5102 						shared_lock = FALSE;
5103 					} else {
5104 						shared_lock = TRUE;
5105 					}
5106 
5107 					kr = vm_compressor_pager_get(
5108 						cur_object->pager,
5109 						(vm_object_trunc_page(cur_offset)
5110 						+ cur_object->paging_offset),
5111 						VM_PAGE_GET_PHYS_PAGE(m),
5112 						&my_fault_type,
5113 						c_flags,
5114 						&compressed_count_delta);
5115 
5116 					vm_compressor_pager_count(
5117 						cur_object->pager,
5118 						compressed_count_delta,
5119 						shared_lock,
5120 						cur_object);
5121 
5122 					if (kr != KERN_SUCCESS) {
5123 						vm_page_release(m, FALSE);
5124 						m = VM_PAGE_NULL;
5125 					}
5126 					/*
5127 					 * If vm_compressor_pager_get() returns
5128 					 * KERN_MEMORY_FAILURE, then the
5129 					 * compressed data is permanently lost,
5130 					 * so return this error immediately.
5131 					 */
5132 					if (kr == KERN_MEMORY_FAILURE) {
5133 						if (object != cur_object) {
5134 							vm_object_unlock(cur_object);
5135 						}
5136 						vm_object_unlock(object);
5137 						vm_map_unlock_read(map);
5138 						if (real_map != map) {
5139 							vm_map_unlock(real_map);
5140 						}
5141 
5142 						goto done;
5143 					} else if (kr != KERN_SUCCESS) {
5144 						break;
5145 					}
5146 					m->vmp_dirty = TRUE;
5147 
5148 					/*
5149 					 * If the object is purgeable, its
5150 					 * owner's purgeable ledgers will be
5151 					 * updated in vm_page_insert() but the
5152 					 * page was also accounted for in a
5153 					 * "compressed purgeable" ledger, so
5154 					 * update that now.
5155 					 */
5156 					if (object != cur_object &&
5157 					    !insert_cur_object) {
5158 						/*
5159 						 * We're not going to insert
5160 						 * the decompressed page into
5161 						 * the object it came from.
5162 						 *
5163 						 * We're dealing with a
5164 						 * copy-on-write fault on
5165 						 * "object".
5166 						 * We're going to decompress
5167 						 * the page directly into the
5168 						 * target "object" while
5169 						 * keepin the compressed
5170 						 * page for "cur_object", so
5171 						 * no ledger update in that
5172 						 * case.
5173 						 */
5174 					} else if (((cur_object->purgable ==
5175 					    VM_PURGABLE_DENY) &&
5176 					    (!cur_object->vo_ledger_tag)) ||
5177 					    (cur_object->vo_owner ==
5178 					    NULL)) {
5179 						/*
5180 						 * "cur_object" is not purgeable
5181 						 * and is not ledger-taged, or
5182 						 * there's no owner for it,
5183 						 * so no owner's ledgers to
5184 						 * update.
5185 						 */
5186 					} else {
5187 						/*
5188 						 * One less compressed
5189 						 * purgeable/tagged page for
5190 						 * cur_object's owner.
5191 						 */
5192 						vm_object_owner_compressed_update(
5193 							cur_object,
5194 							-1);
5195 					}
5196 
5197 					if (insert_cur_object) {
5198 						vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5199 						m_object = cur_object;
5200 					} else {
5201 						vm_page_insert(m, object, vm_object_trunc_page(offset));
5202 						m_object = object;
5203 					}
5204 
5205 					if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5206 						/*
5207 						 * If the page is not cacheable,
5208 						 * we can't let its contents
5209 						 * linger in the data cache
5210 						 * after the decompression.
5211 						 */
5212 						pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5213 					}
5214 
5215 					type_of_fault = my_fault_type;
5216 
5217 					VM_STAT_DECOMPRESSIONS();
5218 
5219 					if (cur_object != object) {
5220 						if (insert_cur_object) {
5221 							top_object = object;
5222 							/*
5223 							 * switch to the object that has the new page
5224 							 */
5225 							object = cur_object;
5226 							object_lock_type = cur_object_lock_type;
5227 						} else {
5228 							vm_object_unlock(cur_object);
5229 							cur_object = object;
5230 						}
5231 					}
5232 					goto FastPmapEnter;
5233 				}
5234 				/*
5235 				 * existence map present and indicates
5236 				 * that the pager doesn't have this page
5237 				 */
5238 			}
5239 			if (cur_object->shadow == VM_OBJECT_NULL ||
5240 			    resilient_media_retry) {
5241 				/*
5242 				 * Zero fill fault.  Page gets
5243 				 * inserted into the original object.
5244 				 */
5245 				if (cur_object->shadow_severed ||
5246 				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5247 				    cur_object == compressor_object ||
5248 				    cur_object == kernel_object) {
5249 					if (object != cur_object) {
5250 						vm_object_unlock(cur_object);
5251 					}
5252 					vm_object_unlock(object);
5253 
5254 					vm_map_unlock_read(map);
5255 					if (real_map != map) {
5256 						vm_map_unlock(real_map);
5257 					}
5258 					if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5259 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5260 					}
5261 
5262 					if (cur_object->shadow_severed) {
5263 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5264 					}
5265 
5266 					kr = KERN_MEMORY_ERROR;
5267 					goto done;
5268 				}
5269 				if (cur_object != object) {
5270 					vm_object_unlock(cur_object);
5271 
5272 					cur_object = object;
5273 				}
5274 				if (object_lock_type == OBJECT_LOCK_SHARED) {
5275 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5276 
5277 					if (vm_object_lock_upgrade(object) == FALSE) {
5278 						/*
5279 						 * couldn't upgrade so do a full retry on the fault
5280 						 * since we dropped the object lock which
5281 						 * could allow another thread to insert
5282 						 * a page at this offset
5283 						 */
5284 						vm_map_unlock_read(map);
5285 						if (real_map != map) {
5286 							vm_map_unlock(real_map);
5287 						}
5288 
5289 						goto RetryFault;
5290 					}
5291 				}
5292 				if (!object->internal) {
5293 					panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5294 				}
5295 #if MACH_ASSERT
5296 				if (resilient_media_retry &&
5297 				    vm_fault_resilient_media_inject_error3_rate != 0 &&
5298 				    (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5299 					/* inject an error */
5300 					m_object = NULL;
5301 					break;
5302 				}
5303 #endif /* MACH_ASSERT */
5304 				m = vm_page_alloc(object, vm_object_trunc_page(offset));
5305 				m_object = NULL;
5306 
5307 				if (m == VM_PAGE_NULL) {
5308 					/*
5309 					 * no free page currently available...
5310 					 * must take the slow path
5311 					 */
5312 					break;
5313 				}
5314 				m_object = object;
5315 
5316 				if ((prot & VM_PROT_WRITE) &&
5317 				    !(fault_type & VM_PROT_WRITE) &&
5318 				    object->copy != VM_OBJECT_NULL) {
5319 					/*
5320 					 * This is not a write fault and
5321 					 * we might have a copy-on-write
5322 					 * obligation to honor (copy object or
5323 					 * "needs_copy" map entry), so do not
5324 					 * give write access yet.
5325 					 * We'll need to catch the first write
5326 					 * to resolve the copy-on-write by
5327 					 * pushing this page to a copy object
5328 					 * or making a shadow object.
5329 					 */
5330 					if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5331 						prot &= ~VM_PROT_WRITE;
5332 					} else {
5333 						assert(fault_info.cs_bypass);
5334 					}
5335 				}
5336 
5337 				/*
5338 				 * Zeroing the page and entering into it into the pmap
5339 				 * represents a significant amount of the zero fill fault handler's work.
5340 				 *
5341 				 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5342 				 * now that we've inserted the page into the vm object.
5343 				 * Before dropping the lock, we need to check protection bits and set the
5344 				 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5345 				 * zero it, and do the pmap enter. We'll need to reacquire the lock
5346 				 * to clear the busy bit and wake up any waiters.
5347 				 */
5348 				vm_fault_cs_clear(m);
5349 				m->vmp_pmapped = TRUE;
5350 				if (map->no_zero_fill) {
5351 					type_of_fault = DBG_NZF_PAGE_FAULT;
5352 				} else {
5353 					type_of_fault = DBG_ZERO_FILL_FAULT;
5354 				}
5355 				{
5356 					pmap_t destination_pmap;
5357 					vm_map_offset_t destination_pmap_vaddr;
5358 					vm_prot_t enter_fault_type;
5359 					if (caller_pmap) {
5360 						destination_pmap = caller_pmap;
5361 						destination_pmap_vaddr = caller_pmap_addr;
5362 					} else {
5363 						destination_pmap = pmap;
5364 						destination_pmap_vaddr = vaddr;
5365 					}
5366 					if (change_wiring) {
5367 						enter_fault_type = VM_PROT_NONE;
5368 					} else {
5369 						enter_fault_type = caller_prot;
5370 					}
5371 					assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
5372 					kr = vm_fault_enter_prepare(m,
5373 					    destination_pmap,
5374 					    destination_pmap_vaddr,
5375 					    &prot,
5376 					    caller_prot,
5377 					    fault_page_size,
5378 					    fault_phys_offset,
5379 					    change_wiring,
5380 					    enter_fault_type,
5381 					    &fault_info,
5382 					    &type_of_fault,
5383 					    &page_needs_data_sync);
5384 					if (kr != KERN_SUCCESS) {
5385 						goto zero_fill_cleanup;
5386 					}
5387 
5388 					if (object_is_contended) {
5389 						/*
5390 						 * At this point the page is in the vm object, but not on a paging queue.
5391 						 * Since it's accessible to another thread but its contents are invalid
5392 						 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5393 						 */
5394 						m->vmp_busy = TRUE;
5395 						vm_object_unlock(object);
5396 					}
5397 					if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5398 						/*
5399 						 * Now zero fill page...
5400 						 * the page is probably going to
5401 						 * be written soon, so don't bother
5402 						 * to clear the modified bit
5403 						 *
5404 						 *   NOTE: This code holds the map
5405 						 *   lock across the zero fill.
5406 						 */
5407 						vm_page_zero_fill(m);
5408 						counter_inc(&vm_statistics_zero_fill_count);
5409 						DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5410 					}
5411 					if (page_needs_data_sync) {
5412 						pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5413 					}
5414 
5415 					if (top_object != VM_OBJECT_NULL) {
5416 						need_retry_ptr = &need_retry;
5417 					} else {
5418 						need_retry_ptr = NULL;
5419 					}
5420 					if (object_is_contended) {
5421 						kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5422 						    fault_page_size, fault_phys_offset,
5423 						    m, &prot, caller_prot, enter_fault_type, wired,
5424 						    fault_info.pmap_options, need_retry_ptr);
5425 						vm_object_lock(object);
5426 					} else {
5427 						kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5428 						    fault_page_size, fault_phys_offset,
5429 						    m, &prot, caller_prot, enter_fault_type, wired,
5430 						    fault_info.pmap_options, need_retry_ptr);
5431 					}
5432 				}
5433 zero_fill_cleanup:
5434 				if (!VM_DYNAMIC_PAGING_ENABLED() &&
5435 				    (object->purgable == VM_PURGABLE_DENY ||
5436 				    object->purgable == VM_PURGABLE_NONVOLATILE ||
5437 				    object->purgable == VM_PURGABLE_VOLATILE)) {
5438 					vm_page_lockspin_queues();
5439 					if (!VM_DYNAMIC_PAGING_ENABLED()) {
5440 						vm_fault_enqueue_throttled_locked(m);
5441 					}
5442 					vm_page_unlock_queues();
5443 				}
5444 				vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5445 
5446 				if (__improbable(rtfault &&
5447 				    !m->vmp_realtime &&
5448 				    vm_pageout_protect_realtime)) {
5449 					vm_page_lock_queues();
5450 					if (!m->vmp_realtime) {
5451 						m->vmp_realtime = true;
5452 						vm_page_realtime_count++;
5453 					}
5454 					vm_page_unlock_queues();
5455 				}
5456 				vm_fault_complete(
5457 					map,
5458 					real_map,
5459 					object,
5460 					m_object,
5461 					m,
5462 					offset,
5463 					trace_real_vaddr,
5464 					&fault_info,
5465 					caller_prot,
5466 					real_vaddr,
5467 					type_of_fault,
5468 					need_retry,
5469 					kr,
5470 					physpage_p,
5471 					prot,
5472 					top_object,
5473 					need_collapse,
5474 					cur_offset,
5475 					fault_type,
5476 					&written_on_object,
5477 					&written_on_pager,
5478 					&written_on_offset);
5479 				top_object = VM_OBJECT_NULL;
5480 				if (need_retry == TRUE) {
5481 					/*
5482 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
5483 					 * at this point we don't hold any locks so it's safe
5484 					 * to ask the pmap layer to expand the page table to
5485 					 * accommodate this mapping... once expanded, we'll
5486 					 * re-drive the fault which should result in vm_fault_enter
5487 					 * being able to successfully enter the mapping this time around
5488 					 */
5489 					(void)pmap_enter_options(
5490 						pmap, vaddr, 0, 0, 0, 0, 0,
5491 						PMAP_OPTIONS_NOENTER, NULL);
5492 
5493 					need_retry = FALSE;
5494 					goto RetryFault;
5495 				}
5496 				goto done;
5497 			}
5498 			/*
5499 			 * On to the next level in the shadow chain
5500 			 */
5501 			cur_offset += cur_object->vo_shadow_offset;
5502 			new_object = cur_object->shadow;
5503 			fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5504 
5505 			/*
5506 			 * take the new_object's lock with the indicated state
5507 			 */
5508 			if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5509 				vm_object_lock_shared(new_object);
5510 			} else {
5511 				vm_object_lock(new_object);
5512 			}
5513 
5514 			if (cur_object != object) {
5515 				vm_object_unlock(cur_object);
5516 			}
5517 
5518 			cur_object = new_object;
5519 
5520 			continue;
5521 		}
5522 	}
5523 	/*
5524 	 * Cleanup from fast fault failure.  Drop any object
5525 	 * lock other than original and drop map lock.
5526 	 */
5527 	if (object != cur_object) {
5528 		vm_object_unlock(cur_object);
5529 	}
5530 
5531 	/*
5532 	 * must own the object lock exclusively at this point
5533 	 */
5534 	if (object_lock_type == OBJECT_LOCK_SHARED) {
5535 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5536 
5537 		if (vm_object_lock_upgrade(object) == FALSE) {
5538 			/*
5539 			 * couldn't upgrade, so explictly
5540 			 * take the lock exclusively
5541 			 * no need to retry the fault at this
5542 			 * point since "vm_fault_page" will
5543 			 * completely re-evaluate the state
5544 			 */
5545 			vm_object_lock(object);
5546 		}
5547 	}
5548 
5549 handle_copy_delay:
5550 	vm_map_unlock_read(map);
5551 	if (real_map != map) {
5552 		vm_map_unlock(real_map);
5553 	}
5554 
5555 	if (__improbable(object == compressor_object ||
5556 	    object == kernel_object)) {
5557 		/*
5558 		 * These objects are explicitly managed and populated by the
5559 		 * kernel.  The virtual ranges backed by these objects should
5560 		 * either have wired pages or "holes" that are not supposed to
5561 		 * be accessed at all until they get explicitly populated.
5562 		 * We should never have to resolve a fault on a mapping backed
5563 		 * by one of these VM objects and providing a zero-filled page
5564 		 * would be wrong here, so let's fail the fault and let the
5565 		 * caller crash or recover.
5566 		 */
5567 		vm_object_unlock(object);
5568 		kr = KERN_MEMORY_ERROR;
5569 		goto done;
5570 	}
5571 
5572 	resilient_media_ref_transfer = false;
5573 	if (resilient_media_retry) {
5574 		/*
5575 		 * We could get here if we failed to get a free page
5576 		 * to zero-fill and had to take the slow path again.
5577 		 * Reset our "recovery-from-failed-media" state.
5578 		 */
5579 		assert(resilient_media_object != VM_OBJECT_NULL);
5580 		assert(resilient_media_offset != (vm_object_offset_t)-1);
5581 		/* release our extra reference on failed object */
5582 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5583 		if (object == resilient_media_object) {
5584 			/*
5585 			 * We're holding "object"'s lock, so we can't release
5586 			 * our extra reference at this point.
5587 			 * We need an extra reference on "object" anyway
5588 			 * (see below), so let's just transfer this reference.
5589 			 */
5590 			resilient_media_ref_transfer = true;
5591 		} else {
5592 			vm_object_lock_assert_notheld(resilient_media_object);
5593 			vm_object_deallocate(resilient_media_object);
5594 		}
5595 		resilient_media_object = VM_OBJECT_NULL;
5596 		resilient_media_offset = (vm_object_offset_t)-1;
5597 		resilient_media_retry = false;
5598 		vm_fault_resilient_media_abort2++;
5599 	}
5600 
5601 	/*
5602 	 * Make a reference to this object to
5603 	 * prevent its disposal while we are messing with
5604 	 * it.  Once we have the reference, the map is free
5605 	 * to be diddled.  Since objects reference their
5606 	 * shadows (and copies), they will stay around as well.
5607 	 */
5608 	if (resilient_media_ref_transfer) {
5609 		/* we already have an extra reference on this object */
5610 		resilient_media_ref_transfer = false;
5611 	} else {
5612 		vm_object_reference_locked(object);
5613 	}
5614 	vm_object_paging_begin(object);
5615 
5616 	set_thread_pagein_error(cthread, 0);
5617 	error_code = 0;
5618 
5619 	result_page = VM_PAGE_NULL;
5620 	kr = vm_fault_page(object, offset, fault_type,
5621 	    (change_wiring && !wired),
5622 	    FALSE,                /* page not looked up */
5623 	    &prot, &result_page, &top_page,
5624 	    &type_of_fault,
5625 	    &error_code, map->no_zero_fill,
5626 	    &fault_info);
5627 
5628 	/*
5629 	 * if kr != VM_FAULT_SUCCESS, then the paging reference
5630 	 * has been dropped and the object unlocked... the ref_count
5631 	 * is still held
5632 	 *
5633 	 * if kr == VM_FAULT_SUCCESS, then the paging reference
5634 	 * is still held along with the ref_count on the original object
5635 	 *
5636 	 *	the object is returned locked with a paging reference
5637 	 *
5638 	 *	if top_page != NULL, then it's BUSY and the
5639 	 *	object it belongs to has a paging reference
5640 	 *	but is returned unlocked
5641 	 */
5642 	if (kr != VM_FAULT_SUCCESS &&
5643 	    kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5644 		if (kr == VM_FAULT_MEMORY_ERROR &&
5645 		    fault_info.resilient_media) {
5646 			assertf(object->internal, "object %p", object);
5647 			/*
5648 			 * This fault failed but the mapping was
5649 			 * "media resilient", so we'll retry the fault in
5650 			 * recovery mode to get a zero-filled page in the
5651 			 * top object.
5652 			 * Keep the reference on the failing object so
5653 			 * that we can check that the mapping is still
5654 			 * pointing to it when we retry the fault.
5655 			 */
5656 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5657 			assert(!resilient_media_retry); /* no double retry */
5658 			assert(resilient_media_object == VM_OBJECT_NULL);
5659 			assert(resilient_media_offset == (vm_object_offset_t)-1);
5660 			resilient_media_retry = true;
5661 			resilient_media_object = object;
5662 			resilient_media_offset = offset;
5663 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5664 			vm_fault_resilient_media_initiate++;
5665 			goto RetryFault;
5666 		} else {
5667 			/*
5668 			 * we didn't succeed, lose the object reference
5669 			 * immediately.
5670 			 */
5671 			vm_object_deallocate(object);
5672 			object = VM_OBJECT_NULL; /* no longer valid */
5673 		}
5674 
5675 		/*
5676 		 * See why we failed, and take corrective action.
5677 		 */
5678 		switch (kr) {
5679 		case VM_FAULT_MEMORY_SHORTAGE:
5680 			if (vm_page_wait((change_wiring) ?
5681 			    THREAD_UNINT :
5682 			    THREAD_ABORTSAFE)) {
5683 				goto RetryFault;
5684 			}
5685 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
5686 			OS_FALLTHROUGH;
5687 		case VM_FAULT_INTERRUPTED:
5688 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
5689 			kr = KERN_ABORTED;
5690 			goto done;
5691 		case VM_FAULT_RETRY:
5692 			goto RetryFault;
5693 		case VM_FAULT_MEMORY_ERROR:
5694 			if (error_code) {
5695 				kr = error_code;
5696 			} else {
5697 				kr = KERN_MEMORY_ERROR;
5698 			}
5699 			goto done;
5700 		default:
5701 			panic("vm_fault: unexpected error 0x%x from "
5702 			    "vm_fault_page()\n", kr);
5703 		}
5704 	}
5705 	m = result_page;
5706 	m_object = NULL;
5707 
5708 	if (m != VM_PAGE_NULL) {
5709 		m_object = VM_PAGE_OBJECT(m);
5710 		assert((change_wiring && !wired) ?
5711 		    (top_page == VM_PAGE_NULL) :
5712 		    ((top_page == VM_PAGE_NULL) == (m_object == object)));
5713 	}
5714 
5715 	/*
5716 	 * What to do with the resulting page from vm_fault_page
5717 	 * if it doesn't get entered into the physical map:
5718 	 */
5719 #define RELEASE_PAGE(m)                                 \
5720 	MACRO_BEGIN                                     \
5721 	PAGE_WAKEUP_DONE(m);                            \
5722 	if ( !VM_PAGE_PAGEABLE(m)) {                    \
5723 	        vm_page_lockspin_queues();              \
5724 	        if ( !VM_PAGE_PAGEABLE(m))              \
5725 	                vm_page_activate(m);            \
5726 	        vm_page_unlock_queues();                \
5727 	}                                               \
5728 	MACRO_END
5729 
5730 
5731 	object_locks_dropped = FALSE;
5732 	/*
5733 	 * We must verify that the maps have not changed
5734 	 * since our last lookup. vm_map_verify() needs the
5735 	 * map lock (shared) but we are holding object locks.
5736 	 * So we do a try_lock() first and, if that fails, we
5737 	 * drop the object locks and go in for the map lock again.
5738 	 */
5739 	if (!vm_map_try_lock_read(original_map)) {
5740 		if (m != VM_PAGE_NULL) {
5741 			old_copy_object = m_object->copy;
5742 			vm_object_unlock(m_object);
5743 		} else {
5744 			old_copy_object = VM_OBJECT_NULL;
5745 			vm_object_unlock(object);
5746 		}
5747 
5748 		object_locks_dropped = TRUE;
5749 
5750 		vm_map_lock_read(original_map);
5751 	}
5752 
5753 	if ((map != original_map) || !vm_map_verify(map, &version)) {
5754 		if (object_locks_dropped == FALSE) {
5755 			if (m != VM_PAGE_NULL) {
5756 				old_copy_object = m_object->copy;
5757 				vm_object_unlock(m_object);
5758 			} else {
5759 				old_copy_object = VM_OBJECT_NULL;
5760 				vm_object_unlock(object);
5761 			}
5762 
5763 			object_locks_dropped = TRUE;
5764 		}
5765 
5766 		/*
5767 		 * no object locks are held at this point
5768 		 */
5769 		vm_object_t             retry_object;
5770 		vm_object_offset_t      retry_offset;
5771 		vm_prot_t               retry_prot;
5772 
5773 		/*
5774 		 * To avoid trying to write_lock the map while another
5775 		 * thread has it read_locked (in vm_map_pageable), we
5776 		 * do not try for write permission.  If the page is
5777 		 * still writable, we will get write permission.  If it
5778 		 * is not, or has been marked needs_copy, we enter the
5779 		 * mapping without write permission, and will merely
5780 		 * take another fault.
5781 		 */
5782 		map = original_map;
5783 
5784 		kr = vm_map_lookup_and_lock_object(&map, vaddr,
5785 		    fault_type & ~VM_PROT_WRITE,
5786 		    OBJECT_LOCK_EXCLUSIVE, &version,
5787 		    &retry_object, &retry_offset, &retry_prot,
5788 		    &wired,
5789 		    &fault_info,
5790 		    &real_map,
5791 		    NULL);
5792 		pmap = real_map->pmap;
5793 
5794 		if (kr != KERN_SUCCESS) {
5795 			vm_map_unlock_read(map);
5796 
5797 			if (m != VM_PAGE_NULL) {
5798 				assert(VM_PAGE_OBJECT(m) == m_object);
5799 
5800 				/*
5801 				 * retake the lock so that
5802 				 * we can drop the paging reference
5803 				 * in vm_fault_cleanup and do the
5804 				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5805 				 */
5806 				vm_object_lock(m_object);
5807 
5808 				RELEASE_PAGE(m);
5809 
5810 				vm_fault_cleanup(m_object, top_page);
5811 			} else {
5812 				/*
5813 				 * retake the lock so that
5814 				 * we can drop the paging reference
5815 				 * in vm_fault_cleanup
5816 				 */
5817 				vm_object_lock(object);
5818 
5819 				vm_fault_cleanup(object, top_page);
5820 			}
5821 			vm_object_deallocate(object);
5822 
5823 			if (kr == KERN_INVALID_ADDRESS) {
5824 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
5825 			}
5826 			goto done;
5827 		}
5828 		vm_object_unlock(retry_object);
5829 
5830 		if ((retry_object != object) || (retry_offset != offset)) {
5831 			vm_map_unlock_read(map);
5832 			if (real_map != map) {
5833 				vm_map_unlock(real_map);
5834 			}
5835 
5836 			if (m != VM_PAGE_NULL) {
5837 				assert(VM_PAGE_OBJECT(m) == m_object);
5838 
5839 				/*
5840 				 * retake the lock so that
5841 				 * we can drop the paging reference
5842 				 * in vm_fault_cleanup and do the
5843 				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5844 				 */
5845 				vm_object_lock(m_object);
5846 
5847 				RELEASE_PAGE(m);
5848 
5849 				vm_fault_cleanup(m_object, top_page);
5850 			} else {
5851 				/*
5852 				 * retake the lock so that
5853 				 * we can drop the paging reference
5854 				 * in vm_fault_cleanup
5855 				 */
5856 				vm_object_lock(object);
5857 
5858 				vm_fault_cleanup(object, top_page);
5859 			}
5860 			vm_object_deallocate(object);
5861 
5862 			goto RetryFault;
5863 		}
5864 		/*
5865 		 * Check whether the protection has changed or the object
5866 		 * has been copied while we left the map unlocked.
5867 		 */
5868 		if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5869 			/* If the pmap layer cares, pass the full set. */
5870 			prot = retry_prot;
5871 		} else {
5872 			prot &= retry_prot;
5873 		}
5874 	}
5875 
5876 	if (object_locks_dropped == TRUE) {
5877 		if (m != VM_PAGE_NULL) {
5878 			assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5879 			assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5880 			vm_object_lock(m_object);
5881 
5882 			if (m_object->copy != old_copy_object) {
5883 				/*
5884 				 * The copy object changed while the top-level object
5885 				 * was unlocked, so take away write permission.
5886 				 */
5887 				assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5888 				prot &= ~VM_PROT_WRITE;
5889 			}
5890 		} else {
5891 			vm_object_lock(object);
5892 		}
5893 
5894 		object_locks_dropped = FALSE;
5895 	}
5896 
5897 	if (!need_copy &&
5898 	    !fault_info.no_copy_on_read &&
5899 	    m != VM_PAGE_NULL &&
5900 	    VM_PAGE_OBJECT(m) != object &&
5901 	    !VM_PAGE_OBJECT(m)->pager_trusted &&
5902 	    vm_protect_privileged_from_untrusted &&
5903 	    !VM_PAGE_OBJECT(m)->code_signed &&
5904 	    current_proc_is_privileged()) {
5905 		/*
5906 		 * We found the page we want in an "untrusted" VM object
5907 		 * down the shadow chain.  Since the target is "privileged"
5908 		 * we want to perform a copy-on-read of that page, so that the
5909 		 * mapped object gets a stable copy and does not have to
5910 		 * rely on the "untrusted" object to provide the same
5911 		 * contents if the page gets reclaimed and has to be paged
5912 		 * in again later on.
5913 		 *
5914 		 * Special case: if the mapping is executable and the untrusted
5915 		 * object is code-signed and the process is "cs_enforced", we
5916 		 * do not copy-on-read because that would break code-signing
5917 		 * enforcement expectations (an executable page must belong
5918 		 * to a code-signed object) and we can rely on code-signing
5919 		 * to re-validate the page if it gets evicted and paged back in.
5920 		 */
5921 //		printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5922 		vm_copied_on_read++;
5923 		need_copy_on_read = TRUE;
5924 		need_copy = TRUE;
5925 	} else {
5926 		need_copy_on_read = FALSE;
5927 	}
5928 
5929 	/*
5930 	 * If we want to wire down this page, but no longer have
5931 	 * adequate permissions, we must start all over.
5932 	 * If we decided to copy-on-read, we must also start all over.
5933 	 */
5934 	if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5935 	    need_copy_on_read) {
5936 		vm_map_unlock_read(map);
5937 		if (real_map != map) {
5938 			vm_map_unlock(real_map);
5939 		}
5940 
5941 		if (m != VM_PAGE_NULL) {
5942 			assert(VM_PAGE_OBJECT(m) == m_object);
5943 
5944 			RELEASE_PAGE(m);
5945 
5946 			vm_fault_cleanup(m_object, top_page);
5947 		} else {
5948 			vm_fault_cleanup(object, top_page);
5949 		}
5950 
5951 		vm_object_deallocate(object);
5952 
5953 		goto RetryFault;
5954 	}
5955 	if (m != VM_PAGE_NULL) {
5956 		/*
5957 		 * Put this page into the physical map.
5958 		 * We had to do the unlock above because pmap_enter
5959 		 * may cause other faults.  The page may be on
5960 		 * the pageout queues.  If the pageout daemon comes
5961 		 * across the page, it will remove it from the queues.
5962 		 */
5963 		if (fault_page_size < PAGE_SIZE) {
5964 			DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5965 			assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5966 			    fault_phys_offset < PAGE_SIZE),
5967 			    "0x%llx\n", (uint64_t)fault_phys_offset);
5968 		} else {
5969 			assertf(fault_phys_offset == 0,
5970 			    "0x%llx\n", (uint64_t)fault_phys_offset);
5971 		}
5972 		assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5973 		assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5974 		if (caller_pmap) {
5975 			kr = vm_fault_enter(m,
5976 			    caller_pmap,
5977 			    caller_pmap_addr,
5978 			    fault_page_size,
5979 			    fault_phys_offset,
5980 			    prot,
5981 			    caller_prot,
5982 			    wired,
5983 			    change_wiring,
5984 			    wire_tag,
5985 			    &fault_info,
5986 			    NULL,
5987 			    &type_of_fault);
5988 		} else {
5989 			kr = vm_fault_enter(m,
5990 			    pmap,
5991 			    vaddr,
5992 			    fault_page_size,
5993 			    fault_phys_offset,
5994 			    prot,
5995 			    caller_prot,
5996 			    wired,
5997 			    change_wiring,
5998 			    wire_tag,
5999 			    &fault_info,
6000 			    NULL,
6001 			    &type_of_fault);
6002 		}
6003 		assert(VM_PAGE_OBJECT(m) == m_object);
6004 
6005 		{
6006 			int     event_code = 0;
6007 
6008 			if (m_object->internal) {
6009 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
6010 			} else if (m_object->object_is_shared_cache) {
6011 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
6012 			} else {
6013 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
6014 			}
6015 
6016 			KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
6017 			KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
6018 
6019 			DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
6020 		}
6021 		if (kr != KERN_SUCCESS) {
6022 			/* abort this page fault */
6023 			vm_map_unlock_read(map);
6024 			if (real_map != map) {
6025 				vm_map_unlock(real_map);
6026 			}
6027 			PAGE_WAKEUP_DONE(m);
6028 			vm_fault_cleanup(m_object, top_page);
6029 			vm_object_deallocate(object);
6030 			goto done;
6031 		}
6032 		if (physpage_p != NULL) {
6033 			/* for vm_map_wire_and_extract() */
6034 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6035 			if (prot & VM_PROT_WRITE) {
6036 				vm_object_lock_assert_exclusive(m_object);
6037 				m->vmp_dirty = TRUE;
6038 			}
6039 		}
6040 	} else {
6041 		vm_map_entry_t          entry;
6042 		vm_map_offset_t         laddr;
6043 		vm_map_offset_t         ldelta, hdelta;
6044 
6045 		/*
6046 		 * do a pmap block mapping from the physical address
6047 		 * in the object
6048 		 */
6049 
6050 		if (real_map != map) {
6051 			vm_map_unlock(real_map);
6052 		}
6053 
6054 		if (original_map != map) {
6055 			vm_map_unlock_read(map);
6056 			vm_map_lock_read(original_map);
6057 			map = original_map;
6058 		}
6059 		real_map = map;
6060 
6061 		laddr = vaddr;
6062 		hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6063 
6064 		while (vm_map_lookup_entry(map, laddr, &entry)) {
6065 			if (ldelta > (laddr - entry->vme_start)) {
6066 				ldelta = laddr - entry->vme_start;
6067 			}
6068 			if (hdelta > (entry->vme_end - laddr)) {
6069 				hdelta = entry->vme_end - laddr;
6070 			}
6071 			if (entry->is_sub_map) {
6072 				laddr = ((laddr - entry->vme_start)
6073 				    + VME_OFFSET(entry));
6074 				vm_map_lock_read(VME_SUBMAP(entry));
6075 
6076 				if (map != real_map) {
6077 					vm_map_unlock_read(map);
6078 				}
6079 				if (entry->use_pmap) {
6080 					vm_map_unlock_read(real_map);
6081 					real_map = VME_SUBMAP(entry);
6082 				}
6083 				map = VME_SUBMAP(entry);
6084 			} else {
6085 				break;
6086 			}
6087 		}
6088 
6089 		if (vm_map_lookup_entry(map, laddr, &entry) &&
6090 		    (!entry->is_sub_map) &&
6091 		    (object != VM_OBJECT_NULL) &&
6092 		    (VME_OBJECT(entry) == object)) {
6093 			uint16_t superpage;
6094 
6095 			if (!object->pager_created &&
6096 			    object->phys_contiguous &&
6097 			    VME_OFFSET(entry) == 0 &&
6098 			    (entry->vme_end - entry->vme_start == object->vo_size) &&
6099 			    VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6100 				superpage = VM_MEM_SUPERPAGE;
6101 			} else {
6102 				superpage = 0;
6103 			}
6104 
6105 			if (superpage && physpage_p) {
6106 				/* for vm_map_wire_and_extract() */
6107 				*physpage_p = (ppnum_t)
6108 				    ((((vm_map_offset_t)
6109 				    object->vo_shadow_offset)
6110 				    + VME_OFFSET(entry)
6111 				    + (laddr - entry->vme_start))
6112 				    >> PAGE_SHIFT);
6113 			}
6114 
6115 			if (caller_pmap) {
6116 				/*
6117 				 * Set up a block mapped area
6118 				 */
6119 				assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6120 				kr = pmap_map_block_addr(caller_pmap,
6121 				    (addr64_t)(caller_pmap_addr - ldelta),
6122 				    (pmap_paddr_t)(((vm_map_offset_t) (object->vo_shadow_offset)) +
6123 				    VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6124 				    (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6125 				    (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6126 
6127 				if (kr != KERN_SUCCESS) {
6128 					goto cleanup;
6129 				}
6130 			} else {
6131 				/*
6132 				 * Set up a block mapped area
6133 				 */
6134 				assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6135 				kr = pmap_map_block_addr(real_map->pmap,
6136 				    (addr64_t)(vaddr - ldelta),
6137 				    (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6138 				    VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6139 				    (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6140 				    (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6141 
6142 				if (kr != KERN_SUCCESS) {
6143 					goto cleanup;
6144 				}
6145 			}
6146 		}
6147 	}
6148 
6149 	/*
6150 	 * Success
6151 	 */
6152 	kr = KERN_SUCCESS;
6153 
6154 	/*
6155 	 * TODO: could most of the done cases just use cleanup?
6156 	 */
6157 cleanup:
6158 	/*
6159 	 * Unlock everything, and return
6160 	 */
6161 	vm_map_unlock_read(map);
6162 	if (real_map != map) {
6163 		vm_map_unlock(real_map);
6164 	}
6165 
6166 	if (m != VM_PAGE_NULL) {
6167 		if (__improbable(rtfault &&
6168 		    !m->vmp_realtime &&
6169 		    vm_pageout_protect_realtime)) {
6170 			vm_page_lock_queues();
6171 			if (!m->vmp_realtime) {
6172 				m->vmp_realtime = true;
6173 				vm_page_realtime_count++;
6174 			}
6175 			vm_page_unlock_queues();
6176 		}
6177 		assert(VM_PAGE_OBJECT(m) == m_object);
6178 
6179 		if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6180 			vm_object_paging_begin(m_object);
6181 
6182 			assert(written_on_object == VM_OBJECT_NULL);
6183 			written_on_object = m_object;
6184 			written_on_pager = m_object->pager;
6185 			written_on_offset = m_object->paging_offset + m->vmp_offset;
6186 		}
6187 		PAGE_WAKEUP_DONE(m);
6188 
6189 		vm_fault_cleanup(m_object, top_page);
6190 	} else {
6191 		vm_fault_cleanup(object, top_page);
6192 	}
6193 
6194 	vm_object_deallocate(object);
6195 
6196 #undef  RELEASE_PAGE
6197 
6198 done:
6199 	thread_interrupt_level(interruptible_state);
6200 
6201 	if (resilient_media_object != VM_OBJECT_NULL) {
6202 		assert(resilient_media_retry);
6203 		assert(resilient_media_offset != (vm_object_offset_t)-1);
6204 		/* release extra reference on failed object */
6205 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6206 		vm_object_lock_assert_notheld(resilient_media_object);
6207 		vm_object_deallocate(resilient_media_object);
6208 		resilient_media_object = VM_OBJECT_NULL;
6209 		resilient_media_offset = (vm_object_offset_t)-1;
6210 		resilient_media_retry = false;
6211 		vm_fault_resilient_media_release++;
6212 	}
6213 	assert(!resilient_media_retry);
6214 
6215 	/*
6216 	 * Only I/O throttle on faults which cause a pagein/swapin.
6217 	 */
6218 	if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6219 		throttle_lowpri_io(1);
6220 	} else {
6221 		if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6222 			if ((throttle_delay = vm_page_throttled(TRUE))) {
6223 				if (vm_debug_events) {
6224 					if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6225 						VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6226 					} else if (type_of_fault == DBG_COW_FAULT) {
6227 						VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6228 					} else {
6229 						VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6230 					}
6231 				}
6232 				__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6233 			}
6234 		}
6235 	}
6236 
6237 	if (written_on_object) {
6238 		vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6239 
6240 		vm_object_lock(written_on_object);
6241 		vm_object_paging_end(written_on_object);
6242 		vm_object_unlock(written_on_object);
6243 
6244 		written_on_object = VM_OBJECT_NULL;
6245 	}
6246 
6247 	if (rtfault) {
6248 		vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6249 	}
6250 
6251 	KDBG_RELEASE(
6252 		(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
6253 		((uint64_t)trace_vaddr >> 32),
6254 		trace_vaddr,
6255 		kr,
6256 		vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6257 
6258 	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6259 		DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6260 	}
6261 
6262 	return kr;
6263 }
6264 
6265 /*
6266  *	vm_fault_wire:
6267  *
6268  *	Wire down a range of virtual addresses in a map.
6269  */
6270 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6271 vm_fault_wire(
6272 	vm_map_t        map,
6273 	vm_map_entry_t  entry,
6274 	vm_prot_t       prot,
6275 	vm_tag_t        wire_tag,
6276 	pmap_t          pmap,
6277 	vm_map_offset_t pmap_addr,
6278 	ppnum_t         *physpage_p)
6279 {
6280 	vm_map_offset_t va;
6281 	vm_map_offset_t end_addr = entry->vme_end;
6282 	kern_return_t   rc;
6283 	vm_map_size_t   effective_page_size;
6284 
6285 	assert(entry->in_transition);
6286 
6287 	if (!entry->is_sub_map &&
6288 	    VME_OBJECT(entry) != VM_OBJECT_NULL &&
6289 	    VME_OBJECT(entry)->phys_contiguous) {
6290 		return KERN_SUCCESS;
6291 	}
6292 
6293 	/*
6294 	 *	Inform the physical mapping system that the
6295 	 *	range of addresses may not fault, so that
6296 	 *	page tables and such can be locked down as well.
6297 	 */
6298 
6299 	pmap_pageable(pmap, pmap_addr,
6300 	    pmap_addr + (end_addr - entry->vme_start), FALSE);
6301 
6302 	/*
6303 	 *	We simulate a fault to get the page and enter it
6304 	 *	in the physical map.
6305 	 */
6306 
6307 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6308 	for (va = entry->vme_start;
6309 	    va < end_addr;
6310 	    va += effective_page_size) {
6311 		rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6312 		    pmap_addr + (va - entry->vme_start),
6313 		    physpage_p);
6314 		if (rc != KERN_SUCCESS) {
6315 			rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6316 			    ((pmap == kernel_pmap)
6317 			    ? THREAD_UNINT
6318 			    : THREAD_ABORTSAFE),
6319 			    pmap,
6320 			    (pmap_addr +
6321 			    (va - entry->vme_start)),
6322 			    physpage_p);
6323 			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6324 		}
6325 
6326 		if (rc != KERN_SUCCESS) {
6327 			struct vm_map_entry     tmp_entry = *entry;
6328 
6329 			/* unwire wired pages */
6330 			tmp_entry.vme_end = va;
6331 			vm_fault_unwire(map,
6332 			    &tmp_entry, FALSE, pmap, pmap_addr);
6333 
6334 			return rc;
6335 		}
6336 	}
6337 	return KERN_SUCCESS;
6338 }
6339 
6340 /*
6341  *	vm_fault_unwire:
6342  *
6343  *	Unwire a range of virtual addresses in a map.
6344  */
6345 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr)6346 vm_fault_unwire(
6347 	vm_map_t        map,
6348 	vm_map_entry_t  entry,
6349 	boolean_t       deallocate,
6350 	pmap_t          pmap,
6351 	vm_map_offset_t pmap_addr)
6352 {
6353 	vm_map_offset_t va;
6354 	vm_map_offset_t end_addr = entry->vme_end;
6355 	vm_object_t             object;
6356 	struct vm_object_fault_info fault_info = {};
6357 	unsigned int    unwired_pages;
6358 	vm_map_size_t   effective_page_size;
6359 
6360 	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6361 
6362 	/*
6363 	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6364 	 * do anything since such memory is wired by default.  So we don't have
6365 	 * anything to undo here.
6366 	 */
6367 
6368 	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6369 		return;
6370 	}
6371 
6372 	fault_info.interruptible = THREAD_UNINT;
6373 	fault_info.behavior = entry->behavior;
6374 	fault_info.user_tag = VME_ALIAS(entry);
6375 	if (entry->iokit_acct ||
6376 	    (!entry->is_sub_map && !entry->use_pmap)) {
6377 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6378 	}
6379 	fault_info.lo_offset = VME_OFFSET(entry);
6380 	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6381 	fault_info.no_cache = entry->no_cache;
6382 	fault_info.stealth = TRUE;
6383 
6384 	unwired_pages = 0;
6385 
6386 	/*
6387 	 *	Since the pages are wired down, we must be able to
6388 	 *	get their mappings from the physical map system.
6389 	 */
6390 
6391 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6392 	for (va = entry->vme_start;
6393 	    va < end_addr;
6394 	    va += effective_page_size) {
6395 		if (object == VM_OBJECT_NULL) {
6396 			if (pmap) {
6397 				pmap_change_wiring(pmap,
6398 				    pmap_addr + (va - entry->vme_start), FALSE);
6399 			}
6400 			(void) vm_fault(map, va, VM_PROT_NONE,
6401 			    TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6402 		} else {
6403 			vm_prot_t       prot;
6404 			vm_page_t       result_page;
6405 			vm_page_t       top_page;
6406 			vm_object_t     result_object;
6407 			vm_fault_return_t result;
6408 
6409 			/* cap cluster size at maximum UPL size */
6410 			upl_size_t cluster_size;
6411 			if (os_sub_overflow(end_addr, va, &cluster_size)) {
6412 				cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6413 			}
6414 			fault_info.cluster_size = cluster_size;
6415 
6416 			do {
6417 				prot = VM_PROT_NONE;
6418 
6419 				vm_object_lock(object);
6420 				vm_object_paging_begin(object);
6421 				result_page = VM_PAGE_NULL;
6422 				result = vm_fault_page(
6423 					object,
6424 					(VME_OFFSET(entry) +
6425 					(va - entry->vme_start)),
6426 					VM_PROT_NONE, TRUE,
6427 					FALSE, /* page not looked up */
6428 					&prot, &result_page, &top_page,
6429 					(int *)0,
6430 					NULL, map->no_zero_fill,
6431 					&fault_info);
6432 			} while (result == VM_FAULT_RETRY);
6433 
6434 			/*
6435 			 * If this was a mapping to a file on a device that has been forcibly
6436 			 * unmounted, then we won't get a page back from vm_fault_page().  Just
6437 			 * move on to the next one in case the remaining pages are mapped from
6438 			 * different objects.  During a forced unmount, the object is terminated
6439 			 * so the alive flag will be false if this happens.  A forced unmount will
6440 			 * will occur when an external disk is unplugged before the user does an
6441 			 * eject, so we don't want to panic in that situation.
6442 			 */
6443 
6444 			if (result == VM_FAULT_MEMORY_ERROR) {
6445 				if (!object->alive) {
6446 					continue;
6447 				}
6448 				if (!object->internal && object->pager == NULL) {
6449 					continue;
6450 				}
6451 			}
6452 
6453 			if (result == VM_FAULT_MEMORY_ERROR &&
6454 			    object == kernel_object) {
6455 				/*
6456 				 * This must have been allocated with
6457 				 * KMA_KOBJECT and KMA_VAONLY and there's
6458 				 * no physical page at this offset.
6459 				 * We're done (no page to free).
6460 				 */
6461 				assert(deallocate);
6462 				continue;
6463 			}
6464 
6465 			if (result != VM_FAULT_SUCCESS) {
6466 				panic("vm_fault_unwire: failure");
6467 			}
6468 
6469 			result_object = VM_PAGE_OBJECT(result_page);
6470 
6471 			if (deallocate) {
6472 				assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6473 				    vm_page_fictitious_addr);
6474 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6475 				if (VM_PAGE_WIRED(result_page)) {
6476 					unwired_pages++;
6477 				}
6478 				VM_PAGE_FREE(result_page);
6479 			} else {
6480 				if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6481 					pmap_change_wiring(pmap,
6482 					    pmap_addr + (va - entry->vme_start), FALSE);
6483 				}
6484 
6485 
6486 				if (VM_PAGE_WIRED(result_page)) {
6487 					vm_page_lockspin_queues();
6488 					vm_page_unwire(result_page, TRUE);
6489 					vm_page_unlock_queues();
6490 					unwired_pages++;
6491 				}
6492 				if (entry->zero_wired_pages) {
6493 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6494 					entry->zero_wired_pages = FALSE;
6495 				}
6496 
6497 				PAGE_WAKEUP_DONE(result_page);
6498 			}
6499 			vm_fault_cleanup(result_object, top_page);
6500 		}
6501 	}
6502 
6503 	/*
6504 	 *	Inform the physical mapping system that the range
6505 	 *	of addresses may fault, so that page tables and
6506 	 *	such may be unwired themselves.
6507 	 */
6508 
6509 	pmap_pageable(pmap, pmap_addr,
6510 	    pmap_addr + (end_addr - entry->vme_start), TRUE);
6511 
6512 	if (kernel_object == object) {
6513 		/*
6514 		 * Would like to make user_tag in vm_object_fault_info
6515 		 * vm_tag_t (unsigned short) but user_tag derives its value from
6516 		 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6517 		 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6518 		 * code at many places.
6519 		 *
6520 		 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6521 		 */
6522 		assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6523 		    "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6524 		vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6525 	}
6526 }
6527 
6528 /*
6529  *	vm_fault_wire_fast:
6530  *
6531  *	Handle common case of a wire down page fault at the given address.
6532  *	If successful, the page is inserted into the associated physical map.
6533  *	The map entry is passed in to avoid the overhead of a map lookup.
6534  *
6535  *	NOTE: the given address should be truncated to the
6536  *	proper page address.
6537  *
6538  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
6539  *	a standard error specifying why the fault is fatal is returned.
6540  *
6541  *	The map in question must be referenced, and remains so.
6542  *	Caller has a read lock on the map.
6543  *
6544  *	This is a stripped version of vm_fault() for wiring pages.  Anything
6545  *	other than the common case will return KERN_FAILURE, and the caller
6546  *	is expected to call vm_fault().
6547  */
6548 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6549 vm_fault_wire_fast(
6550 	__unused vm_map_t       map,
6551 	vm_map_offset_t va,
6552 	__unused vm_prot_t       caller_prot,
6553 	vm_tag_t        wire_tag,
6554 	vm_map_entry_t  entry,
6555 	pmap_t          pmap,
6556 	vm_map_offset_t pmap_addr,
6557 	ppnum_t         *physpage_p)
6558 {
6559 	vm_object_t             object;
6560 	vm_object_offset_t      offset;
6561 	vm_page_t               m;
6562 	vm_prot_t               prot;
6563 	thread_t                thread = current_thread();
6564 	int                     type_of_fault;
6565 	kern_return_t           kr;
6566 	vm_map_size_t           fault_page_size;
6567 	vm_map_offset_t         fault_phys_offset;
6568 	struct vm_object_fault_info fault_info = {};
6569 
6570 	counter_inc(&vm_statistics_faults);
6571 
6572 	if (thread != THREAD_NULL) {
6573 		counter_inc(&get_threadtask(thread)->faults);
6574 	}
6575 
6576 /*
6577  *	Recovery actions
6578  */
6579 
6580 #undef  RELEASE_PAGE
6581 #define RELEASE_PAGE(m) {                               \
6582 	PAGE_WAKEUP_DONE(m);                            \
6583 	vm_page_lockspin_queues();                      \
6584 	vm_page_unwire(m, TRUE);                        \
6585 	vm_page_unlock_queues();                        \
6586 }
6587 
6588 
6589 #undef  UNLOCK_THINGS
6590 #define UNLOCK_THINGS   {                               \
6591 	vm_object_paging_end(object);                      \
6592 	vm_object_unlock(object);                          \
6593 }
6594 
6595 #undef  UNLOCK_AND_DEALLOCATE
6596 #define UNLOCK_AND_DEALLOCATE   {                       \
6597 	UNLOCK_THINGS;                                  \
6598 	vm_object_deallocate(object);                   \
6599 }
6600 /*
6601  *	Give up and have caller do things the hard way.
6602  */
6603 
6604 #define GIVE_UP {                                       \
6605 	UNLOCK_AND_DEALLOCATE;                          \
6606 	return(KERN_FAILURE);                           \
6607 }
6608 
6609 
6610 	/*
6611 	 *	If this entry is not directly to a vm_object, bail out.
6612 	 */
6613 	if (entry->is_sub_map) {
6614 		assert(physpage_p == NULL);
6615 		return KERN_FAILURE;
6616 	}
6617 
6618 	/*
6619 	 *	Find the backing store object and offset into it.
6620 	 */
6621 
6622 	object = VME_OBJECT(entry);
6623 	offset = (va - entry->vme_start) + VME_OFFSET(entry);
6624 	prot = entry->protection;
6625 
6626 	/*
6627 	 *	Make a reference to this object to prevent its
6628 	 *	disposal while we are messing with it.
6629 	 */
6630 
6631 	vm_object_lock(object);
6632 	vm_object_reference_locked(object);
6633 	vm_object_paging_begin(object);
6634 
6635 	/*
6636 	 *	INVARIANTS (through entire routine):
6637 	 *
6638 	 *	1)	At all times, we must either have the object
6639 	 *		lock or a busy page in some object to prevent
6640 	 *		some other thread from trying to bring in
6641 	 *		the same page.
6642 	 *
6643 	 *	2)	Once we have a busy page, we must remove it from
6644 	 *		the pageout queues, so that the pageout daemon
6645 	 *		will not grab it away.
6646 	 *
6647 	 */
6648 
6649 	/*
6650 	 *	Look for page in top-level object.  If it's not there or
6651 	 *	there's something going on, give up.
6652 	 */
6653 	m = vm_page_lookup(object, vm_object_trunc_page(offset));
6654 	if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6655 	    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
6656 		GIVE_UP;
6657 	}
6658 	if (m->vmp_fictitious &&
6659 	    VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6660 		/*
6661 		 * Guard pages are fictitious pages and are never
6662 		 * entered into a pmap, so let's say it's been wired...
6663 		 */
6664 		kr = KERN_SUCCESS;
6665 		goto done;
6666 	}
6667 
6668 	/*
6669 	 *	Wire the page down now.  All bail outs beyond this
6670 	 *	point must unwire the page.
6671 	 */
6672 
6673 	vm_page_lockspin_queues();
6674 	vm_page_wire(m, wire_tag, TRUE);
6675 	vm_page_unlock_queues();
6676 
6677 	/*
6678 	 *	Mark page busy for other threads.
6679 	 */
6680 	assert(!m->vmp_busy);
6681 	m->vmp_busy = TRUE;
6682 	assert(!m->vmp_absent);
6683 
6684 	/*
6685 	 *	Give up if the page is being written and there's a copy object
6686 	 */
6687 	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6688 		RELEASE_PAGE(m);
6689 		GIVE_UP;
6690 	}
6691 
6692 	fault_info.user_tag = VME_ALIAS(entry);
6693 	fault_info.pmap_options = 0;
6694 	if (entry->iokit_acct ||
6695 	    (!entry->is_sub_map && !entry->use_pmap)) {
6696 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6697 	}
6698 
6699 	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6700 	fault_phys_offset = offset - vm_object_trunc_page(offset);
6701 
6702 	/*
6703 	 *	Put this page into the physical map.
6704 	 */
6705 	type_of_fault = DBG_CACHE_HIT_FAULT;
6706 	assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6707 	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6708 	kr = vm_fault_enter(m,
6709 	    pmap,
6710 	    pmap_addr,
6711 	    fault_page_size,
6712 	    fault_phys_offset,
6713 	    prot,
6714 	    prot,
6715 	    TRUE,                  /* wired */
6716 	    FALSE,                 /* change_wiring */
6717 	    wire_tag,
6718 	    &fault_info,
6719 	    NULL,
6720 	    &type_of_fault);
6721 	if (kr != KERN_SUCCESS) {
6722 		RELEASE_PAGE(m);
6723 		GIVE_UP;
6724 	}
6725 
6726 done:
6727 	/*
6728 	 *	Unlock everything, and return
6729 	 */
6730 
6731 	if (physpage_p) {
6732 		/* for vm_map_wire_and_extract() */
6733 		if (kr == KERN_SUCCESS) {
6734 			assert(object == VM_PAGE_OBJECT(m));
6735 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6736 			if (prot & VM_PROT_WRITE) {
6737 				vm_object_lock_assert_exclusive(object);
6738 				m->vmp_dirty = TRUE;
6739 			}
6740 		} else {
6741 			*physpage_p = 0;
6742 		}
6743 	}
6744 
6745 	PAGE_WAKEUP_DONE(m);
6746 	UNLOCK_AND_DEALLOCATE;
6747 
6748 	return kr;
6749 }
6750 
6751 /*
6752  *	Routine:	vm_fault_copy_cleanup
6753  *	Purpose:
6754  *		Release a page used by vm_fault_copy.
6755  */
6756 
6757 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)6758 vm_fault_copy_cleanup(
6759 	vm_page_t       page,
6760 	vm_page_t       top_page)
6761 {
6762 	vm_object_t     object = VM_PAGE_OBJECT(page);
6763 
6764 	vm_object_lock(object);
6765 	PAGE_WAKEUP_DONE(page);
6766 	if (!VM_PAGE_PAGEABLE(page)) {
6767 		vm_page_lockspin_queues();
6768 		if (!VM_PAGE_PAGEABLE(page)) {
6769 			vm_page_activate(page);
6770 		}
6771 		vm_page_unlock_queues();
6772 	}
6773 	vm_fault_cleanup(object, top_page);
6774 }
6775 
6776 static void
vm_fault_copy_dst_cleanup(vm_page_t page)6777 vm_fault_copy_dst_cleanup(
6778 	vm_page_t       page)
6779 {
6780 	vm_object_t     object;
6781 
6782 	if (page != VM_PAGE_NULL) {
6783 		object = VM_PAGE_OBJECT(page);
6784 		vm_object_lock(object);
6785 		vm_page_lockspin_queues();
6786 		vm_page_unwire(page, TRUE);
6787 		vm_page_unlock_queues();
6788 		vm_object_paging_end(object);
6789 		vm_object_unlock(object);
6790 	}
6791 }
6792 
6793 /*
6794  *	Routine:	vm_fault_copy
6795  *
6796  *	Purpose:
6797  *		Copy pages from one virtual memory object to another --
6798  *		neither the source nor destination pages need be resident.
6799  *
6800  *		Before actually copying a page, the version associated with
6801  *		the destination address map wil be verified.
6802  *
6803  *	In/out conditions:
6804  *		The caller must hold a reference, but not a lock, to
6805  *		each of the source and destination objects and to the
6806  *		destination map.
6807  *
6808  *	Results:
6809  *		Returns KERN_SUCCESS if no errors were encountered in
6810  *		reading or writing the data.  Returns KERN_INTERRUPTED if
6811  *		the operation was interrupted (only possible if the
6812  *		"interruptible" argument is asserted).  Other return values
6813  *		indicate a permanent error in copying the data.
6814  *
6815  *		The actual amount of data copied will be returned in the
6816  *		"copy_size" argument.  In the event that the destination map
6817  *		verification failed, this amount may be less than the amount
6818  *		requested.
6819  */
6820 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)6821 vm_fault_copy(
6822 	vm_object_t             src_object,
6823 	vm_object_offset_t      src_offset,
6824 	vm_map_size_t           *copy_size,             /* INOUT */
6825 	vm_object_t             dst_object,
6826 	vm_object_offset_t      dst_offset,
6827 	vm_map_t                dst_map,
6828 	vm_map_version_t         *dst_version,
6829 	int                     interruptible)
6830 {
6831 	vm_page_t               result_page;
6832 
6833 	vm_page_t               src_page;
6834 	vm_page_t               src_top_page;
6835 	vm_prot_t               src_prot;
6836 
6837 	vm_page_t               dst_page;
6838 	vm_page_t               dst_top_page;
6839 	vm_prot_t               dst_prot;
6840 
6841 	vm_map_size_t           amount_left;
6842 	vm_object_t             old_copy_object;
6843 	vm_object_t             result_page_object = NULL;
6844 	kern_return_t           error = 0;
6845 	vm_fault_return_t       result;
6846 
6847 	vm_map_size_t           part_size;
6848 	struct vm_object_fault_info fault_info_src = {};
6849 	struct vm_object_fault_info fault_info_dst = {};
6850 
6851 	/*
6852 	 * In order not to confuse the clustered pageins, align
6853 	 * the different offsets on a page boundary.
6854 	 */
6855 
6856 #define RETURN(x)                                       \
6857 	MACRO_BEGIN                                     \
6858 	*copy_size -= amount_left;                      \
6859 	MACRO_RETURN(x);                                \
6860 	MACRO_END
6861 
6862 	amount_left = *copy_size;
6863 
6864 	fault_info_src.interruptible = interruptible;
6865 	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6866 	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6867 	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6868 	fault_info_src.stealth = TRUE;
6869 
6870 	fault_info_dst.interruptible = interruptible;
6871 	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6872 	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6873 	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6874 	fault_info_dst.stealth = TRUE;
6875 
6876 	do { /* while (amount_left > 0) */
6877 		/*
6878 		 * There may be a deadlock if both source and destination
6879 		 * pages are the same. To avoid this deadlock, the copy must
6880 		 * start by getting the destination page in order to apply
6881 		 * COW semantics if any.
6882 		 */
6883 
6884 RetryDestinationFault:;
6885 
6886 		dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6887 
6888 		vm_object_lock(dst_object);
6889 		vm_object_paging_begin(dst_object);
6890 
6891 		/* cap cluster size at maximum UPL size */
6892 		upl_size_t cluster_size;
6893 		if (os_convert_overflow(amount_left, &cluster_size)) {
6894 			cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6895 		}
6896 		fault_info_dst.cluster_size = cluster_size;
6897 
6898 		dst_page = VM_PAGE_NULL;
6899 		result = vm_fault_page(dst_object,
6900 		    vm_object_trunc_page(dst_offset),
6901 		    VM_PROT_WRITE | VM_PROT_READ,
6902 		    FALSE,
6903 		    FALSE,                    /* page not looked up */
6904 		    &dst_prot, &dst_page, &dst_top_page,
6905 		    (int *)0,
6906 		    &error,
6907 		    dst_map->no_zero_fill,
6908 		    &fault_info_dst);
6909 		switch (result) {
6910 		case VM_FAULT_SUCCESS:
6911 			break;
6912 		case VM_FAULT_RETRY:
6913 			goto RetryDestinationFault;
6914 		case VM_FAULT_MEMORY_SHORTAGE:
6915 			if (vm_page_wait(interruptible)) {
6916 				goto RetryDestinationFault;
6917 			}
6918 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
6919 			OS_FALLTHROUGH;
6920 		case VM_FAULT_INTERRUPTED:
6921 			RETURN(MACH_SEND_INTERRUPTED);
6922 		case VM_FAULT_SUCCESS_NO_VM_PAGE:
6923 			/* success but no VM page: fail the copy */
6924 			vm_object_paging_end(dst_object);
6925 			vm_object_unlock(dst_object);
6926 			OS_FALLTHROUGH;
6927 		case VM_FAULT_MEMORY_ERROR:
6928 			if (error) {
6929 				return error;
6930 			} else {
6931 				return KERN_MEMORY_ERROR;
6932 			}
6933 		default:
6934 			panic("vm_fault_copy: unexpected error 0x%x from "
6935 			    "vm_fault_page()\n", result);
6936 		}
6937 		assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6938 
6939 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
6940 		old_copy_object = dst_object->copy;
6941 
6942 		/*
6943 		 * There exists the possiblity that the source and
6944 		 * destination page are the same.  But we can't
6945 		 * easily determine that now.  If they are the
6946 		 * same, the call to vm_fault_page() for the
6947 		 * destination page will deadlock.  To prevent this we
6948 		 * wire the page so we can drop busy without having
6949 		 * the page daemon steal the page.  We clean up the
6950 		 * top page  but keep the paging reference on the object
6951 		 * holding the dest page so it doesn't go away.
6952 		 */
6953 
6954 		vm_page_lockspin_queues();
6955 		vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6956 		vm_page_unlock_queues();
6957 		PAGE_WAKEUP_DONE(dst_page);
6958 		vm_object_unlock(dst_object);
6959 
6960 		if (dst_top_page != VM_PAGE_NULL) {
6961 			vm_object_lock(dst_object);
6962 			VM_PAGE_FREE(dst_top_page);
6963 			vm_object_paging_end(dst_object);
6964 			vm_object_unlock(dst_object);
6965 		}
6966 
6967 RetrySourceFault:;
6968 
6969 		if (src_object == VM_OBJECT_NULL) {
6970 			/*
6971 			 *	No source object.  We will just
6972 			 *	zero-fill the page in dst_object.
6973 			 */
6974 			src_page = VM_PAGE_NULL;
6975 			result_page = VM_PAGE_NULL;
6976 		} else {
6977 			vm_object_lock(src_object);
6978 			src_page = vm_page_lookup(src_object,
6979 			    vm_object_trunc_page(src_offset));
6980 			if (src_page == dst_page) {
6981 				src_prot = dst_prot;
6982 				result_page = VM_PAGE_NULL;
6983 			} else {
6984 				src_prot = VM_PROT_READ;
6985 				vm_object_paging_begin(src_object);
6986 
6987 				/* cap cluster size at maximum UPL size */
6988 				if (os_convert_overflow(amount_left, &cluster_size)) {
6989 					cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6990 				}
6991 				fault_info_src.cluster_size = cluster_size;
6992 
6993 				result_page = VM_PAGE_NULL;
6994 				result = vm_fault_page(
6995 					src_object,
6996 					vm_object_trunc_page(src_offset),
6997 					VM_PROT_READ, FALSE,
6998 					FALSE, /* page not looked up */
6999 					&src_prot,
7000 					&result_page, &src_top_page,
7001 					(int *)0, &error, FALSE,
7002 					&fault_info_src);
7003 
7004 				switch (result) {
7005 				case VM_FAULT_SUCCESS:
7006 					break;
7007 				case VM_FAULT_RETRY:
7008 					goto RetrySourceFault;
7009 				case VM_FAULT_MEMORY_SHORTAGE:
7010 					if (vm_page_wait(interruptible)) {
7011 						goto RetrySourceFault;
7012 					}
7013 					OS_FALLTHROUGH;
7014 				case VM_FAULT_INTERRUPTED:
7015 					vm_fault_copy_dst_cleanup(dst_page);
7016 					RETURN(MACH_SEND_INTERRUPTED);
7017 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
7018 					/* success but no VM page: fail */
7019 					vm_object_paging_end(src_object);
7020 					vm_object_unlock(src_object);
7021 					OS_FALLTHROUGH;
7022 				case VM_FAULT_MEMORY_ERROR:
7023 					vm_fault_copy_dst_cleanup(dst_page);
7024 					if (error) {
7025 						return error;
7026 					} else {
7027 						return KERN_MEMORY_ERROR;
7028 					}
7029 				default:
7030 					panic("vm_fault_copy(2): unexpected "
7031 					    "error 0x%x from "
7032 					    "vm_fault_page()\n", result);
7033 				}
7034 
7035 				result_page_object = VM_PAGE_OBJECT(result_page);
7036 				assert((src_top_page == VM_PAGE_NULL) ==
7037 				    (result_page_object == src_object));
7038 			}
7039 			assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7040 			vm_object_unlock(result_page_object);
7041 		}
7042 
7043 		vm_map_lock_read(dst_map);
7044 
7045 		if (!vm_map_verify(dst_map, dst_version)) {
7046 			vm_map_unlock_read(dst_map);
7047 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7048 				vm_fault_copy_cleanup(result_page, src_top_page);
7049 			}
7050 			vm_fault_copy_dst_cleanup(dst_page);
7051 			break;
7052 		}
7053 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
7054 
7055 		vm_object_lock(dst_object);
7056 
7057 		if (dst_object->copy != old_copy_object) {
7058 			vm_object_unlock(dst_object);
7059 			vm_map_unlock_read(dst_map);
7060 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7061 				vm_fault_copy_cleanup(result_page, src_top_page);
7062 			}
7063 			vm_fault_copy_dst_cleanup(dst_page);
7064 			break;
7065 		}
7066 		vm_object_unlock(dst_object);
7067 
7068 		/*
7069 		 *	Copy the page, and note that it is dirty
7070 		 *	immediately.
7071 		 */
7072 
7073 		if (!page_aligned(src_offset) ||
7074 		    !page_aligned(dst_offset) ||
7075 		    !page_aligned(amount_left)) {
7076 			vm_object_offset_t      src_po,
7077 			    dst_po;
7078 
7079 			src_po = src_offset - vm_object_trunc_page(src_offset);
7080 			dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7081 
7082 			if (dst_po > src_po) {
7083 				part_size = PAGE_SIZE - dst_po;
7084 			} else {
7085 				part_size = PAGE_SIZE - src_po;
7086 			}
7087 			if (part_size > (amount_left)) {
7088 				part_size = amount_left;
7089 			}
7090 
7091 			if (result_page == VM_PAGE_NULL) {
7092 				assert((vm_offset_t) dst_po == dst_po);
7093 				assert((vm_size_t) part_size == part_size);
7094 				vm_page_part_zero_fill(dst_page,
7095 				    (vm_offset_t) dst_po,
7096 				    (vm_size_t) part_size);
7097 			} else {
7098 				assert((vm_offset_t) src_po == src_po);
7099 				assert((vm_offset_t) dst_po == dst_po);
7100 				assert((vm_size_t) part_size == part_size);
7101 				vm_page_part_copy(result_page,
7102 				    (vm_offset_t) src_po,
7103 				    dst_page,
7104 				    (vm_offset_t) dst_po,
7105 				    (vm_size_t)part_size);
7106 				if (!dst_page->vmp_dirty) {
7107 					vm_object_lock(dst_object);
7108 					SET_PAGE_DIRTY(dst_page, TRUE);
7109 					vm_object_unlock(dst_object);
7110 				}
7111 			}
7112 		} else {
7113 			part_size = PAGE_SIZE;
7114 
7115 			if (result_page == VM_PAGE_NULL) {
7116 				vm_page_zero_fill(dst_page);
7117 			} else {
7118 				vm_object_lock(result_page_object);
7119 				vm_page_copy(result_page, dst_page);
7120 				vm_object_unlock(result_page_object);
7121 
7122 				if (!dst_page->vmp_dirty) {
7123 					vm_object_lock(dst_object);
7124 					SET_PAGE_DIRTY(dst_page, TRUE);
7125 					vm_object_unlock(dst_object);
7126 				}
7127 			}
7128 		}
7129 
7130 		/*
7131 		 *	Unlock everything, and return
7132 		 */
7133 
7134 		vm_map_unlock_read(dst_map);
7135 
7136 		if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7137 			vm_fault_copy_cleanup(result_page, src_top_page);
7138 		}
7139 		vm_fault_copy_dst_cleanup(dst_page);
7140 
7141 		amount_left -= part_size;
7142 		src_offset += part_size;
7143 		dst_offset += part_size;
7144 	} while (amount_left > 0);
7145 
7146 	RETURN(KERN_SUCCESS);
7147 #undef  RETURN
7148 
7149 	/*NOTREACHED*/
7150 }
7151 
7152 #if     VM_FAULT_CLASSIFY
7153 /*
7154  *	Temporary statistics gathering support.
7155  */
7156 
7157 /*
7158  *	Statistics arrays:
7159  */
7160 #define VM_FAULT_TYPES_MAX      5
7161 #define VM_FAULT_LEVEL_MAX      8
7162 
7163 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
7164 
7165 #define VM_FAULT_TYPE_ZERO_FILL 0
7166 #define VM_FAULT_TYPE_MAP_IN    1
7167 #define VM_FAULT_TYPE_PAGER     2
7168 #define VM_FAULT_TYPE_COPY      3
7169 #define VM_FAULT_TYPE_OTHER     4
7170 
7171 
7172 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)7173 vm_fault_classify(vm_object_t           object,
7174     vm_object_offset_t    offset,
7175     vm_prot_t             fault_type)
7176 {
7177 	int             type, level = 0;
7178 	vm_page_t       m;
7179 
7180 	while (TRUE) {
7181 		m = vm_page_lookup(object, offset);
7182 		if (m != VM_PAGE_NULL) {
7183 			if (m->vmp_busy || VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent) {
7184 				type = VM_FAULT_TYPE_OTHER;
7185 				break;
7186 			}
7187 			if (((fault_type & VM_PROT_WRITE) == 0) ||
7188 			    ((level == 0) && object->copy == VM_OBJECT_NULL)) {
7189 				type = VM_FAULT_TYPE_MAP_IN;
7190 				break;
7191 			}
7192 			type = VM_FAULT_TYPE_COPY;
7193 			break;
7194 		} else {
7195 			if (object->pager_created) {
7196 				type = VM_FAULT_TYPE_PAGER;
7197 				break;
7198 			}
7199 			if (object->shadow == VM_OBJECT_NULL) {
7200 				type = VM_FAULT_TYPE_ZERO_FILL;
7201 				break;
7202 			}
7203 
7204 			offset += object->vo_shadow_offset;
7205 			object = object->shadow;
7206 			level++;
7207 			continue;
7208 		}
7209 	}
7210 
7211 	if (level > VM_FAULT_LEVEL_MAX) {
7212 		level = VM_FAULT_LEVEL_MAX;
7213 	}
7214 
7215 	vm_fault_stats[type][level] += 1;
7216 
7217 	return;
7218 }
7219 
7220 /* cleanup routine to call from debugger */
7221 
7222 void
vm_fault_classify_init(void)7223 vm_fault_classify_init(void)
7224 {
7225 	int type, level;
7226 
7227 	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
7228 		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
7229 			vm_fault_stats[type][level] = 0;
7230 		}
7231 	}
7232 
7233 	return;
7234 }
7235 #endif  /* VM_FAULT_CLASSIFY */
7236 
7237 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr)7238 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7239 {
7240 	vm_map_entry_t  entry;
7241 	vm_object_t     object;
7242 	vm_offset_t     object_offset;
7243 	vm_page_t       m;
7244 	int             compressor_external_state, compressed_count_delta;
7245 	int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
7246 	int             my_fault_type = VM_PROT_READ;
7247 	kern_return_t   kr;
7248 	int effective_page_mask, effective_page_size;
7249 
7250 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7251 		effective_page_mask = VM_MAP_PAGE_MASK(map);
7252 		effective_page_size = VM_MAP_PAGE_SIZE(map);
7253 	} else {
7254 		effective_page_mask = PAGE_MASK;
7255 		effective_page_size = PAGE_SIZE;
7256 	}
7257 
7258 	if (not_in_kdp) {
7259 		panic("kdp_lightweight_fault called from outside of debugger context");
7260 	}
7261 
7262 	assert(map != VM_MAP_NULL);
7263 
7264 	assert((cur_target_addr & effective_page_mask) == 0);
7265 	if ((cur_target_addr & effective_page_mask) != 0) {
7266 		return 0;
7267 	}
7268 
7269 	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7270 		return 0;
7271 	}
7272 
7273 	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7274 		return 0;
7275 	}
7276 
7277 	if (entry->is_sub_map) {
7278 		return 0;
7279 	}
7280 
7281 	object = VME_OBJECT(entry);
7282 	if (object == VM_OBJECT_NULL) {
7283 		return 0;
7284 	}
7285 
7286 	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7287 
7288 	while (TRUE) {
7289 		if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7290 			return 0;
7291 		}
7292 
7293 		if (object->pager_created && (object->paging_in_progress ||
7294 		    object->activity_in_progress)) {
7295 			return 0;
7296 		}
7297 
7298 		m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7299 
7300 		if (m != VM_PAGE_NULL) {
7301 			if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7302 				return 0;
7303 			}
7304 
7305 			if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
7306 			    m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7307 				return 0;
7308 			}
7309 
7310 			assert(!m->vmp_private);
7311 			if (m->vmp_private) {
7312 				return 0;
7313 			}
7314 
7315 			assert(!m->vmp_fictitious);
7316 			if (m->vmp_fictitious) {
7317 				return 0;
7318 			}
7319 
7320 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7321 			if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7322 				return 0;
7323 			}
7324 
7325 			return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7326 		}
7327 
7328 		compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7329 
7330 		if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7331 			if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7332 				kr = vm_compressor_pager_get(object->pager,
7333 				    vm_object_trunc_page(object_offset + object->paging_offset),
7334 				    kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7335 				    compressor_flags, &compressed_count_delta);
7336 				if (kr == KERN_SUCCESS) {
7337 					return kdp_compressor_decompressed_page_paddr;
7338 				} else {
7339 					return 0;
7340 				}
7341 			}
7342 		}
7343 
7344 		if (object->shadow == VM_OBJECT_NULL) {
7345 			return 0;
7346 		}
7347 
7348 		object_offset += object->vo_shadow_offset;
7349 		object = object->shadow;
7350 	}
7351 }
7352 
7353 /*
7354  * vm_page_validate_cs_fast():
7355  * Performs a few quick checks to determine if the page's code signature
7356  * really needs to be fully validated.  It could:
7357  *	1. have been modified (i.e. automatically tainted),
7358  *	2. have already been validated,
7359  *	3. have already been found to be tainted,
7360  *	4. no longer have a backing store.
7361  * Returns FALSE if the page needs to be fully validated.
7362  */
7363 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7364 vm_page_validate_cs_fast(
7365 	vm_page_t       page,
7366 	vm_map_size_t   fault_page_size,
7367 	vm_map_offset_t fault_phys_offset)
7368 {
7369 	vm_object_t     object;
7370 
7371 	object = VM_PAGE_OBJECT(page);
7372 	vm_object_lock_assert_held(object);
7373 
7374 	if (page->vmp_wpmapped &&
7375 	    !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7376 		/*
7377 		 * This page was mapped for "write" access sometime in the
7378 		 * past and could still be modifiable in the future.
7379 		 * Consider it tainted.
7380 		 * [ If the page was already found to be "tainted", no
7381 		 * need to re-validate. ]
7382 		 */
7383 		vm_object_lock_assert_exclusive(object);
7384 		VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7385 		VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7386 		if (cs_debug) {
7387 			printf("CODESIGNING: %s: "
7388 			    "page %p obj %p off 0x%llx "
7389 			    "was modified\n",
7390 			    __FUNCTION__,
7391 			    page, object, page->vmp_offset);
7392 		}
7393 		vm_cs_validated_dirtied++;
7394 	}
7395 
7396 	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7397 	    VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7398 		return TRUE;
7399 	}
7400 	vm_object_lock_assert_exclusive(object);
7401 
7402 #if CHECK_CS_VALIDATION_BITMAP
7403 	kern_return_t kr;
7404 
7405 	kr = vnode_pager_cs_check_validation_bitmap(
7406 		object->pager,
7407 		page->vmp_offset + object->paging_offset,
7408 		CS_BITMAP_CHECK);
7409 	if (kr == KERN_SUCCESS) {
7410 		page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7411 		page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7412 		vm_cs_bitmap_validated++;
7413 		return TRUE;
7414 	}
7415 #endif /* CHECK_CS_VALIDATION_BITMAP */
7416 
7417 	if (!object->alive || object->terminating || object->pager == NULL) {
7418 		/*
7419 		 * The object is terminating and we don't have its pager
7420 		 * so we can't validate the data...
7421 		 */
7422 		return TRUE;
7423 	}
7424 
7425 	/* we need to really validate this page */
7426 	vm_object_lock_assert_exclusive(object);
7427 	return FALSE;
7428 }
7429 
7430 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)7431 vm_page_validate_cs_mapped_slow(
7432 	vm_page_t       page,
7433 	const void      *kaddr)
7434 {
7435 	vm_object_t             object;
7436 	memory_object_offset_t  mo_offset;
7437 	memory_object_t         pager;
7438 	struct vnode            *vnode;
7439 	int                     validated, tainted, nx;
7440 
7441 	assert(page->vmp_busy);
7442 	object = VM_PAGE_OBJECT(page);
7443 	vm_object_lock_assert_exclusive(object);
7444 
7445 	vm_cs_validates++;
7446 
7447 	/*
7448 	 * Since we get here to validate a page that was brought in by
7449 	 * the pager, we know that this pager is all setup and ready
7450 	 * by now.
7451 	 */
7452 	assert(object->code_signed);
7453 	assert(!object->internal);
7454 	assert(object->pager != NULL);
7455 	assert(object->pager_ready);
7456 
7457 	pager = object->pager;
7458 	assert(object->paging_in_progress);
7459 	vnode = vnode_pager_lookup_vnode(pager);
7460 	mo_offset = page->vmp_offset + object->paging_offset;
7461 
7462 	/* verify the SHA1 hash for this page */
7463 	validated = 0;
7464 	tainted = 0;
7465 	nx = 0;
7466 	cs_validate_page(vnode,
7467 	    pager,
7468 	    mo_offset,
7469 	    (const void *)((const char *)kaddr),
7470 	    &validated,
7471 	    &tainted,
7472 	    &nx);
7473 
7474 	page->vmp_cs_validated |= validated;
7475 	page->vmp_cs_tainted |= tainted;
7476 	page->vmp_cs_nx |= nx;
7477 
7478 #if CHECK_CS_VALIDATION_BITMAP
7479 	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7480 	    page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7481 		vnode_pager_cs_check_validation_bitmap(object->pager,
7482 		    mo_offset,
7483 		    CS_BITMAP_SET);
7484 	}
7485 #endif /* CHECK_CS_VALIDATION_BITMAP */
7486 }
7487 
7488 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)7489 vm_page_validate_cs_mapped(
7490 	vm_page_t       page,
7491 	vm_map_size_t   fault_page_size,
7492 	vm_map_offset_t fault_phys_offset,
7493 	const void      *kaddr)
7494 {
7495 	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7496 		vm_page_validate_cs_mapped_slow(page, kaddr);
7497 	}
7498 }
7499 
7500 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)7501 vm_page_map_and_validate_cs(
7502 	vm_object_t     object,
7503 	vm_page_t       page)
7504 {
7505 	vm_object_offset_t      offset;
7506 	vm_map_offset_t         koffset;
7507 	vm_map_size_t           ksize;
7508 	vm_offset_t             kaddr;
7509 	kern_return_t           kr;
7510 	boolean_t               busy_page;
7511 	boolean_t               need_unmap;
7512 
7513 	vm_object_lock_assert_exclusive(object);
7514 
7515 	assert(object->code_signed);
7516 	offset = page->vmp_offset;
7517 
7518 	busy_page = page->vmp_busy;
7519 	if (!busy_page) {
7520 		/* keep page busy while we map (and unlock) the VM object */
7521 		page->vmp_busy = TRUE;
7522 	}
7523 
7524 	/*
7525 	 * Take a paging reference on the VM object
7526 	 * to protect it from collapse or bypass,
7527 	 * and keep it from disappearing too.
7528 	 */
7529 	vm_object_paging_begin(object);
7530 
7531 	/* map the page in the kernel address space */
7532 	ksize = PAGE_SIZE_64;
7533 	koffset = 0;
7534 	need_unmap = FALSE;
7535 	kr = vm_paging_map_object(page,
7536 	    object,
7537 	    offset,
7538 	    VM_PROT_READ,
7539 	    FALSE,                       /* can't unlock object ! */
7540 	    &ksize,
7541 	    &koffset,
7542 	    &need_unmap);
7543 	if (kr != KERN_SUCCESS) {
7544 		panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
7545 	}
7546 	kaddr = CAST_DOWN(vm_offset_t, koffset);
7547 
7548 	/* validate the mapped page */
7549 	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7550 
7551 	assert(page->vmp_busy);
7552 	assert(object == VM_PAGE_OBJECT(page));
7553 	vm_object_lock_assert_exclusive(object);
7554 
7555 	if (!busy_page) {
7556 		PAGE_WAKEUP_DONE(page);
7557 	}
7558 	if (need_unmap) {
7559 		/* unmap the map from the kernel address space */
7560 		vm_paging_unmap_object(object, koffset, koffset + ksize);
7561 		koffset = 0;
7562 		ksize = 0;
7563 		kaddr = 0;
7564 	}
7565 	vm_object_paging_end(object);
7566 }
7567 
7568 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7569 vm_page_validate_cs(
7570 	vm_page_t       page,
7571 	vm_map_size_t   fault_page_size,
7572 	vm_map_offset_t fault_phys_offset)
7573 {
7574 	vm_object_t             object;
7575 
7576 	object = VM_PAGE_OBJECT(page);
7577 	vm_object_lock_assert_held(object);
7578 
7579 	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7580 		return;
7581 	}
7582 	vm_page_map_and_validate_cs(object, page);
7583 }
7584 
7585 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)7586 vm_page_validate_cs_mapped_chunk(
7587 	vm_page_t       page,
7588 	const void      *kaddr,
7589 	vm_offset_t     chunk_offset,
7590 	vm_size_t       chunk_size,
7591 	boolean_t       *validated_p,
7592 	unsigned        *tainted_p)
7593 {
7594 	vm_object_t             object;
7595 	vm_object_offset_t      offset, offset_in_page;
7596 	memory_object_t         pager;
7597 	struct vnode            *vnode;
7598 	boolean_t               validated;
7599 	unsigned                tainted;
7600 
7601 	*validated_p = FALSE;
7602 	*tainted_p = 0;
7603 
7604 	assert(page->vmp_busy);
7605 	object = VM_PAGE_OBJECT(page);
7606 	vm_object_lock_assert_exclusive(object);
7607 
7608 	assert(object->code_signed);
7609 	offset = page->vmp_offset;
7610 
7611 	if (!object->alive || object->terminating || object->pager == NULL) {
7612 		/*
7613 		 * The object is terminating and we don't have its pager
7614 		 * so we can't validate the data...
7615 		 */
7616 		return;
7617 	}
7618 	/*
7619 	 * Since we get here to validate a page that was brought in by
7620 	 * the pager, we know that this pager is all setup and ready
7621 	 * by now.
7622 	 */
7623 	assert(!object->internal);
7624 	assert(object->pager != NULL);
7625 	assert(object->pager_ready);
7626 
7627 	pager = object->pager;
7628 	assert(object->paging_in_progress);
7629 	vnode = vnode_pager_lookup_vnode(pager);
7630 
7631 	/* verify the signature for this chunk */
7632 	offset_in_page = chunk_offset;
7633 	assert(offset_in_page < PAGE_SIZE);
7634 
7635 	tainted = 0;
7636 	validated = cs_validate_range(vnode,
7637 	    pager,
7638 	    (object->paging_offset +
7639 	    offset +
7640 	    offset_in_page),
7641 	    (const void *)((const char *)kaddr
7642 	    + offset_in_page),
7643 	    chunk_size,
7644 	    &tainted);
7645 	if (validated) {
7646 		*validated_p = TRUE;
7647 	}
7648 	if (tainted) {
7649 		*tainted_p = tainted;
7650 	}
7651 }
7652 
7653 static void
vm_rtfrecord_lock(void)7654 vm_rtfrecord_lock(void)
7655 {
7656 	lck_spin_lock(&vm_rtfr_slock);
7657 }
7658 
7659 static void
vm_rtfrecord_unlock(void)7660 vm_rtfrecord_unlock(void)
7661 {
7662 	lck_spin_unlock(&vm_rtfr_slock);
7663 }
7664 
7665 unsigned int
vmrtfaultinfo_bufsz(void)7666 vmrtfaultinfo_bufsz(void)
7667 {
7668 	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7669 }
7670 
7671 #include <kern/backtrace.h>
7672 
7673 __attribute__((noinline))
7674 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)7675 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7676 {
7677 	uint64_t fend = mach_continuous_time();
7678 
7679 	uint64_t cfpc = 0;
7680 	uint64_t ctid = cthread->thread_id;
7681 	uint64_t cupid = get_current_unique_pid();
7682 
7683 	uintptr_t bpc = 0;
7684 	errno_t btr = 0;
7685 
7686 	/*
7687 	 * Capture a single-frame backtrace.  This extracts just the program
7688 	 * counter at the point of the fault, and should not use copyin to get
7689 	 * Rosetta save state.
7690 	 */
7691 	struct backtrace_control ctl = {
7692 		.btc_user_thread = cthread,
7693 		.btc_user_copy = backtrace_user_copy_error,
7694 	};
7695 	unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
7696 	if ((btr == 0) && (bfrs > 0)) {
7697 		cfpc = bpc;
7698 	}
7699 
7700 	assert((fstart != 0) && fend >= fstart);
7701 	vm_rtfrecord_lock();
7702 	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7703 
7704 	vmrtfrs.vmrtf_total++;
7705 	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7706 
7707 	cvmr->rtfabstime = fstart;
7708 	cvmr->rtfduration = fend - fstart;
7709 	cvmr->rtfaddr = fault_vaddr;
7710 	cvmr->rtfpc = cfpc;
7711 	cvmr->rtftype = type_of_fault;
7712 	cvmr->rtfupid = cupid;
7713 	cvmr->rtftid = ctid;
7714 
7715 	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7716 		vmrtfrs.vmrtfr_curi = 0;
7717 	}
7718 
7719 	vm_rtfrecord_unlock();
7720 }
7721 
7722 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)7723 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7724 {
7725 	vm_rtfault_record_t *cvmrd = vrecords;
7726 	size_t residue = vrecordsz;
7727 	size_t numextracted = 0;
7728 	boolean_t early_exit = FALSE;
7729 
7730 	vm_rtfrecord_lock();
7731 
7732 	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7733 		if (residue < sizeof(vm_rtfault_record_t)) {
7734 			early_exit = TRUE;
7735 			break;
7736 		}
7737 
7738 		if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7739 #if     DEVELOPMENT || DEBUG
7740 			if (isroot == FALSE) {
7741 				continue;
7742 			}
7743 #else
7744 			continue;
7745 #endif /* DEVDEBUG */
7746 		}
7747 
7748 		*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7749 		cvmrd++;
7750 		residue -= sizeof(vm_rtfault_record_t);
7751 		numextracted++;
7752 	}
7753 
7754 	vm_rtfrecord_unlock();
7755 
7756 	*vmrtfrv = numextracted;
7757 	return early_exit;
7758 }
7759 
7760 /*
7761  * Only allow one diagnosis to be in flight at a time, to avoid
7762  * creating too much additional memory usage.
7763  */
7764 static volatile uint_t vmtc_diagnosing;
7765 unsigned int vmtc_total = 0;
7766 
7767 /*
7768  * Type used to update telemetry for the diagnosis counts.
7769  */
7770 CA_EVENT(vmtc_telemetry,
7771     CA_INT, vmtc_num_byte,            /* number of corrupt bytes found */
7772     CA_BOOL, vmtc_undiagnosed,        /* undiagnosed because more than 1 at a time */
7773     CA_BOOL, vmtc_not_eligible,       /* the page didn't qualify */
7774     CA_BOOL, vmtc_copyin_fail,        /* unable to copy in the page */
7775     CA_BOOL, vmtc_not_found,          /* no corruption found even though CS failed */
7776     CA_BOOL, vmtc_one_bit_flip,       /* single bit flip */
7777     CA_BOOL, vmtc_testing);           /* caused on purpose by testing */
7778 
7779 #if DEVELOPMENT || DEBUG
7780 /*
7781  * Buffers used to compare before/after page contents.
7782  * Stashed to aid when debugging crashes.
7783  */
7784 static size_t vmtc_last_buffer_size = 0;
7785 static uint64_t *vmtc_last_before_buffer = NULL;
7786 static uint64_t *vmtc_last_after_buffer = NULL;
7787 
7788 /*
7789  * Needed to record corruptions due to testing.
7790  */
7791 static uintptr_t corruption_test_va = 0;
7792 #endif /* DEVELOPMENT || DEBUG */
7793 
7794 /*
7795  * Stash a copy of data from a possibly corrupt page.
7796  */
7797 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)7798 vmtc_get_page_data(
7799 	vm_map_offset_t code_addr,
7800 	vm_page_t       page)
7801 {
7802 	uint64_t        *buffer = NULL;
7803 	addr64_t        buffer_paddr;
7804 	addr64_t        page_paddr;
7805 	extern void     bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
7806 	uint_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7807 
7808 	/*
7809 	 * Need an aligned buffer to do a physical copy.
7810 	 */
7811 	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
7812 	    size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
7813 		return NULL;
7814 	}
7815 	buffer_paddr = kvtophys((vm_offset_t)buffer);
7816 	page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
7817 
7818 	/* adjust the page start address if we need only 4K of a 16K page */
7819 	if (size < PAGE_SIZE) {
7820 		uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
7821 		page_paddr += subpage_start;
7822 	}
7823 
7824 	bcopy_phys(page_paddr, buffer_paddr, size);
7825 	return buffer;
7826 }
7827 
7828 /*
7829  * Set things up so we can diagnose a potential text page corruption.
7830  */
7831 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)7832 vmtc_text_page_diagnose_setup(
7833 	vm_map_offset_t code_addr,
7834 	vm_page_t       page,
7835 	CA_EVENT_TYPE(vmtc_telemetry) *event)
7836 {
7837 	uint64_t        *buffer = NULL;
7838 
7839 	/*
7840 	 * If another is being diagnosed, skip this one.
7841 	 */
7842 	if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
7843 		event->vmtc_undiagnosed = true;
7844 		return NULL;
7845 	}
7846 
7847 	/*
7848 	 * Get the contents of the corrupt page.
7849 	 */
7850 	buffer = vmtc_get_page_data(code_addr, page);
7851 	if (buffer == NULL) {
7852 		event->vmtc_copyin_fail = true;
7853 		if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7854 			panic("Bad compare and swap in setup!");
7855 		}
7856 		return NULL;
7857 	}
7858 	return buffer;
7859 }
7860 
7861 /*
7862  * Diagnose the text page by comparing its contents with
7863  * the one we've previously saved.
7864  */
7865 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)7866 vmtc_text_page_diagnose(
7867 	vm_map_offset_t code_addr,
7868 	uint64_t        *old_code_buffer,
7869 	CA_EVENT_TYPE(vmtc_telemetry) *event)
7870 {
7871 	uint64_t        *new_code_buffer;
7872 	size_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7873 	uint_t          count = (uint_t)size / sizeof(uint64_t);
7874 	uint_t          diff_count = 0;
7875 	bool            bit_flip = false;
7876 	uint_t          b;
7877 	uint64_t        *new;
7878 	uint64_t        *old;
7879 
7880 	new_code_buffer = kalloc_data(size, Z_WAITOK);
7881 	assert(new_code_buffer != NULL);
7882 	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
7883 		/* copyin error, so undo things */
7884 		event->vmtc_copyin_fail = true;
7885 		goto done;
7886 	}
7887 
7888 	new = new_code_buffer;
7889 	old = old_code_buffer;
7890 	for (; count-- > 0; ++new, ++old) {
7891 		if (*new == *old) {
7892 			continue;
7893 		}
7894 
7895 		/*
7896 		 * On first diff, check for a single bit flip
7897 		 */
7898 		if (diff_count == 0) {
7899 			uint64_t x = (*new ^ *old);
7900 			assert(x != 0);
7901 			if ((x & (x - 1)) == 0) {
7902 				bit_flip = true;
7903 				++diff_count;
7904 				continue;
7905 			}
7906 		}
7907 
7908 		/*
7909 		 * count up the number of different bytes.
7910 		 */
7911 		for (b = 0; b < sizeof(uint64_t); ++b) {
7912 			char *n = (char *)new;
7913 			char *o = (char *)old;
7914 			if (n[b] != o[b]) {
7915 				++diff_count;
7916 			}
7917 		}
7918 	}
7919 
7920 	if (diff_count > 1) {
7921 		bit_flip = false;
7922 	}
7923 
7924 	if (diff_count == 0) {
7925 		event->vmtc_not_found = true;
7926 	} else {
7927 		event->vmtc_num_byte = diff_count;
7928 	}
7929 	if (bit_flip) {
7930 		event->vmtc_one_bit_flip = true;
7931 	}
7932 
7933 done:
7934 	/*
7935 	 * Free up the code copy buffers, but save the last
7936 	 * set on development / debug kernels in case they
7937 	 * can provide evidence for debugging memory stomps.
7938 	 */
7939 #if DEVELOPMENT || DEBUG
7940 	if (vmtc_last_before_buffer != NULL) {
7941 		kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
7942 	}
7943 	if (vmtc_last_after_buffer != NULL) {
7944 		kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
7945 	}
7946 	vmtc_last_before_buffer = old_code_buffer;
7947 	vmtc_last_after_buffer = new_code_buffer;
7948 	vmtc_last_buffer_size = size;
7949 #else /* DEVELOPMENT || DEBUG */
7950 	kfree_data(new_code_buffer, size);
7951 	kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
7952 #endif /* DEVELOPMENT || DEBUG */
7953 
7954 	/*
7955 	 * We're finished, so clear the diagnosing flag.
7956 	 */
7957 	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7958 		panic("Bad compare and swap in diagnose!");
7959 	}
7960 }
7961 
7962 /*
7963  * For the given map, virt address, find the object, offset, and page.
7964  * This has to lookup the map entry, verify protections, walk any shadow chains.
7965  * If found, returns with the object locked.
7966  */
7967 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)7968 vmtc_revalidate_lookup(
7969 	vm_map_t               map,
7970 	vm_map_offset_t        vaddr,
7971 	vm_object_t            *ret_object,
7972 	vm_object_offset_t     *ret_offset,
7973 	vm_page_t              *ret_page,
7974 	vm_prot_t              *ret_prot)
7975 {
7976 	vm_object_t            object;
7977 	vm_object_offset_t     offset;
7978 	vm_page_t              page;
7979 	kern_return_t          kr = KERN_SUCCESS;
7980 	uint8_t                object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7981 	vm_map_version_t       version;
7982 	boolean_t              wired;
7983 	struct vm_object_fault_info fault_info = {};
7984 	vm_map_t               real_map = NULL;
7985 	vm_prot_t              prot;
7986 	vm_object_t            shadow;
7987 
7988 	/*
7989 	 * Find the object/offset for the given location/map.
7990 	 * Note this returns with the object locked.
7991 	 */
7992 restart:
7993 	vm_map_lock_read(map);
7994 	object = VM_OBJECT_NULL;        /* in case we come around the restart path */
7995 	kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
7996 	    object_lock_type, &version, &object, &offset, &prot, &wired,
7997 	    &fault_info, &real_map, NULL);
7998 	vm_map_unlock_read(map);
7999 	if (real_map != NULL && real_map != map) {
8000 		vm_map_unlock(real_map);
8001 	}
8002 
8003 	/*
8004 	 * If there's no page here, fail.
8005 	 */
8006 	if (kr != KERN_SUCCESS || object == NULL) {
8007 		kr = KERN_FAILURE;
8008 		goto done;
8009 	}
8010 
8011 	/*
8012 	 * Chase down any shadow chains to find the actual page.
8013 	 */
8014 	for (;;) {
8015 		/*
8016 		 * See if the page is on the current object.
8017 		 */
8018 		page = vm_page_lookup(object, vm_object_trunc_page(offset));
8019 		if (page != NULL) {
8020 			/* restart the lookup */
8021 			if (page->vmp_restart) {
8022 				vm_object_unlock(object);
8023 				goto restart;
8024 			}
8025 
8026 			/*
8027 			 * If this page is busy, we need to wait for it.
8028 			 */
8029 			if (page->vmp_busy) {
8030 				PAGE_SLEEP(object, page, TRUE);
8031 				vm_object_unlock(object);
8032 				goto restart;
8033 			}
8034 			break;
8035 		}
8036 
8037 		/*
8038 		 * If the object doesn't have the page and
8039 		 * has no shadow, then we can quit.
8040 		 */
8041 		shadow = object->shadow;
8042 		if (shadow == NULL) {
8043 			kr = KERN_FAILURE;
8044 			goto done;
8045 		}
8046 
8047 		/*
8048 		 * Move to the next object
8049 		 */
8050 		offset += object->vo_shadow_offset;
8051 		vm_object_lock(shadow);
8052 		vm_object_unlock(object);
8053 		object = shadow;
8054 		shadow = VM_OBJECT_NULL;
8055 	}
8056 	*ret_object = object;
8057 	*ret_offset = vm_object_trunc_page(offset);
8058 	*ret_page = page;
8059 	*ret_prot = prot;
8060 
8061 done:
8062 	if (kr != KERN_SUCCESS && object != NULL) {
8063 		vm_object_unlock(object);
8064 	}
8065 	return kr;
8066 }
8067 
8068 /*
8069  * Check if a page is wired, needs extra locking.
8070  */
8071 static bool
is_page_wired(vm_page_t page)8072 is_page_wired(vm_page_t page)
8073 {
8074 	bool result;
8075 	vm_page_lock_queues();
8076 	result = VM_PAGE_WIRED(page);
8077 	vm_page_unlock_queues();
8078 	return result;
8079 }
8080 
8081 /*
8082  * A fatal process error has occurred in the given task.
8083  * Recheck the code signing of the text page at the given
8084  * address to check for a text page corruption.
8085  *
8086  * Returns KERN_FAILURE if a page was found to be corrupt
8087  * by failing to match its code signature. KERN_SUCCESS
8088  * means the page is either valid or we don't have the
8089  * information to say it's corrupt.
8090  */
8091 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)8092 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
8093 {
8094 	kern_return_t          kr;
8095 	vm_map_t               map;
8096 	vm_object_t            object = NULL;
8097 	vm_object_offset_t     offset;
8098 	vm_page_t              page = NULL;
8099 	struct vnode           *vnode;
8100 	uint64_t               *diagnose_buffer = NULL;
8101 	CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
8102 	ca_event_t             ca_event = NULL;
8103 	vm_prot_t              prot;
8104 
8105 	map = task->map;
8106 	if (task->map == NULL) {
8107 		return KERN_SUCCESS;
8108 	}
8109 
8110 	kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
8111 	if (kr != KERN_SUCCESS) {
8112 		goto done;
8113 	}
8114 
8115 	/*
8116 	 * The page must be executable.
8117 	 */
8118 	if (!(prot & VM_PROT_EXECUTE)) {
8119 		goto done;
8120 	}
8121 
8122 	/*
8123 	 * The object needs to have a pager.
8124 	 */
8125 	if (object->pager == NULL) {
8126 		goto done;
8127 	}
8128 
8129 	/*
8130 	 * Needs to be a vnode backed page to have a signature.
8131 	 */
8132 	vnode = vnode_pager_lookup_vnode(object->pager);
8133 	if (vnode == NULL) {
8134 		goto done;
8135 	}
8136 
8137 	/*
8138 	 * Object checks to see if we should proceed.
8139 	 */
8140 	if (!object->code_signed ||     /* no code signature to check */
8141 	    object->internal ||         /* internal objects aren't signed */
8142 	    object->terminating ||      /* the object and its pages are already going away */
8143 	    !object->pager_ready) {     /* this should happen, but check shouldn't hurt */
8144 		goto done;
8145 	}
8146 
8147 
8148 	/*
8149 	 * Check the code signature of the page in question.
8150 	 */
8151 	vm_page_map_and_validate_cs(object, page);
8152 
8153 	/*
8154 	 * At this point:
8155 	 * vmp_cs_validated |= validated (set if a code signature exists)
8156 	 * vmp_cs_tainted |= tainted (set if code signature violation)
8157 	 * vmp_cs_nx |= nx;  ??
8158 	 *
8159 	 * if vmp_pmapped then have to pmap_disconnect..
8160 	 * other flags to check on object or page?
8161 	 */
8162 	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
8163 #if DEBUG || DEVELOPMENT
8164 		/*
8165 		 * On development builds, a boot-arg can be used to cause
8166 		 * a panic, instead of a quiet repair.
8167 		 */
8168 		if (vmtc_panic_instead) {
8169 			panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
8170 		}
8171 #endif /* DEBUG || DEVELOPMENT */
8172 
8173 		/*
8174 		 * We're going to invalidate this page. Grab a copy of it for comparison.
8175 		 */
8176 		ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
8177 		event = ca_event->data;
8178 		diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
8179 
8180 		/*
8181 		 * Invalidate, i.e. toss, the corrupted page.
8182 		 */
8183 		if (!page->vmp_cleaning &&
8184 		    !page->vmp_laundry &&
8185 		    !page->vmp_fictitious &&
8186 		    !page->vmp_precious &&
8187 		    !page->vmp_absent &&
8188 		    !VMP_ERROR_GET(page) &&
8189 		    !page->vmp_dirty &&
8190 		    !is_page_wired(page)) {
8191 			if (page->vmp_pmapped) {
8192 				int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
8193 				if (refmod & VM_MEM_MODIFIED) {
8194 					SET_PAGE_DIRTY(page, FALSE);
8195 				}
8196 				if (refmod & VM_MEM_REFERENCED) {
8197 					page->vmp_reference = TRUE;
8198 				}
8199 			}
8200 			/* If the page seems intentionally modified, don't trash it. */
8201 			if (!page->vmp_dirty) {
8202 				VM_PAGE_FREE(page);
8203 			} else {
8204 				event->vmtc_not_eligible = true;
8205 			}
8206 		} else {
8207 			event->vmtc_not_eligible = true;
8208 		}
8209 		vm_object_unlock(object);
8210 		object = VM_OBJECT_NULL;
8211 
8212 		/*
8213 		 * Now try to diagnose the type of failure by faulting
8214 		 * in a new copy and diff'ing it with what we saved.
8215 		 */
8216 		if (diagnose_buffer != NULL) {
8217 			vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
8218 		}
8219 #if DEBUG || DEVELOPMENT
8220 		if (corruption_test_va != 0) {
8221 			corruption_test_va = 0;
8222 			event->vmtc_testing = true;
8223 		}
8224 #endif /* DEBUG || DEVELOPMENT */
8225 		ktriage_record(thread_tid(current_thread()),
8226 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
8227 		    0 /* arg */);
8228 		CA_EVENT_SEND(ca_event);
8229 		printf("Text page corruption detected for pid %d\n", proc_selfpid());
8230 		++vmtc_total;
8231 		return KERN_FAILURE; /* failure means we definitely found a corrupt page */
8232 	}
8233 done:
8234 	if (object != NULL) {
8235 		vm_object_unlock(object);
8236 	}
8237 	return KERN_SUCCESS;
8238 }
8239 
8240 #if DEBUG || DEVELOPMENT
8241 /*
8242  * For implementing unit tests - ask the pmap to corrupt a text page.
8243  * We have to find the page, to get the physical address, then invoke
8244  * the pmap.
8245  */
8246 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
8247 
8248 kern_return_t
vm_corrupt_text_addr(uintptr_t va)8249 vm_corrupt_text_addr(uintptr_t va)
8250 {
8251 	task_t                 task = current_task();
8252 	vm_map_t               map;
8253 	kern_return_t          kr = KERN_SUCCESS;
8254 	vm_object_t            object = VM_OBJECT_NULL;
8255 	vm_object_offset_t     offset;
8256 	vm_page_t              page = NULL;
8257 	pmap_paddr_t           pa;
8258 	vm_prot_t              prot;
8259 
8260 	map = task->map;
8261 	if (task->map == NULL) {
8262 		printf("corrupt_text_addr: no map\n");
8263 		return KERN_FAILURE;
8264 	}
8265 
8266 	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
8267 	if (kr != KERN_SUCCESS) {
8268 		printf("corrupt_text_addr: page lookup failed\n");
8269 		return kr;
8270 	}
8271 	if (!(prot & VM_PROT_EXECUTE)) {
8272 		printf("corrupt_text_addr: page not executable\n");
8273 		return KERN_FAILURE;
8274 	}
8275 
8276 	/* get the physical address to use */
8277 	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
8278 
8279 	/*
8280 	 * Check we have something we can work with.
8281 	 * Due to racing with pageout as we enter the sysctl,
8282 	 * it's theoretically possible to have the page disappear, just
8283 	 * before the lookup.
8284 	 *
8285 	 * That's highly likely to happen often. I've filed a radar 72857482
8286 	 * to bubble up the error here to the sysctl result and have the
8287 	 * test not FAIL in that case.
8288 	 */
8289 	if (page->vmp_busy) {
8290 		printf("corrupt_text_addr: vmp_busy\n");
8291 		kr = KERN_FAILURE;
8292 	}
8293 	if (page->vmp_cleaning) {
8294 		printf("corrupt_text_addr: vmp_cleaning\n");
8295 		kr = KERN_FAILURE;
8296 	}
8297 	if (page->vmp_laundry) {
8298 		printf("corrupt_text_addr: vmp_cleaning\n");
8299 		kr = KERN_FAILURE;
8300 	}
8301 	if (page->vmp_fictitious) {
8302 		printf("corrupt_text_addr: vmp_fictitious\n");
8303 		kr = KERN_FAILURE;
8304 	}
8305 	if (page->vmp_precious) {
8306 		printf("corrupt_text_addr: vmp_precious\n");
8307 		kr = KERN_FAILURE;
8308 	}
8309 	if (page->vmp_absent) {
8310 		printf("corrupt_text_addr: vmp_absent\n");
8311 		kr = KERN_FAILURE;
8312 	}
8313 	if (VMP_ERROR_GET(page)) {
8314 		printf("corrupt_text_addr: vmp_error\n");
8315 		kr = KERN_FAILURE;
8316 	}
8317 	if (page->vmp_dirty) {
8318 		printf("corrupt_text_addr: vmp_dirty\n");
8319 		kr = KERN_FAILURE;
8320 	}
8321 	if (is_page_wired(page)) {
8322 		printf("corrupt_text_addr: wired\n");
8323 		kr = KERN_FAILURE;
8324 	}
8325 	if (!page->vmp_pmapped) {
8326 		printf("corrupt_text_addr: !vmp_pmapped\n");
8327 		kr = KERN_FAILURE;
8328 	}
8329 
8330 	if (kr == KERN_SUCCESS) {
8331 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8332 		kr = pmap_test_text_corruption(pa);
8333 		if (kr != KERN_SUCCESS) {
8334 			printf("corrupt_text_addr: pmap error %d\n", kr);
8335 		} else {
8336 			corruption_test_va = va;
8337 		}
8338 	} else {
8339 		printf("corrupt_text_addr: object %p\n", object);
8340 		printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8341 		printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8342 		printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8343 		printf("corrupt_text_addr: vm_page_t %p\n", page);
8344 		printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8345 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8346 	}
8347 
8348 	if (object != VM_OBJECT_NULL) {
8349 		vm_object_unlock(object);
8350 	}
8351 	return kr;
8352 }
8353 
8354 #endif /* DEBUG || DEVELOPMENT */
8355